From c8880e1ba025e681d45834e34a1fb7847082b26a Mon Sep 17 00:00:00 2001 From: Timothy Kielan Date: Fri, 21 Feb 2025 15:16:22 +0100 Subject: [PATCH 01/23] support building for QNX --- src/prim/unix/prim.c | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 37dd873d..9e1c53ef 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -190,6 +190,8 @@ int _mi_prim_free(void* addr, size_t size ) { static int unix_madvise(void* addr, size_t size, int advice) { #if defined(__sun) int res = madvise((caddr_t)addr, size, advice); // Solaris needs cast (issue #520) + #elif defined(__QNX__) + int res = posix_madvise(addr, size, advice); #else int res = madvise(addr, size, advice); #endif @@ -411,7 +413,11 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) { int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) { int err = 0; // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE) + #if defined(__QNX__) + err = posix_madvise(start, size, POSIX_MADV_DONTNEED); + #else err = unix_madvise(start, size, MADV_DONTNEED); + #endif #if !MI_DEBUG && !MI_SECURE *needs_recommit = false; #else @@ -443,6 +449,8 @@ int _mi_prim_reset(void* start, size_t size) { mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED); err = unix_madvise(start, size, MADV_DONTNEED); } + #elif defined(__QNX__) + int err = posix_madvise(start, size, POSIX_MADV_DONTNEED); #else int err = unix_madvise(start, size, MADV_DONTNEED); #endif From c1cbe7183680e069271cc627fc752e34e85ce79d Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 3 Mar 2025 18:08:57 -0800 Subject: [PATCH 02/23] add numa-affine allocation, and per-heap numa affinity --- include/mimalloc.h | 20 +++++++------- include/mimalloc/internal.h | 24 +++-------------- include/mimalloc/types.h | 2 ++ src/arena-meta.c | 4 +-- src/arena.c | 53 ++++++++++++++++++++++--------------- src/heap.c | 12 ++++++--- src/init.c | 5 ++++ src/os.c | 18 +++++++++---- 8 files changed, 75 insertions(+), 63 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index c821d7b4..a858008f 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -266,7 +266,7 @@ typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); -// Experimental +// Advanced mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept; mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept; @@ -279,7 +279,7 @@ mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_commi mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept; mi_decl_export void mi_arenas_print(void) mi_attr_noexcept; -// Experimental: heaps associated with specific memory arena's +// Advanced: heaps associated with specific memory arena's typedef void* mi_arena_id_t; mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size); mi_decl_export int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; @@ -292,7 +292,7 @@ mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t a #endif -// Experimental: allow sub-processes whose memory areas stay separated (and no reclamation between them) +// Advanced: allow sub-processes whose memory areas stay separated (and no reclamation between them) // Used for example for separate interpreters in one process. typedef void* mi_subproc_id_t; mi_decl_export mi_subproc_id_t mi_subproc_main(void); @@ -300,10 +300,15 @@ mi_decl_export mi_subproc_id_t mi_subproc_new(void); mi_decl_export void mi_subproc_delete(mi_subproc_id_t subproc); mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet) -// Experimental: visit abandoned heap areas (that are not owned by a specific heap) +// Advanced: visit abandoned heap areas (that are not owned by a specific heap) mi_decl_export bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); +// Experimental: set numa-affinity of a heap +mi_decl_export void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node); + // Experimental: objects followed by a guard page. +// Setting the sample rate on a specific heap can be used to test parts of the program more +// specifically (in combination with `mi_heap_set_default`). // A sample rate of 0 disables guarded objects, while 1 uses a guard page for every object. // A seed of 0 uses a random start point. Only objects within the size bound are eligable for guard pages. mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed); @@ -324,13 +329,6 @@ mi_decl_export void mi_collect_reduce(size_t target_thread_owned) mi_attr_noexce // experimental -//mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size); -//mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size); -//mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size); -//mi_decl_export void mi_os_free(void* p, size_t size); -//mi_decl_export void mi_os_commit(void* p, size_t size); -//mi_decl_export void mi_os_decommit(void* p, size_t size); - mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size); mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id); mi_decl_export bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena); diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 7f610e84..b4515831 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -159,6 +159,8 @@ bool _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned); bool _mi_os_secure_guard_page_reset_at(void* addr); bool _mi_os_secure_guard_page_reset_before(void* addr); +int _mi_os_numa_node(void); +size_t _mi_os_numa_node_count(void); void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid); void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid); @@ -174,8 +176,8 @@ mi_arena_id_t _mi_arena_id_none(void); mi_arena_t* _mi_arena_from_id(mi_arena_id_t id); bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena); -void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); -void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); +void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid); +void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid); void _mi_arenas_free(void* p, size_t size, mi_memid_t memid); bool _mi_arenas_contain(const void* p); void _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld); @@ -1026,24 +1028,6 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) { return x; } -// ------------------------------------------------------------------- -// Optimize numa node access for the common case (= one node) -// ------------------------------------------------------------------- - -int _mi_os_numa_node_get(void); -size_t _mi_os_numa_node_count_get(void); - -extern mi_decl_hidden _Atomic(size_t) _mi_numa_node_count; -static inline int _mi_os_numa_node(void) { - if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; } - else return _mi_os_numa_node_get(); -} -static inline size_t _mi_os_numa_node_count(void) { - const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count); - if mi_likely(count > 0) { return count; } - else return _mi_os_numa_node_count_get(); -} - // --------------------------------------------------------------------------------- // Provide our own `_mi_memcpy` for potential performance optimizations. diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 355293d2..daf44a22 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -424,6 +424,7 @@ typedef struct mi_padding_s { struct mi_heap_s { mi_tld_t* tld; // thread-local data mi_arena_t* exclusive_arena; // if the heap should only allocate from a specific arena (or NULL) + int numa_node; // preferred numa node (or -1 for no preference) uintptr_t cookie; // random cookie to verify pointers (see `_mi_ptr_cookie`) mi_random_ctx_t random; // random number context used for secure allocation size_t page_count; // total number of pages in the `pages` queues. @@ -485,6 +486,7 @@ typedef int64_t mi_msecs_t; struct mi_tld_s { mi_threadid_t thread_id; // thread id of this thread size_t thread_seq; // thread sequence id (linear count of created threads) + int numa_node; // thread preferred numa node mi_subproc_t* subproc; // sub-process this thread belongs to. mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) diff --git a/src/arena-meta.c b/src/arena-meta.c index 530e42cb..a6cc965d 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -64,11 +64,11 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) { // allocate a fresh meta page and add it to the global list. static mi_meta_page_t* mi_meta_page_zalloc(void) { // allocate a fresh arena slice - // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. + // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. (same with _mi_os_numa_node()...) mi_memid_t memid; uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0, true /* commit*/, (MI_SECURE==0) /* allow large? */, - NULL /* req arena */, 0 /* thread_seq */, &memid); + NULL /* req arena */, 0 /* thread_seq */, -1 /* numa node */, &memid); if (base == NULL) return NULL; mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN)); if (!memid.initially_zero) { diff --git a/src/arena.c b/src/arena.c index 9086c155..70e1802b 100644 --- a/src/arena.c +++ b/src/arena.c @@ -335,12 +335,13 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_ Arena iteration ----------------------------------------------------------- */ -static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_pinned) { +static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, bool match_numa, int numa_node, bool allow_pinned) { if (!allow_pinned && arena->memid.is_pinned) return false; if (!mi_arena_id_is_suitable(arena, req_arena)) return false; if (req_arena == NULL) { // if not specific, check numa affinity const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); - if (!numa_suitable) return false; + if (match_numa) { if (!numa_suitable) return false; } + else { if (numa_suitable) return false; } } return true; } @@ -375,9 +376,9 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena } \ } -#define mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, name_arena) \ +#define mi_forall_suitable_arenas(subproc, req_arena, tseq, match_numa, numa_node, allow_large, name_arena) \ mi_forall_arenas(subproc, req_arena,tseq,name_arena) { \ - if (mi_arena_is_suitable(name_arena, req_arena, -1 /* todo: numa node */, allow_large)) { \ + if (mi_arena_is_suitable(name_arena, req_arena, match_numa, numa_node, allow_large)) { \ #define mi_forall_suitable_arenas_end() \ }} \ @@ -390,19 +391,28 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena // allocate slices from the arenas static mi_decl_noinline void* mi_arenas_try_find_free( mi_subproc_t* subproc, size_t slice_count, size_t alignment, - bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) + bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid) { mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE)); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); if (alignment > MI_ARENA_SLICE_ALIGN) return NULL; - // search arena's - mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena) + // search arena's + mi_forall_suitable_arenas(subproc, req_arena, tseq, true /* only numa matching */, numa_node, allow_large, arena) { void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid); if (p != NULL) return p; } mi_forall_suitable_arenas_end(); + if (numa_node < 0) return NULL; + + // search again but now regardless of preferred numa affinity + mi_forall_suitable_arenas(subproc, req_arena, tseq, false /* numa non-matching now */, numa_node, allow_large, arena) + { + void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid); + if (p != NULL) return p; + } + mi_forall_suitable_arenas_end(); return NULL; } @@ -411,14 +421,14 @@ static mi_decl_noinline void* mi_arenas_try_alloc( mi_subproc_t* subproc, size_t slice_count, size_t alignment, bool commit, bool allow_large, - mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) + mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid) { mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); void* p; // try to find free slices in the arena's - p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid); + p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid); if (p != NULL) return p; // did we need a specific arena? @@ -441,7 +451,7 @@ static mi_decl_noinline void* mi_arenas_try_alloc( } // try once more to allocate in the new arena mi_assert_internal(req_arena == NULL); - p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid); + p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid); if (p != NULL) return p; return NULL; @@ -472,21 +482,18 @@ static void* mi_arena_os_alloc_aligned( void* _mi_arenas_alloc_aligned( mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, - mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) + mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid) { mi_assert_internal(memid != NULL); mi_assert_internal(size > 0); - // *memid = _mi_memid_none(); - // const int numa_node = _mi_os_numa_node(&tld->os); // current numa node - // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed? size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && // and not too small/large alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0) // and good alignment { const size_t slice_count = mi_slice_count_of_size(size); - void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, memid); + void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid); if (p != NULL) return p; } @@ -495,9 +502,9 @@ void* _mi_arenas_alloc_aligned( mi_subproc_t* subproc, return p; } -void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) +void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid) { - return _mi_arenas_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid); + return _mi_arenas_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, numa_node, memid); } @@ -547,7 +554,9 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_ // search arena's const bool allow_large = true; - mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena) + const int any_numa = -1; + const bool match_numa = true; + mi_forall_suitable_arenas(subproc, req_arena, tseq, match_numa, any_numa, allow_large, arena) { size_t slice_index; mi_bitmap_t* const bitmap = arena->pages_abandoned[bin]; @@ -582,7 +591,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_ // Allocate a fresh page static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment, - mi_arena_t* req_arena, size_t tseq, bool commit) + mi_arena_t* req_arena, size_t tseq, int numa_node, bool commit) { const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); @@ -596,7 +605,7 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice !os_align && // not large alignment slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large { - page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, &memid); + page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, numa_node, &memid); if (page != NULL) { mi_assert_internal(mi_bitmap_is_clearN(memid.mem.arena.arena->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count)); mi_bitmap_set(memid.mem.arena.arena->pages, memid.mem.arena.slice_index); @@ -727,7 +736,7 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand); const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) || // always commit small pages (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 0)); - page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit); + page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, heap->numa_node, commit); if (page != NULL) { mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); _mi_page_init(heap, page); @@ -749,7 +758,7 @@ static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_s const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size()); #endif - mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq, true /* commit singletons always */); + mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq, heap->numa_node, true /* commit singletons always */); if (page == NULL) return NULL; mi_assert(page->reserved == 1); diff --git a/src/heap.c b/src/heap.c index 54c94179..8655bd27 100644 --- a/src/heap.c +++ b/src/heap.c @@ -182,12 +182,13 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy, mi_memid_t memid = heap->memid; _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); heap->memid = memid; - heap->tld = tld; // avoid reading the thread-local tld during initialization + heap->tld = tld; // avoid reading the thread-local tld during initialization + heap->tag = heap_tag; + heap->numa_node = tld->numa_node; heap->exclusive_arena = _mi_arena_from_id(arena_id); heap->allow_page_reclaim = (!allow_destroy && mi_option_get(mi_option_page_reclaim_on_free) >= 0); heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0); heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); - heap->tag = heap_tag; if (heap->tld->is_in_threadpool) { // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. // this is checked in `free.c:mi_free_try_collect_mt` @@ -227,7 +228,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena else { // heaps associated wita a specific arena are allocated in that arena // note: takes up at least one slice which is quite wasteful... - heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid); + heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, tld->numa_node, &memid); } if (heap==NULL) { _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n"); @@ -261,6 +262,11 @@ uintptr_t _mi_heap_random_next(mi_heap_t* heap) { return _mi_random_next(&heap->random); } +void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node) { + if (heap == NULL) return; + heap->numa_node = (numa_node < 0 ? -1 : numa_node % _mi_os_numa_node_count()); +} + // zero out the page queues static void mi_heap_reset_pages(mi_heap_t* heap) { mi_assert_internal(heap != NULL); diff --git a/src/init.c b/src/init.c index 20c97217..0ed0198a 100644 --- a/src/init.c +++ b/src/init.c @@ -104,6 +104,7 @@ static mi_decl_cache_align mi_subproc_t subproc_main static mi_decl_cache_align mi_tld_t tld_empty = { 0, // thread_id 0, // thread_seq + 0, // default numa node &subproc_main, // subproc NULL, // heap_backing NULL, // heaps list @@ -117,6 +118,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = { mi_decl_cache_align const mi_heap_t _mi_heap_empty = { &tld_empty, // tld NULL, // exclusive_arena + 0, // preferred numa node 0, // cookie //{ 0, 0 }, // keys { {0}, {0}, 0, true }, // random @@ -141,6 +143,7 @@ extern mi_decl_hidden mi_decl_cache_align mi_heap_t heap_main; static mi_decl_cache_align mi_tld_t tld_main = { 0, // thread_id 0, // thread_seq + 0, // numa node &subproc_main, // subproc &heap_main, // heap_backing &heap_main, // heaps list @@ -154,6 +157,7 @@ static mi_decl_cache_align mi_tld_t tld_main = { mi_decl_cache_align mi_heap_t heap_main = { &tld_main, // thread local data NULL, // exclusive arena + 0, // preferred numa node 0, // initial cookie //{ 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!) { {0x846ca68b}, {0}, 0, true }, // random @@ -306,6 +310,7 @@ static mi_tld_t* mi_tld_alloc(void) { tld->heap_backing = NULL; tld->heaps = NULL; tld->subproc = &subproc_main; + tld->numa_node = _mi_os_numa_node(); tld->thread_id = _mi_prim_thread_id(); tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1); tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool(); diff --git a/src/os.c b/src/os.c index dcfe5ccf..69decb71 100644 --- a/src/os.c +++ b/src/os.c @@ -694,18 +694,18 @@ static void mi_os_free_huge_os_pages(void* p, size_t size) { Support NUMA aware allocation -----------------------------------------------------------------------------*/ -_Atomic(size_t) _mi_numa_node_count; // = 0 // cache the node count +static _Atomic(size_t) _mi_numa_node_count; // = 0 // cache the node count -size_t _mi_os_numa_node_count_get(void) { +size_t _mi_os_numa_node_count(void) { size_t count = mi_atomic_load_acquire(&_mi_numa_node_count); - if (count <= 0) { + if mi_unlikely(count <= 0) { long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly? if (ncount > 0) { count = (size_t)ncount; } else { count = _mi_prim_numa_node_count(); // or detect dynamically - if (count == 0) count = 1; + if (count == 0) { count = 1; } } mi_atomic_store_release(&_mi_numa_node_count, count); // save it _mi_verbose_message("using %zd numa regions\n", count); @@ -713,7 +713,8 @@ size_t _mi_os_numa_node_count_get(void) { return count; } -int _mi_os_numa_node_get(void) { + +static int mi_os_numa_node_get(void) { size_t numa_count = _mi_os_numa_node_count(); if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0 // never more than the node count and >= 0 @@ -722,6 +723,13 @@ int _mi_os_numa_node_get(void) { return (int)numa_node; } +int _mi_os_numa_node(void) { + if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; } + else return mi_os_numa_node_get(); +} + + + /* ---------------------------------------------------------------------------- Public API From ce74c905f877c2a8e5ccb8d1fc767e467df47175 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 3 Mar 2025 20:18:40 -0800 Subject: [PATCH 03/23] improve generic_find_free --- src/bitmap.c | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index 3907e91d..2d0bb8f3 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1560,11 +1560,18 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]); const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); size_t eidx = 0; - mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) + mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) { mi_assert_internal(eidx <= MI_BFIELD_BITS); + + // don't search into non-acgcessed memory until we tried other size bins as well + if (bin < bbin && eidx >= cmap_entry_cycle) break; + + // get the chunk idx const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + mi_assert_internal(bin >= bbin || chunk_idx <= chunk_acc); + // only in the current size class! const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]); if ((mi_bbin_t)bin == chunk_bin || (bin == bbin && chunk_bin == MI_BBIN_NONE)) // only allow NONE at the final run From c18a5537dc94d6103f0f4bb97a7175a2a57b0abc Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Mon, 3 Mar 2025 20:50:21 -0800 Subject: [PATCH 04/23] reduce medium page block size to 64k to reducemem usage --- include/mimalloc/types.h | 4 ++-- src/bitmap.c | 2 +- src/options.c | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index daf44a22..2f76cfe6 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -343,10 +343,10 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < ~8 KiB #if MI_ENABLE_LARGE_PAGES -#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < ~64 KiB #define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #else -#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/4) // <= 128 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/8) // <= 64 KiB #define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #endif #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/bitmap.c b/src/bitmap.c index 2d0bb8f3..908562c0 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1560,7 +1560,7 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]); const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); size_t eidx = 0; - mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) + mi_bfield_cycle_iterate(cmap_entry, tseq, cmap_entry_cycle, eidx, Y) { mi_assert_internal(eidx <= MI_BFIELD_BITS); diff --git a/src/options.c b/src/options.c index bf6cf437..94cb8b67 100644 --- a/src/options.c +++ b/src/options.c @@ -175,7 +175,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this only on overcommit systems (like Linux)) + { 1, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this only on overcommit systems (like Linux)) { 16, UNINIT, MI_OPTION(page_reclaim_max) }, // don't reclaim pages if we already own N pages (in that size class) }; From 45f0b0a8a6d66ee3bc5eed73b622bb3abfaae68a Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 4 Mar 2025 07:46:10 -0800 Subject: [PATCH 05/23] remove extra verbose messages --- src/init.c | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/src/init.c b/src/init.c index 0ed0198a..0e867f8b 100644 --- a/src/init.c +++ b/src/init.c @@ -157,7 +157,7 @@ static mi_decl_cache_align mi_tld_t tld_main = { mi_decl_cache_align mi_heap_t heap_main = { &tld_main, // thread local data NULL, // exclusive arena - 0, // preferred numa node + 0, // preferred numa node 0, // initial cookie //{ 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!) { {0x846ca68b}, {0}, 0, true }, // random @@ -690,15 +690,6 @@ void mi_process_init(void) mi_attr_noexcept { // the following two can potentially allocate (on freeBSD for locks and thread keys) mi_subproc_main_init(); mi_process_setup_auto_thread_done(); - - #if MI_DEBUG - _mi_verbose_message("debug level : %d\n", MI_DEBUG); - #endif - _mi_verbose_message("secure level: %d\n", MI_SECURE); - _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL); - #if MI_TSAN - _mi_verbose_message("thread santizer enabled\n"); - #endif mi_thread_init(); #if defined(_WIN32) && defined(MI_WIN_USE_FLS) From 6093971bcb9bd4b07a6e6af8e32c4de2b7d1aa2b Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 4 Mar 2025 09:22:21 -0800 Subject: [PATCH 06/23] use per cmap entry size class binning --- src/arena.c | 9 ++-- src/bitmap.c | 119 ++++++++++++++++++++++++++++---------------------- src/bitmap.h | 14 ++++-- src/options.c | 2 +- 4 files changed, 83 insertions(+), 61 deletions(-) diff --git a/src/arena.c b/src/arena.c index 70e1802b..62811720 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1384,7 +1384,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, return bit_set_count; } -static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) { +static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) { _mi_raw_message("\x1B[37m%s%s%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header1, header2, header3); const size_t fields_per_line = (narrow ? 2 : 4); size_t bit_count = 0; @@ -1400,11 +1400,12 @@ static size_t mi_debug_show_chunks(const char* header1, const char* header2, con char chunk_kind = ' '; if (chunk_bins != NULL) { - switch (mi_atomic_load_relaxed(&chunk_bins[i])) { + switch (mi_bbitmap_debug_get_bin(chunk_bins,i)) { case MI_BBIN_SMALL: chunk_kind = 'S'; break; case MI_BBIN_MEDIUM: chunk_kind = 'M'; break; case MI_BBIN_LARGE: chunk_kind = 'L'; break; case MI_BBIN_OTHER: chunk_kind = 'X'; break; + default: chunk_kind = ' '; break; // suppress warning // case MI_BBIN_NONE: chunk_kind = 'N'; break; } } @@ -1441,7 +1442,7 @@ static size_t mi_debug_show_chunks(const char* header1, const char* header2, con return bit_set_count; } -static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) { +static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count, mi_bitmap_t* bitmap, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) { return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow); } @@ -1472,7 +1473,7 @@ static void mi_debug_show_arenas_ex(bool show_pages, bool narrow) mi_attr_noexce const char* header1 = "pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:meta-data, ~:free-purgable, _:free-committed, .:free-reserved)"; const char* header2 = (narrow ? "\n " : " "); const char* header3 = "(chunk bin: S:small, M : medium, L : large, X : other)"; - page_total += mi_debug_show_bitmap_binned(header1, header2, header3, arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena, narrow); + page_total += mi_debug_show_bitmap_binned(header1, header2, header3, arena->slice_count, arena->pages, arena->slices_free->chunkmap_bins, false, arena, narrow); } } // if (show_inuse) _mi_raw_message("total inuse slices : %zu\n", slice_total - free_total); diff --git a/src/bitmap.c b/src/bitmap.c index 908562c0..a2e29645 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -218,39 +218,39 @@ static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all // ------- mi_bfield_atomic_is_set --------------------------------------- // Check if a bit is set -static inline bool mi_bfield_atomic_is_set(_Atomic(mi_bfield_t)*b, const size_t idx) { +static inline bool mi_bfield_atomic_is_set(const _Atomic(mi_bfield_t)*b, const size_t idx) { const mi_bfield_t x = mi_atomic_load_relaxed(b); return ((x & mi_bfield_mask(1,idx)) != 0); } // Check if a bit is clear -static inline bool mi_bfield_atomic_is_clear(_Atomic(mi_bfield_t)*b, const size_t idx) { +static inline bool mi_bfield_atomic_is_clear(const _Atomic(mi_bfield_t)*b, const size_t idx) { const mi_bfield_t x = mi_atomic_load_relaxed(b); return ((x & mi_bfield_mask(1, idx)) == 0); } // Check if a bit is xset -static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, const size_t idx) { +static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, const _Atomic(mi_bfield_t)*b, const size_t idx) { if (set) return mi_bfield_atomic_is_set(b, idx); else return mi_bfield_atomic_is_clear(b, idx); } // Check if all bits corresponding to a mask are set. -static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) { +static inline bool mi_bfield_atomic_is_set_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) { mi_assert_internal(mask != 0); const mi_bfield_t x = mi_atomic_load_relaxed(b); return ((x & mask) == mask); } // Check if all bits corresponding to a mask are clear. -static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) { +static inline bool mi_bfield_atomic_is_clear_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) { mi_assert_internal(mask != 0); const mi_bfield_t x = mi_atomic_load_relaxed(b); return ((x & mask) == 0); } // Check if all bits corresponding to a mask are set/cleared. -static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask) { +static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) { mi_assert_internal(mask != 0); if (set) return mi_bfield_atomic_is_set_mask(b, mask); else return mi_bfield_atomic_is_clear_mask(b, mask); @@ -371,7 +371,7 @@ static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, b // Check if a sequence of `n` bits within a chunk are all set/cleared. // This can cross bfield's -mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) { +mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, const mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) { mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS); while (n > 0) { size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field @@ -391,7 +391,7 @@ mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, mi_bchunk_t* chu } // Check if a sequence of `n` bits within a chunk are all set/cleared. -static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) { +static inline bool mi_bchunk_is_xsetN(mi_xset_t set, const mi_bchunk_t* chunk, size_t cidx, size_t n) { mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); if (n==0) return true; @@ -1413,7 +1413,23 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { // Assign a specific size bin to a chunk static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) { mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); - mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin); + for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) { + if (ibin == bin) { + mi_bchunk_set(& bbitmap->chunkmap_bins[ibin], chunk_idx, NULL); + } + else { + mi_bchunk_clear(&bbitmap->chunkmap_bins[ibin], chunk_idx, NULL); + } + } +} + +mi_bbin_t mi_bbitmap_debug_get_bin(const mi_bchunkmap_t* chunkmap_bins, size_t chunk_idx) { + for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) { + if (mi_bchunk_is_xsetN(MI_BIT_SET, &chunkmap_bins[ibin], chunk_idx, 1)) { + return ibin; + } + } + return MI_BBIN_NONE; } // Track the index of the highest chunk that is accessed. @@ -1542,62 +1558,59 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); const size_t cmap_cycle = cmap_acc+1; const mi_bbin_t bbin = mi_bbin_of(n); - // visit bins from smallest to largest (to reduce fragmentation on the larger blocks) - for(mi_bbin_t bin = MI_BBIN_SMALL; bin <= bbin; bin = mi_bbin_inc(bin)) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL - // (int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // visit bins from largest size bin up to the NONE bin + // visit each cmap entry + size_t cmap_idx = 0; + mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) { - size_t cmap_idx = 0; - mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) + // and for each chunkmap entry we iterate over its bits to find the chunks + const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]); + const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); + if (cmap_entry == 0) continue; + + // get size bin masks + mi_bfield_t cmap_bins[MI_BBIN_COUNT] = { 0 }; + cmap_bins[MI_BBIN_NONE] = cmap_entry; + for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) { + const mi_bfield_t cmap_bin = mi_atomic_load_relaxed(&bbitmap->chunkmap_bins[ibin].bfields[cmap_idx]); + cmap_bins[ibin] = cmap_bin & cmap_entry; + cmap_bins[MI_BBIN_NONE] &= ~cmap_bin; // clear bits that are in an assigned size bin + } + + // consider only chunks for a particular size bin at a time + for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin <= MI_BBIN_NONE; + // skip from bbin to NONE (so, say, a SMALL will never be placed in a OTHER, MEDIUM, or LARGE chunk to reduce fragmentation) + ibin = (ibin == bbin ? MI_BBIN_NONE : mi_bbin_inc(ibin))) { - // don't search into non-accessed memory until we tried other size bins as well - if (bin < bbin && cmap_idx > cmap_acc) - // (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) // large to small - { - break; - } - - // and for each chunkmap entry we iterate over its bits to find the chunks - const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]); - const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); + const mi_bfield_t cmap_bin = cmap_bins[ibin]; size_t eidx = 0; - mi_bfield_cycle_iterate(cmap_entry, tseq, cmap_entry_cycle, eidx, Y) + mi_bfield_cycle_iterate(cmap_bin, tseq, cmap_entry_cycle, eidx, Y) { - mi_assert_internal(eidx <= MI_BFIELD_BITS); - - // don't search into non-acgcessed memory until we tried other size bins as well - if (bin < bbin && eidx >= cmap_entry_cycle) break; + // assertion doesn't quite hold as the max_accessed may be out-of-date + // mi_assert_internal(cmap_entry_cycle > eidx || ibin == MI_BBIN_NONE); - // get the chunk idx + // get the chunk const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; - mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); - mi_assert_internal(bin >= bbin || chunk_idx <= chunk_acc); - - // only in the current size class! - const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]); - if ((mi_bbin_t)bin == chunk_bin || (bin == bbin && chunk_bin == MI_BBIN_NONE)) // only allow NONE at the final run - // ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { largest to smallest - { - mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; - size_t cidx; - if ((*on_find)(chunk, n, &cidx)) { - if (cidx==0 && chunk_bin == MI_BBIN_NONE) { // only the first determines the size bin - // this chunk is now reserved for the `bbin` size class - mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin); - } - *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; - mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap)); - return true; - } - else { - /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ - mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); + mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; + + size_t cidx; + if ((*on_find)(chunk, n, &cidx)) { + if (cidx==0 && ibin == MI_BBIN_NONE) { // only the first block determines the size bin + // this chunk is now reserved for the `bbin` size class + mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin); } + *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; + mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap)); + return true; + } + else { + /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ + mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); } } mi_bfield_cycle_iterate_end(Y); } - mi_bfield_cycle_iterate_end(X); } + mi_bfield_cycle_iterate_end(X); return false; } diff --git a/src/bitmap.h b/src/bitmap.h index b17d83e5..e797bd8e 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -215,18 +215,24 @@ bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* vis // Size bins; larger bins are allowed to go into smaller bins. // SMALL can only be in small (and NONE), so they cannot fragment the larger bins. typedef enum mi_bbin_e { - MI_BBIN_NONE, // no bin assigned yet (the chunk is completely free) MI_BBIN_SMALL, // slice_count == 1 MI_BBIN_OTHER, // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS MI_BBIN_MEDIUM, // slice_count == 8 MI_BBIN_LARGE, // slice_count == MI_BFIELD_BITS -- only used if MI_ENABLE_LARGE_PAGES is 1 + MI_BBIN_NONE, // no bin assigned yet (the chunk is completely free) MI_BBIN_COUNT } mi_bbin_t; static inline mi_bbin_t mi_bbin_inc(mi_bbin_t bbin) { + mi_assert_internal(bbin < MI_BBIN_COUNT); return (mi_bbin_t)((int)bbin + 1); } +static inline mi_bbin_t mi_bbin_dec(mi_bbin_t bbin) { + mi_assert_internal(bbin > MI_BBIN_NONE); + return (mi_bbin_t)((int)bbin - 1); +} + static inline mi_bbin_t mi_bbin_of(size_t slice_count) { if (slice_count==1) return MI_BBIN_SMALL; if (slice_count==8) return MI_BBIN_MEDIUM; @@ -241,8 +247,8 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s { _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) _Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc - mi_bchunkmap_t chunkmap; - _Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT]; // 512b + mi_bchunkmap_t chunkmap; + mi_bchunkmap_t chunkmap_bins[MI_BBIN_COUNT - 1]; // chunkmaps with bit set if the chunk is in that size class (except MI_BBIN_NONE) mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT } mi_bbitmap_t; @@ -255,6 +261,8 @@ static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) { return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS); } +mi_bbin_t mi_bbitmap_debug_get_bin(const mi_bchunk_t* chunkmap_bins, size_t chunk_idx); + size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count); diff --git a/src/options.c b/src/options.c index 94cb8b67..e8eb85ad 100644 --- a/src/options.c +++ b/src/options.c @@ -202,7 +202,7 @@ void _mi_options_init(void) { } } #endif - if (!mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); } + if (mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); } } #define mi_stringifyx(str) #str // and stringify From 1c6b40d8bd7a3fd77f4590f50e88bf4d040a5375 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 4 Mar 2025 09:22:41 -0800 Subject: [PATCH 07/23] fix verbose option printing --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index 94cb8b67..e8eb85ad 100644 --- a/src/options.c +++ b/src/options.c @@ -202,7 +202,7 @@ void _mi_options_init(void) { } } #endif - if (!mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); } + if (mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); } } #define mi_stringifyx(str) #str // and stringify From 119f2eff6c61eb4f0c315773944b914af47d48ef Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 5 Mar 2025 09:51:40 -0800 Subject: [PATCH 08/23] use int for numa node count --- include/mimalloc/internal.h | 2 +- src/arena.c | 10 +++++----- src/os.c | 20 +++++++++++--------- 3 files changed, 17 insertions(+), 15 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index b4515831..e8b1c919 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -160,7 +160,7 @@ bool _mi_os_secure_guard_page_reset_at(void* addr); bool _mi_os_secure_guard_page_reset_before(void* addr); int _mi_os_numa_node(void); -size_t _mi_os_numa_node_count(void); +int _mi_os_numa_node_count(void); void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid); void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid); diff --git a/src/arena.c b/src/arena.c index 70e1802b..2aa1f8fe 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1524,17 +1524,17 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t if (pages == 0) return 0; // pages per numa node - size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); - if (numa_count <= 0) numa_count = 1; + int numa_count = (numa_nodes > 0 && numa_nodes <= INT_MAX ? (int)numa_nodes : _mi_os_numa_node_count()); + if (numa_count <= 0) { numa_count = 1; } const size_t pages_per = pages / numa_count; const size_t pages_mod = pages % numa_count; const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); // reserve evenly among numa nodes - for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { + for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { size_t node_pages = pages_per; // can be 0 - if (numa_node < pages_mod) node_pages++; - int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); + if ((size_t)numa_node < pages_mod) { node_pages++; } + int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per); if (err) return err; if (pages < node_pages) { pages = 0; diff --git a/src/os.c b/src/os.c index 69decb71..2ee4c897 100644 --- a/src/os.c +++ b/src/os.c @@ -694,18 +694,19 @@ static void mi_os_free_huge_os_pages(void* p, size_t size) { Support NUMA aware allocation -----------------------------------------------------------------------------*/ -static _Atomic(size_t) _mi_numa_node_count; // = 0 // cache the node count +static _Atomic(int) _mi_numa_node_count; // = 0 // cache the node count -size_t _mi_os_numa_node_count(void) { +int _mi_os_numa_node_count(void) { size_t count = mi_atomic_load_acquire(&_mi_numa_node_count); if mi_unlikely(count <= 0) { long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly? - if (ncount > 0) { - count = (size_t)ncount; + if (ncount > 0 && ncount < INT_MAX) { + count = (int)ncount; } else { - count = _mi_prim_numa_node_count(); // or detect dynamically - if (count == 0) { count = 1; } + const size_t n = _mi_prim_numa_node_count(); // or detect dynamically + if (n == 0 || n > INT_MAX) { count = 1; } + else { count = (int)n; } } mi_atomic_store_release(&_mi_numa_node_count, count); // save it _mi_verbose_message("using %zd numa regions\n", count); @@ -715,12 +716,13 @@ size_t _mi_os_numa_node_count(void) { static int mi_os_numa_node_get(void) { - size_t numa_count = _mi_os_numa_node_count(); + int numa_count = _mi_os_numa_node_count(); if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0 // never more than the node count and >= 0 - size_t numa_node = _mi_prim_numa_node(); + const size_t n = _mi_prim_numa_node(); + int numa_node = (n < INT_MAX ? (int)n : 0); if (numa_node >= numa_count) { numa_node = numa_node % numa_count; } - return (int)numa_node; + return numa_node; } int _mi_os_numa_node(void) { From 5c6ab532d98d2c617a7c21849d60c8e06a618a76 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Wed, 5 Mar 2025 15:37:37 -0800 Subject: [PATCH 09/23] make MI_OPT_ARCH by default OFF except for arm64 where we assume v8.1-a for fast atomics --- CMakeLists.txt | 77 +++++++++++++++++++++++--------------------------- 1 file changed, 36 insertions(+), 41 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 68153468..17e8d3f5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ option(MI_TRACK_VALGRIND "Compile with Valgrind support (adds a small overhea option(MI_TRACK_ASAN "Compile with address sanitizer support (adds a small overhead)" OFF) option(MI_TRACK_ETW "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF) option(MI_USE_CXX "Use the C++ compiler to compile the library (instead of the C compiler)" OFF) -option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" ON) +option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" OFF) option(MI_SEE_ASM "Generate assembly files" OFF) option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" ON) option(MI_OSX_ZONE "Use malloc zone to override standard malloc on macOS" ON) @@ -116,9 +116,44 @@ if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$") set(MI_SECURE "ON") endif() + +# Determine architecture +set(MI_OPT_ARCH_FLAGS "") +set(MI_ARCH "unknown") +if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|i[3456]86)$" OR CMAKE_GENERATOR_PLATFORM MATCHES "^(x86|Win32)$") + set(MI_ARCH "x86") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64" OR "x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES) # must be before arm64 + set(MI_ARCH "x64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) + set(MI_ARCH "arm64") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$") + set(MI_ARCH "arm32") +elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$") + if(CMAKE_SIZEOF_VOID_P==4) + set(MI_ARCH "riscv32") + else() + set(MI_ARCH "riscv64") + endif() +else() + set(MI_ARCH ${CMAKE_SYSTEM_PROCESSOR}) +endif() +message(STATUS "Architecture: ${MI_ARCH}") # (${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_GENERATOR_PLATFORM}, ${CMAKE_GENERATOR})") + +# negative overrides (mainly to support vcpkg features) +if(MI_NO_USE_CXX) + set(MI_USE_CXX "OFF") +endif() +if(MI_NO_OPT_ARCH) + set(MI_OPT_ARCH "OFF") +elseif(MI_ARCH STREQUAL "arm64") + set(MI_OPT_ARCH "ON") # enable armv8.1-a by default on arm64 unless MI_NO_OPT_ARCH is set +endif() + + # ----------------------------------------------------------------------------- # Process options # ----------------------------------------------------------------------------- + if(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC") set(MI_CLANG_CL "ON") endif() @@ -138,27 +173,10 @@ if(CMAKE_C_COMPILER_ID MATCHES "Intel") list(APPEND mi_cflags -Wall) endif() -# negative overrides (mainly to support vcpkg features) -if(MI_NO_USE_CXX) - set(MI_USE_CXX "OFF") -endif() -if(MI_NO_OPT_ARCH) - set(MI_OPT_ARCH "OFF") -endif() - - if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel") set(MI_USE_CXX "ON") endif() -if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo") - if (NOT MI_OPT_ARCH) - message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)") - endif() -else() - set(MI_OPT_ARCH OFF) -endif() - if(MI_OVERRIDE) message(STATUS "Override standard malloc (MI_OVERRIDE=ON)") if(APPLE) @@ -365,28 +383,6 @@ if(MI_WIN_USE_FIXED_TLS) list(APPEND mi_defines MI_WIN_USE_FIXED_TLS=1) endif() -# Determine architecture -set(MI_OPT_ARCH_FLAGS "") -set(MI_ARCH "unknown") -if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|i[3456]86)$" OR CMAKE_GENERATOR_PLATFORM MATCHES "^(x86|Win32)$") - set(MI_ARCH "x86") -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64" OR "x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES) # must be before arm64 - set(MI_ARCH "x64") -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) - set(MI_ARCH "arm64") -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$") - set(MI_ARCH "arm32") -elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$") - if(CMAKE_SIZEOF_VOID_P==4) - set(MI_ARCH "riscv32") - else() - set(MI_ARCH "riscv64") - endif() -else() - set(MI_ARCH ${CMAKE_SYSTEM_PROCESSOR}) -endif() -message(STATUS "Architecture: ${MI_ARCH}") # (${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_GENERATOR_PLATFORM}, ${CMAKE_GENERATOR})") - # Check /proc/cpuinfo for an SV39 MMU and limit the virtual address bits. # (this will skip the aligned hinting in that case. Issue #939, #949) if (EXISTS /proc/cpuinfo) @@ -439,7 +435,6 @@ endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku") if(MI_OPT_ARCH) if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES) # to support multi-arch binaries (#999) - set(MI_OPT_ARCH_FLAGS "") if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a") endif() From 8f40bed0864364adcf34f6bbf13bd8b8962a15a4 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 5 Mar 2025 15:48:57 -0800 Subject: [PATCH 10/23] fix erms detection --- src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/init.c b/src/init.c index 4ddc5bd1..215eed20 100644 --- a/src/init.c +++ b/src/init.c @@ -600,7 +600,7 @@ static void mi_detect_cpu_features(void) { int32_t cpu_info[4]; __cpuid(cpu_info, 7); _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see - _mi_cpu_has_erms = ((cpu_info[2] & (1 << 9)) != 0); // bit 9 of ECX : see + _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see } #else static void mi_detect_cpu_features(void) { From ad52fc1b7e1ef0f33696c5d89e5dbc7dcef772d8 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 5 Mar 2025 16:09:22 -0800 Subject: [PATCH 11/23] fix type --- src/os.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/os.c b/src/os.c index 2ee4c897..01ec2c46 100644 --- a/src/os.c +++ b/src/os.c @@ -697,7 +697,7 @@ Support NUMA aware allocation static _Atomic(int) _mi_numa_node_count; // = 0 // cache the node count int _mi_os_numa_node_count(void) { - size_t count = mi_atomic_load_acquire(&_mi_numa_node_count); + int count = mi_atomic_load_acquire(&_mi_numa_node_count); if mi_unlikely(count <= 0) { long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly? if (ncount > 0 && ncount < INT_MAX) { From 7e721c881b01c96137c8b246bececff47a5dde20 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 6 Mar 2025 16:50:56 -0800 Subject: [PATCH 12/23] add comments --- src/bitmap.c | 11 ++++++++--- src/bitmap.h | 2 +- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index a2e29645..f7f94ddb 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1557,7 +1557,7 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS); const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); const size_t cmap_cycle = cmap_acc+1; - const mi_bbin_t bbin = mi_bbin_of(n); + const mi_bbin_t bbin = mi_bbin_of(n); // visit each cmap entry size_t cmap_idx = 0; mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) @@ -1576,11 +1576,15 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, cmap_bins[MI_BBIN_NONE] &= ~cmap_bin; // clear bits that are in an assigned size bin } - // consider only chunks for a particular size bin at a time + // consider only chunks for a particular size bin at a time + // this picks the best bin only within a cmap entry (~ 1GiB address space), but avoids multiple + // iterations through all entries. + mi_assert_internal(bbin < MI_BBIN_NONE); for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin <= MI_BBIN_NONE; // skip from bbin to NONE (so, say, a SMALL will never be placed in a OTHER, MEDIUM, or LARGE chunk to reduce fragmentation) ibin = (ibin == bbin ? MI_BBIN_NONE : mi_bbin_inc(ibin))) { + mi_assert_internal(ibin < MI_BBIN_COUNT); const mi_bfield_t cmap_bin = cmap_bins[ibin]; size_t eidx = 0; mi_bfield_cycle_iterate(cmap_bin, tseq, cmap_entry_cycle, eidx, Y) @@ -1603,7 +1607,8 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, return true; } else { - /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ + // todo: should _on_find_ return a boolen if there is a chance all are clear to avoid calling `try_clear?` + // we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); } } diff --git a/src/bitmap.h b/src/bitmap.h index e797bd8e..0237d005 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -248,7 +248,7 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s { _Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc mi_bchunkmap_t chunkmap; - mi_bchunkmap_t chunkmap_bins[MI_BBIN_COUNT - 1]; // chunkmaps with bit set if the chunk is in that size class (except MI_BBIN_NONE) + mi_bchunkmap_t chunkmap_bins[MI_BBIN_COUNT - 1]; // chunkmaps with bit set if the chunk is in that size class (excluding MI_BBIN_NONE) mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT } mi_bbitmap_t; From d9580f3bfb4491f21fb7b40d54ca3d93465a7902 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 6 Mar 2025 18:54:04 -0800 Subject: [PATCH 13/23] update popcnt to be more efficient on x64 even without MI_OPT_ARCH=ON --- ide/vs2022/mimalloc-lib.vcxproj | 2 +- include/mimalloc/bits.h | 35 +++++++++++++++++++++++----- include/mimalloc/internal.h | 23 ++---------------- src/init.c | 41 +++++++++++++++++++++++++++------ src/libc.c | 6 ++--- 5 files changed, 68 insertions(+), 39 deletions(-) diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj index b4bf013e..c294ea0e 100644 --- a/ide/vs2022/mimalloc-lib.vcxproj +++ b/ide/vs2022/mimalloc-lib.vcxproj @@ -316,7 +316,7 @@ CompileAsCpp true stdcpp20 - AdvancedVectorExtensions2 + StreamingSIMDExtensions /Zc:__cplusplus %(AdditionalOptions) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index fc56e8ea..2debaf25 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -90,7 +90,7 @@ typedef int32_t mi_ssize_t; #endif #endif -#if MI_ARCH_X64 && defined(__AVX2__) +#if (MI_ARCH_X86 || MI_ARCH_X64) #include #elif MI_ARCH_ARM64 && MI_OPT_SIMD #include @@ -134,6 +134,18 @@ typedef int32_t mi_ssize_t; Builtin's -------------------------------------------------------------------------------- */ +#if defined(__GNUC__) || defined(__clang__) +#define mi_unlikely(x) (__builtin_expect(!!(x),false)) +#define mi_likely(x) (__builtin_expect(!!(x),true)) +#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) +#define mi_unlikely(x) (x) [[unlikely]] +#define mi_likely(x) (x) [[likely]] +#else +#define mi_unlikely(x) (x) +#define mi_likely(x) (x) +#endif + + #ifndef __has_builtin #define __has_builtin(x) 0 #endif @@ -171,14 +183,25 @@ typedef int32_t mi_ssize_t; -------------------------------------------------------------------------------- */ size_t _mi_popcount_generic(size_t x); +extern bool _mi_cpu_has_popcnt; static inline size_t mi_popcount(size_t x) { - #if mi_has_builtinz(popcount) + #if defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86) + #if !defined(__BMI1__) + if mi_unlikely(!_mi_cpu_has_popcnt) { return _mi_popcount_generic(x); } + #endif + size_t r; + __asm ("popcnt\t%1,%0" : "=r"(r) : "r"(x) : "cc"); + return r; + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86) + #if !defined(__BMI1__) + if mi_unlikely(!_mi_cpu_has_popcnt) { return _mi_popcount_generic(x); } + #endif + return (size_t)mi_msc_builtinz(__popcnt)(x); + #elif defined(_MSC_VER) && MI_ARCH_ARM64 + return (size_t)mi_msc_builtinz(__popcnt)(x); + #elif mi_has_builtinz(popcount) return mi_builtinz(popcount)(x); - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) - return mi_msc_builtinz(__popcnt)(x); - #elif MI_ARCH_X64 && defined(__BMI1__) - return (size_t)_mm_popcnt_u64(x); #else #define MI_HAS_FAST_POPCOUNT 0 return (x<=1 ? x : _mi_popcount_generic(x)); diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index e8b1c919..8a880b8d 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -256,25 +256,6 @@ bool _mi_page_is_valid(mi_page_t* page); #endif -// ------------------------------------------------------ -// Branches -// ------------------------------------------------------ - -#if defined(__GNUC__) || defined(__clang__) -#define mi_unlikely(x) (__builtin_expect(!!(x),false)) -#define mi_likely(x) (__builtin_expect(!!(x),true)) -#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) -#define mi_unlikely(x) (x) [[unlikely]] -#define mi_likely(x) (x) [[likely]] -#else -#define mi_unlikely(x) (x) -#define mi_likely(x) (x) -#endif - -#ifndef __has_builtin -#define __has_builtin(x) 0 -#endif - /* ----------------------------------------------------------- Assertions @@ -1037,10 +1018,10 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) { // (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253. // --------------------------------------------------------------------------------- -#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) -#include +#if !MI_TRACK_ENABLED && defined(_WIN32) && (MI_ARCH_X64 || MI_ARCH_X86) extern bool _mi_cpu_has_fsrm; extern bool _mi_cpu_has_erms; + static inline void _mi_memcpy(void* dst, const void* src, size_t n) { if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { __movsb((unsigned char*)dst, (const unsigned char*)src, n); diff --git a/src/init.c b/src/init.c index f9678cc5..54905dc8 100644 --- a/src/init.c +++ b/src/init.c @@ -652,25 +652,52 @@ void _mi_process_load(void) { _mi_random_reinit_if_weak(&heap_main.random); } -#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) -#include +// CPU features mi_decl_cache_align bool _mi_cpu_has_fsrm = false; mi_decl_cache_align bool _mi_cpu_has_erms = false; +mi_decl_cache_align bool _mi_cpu_has_popcnt = false; + +#if (MI_ARCH_X64 || MI_ARCH_X86) +#if defined(__GNUC__) +#include +static bool mi_cpuid(uint32_t* regs4, uint32_t level) { + return (__get_cpuid(level, ®s4[0], ®s4[1], ®s4[2], ®s4[3]) == 1); +} + +#elif defined(_MSC_VER) +static bool mi_cpuid(uint32_t* regs4, uint32_t level) { + __cpuid((int32_t*)regs4, (int32_t)level); + return true; +} +#else +static bool mi_cpuid(uint32_t* regs4, uint32_t level) { + MI_UNUSED(regs4); MI_UNUSED(level); + return false; +} +#endif static void mi_detect_cpu_features(void) { // FSRM for fast short rep movsb/stosb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017)) // EMRS for fast enhanced rep movsb/stosb support - int32_t cpu_info[4]; - __cpuid(cpu_info, 7); - _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see - _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see + uint32_t cpu_info[4]; + if (mi_cpuid(cpu_info, 7)) { + _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see + _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see + } + if (mi_cpuid(cpu_info, 1)) { + _mi_cpu_has_popcnt = ((cpu_info[2] & (1 << 23)) != 0); // bit 23 of ECX : see + } } + #else static void mi_detect_cpu_features(void) { - // nothing + #if MI_ARCH_ARM64 + _mi_cpu_has_popcnt = true; + #endif } #endif + // Initialize the process; called by thread_init or the process loader void mi_process_init(void) mi_attr_noexcept { // ensure we are called once diff --git a/src/libc.c b/src/libc.c index b18dff2c..a54eec5b 100644 --- a/src/libc.c +++ b/src/libc.c @@ -355,7 +355,6 @@ size_t _mi_clz_generic(size_t x) { #endif // bit scan -#if !MI_HAS_FAST_POPCOUNT #if MI_SIZE_SIZE == 4 #define mi_mask_even_bits32 (0x55555555) @@ -383,7 +382,7 @@ static size_t mi_popcount_generic32(uint32_t x) { return mi_byte_sum32(x); } -size_t _mi_popcount_generic(size_t x) { +mi_decl_noinline size_t _mi_popcount_generic(size_t x) { return mi_popcount_generic32(x); } @@ -407,9 +406,8 @@ static size_t mi_popcount_generic64(uint64_t x) { return mi_byte_sum64(x); } -size_t _mi_popcount_generic(size_t x) { +mi_decl_noinline size_t _mi_popcount_generic(size_t x) { return mi_popcount_generic64(x); } #endif -#endif // popcount From 7d6304347e1d73cd9b4165c71f307dabbdda7eda Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 19:09:51 -0800 Subject: [PATCH 14/23] revert back to _WIN32_WINNT=x0600 as we dynamically check for GetPhysicalInstalledMemory now (issue #976) --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 17e8d3f5..0d780fa1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -454,7 +454,7 @@ if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) endif() if(MINGW) - add_definitions(-D_WIN32_WINNT=0x601) # issue #976 + add_definitions(-D_WIN32_WINNT=0x600) # issue #976 endif() if(MI_OPT_ARCH_FLAGS) From e1fde6b5ce040f36bdd4b44c92c18a059370bcaa Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 19:21:30 -0800 Subject: [PATCH 15/23] update vcpkg file --- contrib/vcpkg/portfile.cmake | 6 +++--- contrib/vcpkg/vcpkg.json | 9 +++++++++ 2 files changed, 12 insertions(+), 3 deletions(-) diff --git a/contrib/vcpkg/portfile.cmake b/contrib/vcpkg/portfile.cmake index f5f39009..69661526 100644 --- a/contrib/vcpkg/portfile.cmake +++ b/contrib/vcpkg/portfile.cmake @@ -18,6 +18,8 @@ vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS guarded MI_GUARDED secure MI_SECURE override MI_OVERRIDE + optarch MI_OPT_ARCH + optsimd MI_OPT_SIMD xmalloc MI_XMALLOC asm MI_SEE_ASM ) @@ -26,16 +28,14 @@ string(COMPARE EQUAL "${VCPKG_LIBRARY_LINKAGE}" "dynamic" MI_BUILD_SHARED) vcpkg_cmake_configure( SOURCE_PATH "${SOURCE_PATH}" - OPTIONS_RELEASE - -DMI_OPT_ARCH=ON OPTIONS -DMI_USE_CXX=ON -DMI_BUILD_TESTS=OFF -DMI_BUILD_OBJECT=ON - ${FEATURE_OPTIONS} -DMI_BUILD_STATIC=${MI_BUILD_STATIC} -DMI_BUILD_SHARED=${MI_BUILD_SHARED} -DMI_INSTALL_TOPLEVEL=ON + ${FEATURE_OPTIONS} ) vcpkg_cmake_install() diff --git a/contrib/vcpkg/vcpkg.json b/contrib/vcpkg/vcpkg.json index bdbe9ba1..45f8097b 100644 --- a/contrib/vcpkg/vcpkg.json +++ b/contrib/vcpkg/vcpkg.json @@ -26,9 +26,18 @@ "secure": { "description": "Use full security mitigations (like guard pages and randomization)" }, + "guarded": { + "description": "Use build that support guard pages after objects controlled with MIMALLOC_GUARDED_SAMPLE_RATE" + }, "xmalloc": { "description": "If out-of-memory, call abort() instead of returning NULL" }, + "optarch": { + "description": "Use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')" + }, + "optsimd": { + "description": "Allow use of SIMD instructions (avx2 or neon) (requires 'optarch' to be enabled)" + }, "asm": { "description": "Generate assembly files" } From 0f60b12769234614e7477d5e4d653556db10c8d4 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 19:40:40 -0800 Subject: [PATCH 16/23] prepare readme for new release --- readme.md | 32 +++++++++++++++++++------------- 1 file changed, 19 insertions(+), 13 deletions(-) diff --git a/readme.md b/readme.md index 30875b63..1ea10883 100644 --- a/readme.md +++ b/readme.md @@ -12,8 +12,9 @@ is a general purpose allocator with excellent [performance](#performance) charac Initially developed by Daan Leijen for the runtime systems of the [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages. -Latest release tag: `v2.1.9` (2025-01-03). -Latest v1 tag: `v1.8.9` (2024-01-03). +Latest release : `v3.0.2` (beta) (2025-03-06) +Latest v2 release: `v2.2.2` (2025-03-06). +Latest v1 release: `v1.9.2` (2024-03-06). mimalloc is a drop-in replacement for `malloc` and can be used in other programs without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as: @@ -71,17 +72,22 @@ Enjoy! ### Branches -* `master`: latest stable release (based on `dev2`). -* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's. +* `master`: latest stable release (still based on `dev2`). +* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's. * `dev2`: development branch for mimalloc v2. This branch is downstream of `dev` - (and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage - mimalloc pages what can reduce fragmentation. -* `dev3`: development branch for mimalloc v3-alpha. This branch is downstream of `dev`. This is still experimental, - but simplifies previous versions by having no segments any more. This improves sharing of memory - between threads, and on certain large workloads uses less memory with less fragmentation. + (and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage + mimalloc pages that can reduce fragmentation. +* `dev3`: development branch for mimalloc v3-beta. This branch is downstream of `dev`. This version + simplifies the lock-free ownership of previous versions, has no thread-local segments any more. + This improves sharing of memory between threads, and on certain large workloads may use less memory + with less fragmentation. ### Releases +* 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. + Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. + Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. + On Windows, use `mimalloc.lib` for the static library, and `mimalloc.dll` for the dynamic override (which used to be `mimalloc-override.dll`) -- and use `mimalloc-dll.lib` for the export library of `mimalloc.dll`. Upgrade redirect to v1.3.2. * 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0.1-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS guard pages behind objects to catch buffer overflows as they occur. Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for @@ -167,7 +173,7 @@ mimalloc is used in various large scale low-latency services and programs, for e Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build. The `mimalloc-lib` project builds a static library (in `out/msvc-x64`), while the -`mimalloc-override-dll` project builds a DLL for overriding malloc +`mimalloc-override-dll` project builds DLL for overriding malloc in the entire program. ## Linux, macOS, BSD, etc. @@ -240,13 +246,13 @@ on Windows to build with the `clang-cl` compiler directly: ``` -## Single source +## Single Source You can also directly build the single `src/static.c` file as part of your project without needing `cmake` at all. Make sure to also add the mimalloc `include` directory to the include path. -# Using the library +# Using the Library The preferred usage is including ``, linking with the shared- or static library, and using the `mi_malloc` API exclusively for allocation. For example, @@ -474,7 +480,7 @@ Note that certain security restrictions may apply when doing this from the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash). -# Windows Override +### Dynamic Override on Windows We use a separate redirection DLL to override mimalloc on Windows such that we redirect all malloc/free calls that go through the (dynamic) C runtime allocator, From 71c61c4b91f4974e87c88b14f8c49a774fb688c2 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 19:52:06 -0800 Subject: [PATCH 17/23] fix multi-threaded access in stats merging --- include/mimalloc/atomic.h | 5 +++++ src/stats.c | 12 ++++++------ 2 files changed, 11 insertions(+), 6 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index dbd7160c..3f0bbc68 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -129,6 +129,11 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub); static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) { return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed)); } +static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, int64_t add) { + if (add != 0) { + mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed)); + } +} static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) { int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p); while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, ¤t, x)) { /* nothing */ }; diff --git a/src/stats.c b/src/stats.c index d9b26863..bce343f6 100644 --- a/src/stats.c +++ b/src/stats.c @@ -65,12 +65,12 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) { // must be thread safe as it is called from stats_merge static void mi_stat_count_add(mi_stat_count_t* stat, const mi_stat_count_t* src) { if (stat==src) return; - if (src->total!=0) { mi_atomic_addi64_relaxed(&stat->total, src->total); } - if (src->current!=0) { mi_atomic_addi64_relaxed(&stat->current, src->current); } - // peak scores do really not work across threads ... we use conservative max - if (src->peak > stat->peak) { - mi_atomic_maxi64_relaxed(&stat->peak, src->peak); // or: mi_atomic_addi64_relaxed( &stat->peak, src->peak); - } + mi_atomic_void_addi64_relaxed(&stat->total, src->total); + mi_atomic_void_addi64_relaxed(&stat->current, src->current); + // peak scores do really not work across threads .. we just add them + mi_atomic_void_addi64_relaxed( &stat->peak, src->peak); + // or, take the max? + // mi_atomic_maxi64_relaxed(&stat->peak, src->peak); } static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src) { From a085c305551d06fb8cfa51889edea729cf9ff857 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 19:59:37 -0800 Subject: [PATCH 18/23] update readme --- readme.md | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/readme.md b/readme.md index 1ea10883..66d0755e 100644 --- a/readme.md +++ b/readme.md @@ -12,7 +12,7 @@ is a general purpose allocator with excellent [performance](#performance) charac Initially developed by Daan Leijen for the runtime systems of the [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages. -Latest release : `v3.0.2` (beta) (2025-03-06) +Latest release : `v3.0.2` (beta) (2025-03-06). Latest v2 release: `v2.2.2` (2025-03-06). Latest v1 release: `v1.9.2` (2024-03-06). @@ -87,12 +87,13 @@ Enjoy! * 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. - On Windows, use `mimalloc.lib` for the static library, and `mimalloc.dll` for the dynamic override (which used to be `mimalloc-override.dll`) -- and use `mimalloc-dll.lib` for the export library of `mimalloc.dll`. Upgrade redirect to v1.3.2. + Upgrade mimalloc-redirect to v1.3.2. `MI_OPT_ARCH` is off by default now but still assumes armv8.1-a on arm64 + for fast atomic operations. * 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0.1-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS guard pages behind objects to catch buffer overflows as they occur. Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for thread local destructors (in Rust/C++); macOS tag change; macOS TLS slot fix; improve stats; - consistent mimalloc.dll on Windows (instead of mimalloc-override.dll); fix mimalloc-redirect on Win11 H2; + consistent `mimalloc.dll` on Windows (instead of `mimalloc-override.dll`); fix mimalloc-redirect on Win11 H2; add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance. * 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches. From 9eac969ea5caf105191bd974b7317aee696cdd48 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 20:14:33 -0800 Subject: [PATCH 19/23] improve atomic stat merging --- include/mimalloc/atomic.h | 3 ++- src/stats.c | 24 ++++++++++++------------ 2 files changed, 14 insertions(+), 13 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index 3f0bbc68..6eaa6f99 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -129,7 +129,8 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub); static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) { return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed)); } -static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, int64_t add) { +static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) { + const int64_t add = mi_atomic_load_relaxed((_Atomic(int64_t)*)padd); if (add != 0) { mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed)); } diff --git a/src/stats.c b/src/stats.c index bce343f6..1cfc3104 100644 --- a/src/stats.c +++ b/src/stats.c @@ -63,23 +63,23 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) { // must be thread safe as it is called from stats_merge -static void mi_stat_count_add(mi_stat_count_t* stat, const mi_stat_count_t* src) { +static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) { if (stat==src) return; - mi_atomic_void_addi64_relaxed(&stat->total, src->total); - mi_atomic_void_addi64_relaxed(&stat->current, src->current); + mi_atomic_void_addi64_relaxed(&stat->total, &src->total); + mi_atomic_void_addi64_relaxed(&stat->current, &src->current); // peak scores do really not work across threads .. we just add them - mi_atomic_void_addi64_relaxed( &stat->peak, src->peak); + mi_atomic_void_addi64_relaxed( &stat->peak, &src->peak); // or, take the max? // mi_atomic_maxi64_relaxed(&stat->peak, src->peak); } -static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src) { +static void mi_stat_counter_add_mt(mi_stat_counter_t* stat, const mi_stat_counter_t* src) { if (stat==src) return; - if (src->total!=0) { mi_atomic_addi64_relaxed(&stat->total, src->total); } + mi_atomic_void_addi64_relaxed(&stat->total, &src->total); } -#define MI_STAT_COUNT(stat) mi_stat_count_add(&stats->stat, &src->stat); -#define MI_STAT_COUNTER(stat) mi_stat_counter_add(&stats->stat, &src->stat); +#define MI_STAT_COUNT(stat) mi_stat_count_add_mt(&stats->stat, &src->stat); +#define MI_STAT_COUNTER(stat) mi_stat_counter_add_mt(&stats->stat, &src->stat); // must be thread safe as it is called from stats_merge static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { @@ -90,11 +90,11 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { #if MI_STAT>1 for (size_t i = 0; i <= MI_BIN_HUGE; i++) { - mi_stat_count_add(&stats->malloc_bins[i], &src->malloc_bins[i]); + mi_stat_count_add_mt(&stats->malloc_bins[i], &src->malloc_bins[i]); } #endif for (size_t i = 0; i <= MI_BIN_HUGE; i++) { - mi_stat_count_add(&stats->page_bins[i], &src->page_bins[i]); + mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]); } } @@ -289,8 +289,8 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_stat_print(&stats->malloc_normal, "normal", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg); mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? 1 : -1), out, arg); mi_stat_count_t total = { 0,0,0 }; - mi_stat_count_add(&total, &stats->malloc_normal); - mi_stat_count_add(&total, &stats->malloc_huge); + mi_stat_count_add_mt(&total, &stats->malloc_normal); + mi_stat_count_add_mt(&total, &stats->malloc_huge); mi_stat_print_ex(&total, "total", 1, out, arg, ""); #endif #if MI_STAT>1 From 783c3ba486d388119fcf80995b5f79ed42f2b783 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 20:40:52 -0800 Subject: [PATCH 20/23] improve QNX support --- src/prim/unix/prim.c | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 9e1c53ef..8e3180e6 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -62,7 +62,14 @@ terms of the MIT license. A copy of the license can be found in the file #include #endif +#if !defined(MADV_DONTNEED) && defined(POSIX_MADV_DONTNEED) // QNX +#define MADV_DONTNEED POSIX_MADV_DONTNEED +#endif +#if !defined(MADV_FREE) && defined(POSIX_MADV_FREE) // QNX +#define MADV_FREE POSIX_MADV_FREE +#endif + //------------------------------------------------------------------------------------ // Use syscalls for some primitives to allow for libraries that override open/read/close etc. // and do allocation themselves; using syscalls prevents recursion when mimalloc is @@ -413,11 +420,7 @@ int _mi_prim_commit(void* start, size_t size, bool* is_zero) { int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) { int err = 0; // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE) - #if defined(__QNX__) - err = posix_madvise(start, size, POSIX_MADV_DONTNEED); - #else err = unix_madvise(start, size, MADV_DONTNEED); - #endif #if !MI_DEBUG && !MI_SECURE *needs_recommit = false; #else @@ -449,8 +452,6 @@ int _mi_prim_reset(void* start, size_t size) { mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED); err = unix_madvise(start, size, MADV_DONTNEED); } - #elif defined(__QNX__) - int err = posix_madvise(start, size, POSIX_MADV_DONTNEED); #else int err = unix_madvise(start, size, MADV_DONTNEED); #endif From f57086f9f1e2c24f2ecbc8909a44f4284ffb38f2 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 20:46:54 -0800 Subject: [PATCH 21/23] update simd pipeline build --- azure-pipelines.yml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index a4266ae1..3ed715f3 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -32,7 +32,7 @@ jobs: MSBuildConfiguration: Release Release SIMD: BuildType: release-simd - cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON -DMI_WIN_USE_FIXED_TLS=ON + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON -DMI_WIN_USE_FIXED_TLS=ON MSBuildConfiguration: Release Secure: BuildType: secure @@ -97,7 +97,7 @@ jobs: CC: clang CXX: clang++ BuildType: release-simd-clang - cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON Secure Clang: CC: clang CXX: clang++ @@ -159,7 +159,7 @@ jobs: cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release Release SIMD: BuildType: release-simd - cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON Secure: BuildType: secure cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON From 21f6edf9c6135fb13b4af68438b5d47bc63efad1 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 20:54:44 -0800 Subject: [PATCH 22/23] update readme --- readme.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/readme.md b/readme.md index 66d0755e..81f2057e 100644 --- a/readme.md +++ b/readme.md @@ -88,7 +88,7 @@ Enjoy! Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. Upgrade mimalloc-redirect to v1.3.2. `MI_OPT_ARCH` is off by default now but still assumes armv8.1-a on arm64 - for fast atomic operations. + for fast atomic operations. Add QNX support. * 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0.1-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS guard pages behind objects to catch buffer overflows as they occur. Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for From 18124909a3d9335350995c951061423c2a60b84f Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Mar 2025 21:05:21 -0800 Subject: [PATCH 23/23] bump version to 3.0.3 for further development --- cmake/mimalloc-config-version.cmake | 2 +- include/mimalloc.h | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake index 527b1874..bd4adb2a 100644 --- a/cmake/mimalloc-config-version.cmake +++ b/cmake/mimalloc-config-version.cmake @@ -1,6 +1,6 @@ set(mi_version_major 3) set(mi_version_minor 0) -set(mi_version_patch 2) +set(mi_version_patch 3) set(mi_version ${mi_version_major}.${mi_version_minor}) set(PACKAGE_VERSION ${mi_version}) diff --git a/include/mimalloc.h b/include/mimalloc.h index a858008f..5b0445bf 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MIMALLOC_H #define MIMALLOC_H -#define MI_MALLOC_VERSION 302 // major + 2 digits minor +#define MI_MALLOC_VERSION 303 // major + 2 digits minor // ------------------------------------------------------ // Compiler specific attributes