From eb0081382b16114de189f529ad59c6cd8441287e Mon Sep 17 00:00:00 2001 From: Jo Bates <29763794+jbatez@users.noreply.github.com> Date: Sat, 1 Feb 2025 17:04:46 -0800 Subject: [PATCH 01/15] support MI_OPT_ARCH when targeting multiple CMAKE_OSX_ARCHITECTURES --- CMakeLists.txt | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index c766ce3a..e30c40c8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -425,7 +425,9 @@ endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku") if(MI_OPT_ARCH) - if(MI_ARCH STREQUAL "arm64") + if(APPLE AND "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) + set(MI_OPT_ARCH_FLAGS "-Xarch_arm64" "-march=armv8.1-a") + elseif(MI_ARCH STREQUAL "arm64") set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a") # fast atomics endif() endif() From 64aaf9d88f507c60ffc9ede4c8aea3b512867456 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Feb 2025 17:08:06 -0800 Subject: [PATCH 02/15] fix performance bug in mi_bchunk_try_find _and_clearNX --- src/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index 8a7a9442..d1719c3b 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -804,7 +804,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, const size_t post = mi_bfield_clz(~b); if (post > 0) { const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1])); - if (post + pre <= n) { + if (post + pre >= n) { // it fits -- try to claim it atomically const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post); if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) { From 7931678899281766f6fb03678928e615bfbcd571 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 6 Feb 2025 22:59:14 -0800 Subject: [PATCH 03/15] further optimize mi_bchunk_try_find_and_clearNX --- include/mimalloc/bits.h | 8 ++++++-- src/bitmap.c | 14 ++++++++------ src/options.c | 2 +- src/page.c | 4 ++-- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 64875e9d..d4632441 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -199,6 +199,8 @@ static inline size_t mi_ctz(size_t x) { size_t r; __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return r; + #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) + return _tzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long idx; return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS); @@ -221,6 +223,8 @@ static inline size_t mi_clz(size_t x) { size_t r; __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return r; + #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) + return _lzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long idx; return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS); @@ -254,7 +258,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) { bool is_zero; __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); return !is_zero; - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long i; return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false); #else @@ -271,7 +275,7 @@ static inline bool mi_bsr(size_t x, size_t* idx) { bool is_zero; __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc"); return !is_zero; - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long i; return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false); #else diff --git a/src/bitmap.c b/src/bitmap.c index d1719c3b..0b13e2ec 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -773,9 +773,10 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t idx; + // is there a range inside the field? while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit - if (idx + n > MI_BFIELD_BITS) break; // too short, maybe cross over, or continue with the next field + if (idx + n > MI_BFIELD_BITS) break; // too short: maybe cross over, or continue with the next field const size_t bmask = mask<>idx == mask); @@ -792,15 +793,16 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, } } else { - // advance - const size_t ones = mi_bfield_ctz(~(b>>idx)); // skip all ones (since it didn't fit the mask) - mi_assert_internal(ones>0); - b = b & ~mi_bfield_mask(ones, idx); // clear the ones + // advance by clearing the least run of ones, for example, with n>=4, idx=2: + // b = 1111 1101 1010 1100 + // .. + (1< 0) { const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1])); diff --git a/src/options.c b/src/options.c index 485beb48..d1bdd716 100644 --- a/src/options.c +++ b/src/options.c @@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) + { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/page.c b/src/page.c index b3dabb41..4e1f683c 100644 --- a/src/page.c +++ b/src/page.c @@ -137,7 +137,7 @@ bool _mi_page_is_valid(mi_page_t* page) { Page collect the `local_free` and `thread_free` lists ----------------------------------------------------------- */ -static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) +static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) { if (head == NULL) return; @@ -167,7 +167,7 @@ static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) } // Collect the local `thread_free` list using an atomic exchange. -static void mi_page_thread_free_collect(mi_page_t* page) +static mi_decl_noinline void mi_page_thread_free_collect(mi_page_t* page) { // atomically capture the thread free list mi_block_t* head; From 9053cf0cd25e7a59750eb974012c0f371ce3e312 Mon Sep 17 00:00:00 2001 From: Sergey Markelov Date: Fri, 7 Feb 2025 12:35:59 -0700 Subject: [PATCH 04/15] prim: fix dev3 UWP build (#1005) --- src/prim/windows/prim.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 0916a7ea..f91925fc 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -127,9 +127,11 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) config->has_partial_free = false; config->has_virtual_reserve = true; // windows version - const DWORD win_version = GetVersion(); - win_major_version = (DWORD)(LOBYTE(LOWORD(win_version))); - win_minor_version = (DWORD)(HIBYTE(LOWORD(win_version))); + OSVERSIONINFOW version{sizeof(version)}; + if (GetVersionExW(&version)) { + win_major_version = version.dwMajorVersion; + win_minor_version = version.dwMinorVersion; + } // get the page size SYSTEM_INFO si; GetSystemInfo(&si); From ca25fb3d17a1326f89a13c4c01d5a6d67b973af2 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 7 Feb 2025 17:38:53 -0800 Subject: [PATCH 05/15] avoid reload on clearing mask --- src/bitmap.c | 55 ++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index 0b13e2ec..c096bd4a 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -165,25 +165,31 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0 // and false otherwise (leaving the bit field as is). // `all_clear` is set to `true` if the new bfield became zero. -static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) { +static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) { mi_assert_internal(mask != 0); - mi_bfield_t old = mi_atomic_load_relaxed(b); - do { - if ((old&mask) != mask) { - // the mask bits are no longer set - if (all_clear != NULL) { *all_clear = (old==0); } + mi_assert_internal((expect & mask) == mask); + // try to atomically clear the mask bits + while mi_unlikely(!mi_atomic_cas_strong_acq_rel(b, &expect, expect & ~mask)) { + if ((expect & mask) != mask) { + if (all_clear != NULL) { *all_clear = (expect == 0); } return false; } - } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)); // try to atomically clear the mask bits - if (all_clear != NULL) { *all_clear = ((old&~mask) == 0); } + } + if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0); } return true; } +static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear) { + mi_assert_internal(mask != 0); + const mi_bfield_t expect = mi_atomic_load_relaxed(b); + return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear); +} + // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0 // and `false` otherwise leaving the bfield `b` as-is. // `all_clear` is set to true if the new bfield became zero (and false otherwise) -static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) { +static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_one()<bfields[chunk_idx]); size_t idx; - if (!allow_allset && (~b == 0)) return false; if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically + if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], mi_bfield_mask(1,idx), b, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; @@ -565,7 +570,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 const size_t chunk_idx = _tzcnt_u32(mask) / 8; - if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } @@ -600,7 +605,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. chunk_idx = mi_ctz(mask) / 8; #endif - if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } @@ -621,17 +626,13 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx if (mask==0) return false; mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. const size_t chunk_idx = mi_ctz(mask) / 8; - if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #else - // try first to find a field that is not all set (to reduce fragmentation) (not needed for binned bitmaps) - // for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - // if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true; - // } for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx)) return true; } return false; #endif @@ -643,9 +644,8 @@ static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, } #if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)) -static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) { +static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) { const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); - if (!allow_all_set && (~b == 0)) return false; // has_set8 has low bit in each byte set if the byte in x == 0xFF const mi_bfield_t has_set8 = ((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F @@ -655,7 +655,7 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); mi_assert_internal((idx%8)==0); - if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // unset the byte atomically + if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], (mi_bfield_t)0xFF << idx, b, NULL)) { // unset the byte atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; @@ -701,7 +701,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true; // } for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true; + if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true; } return false; #endif @@ -771,7 +771,8 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, const mi_bfield_t mask = mi_bfield_mask(n, 0); // for all fields in the chunk for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); + mi_bfield_t b0 = mi_atomic_load_relaxed(&chunk->bfields[i]); + mi_bfield_t b = b0; size_t idx; // is there a range inside the field? @@ -781,7 +782,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, const size_t bmask = mask<>idx == mask); if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically - if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) { + if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[i], bmask, b0, NULL)) { *pidx = (i*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); @@ -789,7 +790,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, } else { // if we failed to atomically commit, reload b and try again from the start - b = mi_atomic_load_acquire(&chunk->bfields[i]); + b = b0 = mi_atomic_load_acquire(&chunk->bfields[i]); } } else { From 9b7914fd3fb165a8caebc3a37179eee2447ecd93 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 09:35:21 -0800 Subject: [PATCH 06/15] fix bug in mi_page_free_collect_partly where the tail of the free list was kept --- src/page.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/page.c b/src/page.c index 4e1f683c..f25d0d9b 100644 --- a/src/page.c +++ b/src/page.c @@ -137,7 +137,7 @@ bool _mi_page_is_valid(mi_page_t* page) { Page collect the `local_free` and `thread_free` lists ----------------------------------------------------------- */ -static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) +static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) { if (head == NULL) return; @@ -167,7 +167,7 @@ static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi } // Collect the local `thread_free` list using an atomic exchange. -static mi_decl_noinline void mi_page_thread_free_collect(mi_page_t* page) +static void mi_page_thread_free_collect(mi_page_t* page) { // atomically capture the thread free list mi_block_t* head; @@ -215,11 +215,17 @@ void _mi_page_free_collect(mi_page_t* page, bool force) { mi_assert_internal(!force || page->local_free == NULL); } -// collect elements in the thread-free list starting at `head`. +// Collect elements in the thread-free list starting at `head`. This is an optimized +// version of `_mi_page_free_collect` to be used from `free.c:_mi_free_collect_mt` that avoids atomic access to `xthread_free`. +// +// `head` must be in the `xthread_free` list. It will not collect `head` itself +// so the `used` count is not fully updated in general. However, if the `head` is +// the last remaining element, it will be collected and the used count will become `0` (so `mi_page_all_free` becomes true). void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) { if (head == NULL) return; - mi_block_t* next = mi_block_next(page,head); // we cannot collect the head element itself as `page->thread_free` may point at it (and we want to avoid atomic ops) + mi_block_t* next = mi_block_next(page,head); // we cannot collect the head element itself as `page->thread_free` may point to it (and we want to avoid atomic ops) if (next != NULL) { + mi_block_set_next(page, head, NULL); mi_page_thread_collect_to_local(page, next); if (page->local_free != NULL && page->free == NULL) { page->free = page->local_free; @@ -229,6 +235,8 @@ void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) { } if (page->used == 1) { // all elements are free'd since we skipped the `head` element itself + mi_assert_internal(mi_tf_block(mi_atomic_load_relaxed(&page->xthread_free)) == head); + mi_assert_internal(mi_block_next(page,head) == NULL); _mi_page_free_collect(page, false); // collect the final element } } @@ -816,31 +824,25 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m // Find a page with free blocks of `size`. -static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) { +static mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) { // mi_page_queue_t* pq = mi_page_queue(heap, size); mi_assert_internal(!mi_page_queue_is_huge(pq)); // check the first page: we even do this with candidate search or otherwise we re-search every time mi_page_t* page = pq->first; - if (page != NULL) { - #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness + if mi_likely(page != NULL && mi_page_immediate_available(page)) { + #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) { mi_page_extend_free(heap, page); mi_assert_internal(mi_page_immediate_available(page)); } - else - #endif - { - _mi_page_free_collect(page,false); - } - - if (mi_page_immediate_available(page)) { - page->retire_expire = 0; - return page; // fast path - } + #endif + page->retire_expire = 0; + return page; // fast path + } + else { + return mi_page_queue_find_free_ex(heap, pq, true); } - - return mi_page_queue_find_free_ex(heap, pq, true); } From bc7fe059a6d87cb01a58c8f604f5b7764813c659 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 09:35:52 -0800 Subject: [PATCH 07/15] improve performance of mi_free_collect_mt by specializing mi_page_unown --- src/free.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/free.c b/src/free.c index 3fdb35aa..1df10728 100644 --- a/src/free.c +++ b/src/free.c @@ -201,7 +201,7 @@ void mi_free(void* p) mi_attr_noexcept // ------------------------------------------------------ // Multi-threaded Free (`_mt`) // ------------------------------------------------------ - +static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free); static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept { mi_assert_internal(mi_page_is_owned(page)); @@ -269,7 +269,36 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* // not reclaimed or free'd, unown again - _mi_page_unown(page); + // _mi_page_unown(page); + mi_page_unown_from_free(page, mt_free); +} + + +// release ownership of a page. This may free the page if all (other) blocks were concurrently +// freed in the meantime. Returns true if the page was freed. +// This is a specialized version of `mi_page_unown` to (try to) avoid calling `mi_page_free_collect` again. +static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free) { + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(mt_free != NULL); + mi_assert_internal(page->used > 1); + mi_thread_free_t tf_expect = mi_tf_create(mt_free, true); + mi_thread_free_t tf_new = mi_tf_create(mt_free, false); + while mi_unlikely(!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_expect, tf_new)) { + mi_assert_internal(mi_tf_is_owned(tf_expect)); + while (mi_tf_block(tf_expect) != NULL) { + _mi_page_free_collect(page,false); // update used + if (mi_page_all_free(page)) { // it may become free just before unowning it + _mi_arenas_page_unabandon(page); + _mi_arenas_page_free(page); + return true; + } + tf_expect = mi_atomic_load_relaxed(&page->xthread_free); + } + mi_assert_internal(mi_tf_block(tf_expect)==NULL); + tf_new = mi_tf_create(NULL, false); + } + return false; } From 2017181a6913e174f875c85c250dba3144ac9f04 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 09:36:09 -0800 Subject: [PATCH 08/15] improve performance of clearNX --- src/bitmap.c | 18 ++---------------- src/bitmap.h | 4 ---- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index c096bd4a..623f921d 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -167,14 +167,13 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already // `all_clear` is set to `true` if the new bfield became zero. static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) { mi_assert_internal(mask != 0); - mi_assert_internal((expect & mask) == mask); // try to atomically clear the mask bits - while mi_unlikely(!mi_atomic_cas_strong_acq_rel(b, &expect, expect & ~mask)) { + do { if ((expect & mask) != mask) { if (all_clear != NULL) { *all_clear = (expect == 0); } return false; } - } + } while (!mi_atomic_cas_weak_acq_rel(b, &expect, expect & ~mask)); if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0); } return true; } @@ -696,10 +695,6 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } } #else - // first skip allset fields to reduce fragmentation (not needed for binned bitmaps) - // for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { - // if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true; - // } for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true; } @@ -892,15 +887,6 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, } -//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { -// if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages -// if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages -// // if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages -// if (n==0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk -// if (n<=MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); -// return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); -//} - // ------- mi_bchunk_clear_once_set --------------------------------------- diff --git a/src/bitmap.h b/src/bitmap.h index 9afdffce..b17d83e5 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -271,10 +271,6 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); -// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's -// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! -bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); - // Is a sequence of n bits already all set/cleared? bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n); From 2048fa2d17684dde6a588a3aa444149b0cb1d842 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 09:53:00 -0800 Subject: [PATCH 09/15] fix comments --- include/mimalloc/types.h | 4 ++-- src/bitmap.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 6ed17f09..29d6fde9 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -99,7 +99,7 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_ENCODE_FREELIST 1 #endif -// Enable large pages for objects between 64KiB and 256KiB. +// Enable large pages for objects between 64KiB and 512KiB. // Disabled by default as for many workloads the block sizes above 64 KiB are quite random which can lead to too many partially used large pages. #ifndef MI_ENABLE_LARGE_PAGES #define MI_ENABLE_LARGE_PAGES 0 @@ -342,7 +342,7 @@ typedef struct mi_page_s { #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB #if MI_ENABLE_LARGE_PAGES #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 256 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` +#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #else #define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/8) // <= 64 KiB #define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // <= 64 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` diff --git a/src/bitmap.c b/src/bitmap.c index 623f921d..b458d5e8 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -184,7 +184,7 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_b return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear); } - +/* // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0 // and `false` otherwise leaving the bfield `b` as-is. // `all_clear` is set to true if the new bfield became zero (and false otherwise) @@ -203,6 +203,7 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t id const mi_bfield_t mask = ((mi_bfield_t)0xFF)< Date: Sat, 8 Feb 2025 11:51:18 -0800 Subject: [PATCH 10/15] set the option commit_on_demand back to 2 as we only do this for medium/large pages --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index d1bdd716..485beb48 100644 --- a/src/options.c +++ b/src/options.c @@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) + { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) }; static void mi_option_init(mi_option_desc_t* desc); From 0e9159e0bf0943b4c11e3d0746d164033a42f3c8 Mon Sep 17 00:00:00 2001 From: Daan Date: Sat, 8 Feb 2025 12:06:39 -0800 Subject: [PATCH 11/15] add comment --- CMakeLists.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e30c40c8..530afcf8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -425,7 +425,7 @@ endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku") if(MI_OPT_ARCH) - if(APPLE AND "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) + if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) # to support multi-arch binaries (#999) set(MI_OPT_ARCH_FLAGS "-Xarch_arm64" "-march=armv8.1-a") elseif(MI_ARCH STREQUAL "arm64") set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a") # fast atomics From 069279e3e7f6424eed1ea79f129ab26b22195b55 Mon Sep 17 00:00:00 2001 From: Daan Date: Sat, 8 Feb 2025 12:18:27 -0800 Subject: [PATCH 12/15] improve cmake for multi-arch binaries on apple --- CMakeLists.txt | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 530afcf8..cd1582b2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -425,10 +425,13 @@ endif() if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku") if(MI_OPT_ARCH) - if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) # to support multi-arch binaries (#999) - set(MI_OPT_ARCH_FLAGS "-Xarch_arm64" "-march=armv8.1-a") + if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES) # to support multi-arch binaries (#999) + set(MI_OPT_ARCH_FLAGS "") + if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) + list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a") + endif() elseif(MI_ARCH STREQUAL "arm64") - set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a") # fast atomics + set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a") # fast atomics endif() endif() endif() From c7f7c23dc15a27abb6a26e78fd7b3c073f43b388 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Sat, 8 Feb 2025 12:43:00 -0800 Subject: [PATCH 13/15] make C compatible --- src/prim/windows/prim.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index f91925fc..31ef0e94 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -127,10 +127,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) config->has_partial_free = false; config->has_virtual_reserve = true; // windows version - OSVERSIONINFOW version{sizeof(version)}; + OSVERSIONINFOW version; _mi_memzero_var(version); if (GetVersionExW(&version)) { - win_major_version = version.dwMajorVersion; - win_minor_version = version.dwMinorVersion; + win_major_version = version.dwMajorVersion; + win_minor_version = version.dwMinorVersion; } // get the page size SYSTEM_INFO si; From 5f9b42685efd0800389fe785f4bfeb805bc484f8 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Sat, 8 Feb 2025 12:51:06 -0800 Subject: [PATCH 14/15] remove declspec(dllexport) from DllMain on Windows (issue #1008) --- src/prim/windows/prim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index e3ad1e57..aa79eb91 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -651,7 +651,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { #define MI_PRIM_HAS_PROCESS_ATTACH 1 // Windows DLL: easy to hook into process_init and thread_done - __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) { + BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) { mi_win_main((PVOID)inst,reason,reserved); return TRUE; } From 9dd753d2c0aee48b38a56d513ae01231ca6901ac Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 13:12:19 -0800 Subject: [PATCH 15/15] add comment --- src/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index b458d5e8..f3030153 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -169,7 +169,7 @@ static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi mi_assert_internal(mask != 0); // try to atomically clear the mask bits do { - if ((expect & mask) != mask) { + if ((expect & mask) != mask) { // are all bits still set? if (all_clear != NULL) { *all_clear = (expect == 0); } return false; }