From 616db104a9147071a406a320fee6f51cf858ea74 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 1 Jun 2024 12:29:48 -0700 Subject: [PATCH 01/18] prevent UB in arena reservation --- include/mimalloc/internal.h | 8 ++++++++ src/arena.c | 8 +++++++- 2 files changed, 15 insertions(+), 1 deletion(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 2954eabd..2a21f34b 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -329,6 +329,14 @@ static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) { return (divider == 0 ? size : ((size + divider - 1) / divider)); } + +// clamp an integer +static inline size_t _mi_clamp(size_t sz, size_t min, size_t max) { + if (sz < min) return min; + else if (sz > max) return max; + else return sz; +} + // Is memory zero initialized? static inline bool mi_mem_is_zero(const void* p, size_t size) { for (size_t i = 0; i < size; i++) { diff --git a/src/arena.c b/src/arena.c index 25ce56ec..445cc309 100644 --- a/src/arena.c +++ b/src/arena.c @@ -358,8 +358,14 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) } arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); + arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE); if (arena_count >= 8 && arena_count <= 128) { - arena_reserve = ((size_t)1<<(arena_count/8)) * arena_reserve; // scale up the arena sizes exponentially + // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 ); + size_t reserve = 0; + if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { + arena_reserve = reserve; + } } if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size From aeee7907a0324f6d7ee8b01b55721c4efe2dec7e Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 1 Jun 2024 13:20:28 -0700 Subject: [PATCH 02/18] fix spelling --- src/arena.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arena.c b/src/arena.c index 445cc309..83582bad 100644 --- a/src/arena.c +++ b/src/arena.c @@ -510,7 +510,7 @@ static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitidx = startidx; bool all_purged = false; while (bitidx < endidx) { - // count consequetive ones in the purge mask + // count consecutive ones in the purge mask size_t count = 0; while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) { count++; @@ -547,7 +547,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi if (purge != 0) { size_t bitidx = 0; while (bitidx < MI_BITMAP_FIELD_BITS) { - // find consequetive range of ones in the purge mask + // find consecutive range of ones in the purge mask size_t bitlen = 0; while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) { bitlen++; @@ -927,7 +927,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->is_large = is_large; arena->purge_expire = 0; arena->search_idx = 0; - // consequetive bitmaps + // consecutive bitmaps arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap From f87a4c15b285a6d4c04c8813db2a26ebba807a4d Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 1 Jun 2024 13:41:13 -0700 Subject: [PATCH 03/18] increase max arenas --- src/arena.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/arena.c b/src/arena.c index 83582bad..d97bf628 100644 --- a/src/arena.c +++ b/src/arena.c @@ -36,7 +36,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo typedef uintptr_t mi_block_info_t; #define MI_ARENA_BLOCK_SIZE (MI_SEGMENT_SIZE) // 64MiB (must be at least MI_SEGMENT_ALIGN) #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2) // 32MiB -#define MI_MAX_ARENAS (112) // not more than 126 (since we use 7 bits in the memid and an arena index + 1) +#define MI_MAX_ARENAS (255) // Limited as the reservation exponentially increases (and takes up .bss) // A memory arena descriptor typedef struct mi_arena_s { @@ -552,6 +552,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) { bitlen++; } + // temporarily claim the purge range as "in-use" to be thread-safe with allocation // try to claim the longest range of corresponding in_use bits const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx); while( bitlen > 0 ) { From d9aa19a7636d457f0b7b50e599649b86e8ade666 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 1 Jun 2024 15:57:18 -0700 Subject: [PATCH 04/18] add support for sub-processes (to supportpython/cpython#113717) --- include/mimalloc.h | 10 ++++++- include/mimalloc/internal.h | 9 ++++--- include/mimalloc/types.h | 20 ++++++++++++-- src/arena.c | 54 ++++++++++++++++++++----------------- src/free.c | 3 ++- src/init.c | 43 ++++++++++++++++++++++++++--- src/segment.c | 24 ++++++++++------- 7 files changed, 119 insertions(+), 44 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 0173a323..26bb849d 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -288,8 +288,16 @@ mi_decl_export bool mi_manage_os_memory_ex(void* start, size_t size, bool is_co mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id); #endif + +// Experimental: allow sub-processes whose memory segments stay separated (and no reclamation between them) +// Used for example for separate interpreter's in one process. +typedef void* mi_subproc_id_t; +mi_decl_export mi_subproc_id_t mi_subproc_new(void); +mi_decl_export void mi_subproc_delete(mi_subproc_id_t subproc); +mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet) + // deprecated -mi_decl_export int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept; +mi_decl_export int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept; // ------------------------------------------------------ diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 2a21f34b..65cd3569 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -130,14 +130,17 @@ void _mi_arena_unsafe_destroy_all(mi_stats_t* stats); bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment); void _mi_arena_segment_mark_abandoned(mi_segment_t* segment); -size_t _mi_arena_segment_abandoned_count(void); -typedef struct mi_arena_field_cursor_s { // abstract +void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid); +void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size); + +typedef struct mi_arena_field_cursor_s { // abstract struct mi_arena_id_t start; int count; size_t bitmap_idx; + mi_subproc_t* subproc; } mi_arena_field_cursor_t; -void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current); +void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current); mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous); // "segment-map.c" diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index ed326c69..6b90bf5d 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -307,7 +307,7 @@ typedef struct mi_page_s { mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) uint16_t used; // number of blocks in use (including blocks in `thread_free`) uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) - uint8_t heap_tag; // tag of the owning heap, used for separated heaps by object type + uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type // padding size_t block_size; // size available in each block (always `>0`) uint8_t* page_start; // start of the page area containing the blocks @@ -387,6 +387,7 @@ typedef struct mi_memid_s { // --------------------------------------------------------------- // Segments contain mimalloc pages // --------------------------------------------------------------- +typedef struct mi_subproc_s mi_subproc_t; // Segments are large allocated memory blocks (2MiB on 64 bit) from the OS. // Inside segments we allocated fixed size _pages_ that contain blocks. @@ -409,6 +410,7 @@ typedef struct mi_segment_s { size_t capacity; // count of available pages (`#free + used`) size_t segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages. uintptr_t cookie; // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie` + mi_subproc_t* subproc; // segment belongs to sub process // layout like this to optimize access in `mi_free` _Atomic(mi_threadid_t) thread_id; // unique id of the thread owning this segment @@ -600,10 +602,23 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); #define mi_heap_stat_decrease(heap,stat,amount) mi_stat_decrease( (heap)->tld->stats.stat, amount) +// ------------------------------------------------------ +// Sub processes do not reclaim or visit segments +// from other sub processes +// ------------------------------------------------------ + +struct mi_subproc_s { + _Atomic(size_t) abandoned_count; // count of abandoned segments for this sup-process + mi_memid_t memid; // provenance +}; + +mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id); + // ------------------------------------------------------ // Thread Local data // ------------------------------------------------------ +// Milliseconds as in `int64_t` to avoid overflows typedef int64_t mi_msecs_t; // Queue of segments @@ -628,8 +643,9 @@ typedef struct mi_segments_tld_s { size_t current_size; // current size of all segments size_t peak_size; // peak size of all segments size_t reclaim_count;// number of reclaimed (abandoned) segments + mi_subproc_t* subproc; // sub-process this thread belongs to. mi_stats_t* stats; // points to tld stats - mi_os_tld_t* os; // points to os stats + mi_os_tld_t* os; // points to os tld } mi_segments_tld_t; // Thread local data diff --git a/src/arena.c b/src/arena.c index d97bf628..aeadd604 100644 --- a/src/arena.c +++ b/src/arena.c @@ -172,7 +172,7 @@ static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* m return p; } -static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { +void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) { *memid = _mi_memid_none(); // try static @@ -180,7 +180,7 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st if (p != NULL) return p; // or fall back to the OS - p = _mi_os_alloc(size, memid, stats); + p = _mi_os_alloc(size, memid, &_mi_stats_main); if (p == NULL) return NULL; // zero the OS memory if needed @@ -191,9 +191,9 @@ static void* mi_arena_meta_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* st return p; } -static void mi_arena_meta_free(void* p, mi_memid_t memid, size_t size, mi_stats_t* stats) { +void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { if (mi_memkind_is_os(memid.memkind)) { - _mi_os_free(p, size, memid, stats); + _mi_os_free(p, size, memid, &_mi_stats_main); } else { mi_assert(memid.memkind == MI_MEM_STATIC); @@ -709,7 +709,7 @@ static void mi_arenas_unsafe_destroy(void) { else { new_max_arena = i; } - mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size, &_mi_stats_main); + _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size); } } @@ -752,13 +752,6 @@ bool _mi_arena_contains(const void* p) { the arena bitmaps. ----------------------------------------------------------- */ -// Maintain a count of all abandoned segments -static mi_decl_cache_align _Atomic(size_t)abandoned_count; - -size_t _mi_arena_segment_abandoned_count(void) { - return mi_atomic_load_relaxed(&abandoned_count); -} - // reclaim a specific abandoned segment; `true` on success. // sets the thread_id. bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) @@ -768,7 +761,7 @@ bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) // but we need to still claim it atomically -- we use the thread_id for that. size_t expected = 0; if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) { - mi_atomic_decrement_relaxed(&abandoned_count); + mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count); return true; } else { @@ -785,7 +778,7 @@ bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx); if (was_marked) { mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); - mi_atomic_decrement_relaxed(&abandoned_count); + mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count); mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); } // mi_assert_internal(was_marked); @@ -802,9 +795,10 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) mi_assert_internal(segment->used == segment->abandoned); if (segment->memid.memkind != MI_MEM_ARENA) { // not in an arena; count it as abandoned and return - mi_atomic_increment_relaxed(&abandoned_count); + mi_atomic_increment_relaxed(&segment->subproc->abandoned_count); return; } + // segment is in an arena size_t arena_idx; size_t bitmap_idx; mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); @@ -812,17 +806,19 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); mi_assert_internal(arena != NULL); const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); - if (was_unmarked) { mi_atomic_increment_relaxed(&abandoned_count); } + if (was_unmarked) { mi_atomic_increment_relaxed(&segment->subproc->abandoned_count); } mi_assert_internal(was_unmarked); mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); } // start a cursor at a randomized arena -void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current) { +void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current) { + mi_assert_internal(heap->tld->segments.subproc == subproc); const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena)); current->count = 0; current->bitmap_idx = 0; + current->subproc = subproc; } // reclaim abandoned segments @@ -830,7 +826,7 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* curre mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) { const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count); - if (max_arena <= 0 || mi_atomic_load_relaxed(&abandoned_count) == 0) return NULL; + if (max_arena <= 0 || mi_atomic_load_relaxed(&previous->subproc->abandoned_count) == 0) return NULL; int count = previous->count; size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx); @@ -853,14 +849,24 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); // try to reclaim it atomically if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) { - mi_atomic_decrement_relaxed(&abandoned_count); - previous->bitmap_idx = bitmap_idx; - previous->count = count; mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); - //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); - return segment; + // check that belongs to our sub-process + if (segment->subproc != previous->subproc) { + // it is from another subprocess, re-mark it and continue searching + const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); + mi_assert_internal(was_zero); + } + else { + // success, we unabandoned a segment in our sub-process + mi_atomic_decrement_relaxed(&previous->subproc->abandoned_count); + previous->bitmap_idx = bitmap_idx; + previous->count = count; + + //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); + return segment; + } } } } @@ -911,7 +917,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int const size_t bitmaps = (memid.is_pinned ? 3 : 5); const size_t asize = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t)); mi_memid_t meta_memid; - mi_arena_t* arena = (mi_arena_t*)mi_arena_meta_zalloc(asize, &meta_memid, &_mi_stats_main); // TODO: can we avoid allocating from the OS? + mi_arena_t* arena = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid); if (arena == NULL) return false; // already zero'd due to zalloc diff --git a/src/free.c b/src/free.c index c065d2f3..191ec9bf 100644 --- a/src/free.c +++ b/src/free.c @@ -240,7 +240,8 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg { // the segment is abandoned, try to reclaim it into our heap if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) { - mi_assert_internal(_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); + mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); + mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc); mi_free(block); // recursively free as now it will be a local free in our heap return; } diff --git a/src/init.c b/src/init.c index 62bb69dd..1922907b 100644 --- a/src/init.c +++ b/src/init.c @@ -125,18 +125,20 @@ mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty; extern mi_heap_t _mi_heap_main; -static mi_tld_t tld_main = { +static mi_decl_cache_align mi_subproc_t mi_subproc_default; + +static mi_decl_cache_align mi_tld_t tld_main = { 0, false, - &_mi_heap_main, &_mi_heap_main, + &_mi_heap_main, &_mi_heap_main, { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0}, - 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, &mi_subproc_default, &tld_main.stats, &tld_main.os }, // segments { 0, &tld_main.stats }, // os { MI_STATS_NULL } // stats }; -mi_heap_t _mi_heap_main = { +mi_decl_cache_align mi_heap_t _mi_heap_main = { &tld_main, MI_ATOMIC_VAR_INIT(NULL), 0, // thread id @@ -179,6 +181,38 @@ mi_heap_t* _mi_heap_main_get(void) { } +/* ----------------------------------------------------------- + Sub process +----------------------------------------------------------- */ + +mi_subproc_id_t mi_subproc_new(void) { + mi_memid_t memid = _mi_memid_none(); + mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid); + if (subproc == NULL) return NULL; + subproc->memid = memid; + return subproc; +} + +mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id) { + return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id); +} + +void mi_subproc_delete(mi_subproc_id_t subproc_id) { + if (subproc_id == NULL) return; + mi_subproc_t* subproc = mi_subproc_from_id(subproc_id); + _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t)); +} + +void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { + mi_heap_t* heap = mi_heap_get_default(); + if (heap == NULL) return; + mi_assert(heap->tld->segments.subproc == &mi_subproc_default); + if (heap->tld->segments.subproc != &mi_subproc_default) return; + heap->tld->segments.subproc = mi_subproc_from_id(subproc_id); +} + + + /* ----------------------------------------------------------- Initialization and freeing of the thread local heaps ----------------------------------------------------------- */ @@ -295,6 +329,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { _mi_memzero_aligned(tld,sizeof(mi_tld_t)); tld->heap_backing = bheap; tld->heaps = NULL; + tld->segments.subproc = &mi_subproc_default; tld->segments.stats = &tld->stats; tld->segments.os = &tld->os; tld->os.stats = &tld->stats; diff --git a/src/segment.c b/src/segment.c index fc13d2e7..205d8753 100644 --- a/src/segment.c +++ b/src/segment.c @@ -628,7 +628,8 @@ static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, segment->page_shift = page_shift; segment->segment_info_size = pre_size; segment->thread_id = _mi_thread_id(); - segment->cookie = _mi_ptr_cookie(segment); + segment->cookie = _mi_ptr_cookie(segment); + segment->subproc = tld->subproc; // set protection mi_segment_protect(segment, true, tld->os); @@ -880,6 +881,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; } // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free. mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id()); + mi_assert_internal(segment->subproc == heap->tld->segments.subproc); // only reclaim within the same subprocess mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); segment->abandoned_visits = 0; segment->was_reclaimed = true; @@ -899,12 +901,13 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, segment->abandoned--; mi_assert(page->next == NULL); _mi_stat_decrease(&tld->stats->pages_abandoned, 1); - // set the heap again and allow heap thread delayed free again. + // get the target heap for this thread which has a matching heap tag (so we reclaim into a matching heap) mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag); // allow custom heaps to separate objects if (target_heap == NULL) { target_heap = heap; - _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using %u instead)\n", page->heap_tag, heap->tag ); + _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using tag %u instead)\n", page->heap_tag, heap->tag ); } + // associate the heap with this page, and allow heap thread delayed free again. mi_page_set_heap(page, target_heap); _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) _mi_page_free_collect(page, false); // ensure used count is up to date @@ -944,7 +947,8 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, // attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`) bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) { if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false; // it is not abandoned - // don't reclaim more from a free than half the current segments + if (segment->subproc != heap->tld->segments.subproc) return false; // only reclaim within the same subprocess + // don't reclaim more from a `free` call than half the current segments // this is to prevent a pure free-ing thread to start owning too many segments if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false; if (_mi_arena_segment_clear_abandoned(segment)) { // atomically unabandon @@ -957,17 +961,17 @@ bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) { void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) { mi_segment_t* segment; - mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, ¤t); + mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, ¤t); while ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL) { mi_segment_reclaim(segment, heap, 0, NULL, tld); } } -static long mi_segment_get_reclaim_tries(void) { +static long mi_segment_get_reclaim_tries(mi_segments_tld_t* tld) { // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries. const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100); if (perc <= 0) return 0; - const size_t total_count = _mi_arena_segment_abandoned_count(); + const size_t total_count = mi_atomic_load_relaxed(&tld->subproc->abandoned_count); if (total_count == 0) return 0; const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count)); @@ -978,13 +982,14 @@ static long mi_segment_get_reclaim_tries(void) { static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld) { *reclaimed = false; - long max_tries = mi_segment_get_reclaim_tries(); + long max_tries = mi_segment_get_reclaim_tries(tld); if (max_tries <= 0) return NULL; mi_segment_t* segment; - mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, ¤t); + mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, ¤t); while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL)) { + mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process segment->abandoned_visits++; // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit? // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries @@ -1232,5 +1237,6 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t pag mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size); // mi_segment_try_purge(tld); mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld)); + mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); return page; } From 0b3cd5124999efc673afb26bab3f5a1c8eff4c22 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Sat, 1 Jun 2024 16:45:20 -0700 Subject: [PATCH 05/18] add initial primitive api for locks --- include/mimalloc/atomic.h | 21 +++++++++------- include/mimalloc/internal.h | 5 ---- include/mimalloc/prim.h | 24 +++++++++++++++--- include/mimalloc/track.h | 8 ++---- src/alloc.c | 10 ++++---- src/prim/emscripten/prim.c | 49 ++++++++++++++++++++++++++++++++++++- src/prim/unix/prim.c | 46 ++++++++++++++++++++++++++++++++++ src/prim/wasi/prim.c | 48 +++++++++++++++++++++++++++++++++--- src/prim/windows/prim.c | 35 +++++++++++++++++++------- test/main-override.cpp | 4 +-- test/test-stress.c | 6 ++--- 11 files changed, 208 insertions(+), 48 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index d5333dd9..2c313fdb 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -8,6 +8,17 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MIMALLOC_ATOMIC_H #define MIMALLOC_ATOMIC_H +// include windows.h or pthreads.h +#if defined(_WIN32) +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif +#include +#elif !defined(_WIN32) && (defined(__EMSCRIPTEN_SHARED_MEMORY__) || !defined(__wasi__)) +#define MI_USE_PTHREADS +#include +#endif + // -------------------------------------------------------------------------------------------- // Atomics // We need to be portable between C, C++, and MSVC. @@ -133,10 +144,6 @@ static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) { #elif defined(_MSC_VER) // Legacy MSVC plain C compilation wrapper that uses Interlocked operations to model C11 atomics. -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include #include #ifdef _WIN64 typedef LONG64 msc_intptr_t; @@ -306,7 +313,7 @@ typedef _Atomic(uintptr_t) mi_atomic_once_t; // Returns true only on the first invocation static inline bool mi_atomic_once( mi_atomic_once_t* once ) { - if (mi_atomic_load_relaxed(once) != 0) return false; // quick test + if (mi_atomic_load_relaxed(once) != 0) return false; // quick test uintptr_t expected = 0; return mi_atomic_cas_strong_acq_rel(once, &expected, (uintptr_t)1); // try to set to 1 } @@ -329,10 +336,6 @@ static inline void mi_atomic_yield(void) { std::this_thread::yield(); } #elif defined(_WIN32) -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include static inline void mi_atomic_yield(void) { YieldProcessor(); } diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 65cd3569..9046e3ad 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -53,11 +53,6 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_decl_externc #endif -// pthreads -#if !defined(_WIN32) && !defined(__wasi__) -#define MI_USE_PTHREADS -#include -#endif // "options.c" void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message); diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 3f4574dd..ba305dc1 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -114,6 +114,24 @@ void _mi_prim_thread_done_auto_done(void); // Called when the default heap for a thread changes void _mi_prim_thread_associate_default_heap(mi_heap_t* heap); +// Locks are only used if abandoned segment visiting is permitted +#if defined(_WIN32) +#define mi_lock_t CRITICAL_SECTION +#elif defined(MI_USE_PTHREADS) +#define mi_lock_t pthread_mutex_t +#else +#define mi_lock_t _Atomic(uintptr_t) +#endif + +// Take a lock (blocking). Return `true` on success. +bool _mi_prim_lock(mi_lock_t* lock); + +// Try to take lock and return `true` if successful. +bool _mi_prim_try_lock(mi_lock_t* lock); + +// Release a lock. +void _mi_prim_unlock(mi_lock_t* lock); + //------------------------------------------------------------------- // Thread id: `_mi_prim_thread_id()` @@ -235,10 +253,6 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { #elif defined(_WIN32) -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { // Windows: works on Intel and ARM in both 32- and 64-bit return (uintptr_t)NtCurrentTeb(); @@ -370,4 +384,6 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) { + + #endif // MIMALLOC_PRIM_H diff --git a/include/mimalloc/track.h b/include/mimalloc/track.h index a659d940..4b5709e2 100644 --- a/include/mimalloc/track.h +++ b/include/mimalloc/track.h @@ -34,7 +34,7 @@ The corresponding `mi_track_free` still uses the block start pointer and origina The `mi_track_resize` is currently unused but could be called on reallocations within a block. `mi_track_init` is called at program start. -The following macros are for tools like asan and valgrind to track whether memory is +The following macros are for tools like asan and valgrind to track whether memory is defined, undefined, or not accessible at all: #define mi_track_mem_defined(p,size) @@ -82,10 +82,6 @@ defined, undefined, or not accessible at all: #define MI_TRACK_HEAP_DESTROY 1 #define MI_TRACK_TOOL "ETW" -#ifndef WIN32_LEAN_AND_MEAN -#define WIN32_LEAN_AND_MEAN -#endif -#include #include "../src/prim/windows/etw.h" #define mi_track_init() EventRegistermicrosoft_windows_mimalloc(); @@ -96,7 +92,7 @@ defined, undefined, or not accessible at all: // no tracking #define MI_TRACK_ENABLED 0 -#define MI_TRACK_HEAP_DESTROY 0 +#define MI_TRACK_HEAP_DESTROY 0 #define MI_TRACK_TOOL "none" #define mi_track_malloc_size(p,reqsize,size,zero) diff --git a/src/alloc.c b/src/alloc.c index 6c9c5baf..5ba8bb33 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -28,7 +28,7 @@ terms of the MIT license. A copy of the license can be found in the file // Fast allocation in a page: just pop from the free list. // Fall back to generic allocation only if the list is empty. // Note: in release mode the (inlined) routine is about 7 instructions with a single test. -extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept +extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept { mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size); mi_block_t* const block = page->free; @@ -61,7 +61,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_ } else { _mi_memzero_aligned(block, page->block_size - MI_PADDING_SIZE); - } + } } #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN @@ -123,9 +123,9 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, #if (MI_PADDING) if (size == 0) { size = sizeof(void*); } #endif - + mi_page_t* page = _mi_heap_get_free_small_page(heap, size + MI_PADDING_SIZE); - void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero); + void* const p = _mi_page_malloc_zero(heap, page, size + MI_PADDING_SIZE, zero); mi_track_malloc(p,size,zero); #if MI_STAT>1 @@ -362,7 +362,7 @@ mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_ #ifndef PATH_MAX #define PATH_MAX MAX_PATH #endif -#include + mi_decl_nodiscard mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept { // todo: use GetFullPathNameW to allow longer file names char buf[PATH_MAX]; diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c index f3797c9e..6b5aa452 100644 --- a/src/prim/emscripten/prim.c +++ b/src/prim/emscripten/prim.c @@ -200,7 +200,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) { // Thread init/done //---------------------------------------------------------------- -#ifdef __EMSCRIPTEN_SHARED_MEMORY__ +#if defined(MI_USE_PTHREADS) // use pthread local storage keys to detect thread ending // (and used with MI_TLS_PTHREADS for the default heap) @@ -242,3 +242,50 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { } #endif + +//---------------------------------------------------------------- +// Locks +//---------------------------------------------------------------- + +#if defined(MI_USE_PTHREADS) + +bool _mi_prim_lock(mi_lock_t* lock) { + return (pthread_mutex_lock(lock) == 0); +} + +bool _mi_prim_try_lock(mi_lock_t* lock) { + return (pthread_mutex_trylock(lock) == 0); +} + +void _mi_prim_unlock(mi_lock_t* lock) { + pthread_mutex_unlock(lock); +} + +#else + +#include + +// fall back to poor man's locks. +bool _mi_prim_lock(mi_lock_t* lock) { + for(int i = 0; i < 1000; i++) { // for at most 1 second? + if (_mi_prim_try_lock(lock)) return true; + if (i < 25) { + mi_atomic_yield(); // first yield a bit + } + else { + emscripten_sleep(1); // then sleep for 1ms intervals + } + } + return true; +} + +bool _mi_prim_try_lock(mi_lock_t* lock) { + uintptr_t expected = 0; + return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1); +} + +void _mi_prim_unlock(mi_lock_t* lock) { + mi_atomic_store_release(lock,(uintptr_t)0); +} + +#endif \ No newline at end of file diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 99325d03..7935c1c6 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -880,3 +880,49 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { } #endif + + +//---------------------------------------------------------------- +// Locks +//---------------------------------------------------------------- + +#if defined(MI_USE_PTHREADS) + +bool _mi_prim_lock(mi_lock_t* lock) { + return (pthread_mutex_lock(lock) == 0); +} + +bool _mi_prim_try_lock(mi_lock_t* lock) { + return (pthread_mutex_trylock(lock) == 0); +} + +void _mi_prim_unlock(mi_lock_t* lock) { + pthread_mutex_unlock(lock); +} + +#else + +// fall back to poor man's locks. +bool _mi_prim_lock(mi_lock_t* lock) { + for(int i = 0; i < 1000; i++) { // for at most 1 second? + if (_mi_prim_try_lock(lock)) return true; + if (i < 25) { + mi_atomic_yield(); // first yield a bit + } + else { + usleep(1000); // then sleep for 1ms intervals + } + } + return true; +} + +bool _mi_prim_try_lock(mi_lock_t* lock) { + uintptr_t expected = 0; + return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1); +} + +void _mi_prim_unlock(mi_lock_t* lock) { + mi_atomic_store_release(lock,(uintptr_t)0); +} + +#endif diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c index e95f67f5..3f3a2ea1 100644 --- a/src/prim/wasi/prim.c +++ b/src/prim/wasi/prim.c @@ -22,7 +22,7 @@ terms of the MIT license. A copy of the license can be found in the file void _mi_prim_mem_init( mi_os_mem_config_t* config ) { config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB config->alloc_granularity = 16; - config->has_overcommit = false; + config->has_overcommit = false; config->has_partial_free = false; config->has_virtual_reserve = false; } @@ -134,7 +134,7 @@ int _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_la //--------------------------------------------- int _mi_prim_commit(void* addr, size_t size, bool* is_zero) { - MI_UNUSED(addr); MI_UNUSED(size); + MI_UNUSED(addr); MI_UNUSED(size); *is_zero = false; return 0; } @@ -199,9 +199,9 @@ mi_msecs_t _mi_prim_clock_now(void) { // low resolution timer mi_msecs_t _mi_prim_clock_now(void) { #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0) - return (mi_msecs_t)clock(); + return (mi_msecs_t)clock(); #elif (CLOCKS_PER_SEC < 1000) - return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); + return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); #else return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000); #endif @@ -278,3 +278,43 @@ void _mi_prim_thread_done_auto_done(void) { void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); } + +//---------------------------------------------------------------- +// Locks +//---------------------------------------------------------------- + +#if defined(MI_USE_PTHREADS) + +bool _mi_prim_lock(mi_lock_t* lock) { + return (pthread_mutex_lock(lock) == 0); +} + +bool _mi_prim_try_lock(mi_lock_t* lock) { + return (pthread_mutex_trylock(lock) == 0); +} + +void _mi_prim_unlock(mi_lock_t* lock) { + pthread_mutex_unlock(lock); +} + +#else + +// fall back to poor man's locks. +bool _mi_prim_lock(mi_lock_t* lock) { + for(int i = 0; i < 1000; i++) { // for at most 1 second? + if (_mi_prim_try_lock(lock)) return true; + mi_atomic_yield(); // this should never happen as wasi is single threaded? + } + return true; +} + +bool _mi_prim_try_lock(mi_lock_t* lock) { + uintptr_t expected = 0; + return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1); +} + +void _mi_prim_unlock(mi_lock_t* lock) { + mi_atomic_store_release(lock,(uintptr_t)0); +} + +#endif \ No newline at end of file diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 5074ad4c..760debb3 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -231,7 +231,7 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) && (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 && win_is_out_of_memory_error(GetLastError())) { - // if committing regular memory and being out-of-memory, + // if committing regular memory and being out-of-memory, // keep trying for a bit in case memory frees up after all. See issue #894 _mi_warning_message("out-of-memory on OS allocation, try again... (attempt %lu, 0x%zx bytes, error code: 0x%x, address: %p, alignment: 0x%zx, flags: 0x%x)\n", tries, size, GetLastError(), addr, try_alignment, flags); long sleep_msecs = tries*40; // increasing waits @@ -316,7 +316,7 @@ int _mi_prim_commit(void* addr, size_t size, bool* is_zero) { return 0; } -int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) { +int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) { BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT); *needs_recommit = true; // for safety, assume always decommitted even in the case of an error. return (ok ? 0 : (int)GetLastError()); @@ -468,7 +468,6 @@ mi_msecs_t _mi_prim_clock_now(void) { // Process Info //---------------------------------------------------------------- -#include #include static mi_msecs_t filetime_msecs(const FILETIME* ftime) { @@ -491,7 +490,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo) GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut); pinfo->utime = filetime_msecs(&ut); pinfo->stime = filetime_msecs(&st); - + // load psapi on demand if (pGetProcessMemoryInfo == NULL) { HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll")); @@ -505,7 +504,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo) memset(&info, 0, sizeof(info)); if (pGetProcessMemoryInfo != NULL) { pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info)); - } + } pinfo->current_rss = (size_t)info.WorkingSetSize; pinfo->peak_rss = (size_t)info.PeakWorkingSetSize; pinfo->current_commit = (size_t)info.PagefileUsage; @@ -517,7 +516,7 @@ void _mi_prim_process_info(mi_process_info_t* pinfo) // Output //---------------------------------------------------------------- -void _mi_prim_out_stderr( const char* msg ) +void _mi_prim_out_stderr( const char* msg ) { // on windows with redirection, the C runtime cannot handle locale dependent output // after the main thread closes so we use direct console output. @@ -564,6 +563,23 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { } +//---------------------------------------------------------------- +// Locks +//---------------------------------------------------------------- + +bool _mi_prim_lock(mi_lock_t* lock) { + EnterCriticalSection(lock); + return true; +} + +bool _mi_prim_try_lock(mi_lock_t* lock) { + return TryEnterCriticalSection(lock); +} + +void _mi_prim_unlock(mi_lock_t* lock) { + LeaveCriticalSection(lock); +} + //---------------------------------------------------------------- // Random @@ -600,7 +616,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) { } if (pBCryptGenRandom == NULL) return false; } - return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0); + return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0); } #endif // MI_USE_RTLGENRANDOM @@ -636,9 +652,9 @@ void _mi_prim_thread_init_auto_done(void) { } void _mi_prim_thread_done_auto_done(void) { - // call thread-done on all threads (except the main thread) to prevent + // call thread-done on all threads (except the main thread) to prevent // dangling callback pointer if statically linked with a DLL; Issue #208 - FlsFree(mi_fls_key); + FlsFree(mi_fls_key); } void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { @@ -661,3 +677,4 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { } #endif + diff --git a/test/main-override.cpp b/test/main-override.cpp index 64ea178b..fc7f70f0 100644 --- a/test/main-override.cpp +++ b/test/main-override.cpp @@ -19,7 +19,7 @@ #endif #ifdef _WIN32 -#include +#include static void msleep(unsigned long msecs) { Sleep(msecs); } #else #include @@ -43,7 +43,7 @@ static void test_stl_allocators(); int main() { // mi_stats_reset(); // ignore earlier allocations - + test_std_string(); // heap_thread_free_huge(); /* diff --git a/test/test-stress.c b/test/test-stress.c index 14b3c3ae..0368007a 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -200,7 +200,7 @@ static void test_stress(void) { #ifndef NDEBUG //mi_collect(false); //mi_debug_show_arenas(); - #endif + #endif #if !defined(NDEBUG) || defined(MI_TSAN) if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); } #endif @@ -232,7 +232,7 @@ static void test_leak(void) { int main(int argc, char** argv) { #ifndef USE_STD_MALLOC mi_stats_reset(); - #endif + #endif // > mimalloc-test-stress [THREADS] [SCALE] [ITER] if (argc >= 2) { @@ -285,7 +285,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress; #ifdef _WIN32 -#include +#include static DWORD WINAPI thread_entry(LPVOID param) { thread_entry_fun((intptr_t)param); From f93fb900b7495d320b2cfae4e69f1091917d278d Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Sat, 1 Jun 2024 17:25:45 -0700 Subject: [PATCH 06/18] move lock code to atomic.h --- include/mimalloc/atomic.h | 91 ++++++++++++++++++++++++++++++++++++++ include/mimalloc/prim.h | 17 ------- include/mimalloc/types.h | 2 - src/init.c | 2 +- src/prim/emscripten/prim.c | 47 -------------------- src/prim/unix/prim.c | 47 -------------------- src/prim/wasi/prim.c | 41 ----------------- src/prim/windows/prim.c | 19 -------- 8 files changed, 92 insertions(+), 174 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index 2c313fdb..4e3250f9 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -309,6 +309,11 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) { return (intptr_t)mi_atomic_addi(p, -sub); } + +// ---------------------------------------------------------------------- +// Once and Guard +// ---------------------------------------------------------------------- + typedef _Atomic(uintptr_t) mi_atomic_once_t; // Returns true only on the first invocation @@ -329,7 +334,9 @@ typedef _Atomic(uintptr_t) mi_atomic_guard_t; +// ---------------------------------------------------------------------- // Yield +// ---------------------------------------------------------------------- #if defined(__cplusplus) #include static inline void mi_atomic_yield(void) { @@ -393,4 +400,88 @@ static inline void mi_atomic_yield(void) { #endif +// ---------------------------------------------------------------------- +// Locks are only used for abandoned segment visiting +// ---------------------------------------------------------------------- +#if defined(_WIN32) + +#define mi_lock_t CRITICAL_SECTION + +static inline bool _mi_prim_lock(mi_lock_t* lock) { + EnterCriticalSection(lock); + return true; +} + +static inline bool _mi_prim_try_lock(mi_lock_t* lock) { + return TryEnterCriticalSection(lock); +} + +static inline void _mi_prim_unlock(mi_lock_t* lock) { + LeaveCriticalSection(lock); +} + + +#elif defined(MI_USE_PTHREADS) + +#define mi_lock_t pthread_mutex_t + +static inline bool _mi_prim_lock(mi_lock_t* lock) { + return (pthread_mutex_lock(lock) == 0); +} + +static inline bool _mi_prim_try_lock(mi_lock_t* lock) { + return (pthread_mutex_trylock(lock) == 0); +} + +static inline void _mi_prim_unlock(mi_lock_t* lock) { + pthread_mutex_unlock(lock); +} + +#elif defined(__cplusplus) + +#include +#define mi_lock_t std::mutex + +static inline bool _mi_prim_lock(mi_lock_t* lock) { + lock->lock(); + return true; +} + +static inline bool _mi_prim_try_lock(mi_lock_t* lock) { + return (lock->try_lock(); +} + +static inline void _mi_prim_unlock(mi_lock_t* lock) { + lock->unlock(); +} + +#else + +// fall back to poor man's locks. +// this should only be the case in a single-threaded environment (like __wasi__) + +#define mi_lock_t _Atomic(uintptr_t) + +static inline bool _mi_prim_try_lock(mi_lock_t* lock) { + uintptr_t expected = 0; + return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1); +} + +static inline bool _mi_prim_lock(mi_lock_t* lock) { + for (int i = 0; i < 1000; i++) { // for at most 1000 tries? + if (_mi_prim_try_lock(lock)) return true; + mi_atomic_yield(); + } + return true; +} + +static inline void _mi_prim_unlock(mi_lock_t* lock) { + mi_atomic_store_release(lock, (uintptr_t)0); +} + +#endif + + + + #endif // __MIMALLOC_ATOMIC_H diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index ba305dc1..640c966f 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -114,23 +114,6 @@ void _mi_prim_thread_done_auto_done(void); // Called when the default heap for a thread changes void _mi_prim_thread_associate_default_heap(mi_heap_t* heap); -// Locks are only used if abandoned segment visiting is permitted -#if defined(_WIN32) -#define mi_lock_t CRITICAL_SECTION -#elif defined(MI_USE_PTHREADS) -#define mi_lock_t pthread_mutex_t -#else -#define mi_lock_t _Atomic(uintptr_t) -#endif - -// Take a lock (blocking). Return `true` on success. -bool _mi_prim_lock(mi_lock_t* lock); - -// Try to take lock and return `true` if successful. -bool _mi_prim_try_lock(mi_lock_t* lock); - -// Release a lock. -void _mi_prim_unlock(mi_lock_t* lock); //------------------------------------------------------------------- diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 6b90bf5d..f4ba6739 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -612,8 +612,6 @@ struct mi_subproc_s { mi_memid_t memid; // provenance }; -mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id); - // ------------------------------------------------------ // Thread Local data // ------------------------------------------------------ diff --git a/src/init.c b/src/init.c index 1922907b..01625891 100644 --- a/src/init.c +++ b/src/init.c @@ -193,7 +193,7 @@ mi_subproc_id_t mi_subproc_new(void) { return subproc; } -mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id) { +static mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id) { return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id); } diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c index 6b5aa452..944c0cb4 100644 --- a/src/prim/emscripten/prim.c +++ b/src/prim/emscripten/prim.c @@ -242,50 +242,3 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { } #endif - -//---------------------------------------------------------------- -// Locks -//---------------------------------------------------------------- - -#if defined(MI_USE_PTHREADS) - -bool _mi_prim_lock(mi_lock_t* lock) { - return (pthread_mutex_lock(lock) == 0); -} - -bool _mi_prim_try_lock(mi_lock_t* lock) { - return (pthread_mutex_trylock(lock) == 0); -} - -void _mi_prim_unlock(mi_lock_t* lock) { - pthread_mutex_unlock(lock); -} - -#else - -#include - -// fall back to poor man's locks. -bool _mi_prim_lock(mi_lock_t* lock) { - for(int i = 0; i < 1000; i++) { // for at most 1 second? - if (_mi_prim_try_lock(lock)) return true; - if (i < 25) { - mi_atomic_yield(); // first yield a bit - } - else { - emscripten_sleep(1); // then sleep for 1ms intervals - } - } - return true; -} - -bool _mi_prim_try_lock(mi_lock_t* lock) { - uintptr_t expected = 0; - return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1); -} - -void _mi_prim_unlock(mi_lock_t* lock) { - mi_atomic_store_release(lock,(uintptr_t)0); -} - -#endif \ No newline at end of file diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 7935c1c6..93785b22 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -22,7 +22,6 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc/internal.h" -#include "mimalloc/atomic.h" #include "mimalloc/prim.h" #include // mmap @@ -880,49 +879,3 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { } #endif - - -//---------------------------------------------------------------- -// Locks -//---------------------------------------------------------------- - -#if defined(MI_USE_PTHREADS) - -bool _mi_prim_lock(mi_lock_t* lock) { - return (pthread_mutex_lock(lock) == 0); -} - -bool _mi_prim_try_lock(mi_lock_t* lock) { - return (pthread_mutex_trylock(lock) == 0); -} - -void _mi_prim_unlock(mi_lock_t* lock) { - pthread_mutex_unlock(lock); -} - -#else - -// fall back to poor man's locks. -bool _mi_prim_lock(mi_lock_t* lock) { - for(int i = 0; i < 1000; i++) { // for at most 1 second? - if (_mi_prim_try_lock(lock)) return true; - if (i < 25) { - mi_atomic_yield(); // first yield a bit - } - else { - usleep(1000); // then sleep for 1ms intervals - } - } - return true; -} - -bool _mi_prim_try_lock(mi_lock_t* lock) { - uintptr_t expected = 0; - return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1); -} - -void _mi_prim_unlock(mi_lock_t* lock) { - mi_atomic_store_release(lock,(uintptr_t)0); -} - -#endif diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c index 3f3a2ea1..5d7a8132 100644 --- a/src/prim/wasi/prim.c +++ b/src/prim/wasi/prim.c @@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc/internal.h" -#include "mimalloc/atomic.h" #include "mimalloc/prim.h" #include // fputs @@ -278,43 +277,3 @@ void _mi_prim_thread_done_auto_done(void) { void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); } - -//---------------------------------------------------------------- -// Locks -//---------------------------------------------------------------- - -#if defined(MI_USE_PTHREADS) - -bool _mi_prim_lock(mi_lock_t* lock) { - return (pthread_mutex_lock(lock) == 0); -} - -bool _mi_prim_try_lock(mi_lock_t* lock) { - return (pthread_mutex_trylock(lock) == 0); -} - -void _mi_prim_unlock(mi_lock_t* lock) { - pthread_mutex_unlock(lock); -} - -#else - -// fall back to poor man's locks. -bool _mi_prim_lock(mi_lock_t* lock) { - for(int i = 0; i < 1000; i++) { // for at most 1 second? - if (_mi_prim_try_lock(lock)) return true; - mi_atomic_yield(); // this should never happen as wasi is single threaded? - } - return true; -} - -bool _mi_prim_try_lock(mi_lock_t* lock) { - uintptr_t expected = 0; - return mi_atomic_cas_strong_acq_rel(lock,&expected,(uintptr_t)1); -} - -void _mi_prim_unlock(mi_lock_t* lock) { - mi_atomic_store_release(lock,(uintptr_t)0); -} - -#endif \ No newline at end of file diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 760debb3..bd874f9b 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -9,7 +9,6 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc/internal.h" -#include "mimalloc/atomic.h" #include "mimalloc/prim.h" #include // fputs, stderr @@ -563,24 +562,6 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { } -//---------------------------------------------------------------- -// Locks -//---------------------------------------------------------------- - -bool _mi_prim_lock(mi_lock_t* lock) { - EnterCriticalSection(lock); - return true; -} - -bool _mi_prim_try_lock(mi_lock_t* lock) { - return TryEnterCriticalSection(lock); -} - -void _mi_prim_unlock(mi_lock_t* lock) { - LeaveCriticalSection(lock); -} - - //---------------------------------------------------------------- // Random //---------------------------------------------------------------- From 8f874555d5d42c4e1006bfc78f6cadfb167b1e30 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 07:47:08 -0700 Subject: [PATCH 07/18] add initial support for visiting abandoned segments per subprocess, upstream for python/cpython#114133 --- include/mimalloc.h | 11 +++-- include/mimalloc/atomic.h | 83 ++++++++++++++++++++------------- include/mimalloc/internal.h | 10 ++-- src/arena.c | 93 +++++++++++++++++++++++++++---------- src/heap.c | 45 ++++++++++-------- src/init.c | 14 ++++-- src/options.c | 5 ++ src/segment.c | 33 ++++++++++++- 8 files changed, 206 insertions(+), 88 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 26bb849d..9fc770cc 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -262,7 +262,7 @@ typedef struct mi_heap_area_s { typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg); -mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_blocks, mi_block_visit_fun* visitor, void* arg); +mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); // Experimental mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept; @@ -292,9 +292,13 @@ mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t a // Experimental: allow sub-processes whose memory segments stay separated (and no reclamation between them) // Used for example for separate interpreter's in one process. typedef void* mi_subproc_id_t; +mi_decl_export mi_subproc_id_t mi_subproc_main(void); mi_decl_export mi_subproc_id_t mi_subproc_new(void); -mi_decl_export void mi_subproc_delete(mi_subproc_id_t subproc); -mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet) +mi_decl_export void mi_subproc_delete(mi_subproc_id_t subproc); +mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet) + +// Experimental: visit abandoned heap areas (from threads that have been terminated) +mi_decl_export bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); // deprecated mi_decl_export int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept; @@ -355,6 +359,7 @@ typedef enum mi_option_e { mi_option_abandoned_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) mi_option_disallow_arena_alloc, // 1 = do not use arena's for allocation (except if using specific arena id's) mi_option_retry_on_oom, // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows) + mi_option_visit_abandoned, // allow visiting heap blocks from abandoned threads (=0) _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index 4e3250f9..d2711019 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -14,7 +14,7 @@ terms of the MIT license. A copy of the license can be found in the file #define WIN32_LEAN_AND_MEAN #endif #include -#elif !defined(_WIN32) && (defined(__EMSCRIPTEN_SHARED_MEMORY__) || !defined(__wasi__)) +#elif !defined(__wasi__) && (!defined(__EMSCRIPTEN__) || defined(__EMSCRIPTEN_PTHREADS__)) #define MI_USE_PTHREADS #include #endif @@ -35,9 +35,9 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_atomic(name) std::atomic_##name #define mi_memory_order(name) std::memory_order_##name #if (__cplusplus >= 202002L) // c++20, see issue #571 -#define MI_ATOMIC_VAR_INIT(x) x + #define MI_ATOMIC_VAR_INIT(x) x #elif !defined(ATOMIC_VAR_INIT) -#define MI_ATOMIC_VAR_INIT(x) x + #define MI_ATOMIC_VAR_INIT(x) x #else #define MI_ATOMIC_VAR_INIT(x) ATOMIC_VAR_INIT(x) #endif @@ -337,6 +337,7 @@ typedef _Atomic(uintptr_t) mi_atomic_guard_t; // ---------------------------------------------------------------------- // Yield // ---------------------------------------------------------------------- + #if defined(__cplusplus) #include static inline void mi_atomic_yield(void) { @@ -401,59 +402,73 @@ static inline void mi_atomic_yield(void) { // ---------------------------------------------------------------------- -// Locks are only used for abandoned segment visiting +// Locks are only used for abandoned segment visiting in `arena.c` // ---------------------------------------------------------------------- + #if defined(_WIN32) -#define mi_lock_t CRITICAL_SECTION +#define mi_lock_t CRITICAL_SECTION -static inline bool _mi_prim_lock(mi_lock_t* lock) { +static inline bool mi_lock_try_acquire(mi_lock_t* lock) { + return TryEnterCriticalSection(lock); +} +static inline bool mi_lock_acquire(mi_lock_t* lock) { EnterCriticalSection(lock); return true; } - -static inline bool _mi_prim_try_lock(mi_lock_t* lock) { - return TryEnterCriticalSection(lock); -} - -static inline void _mi_prim_unlock(mi_lock_t* lock) { +static inline void mi_lock_release(mi_lock_t* lock) { LeaveCriticalSection(lock); } +static inline void mi_lock_init(mi_lock_t* lock) { + InitializeCriticalSection(lock); +} +static inline void mi_lock_done(mi_lock_t* lock) { + DeleteCriticalSection(lock); +} #elif defined(MI_USE_PTHREADS) #define mi_lock_t pthread_mutex_t -static inline bool _mi_prim_lock(mi_lock_t* lock) { - return (pthread_mutex_lock(lock) == 0); -} - -static inline bool _mi_prim_try_lock(mi_lock_t* lock) { +static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return (pthread_mutex_trylock(lock) == 0); } - -static inline void _mi_prim_unlock(mi_lock_t* lock) { +static inline bool mi_lock_acquire(mi_lock_t* lock) { + return (pthread_mutex_lock(lock) == 0); +} +static inline void mi_lock_release(mi_lock_t* lock) { pthread_mutex_unlock(lock); } +static inline void mi_lock_init(mi_lock_t* lock) { + (void)(lock); +} +static inline void mi_lock_done(mi_lock_t* lock) { + (void)(lock); +} + #elif defined(__cplusplus) #include #define mi_lock_t std::mutex -static inline bool _mi_prim_lock(mi_lock_t* lock) { +static inline bool mi_lock_try_acquire(mi_lock_t* lock) { + return lock->lock_try_acquire(); +} +static inline bool mi_lock_acquire(mi_lock_t* lock) { lock->lock(); return true; } - -static inline bool _mi_prim_try_lock(mi_lock_t* lock) { - return (lock->try_lock(); -} - -static inline void _mi_prim_unlock(mi_lock_t* lock) { +static inline void mi_lock_release(mi_lock_t* lock) { lock->unlock(); } +static inline void mi_lock_init(mi_lock_t* lock) { + (void)(lock); +} +static inline void mi_lock_done(mi_lock_t* lock) { + (void)(lock); +} #else @@ -462,22 +477,26 @@ static inline void _mi_prim_unlock(mi_lock_t* lock) { #define mi_lock_t _Atomic(uintptr_t) -static inline bool _mi_prim_try_lock(mi_lock_t* lock) { +static inline bool mi_lock_try_acquire(mi_lock_t* lock) { uintptr_t expected = 0; return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1); } - -static inline bool _mi_prim_lock(mi_lock_t* lock) { +static inline bool mi_lock_acquire(mi_lock_t* lock) { for (int i = 0; i < 1000; i++) { // for at most 1000 tries? - if (_mi_prim_try_lock(lock)) return true; + if (mi_lock_try_acquire(lock)) return true; mi_atomic_yield(); } return true; } - -static inline void _mi_prim_unlock(mi_lock_t* lock) { +static inline void mi_lock_release(mi_lock_t* lock) { mi_atomic_store_release(lock, (uintptr_t)0); } +static inline void mi_lock_init(mi_lock_t* lock) { + mi_lock_release(lock); +} +static inline void mi_lock_done(mi_lock_t* lock) { + (void)(lock); +} #endif diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 9046e3ad..89f04103 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -79,11 +79,12 @@ extern mi_decl_cache_align const mi_page_t _mi_page_empty; bool _mi_is_main_thread(void); size_t _mi_current_thread_count(void); bool _mi_preloading(void); // true while the C runtime is not initialized yet -mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; -mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); +mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; +mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap +mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); // os.c void _mi_os_init(void); // called from process init @@ -136,7 +137,7 @@ typedef struct mi_arena_field_cursor_s { // abstract struct mi_subproc_t* subproc; } mi_arena_field_cursor_t; void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current); -mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous); +mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous, bool visit_all); // "segment-map.c" void _mi_segment_map_allocated_at(const mi_segment_t* segment); @@ -158,6 +159,7 @@ void _mi_segments_collect(bool force, mi_segments_tld_t* tld); void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld); void _mi_abandoned_await_readers(void); bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment); +bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); // "page.c" void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; @@ -189,6 +191,8 @@ void _mi_heap_set_default_direct(mi_heap_t* heap); bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid); void _mi_heap_unsafe_destroy_all(void); mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag); +void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page); +bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg); // "stats.c" void _mi_stats_done(mi_stats_t* stats); diff --git a/src/arena.c b/src/arena.c index aeadd604..59514950 100644 --- a/src/arena.c +++ b/src/arena.c @@ -40,23 +40,24 @@ typedef uintptr_t mi_block_info_t; // A memory arena descriptor typedef struct mi_arena_s { - mi_arena_id_t id; // arena id; 0 for non-specific - mi_memid_t memid; // memid of the memory area - _Atomic(uint8_t*) start; // the start of the memory area - size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) - size_t field_count; // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`) - size_t meta_size; // size of the arena structure itself (including its bitmaps) - mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) - int numa_node; // associated NUMA node - bool exclusive; // only allow allocations if specifically for this arena - bool is_large; // memory area consists of large- or huge OS pages (always committed) - _Atomic(size_t) search_idx; // optimization to start the search for free blocks - _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. - mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? - mi_bitmap_field_t* blocks_committed; // are the blocks committed? (can be NULL for memory that cannot be decommitted) - mi_bitmap_field_t* blocks_purge; // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted) - mi_bitmap_field_t* blocks_abandoned; // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here) - mi_bitmap_field_t blocks_inuse[1]; // in-place bitmap of in-use blocks (of size `field_count`) + mi_arena_id_t id; // arena id; 0 for non-specific + mi_memid_t memid; // memid of the memory area + _Atomic(uint8_t*) start; // the start of the memory area + size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) + size_t field_count; // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`) + size_t meta_size; // size of the arena structure itself (including its bitmaps) + mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) + int numa_node; // associated NUMA node + bool exclusive; // only allow allocations if specifically for this arena + bool is_large; // memory area consists of large- or huge OS pages (always committed) + mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited + _Atomic(size_t) search_idx; // optimization to start the search for free blocks + _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. + mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? + mi_bitmap_field_t* blocks_committed; // are the blocks committed? (can be NULL for memory that cannot be decommitted) + mi_bitmap_field_t* blocks_purge; // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted) + mi_bitmap_field_t* blocks_abandoned; // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here) + mi_bitmap_field_t blocks_inuse[1]; // in-place bitmap of in-use blocks (of size `field_count`) // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields. } mi_arena_t; @@ -65,7 +66,6 @@ typedef struct mi_arena_s { static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 - //static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept; /* ----------------------------------------------------------- @@ -702,6 +702,7 @@ static void mi_arenas_unsafe_destroy(void) { for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL) { + mi_lock_done(&arena->abandoned_visit_lock); if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) { mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); @@ -813,9 +814,9 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) // start a cursor at a randomized arena void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current) { - mi_assert_internal(heap->tld->segments.subproc == subproc); + mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc); const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena)); + current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena)); current->count = 0; current->bitmap_idx = 0; current->subproc = subproc; @@ -823,7 +824,7 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_aren // reclaim abandoned segments // this does not set the thread id (so it appears as still abandoned) -mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous ) +mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous, bool visit_all ) { const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count); if (max_arena <= 0 || mi_atomic_load_relaxed(&previous->subproc->abandoned_count) == 0) return NULL; @@ -831,18 +832,31 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr int count = previous->count; size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx); size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1; - // visit arena's (from previous) + // visit arena's (from the previous cursor) for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) { mi_arena_id_t arena_idx = previous->start + count; if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); if (arena != NULL) { + bool has_lock = false; // visit the abandoned fields (starting at previous_idx) - for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) { + for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) { size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]); if mi_unlikely(field != 0) { // skip zero fields quickly + // we only take the arena lock if there are actually abandoned segments present + if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) { + has_lock = (visit_all ? mi_lock_acquire(&arena->abandoned_visit_lock) : mi_lock_try_acquire(&arena->abandoned_visit_lock)); + if (!has_lock) { + if (visit_all) { + _mi_error_message(EINVAL, "failed to visit all abandoned segments due to failure to acquire the visitor lock"); + } + // skip to next arena + break; + } + } + mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned)); // visit each set bit in the field (todo: maybe use `ctz` here?) - for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) { + for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) { // pre-check if the bit is set size_t mask = ((size_t)1 << bit_idx); if mi_unlikely((field & mask) == mask) { @@ -852,7 +866,10 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); - // check that belongs to our sub-process + // check that the segment belongs to our sub-process + // note: this is the reason we need a lock in the case abandoned visiting is enabled. + // without the lock an abandoned visit may otherwise fail to visit all segments. + // for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the arena lock. if (segment->subproc != previous->subproc) { // it is from another subprocess, re-mark it and continue searching const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); @@ -865,6 +882,7 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr previous->count = count; //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); + if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } return segment; } } @@ -872,6 +890,7 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr } } } + if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } } } // no more found @@ -881,6 +900,29 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr } +static bool mi_arena_visit_abandoned_blocks(mi_subproc_t* subproc, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { + mi_arena_field_cursor_t current; + _mi_arena_field_cursor_init(NULL, subproc, ¤t); + mi_segment_t* segment; + while ((segment = _mi_arena_segment_clear_abandoned_next(¤t, true /* visit all */)) != NULL) { + if (!_mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg)) return false; + } + return true; +} + +bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { + // (unfortunately) the visit_abandoned option must be enabled from the start. + // This is to avoid taking locks if abandoned list visiting is not required (as for most programs) + if (!mi_option_is_enabled(mi_option_visit_abandoned)) { + mi_assert(false); + _mi_error_message(EINVAL, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON"); + return false; + } + // visit abandoned segments in the arena's + return mi_arena_visit_abandoned_blocks(_mi_subproc_from_id(subproc_id), heap_tag, visit_blocks, visitor, arg); +} + + /* ----------------------------------------------------------- Add an arena. ----------------------------------------------------------- */ @@ -934,6 +976,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->is_large = is_large; arena->purge_expire = 0; arena->search_idx = 0; + mi_lock_init(&arena->abandoned_visit_lock); // consecutive bitmaps arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap diff --git a/src/heap.c b/src/heap.c index f6f23549..2cde5fb0 100644 --- a/src/heap.c +++ b/src/heap.c @@ -137,6 +137,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) { // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. // if all memory is freed by now, all segments should be freed. + // note: this only collects in the current subprocess _mi_abandoned_reclaim_all(heap, &heap->tld->segments); } @@ -515,17 +516,21 @@ bool mi_check_owned(const void* p) { enable visiting all blocks of all heaps across threads ----------------------------------------------------------- */ -// Separate struct to keep `mi_page_t` out of the public interface -typedef struct mi_heap_area_ex_s { - mi_heap_area_t area; - mi_page_t* page; -} mi_heap_area_ex_t; +void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) { + const size_t bsize = mi_page_block_size(page); + const size_t ubsize = mi_page_usable_block_size(page); + area->reserved = page->reserved * bsize; + area->committed = page->capacity * bsize; + area->blocks = mi_page_start(page); + area->used = page->used; // number of blocks in use (#553) + area->block_size = ubsize; + area->full_block_size = bsize; +} -static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_visit_fun* visitor, void* arg) { - mi_assert(xarea != NULL); - if (xarea==NULL) return true; - const mi_heap_area_t* area = &xarea->area; - mi_page_t* page = xarea->page; + +bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) { + mi_assert(area != NULL); + if (area==NULL) return true; mi_assert(page != NULL); if (page == NULL) return true; @@ -590,23 +595,23 @@ static bool mi_heap_area_visit_blocks(const mi_heap_area_ex_t* xarea, mi_block_v return true; } -typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg); +// Separate struct to keep `mi_page_t` out of the public interface +typedef struct mi_heap_area_ex_s { + mi_heap_area_t area; + mi_page_t* page; +} mi_heap_area_ex_t; + +typedef bool (mi_heap_area_visit_fun)(const mi_heap_t* heap, const mi_heap_area_ex_t* area, void* arg); + static bool mi_heap_visit_areas_page(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* vfun, void* arg) { MI_UNUSED(heap); MI_UNUSED(pq); mi_heap_area_visit_fun* fun = (mi_heap_area_visit_fun*)vfun; mi_heap_area_ex_t xarea; - const size_t bsize = mi_page_block_size(page); - const size_t ubsize = mi_page_usable_block_size(page); xarea.page = page; - xarea.area.reserved = page->reserved * bsize; - xarea.area.committed = page->capacity * bsize; - xarea.area.blocks = mi_page_start(page); - xarea.area.used = page->used; // number of blocks in use (#553) - xarea.area.block_size = ubsize; - xarea.area.full_block_size = bsize; + _mi_heap_area_init(&xarea.area, page); return fun(heap, &xarea, arg); } @@ -627,7 +632,7 @@ static bool mi_heap_area_visitor(const mi_heap_t* heap, const mi_heap_area_ex_t* mi_visit_blocks_args_t* args = (mi_visit_blocks_args_t*)arg; if (!args->visitor(heap, &xarea->area, NULL, xarea->area.block_size, args->arg)) return false; if (args->visit_blocks) { - return mi_heap_area_visit_blocks(xarea, args->visitor, args->arg); + return _mi_heap_area_visit_blocks(&xarea->area, xarea->page, args->visitor, args->arg); } else { return true; diff --git a/src/init.c b/src/init.c index 01625891..be8c16de 100644 --- a/src/init.c +++ b/src/init.c @@ -185,22 +185,30 @@ mi_heap_t* _mi_heap_main_get(void) { Sub process ----------------------------------------------------------- */ +static mi_decl_cache_align _Atomic(uintptr_t) mi_subproc_count; + +mi_subproc_id_t mi_subproc_main(void) { + return NULL; +} + mi_subproc_id_t mi_subproc_new(void) { mi_memid_t memid = _mi_memid_none(); mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid); if (subproc == NULL) return NULL; + mi_atomic_increment_relaxed(&mi_subproc_count); subproc->memid = memid; return subproc; } -static mi_subproc_t* mi_subproc_from_id(mi_subproc_id_t subproc_id) { +mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) { return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id); } void mi_subproc_delete(mi_subproc_id_t subproc_id) { if (subproc_id == NULL) return; - mi_subproc_t* subproc = mi_subproc_from_id(subproc_id); + mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id); _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t)); + mi_atomic_decrement_relaxed(&mi_subproc_count); } void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { @@ -208,7 +216,7 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { if (heap == NULL) return; mi_assert(heap->tld->segments.subproc == &mi_subproc_default); if (heap->tld->segments.subproc != &mi_subproc_default) return; - heap->tld->segments.subproc = mi_subproc_from_id(subproc_id); + heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id); } diff --git a/src/options.c b/src/options.c index db6e040f..32fa212b 100644 --- a/src/options.c +++ b/src/options.c @@ -94,6 +94,11 @@ static mi_option_desc_t options[_mi_option_last] = { 1, UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { 0, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. +#if defined(MI_VISIT_ABANDONED) + { 1, INITIALIZED, MI_OPTION(visit_abandoned) }, // allow visiting heap blocks in abandonded segments; requires taking locks during reclaim. +#else + { 0, UNINIT, MI_OPTION(visit_abandoned) }, +#endif }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/segment.c b/src/segment.c index 205d8753..dc82b89d 100644 --- a/src/segment.c +++ b/src/segment.c @@ -962,7 +962,7 @@ bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) { void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) { mi_segment_t* segment; mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, ¤t); - while ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL) { + while ((segment = _mi_arena_segment_clear_abandoned_next(¤t, true /* blocking */)) != NULL) { mi_segment_reclaim(segment, heap, 0, NULL, tld); } } @@ -987,7 +987,7 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_segment_t* segment; mi_arena_field_cursor_t current; _mi_arena_field_cursor_init(heap, tld->subproc, ¤t); - while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL)) + while ((max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(¤t, false /* non-blocking */)) != NULL)) { mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process segment->abandoned_visits++; @@ -1240,3 +1240,32 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t pag mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); return page; } + + +/* ----------------------------------------------------------- + Visit blocks in a segment (only used for abandoned segments) +----------------------------------------------------------- */ + +static bool mi_segment_visit_page(mi_page_t* page, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { + mi_heap_area_t area; + _mi_heap_area_init(&area, page); + if (!visitor(NULL, &area, NULL, area.block_size, arg)) return false; + if (visit_blocks) { + return _mi_heap_area_visit_blocks(&area, page, visitor, arg); + } + else { + return true; + } +} + +bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { + for (size_t i = 0; i < segment->capacity; i++) { + mi_page_t* const page = &segment->pages[i]; + if (page->segment_in_use) { + if (heap_tag < 0 || (int)page->heap_tag == heap_tag) { + if (!mi_segment_visit_page(page, visit_blocks, visitor, arg)) return false; + } + } + } + return true; +} From 855e3b2549e0f2aa0277e43c4eeb8b1cbe1ea497 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 09:41:12 -0700 Subject: [PATCH 08/18] add support to visit _all_ abandoned segment blocks per sub-process, upstream for python/cpython#114133 --- include/mimalloc/types.h | 7 +- src/arena.c | 138 ++++++++++++++++++++++++++++----------- src/init.c | 21 ++++-- 3 files changed, 121 insertions(+), 45 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index f4ba6739..2506d454 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -404,7 +404,7 @@ typedef struct mi_segment_s { bool was_reclaimed; // true if it was reclaimed (used to limit on-free reclamation) size_t abandoned; // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`) - size_t abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim if it is too long) + size_t abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long) size_t used; // count of pages in use (`used <= capacity`) size_t capacity; // count of available pages (`#free + used`) @@ -412,6 +412,9 @@ typedef struct mi_segment_s { uintptr_t cookie; // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie` mi_subproc_t* subproc; // segment belongs to sub process + struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled + struct mi_segment_s* abandoned_os_prev; + // layout like this to optimize access in `mi_free` _Atomic(mi_threadid_t) thread_id; // unique id of the thread owning this segment size_t page_shift; // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`). @@ -609,6 +612,8 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); struct mi_subproc_s { _Atomic(size_t) abandoned_count; // count of abandoned segments for this sup-process + mi_lock_t abandoned_os_lock; // lock for the abandoned segments outside of arena's + mi_segment_t* abandoned_os_list; // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory) mi_memid_t memid; // provenance }; diff --git a/src/arena.c b/src/arena.c index 59514950..913a02a9 100644 --- a/src/arena.c +++ b/src/arena.c @@ -757,17 +757,34 @@ bool _mi_arena_contains(const void* p) { // sets the thread_id. bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment ) { - if (segment->memid.memkind != MI_MEM_ARENA) { - // not in an arena, consider it un-abandoned now. - // but we need to still claim it atomically -- we use the thread_id for that. + if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) { + // not in an arena + // if abandoned visiting is allowed, we need to take a lock on the abandoned os list + bool has_lock = false; + if (mi_option_is_enabled(mi_option_visit_abandoned)) { + has_lock = mi_lock_try_acquire(&segment->subproc->abandoned_os_lock); + if (!has_lock) { + return false; // failed to acquire the lock, we just give up + } + } + // abandon it, but we need to still claim it atomically -- we use the thread_id for that. + bool reclaimed = false; size_t expected = 0; if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected, _mi_thread_id())) { + // reclaim mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count); - return true; - } - else { - return false; + reclaimed = true; + // and remove from the abandoned os list (if needed) + mi_segment_t* const next = segment->abandoned_os_next; + mi_segment_t* const prev = segment->abandoned_os_prev; + if (prev != NULL) { prev->abandoned_os_next = next; } + else { segment->subproc->abandoned_os_list = next; } + if (next != NULL) { next->abandoned_os_prev = prev; } + segment->abandoned_os_next = NULL; + segment->abandoned_os_prev = NULL; } + if (has_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); } + return reclaimed; } // arena segment: use the blocks_abandoned bitmap. size_t arena_idx; @@ -794,12 +811,30 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) { mi_atomic_store_release(&segment->thread_id, 0); mi_assert_internal(segment->used == segment->abandoned); - if (segment->memid.memkind != MI_MEM_ARENA) { - // not in an arena; count it as abandoned and return + if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) { + // not in an arena; count it as abandoned and return (these can be reclaimed on a `free`) mi_atomic_increment_relaxed(&segment->subproc->abandoned_count); + // if abandoned visiting is allowed, we need to take a lock on the abandoned os list to insert it + if (mi_option_is_enabled(mi_option_visit_abandoned)) { + if (!mi_lock_acquire(&segment->subproc->abandoned_os_lock)) { + _mi_error_message(EFAULT, "internal error: failed to acquire the abandoned (os) segment lock to mark abandonment"); + } + else { + // push on the front of the list + mi_segment_t* next = segment->subproc->abandoned_os_list; + mi_assert_internal(next == NULL || next->abandoned_os_prev == NULL); + mi_assert_internal(segment->abandoned_os_prev == NULL); + mi_assert_internal(segment->abandoned_os_next == NULL); + if (next != NULL) { next->abandoned_os_prev = segment; } + segment->abandoned_os_prev = NULL; + segment->abandoned_os_next = next; + segment->subproc->abandoned_os_list = segment; + mi_lock_release(&segment->subproc->abandoned_os_lock); + } + } return; } - // segment is in an arena + // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap size_t arena_idx; size_t bitmap_idx; mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); @@ -822,6 +857,29 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_aren current->subproc = subproc; } +static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) { + // try to reclaim an abandoned segment in the arena atomically + if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL; + mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); + mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); + mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); + // check that the segment belongs to our sub-process + // note: this is the reason we need a lock in the case abandoned visiting is enabled. + // without the lock an abandoned visit may otherwise fail to visit all segments. + // for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the arena lock. + if (segment->subproc != subproc) { + // it is from another subprocess, re-mark it and continue searching + const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); + mi_assert_internal(was_zero); MI_UNUSED(was_zero); + return NULL; + } + else { + // success, we unabandoned a segment in our sub-process + mi_atomic_decrement_relaxed(&subproc->abandoned_count); + return segment; + } +} + // reclaim abandoned segments // this does not set the thread id (so it appears as still abandoned) mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous, bool visit_all ) @@ -848,7 +906,7 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr has_lock = (visit_all ? mi_lock_acquire(&arena->abandoned_visit_lock) : mi_lock_try_acquire(&arena->abandoned_visit_lock)); if (!has_lock) { if (visit_all) { - _mi_error_message(EINVAL, "failed to visit all abandoned segments due to failure to acquire the visitor lock"); + _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock"); } // skip to next arena break; @@ -860,31 +918,14 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr // pre-check if the bit is set size_t mask = ((size_t)1 << bit_idx); if mi_unlikely((field & mask) == mask) { - mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); - // try to reclaim it atomically - if (_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) { - mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); - mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); - mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); - // check that the segment belongs to our sub-process - // note: this is the reason we need a lock in the case abandoned visiting is enabled. - // without the lock an abandoned visit may otherwise fail to visit all segments. - // for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the arena lock. - if (segment->subproc != previous->subproc) { - // it is from another subprocess, re-mark it and continue searching - const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); - mi_assert_internal(was_zero); - } - else { - // success, we unabandoned a segment in our sub-process - mi_atomic_decrement_relaxed(&previous->subproc->abandoned_count); - previous->bitmap_idx = bitmap_idx; - previous->count = count; - - //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); - if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } - return segment; - } + const mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); + mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx); + if (segment != NULL) { + previous->bitmap_idx = bitmap_idx; + previous->count = count; + //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); + if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } + return segment; } } } @@ -910,16 +951,35 @@ static bool mi_arena_visit_abandoned_blocks(mi_subproc_t* subproc, int heap_tag, return true; } +static bool mi_subproc_visit_abandoned_os_blocks(mi_subproc_t* subproc, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { + if (!mi_lock_acquire(&subproc->abandoned_os_lock)) { + _mi_error_message(EFAULT, "internal error: failed to acquire abandoned (OS) segment lock"); + return false; + } + bool all_visited = true; + for (mi_segment_t* segment = subproc->abandoned_os_list; segment != NULL; segment = segment->abandoned_os_next) { + if (!_mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg)) { + all_visited = false; + break; + } + } + mi_lock_release(&subproc->abandoned_os_lock); + return all_visited; +} + bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { // (unfortunately) the visit_abandoned option must be enabled from the start. // This is to avoid taking locks if abandoned list visiting is not required (as for most programs) if (!mi_option_is_enabled(mi_option_visit_abandoned)) { - mi_assert(false); - _mi_error_message(EINVAL, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON"); + _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON"); return false; } + mi_subproc_t* const subproc = _mi_subproc_from_id(subproc_id); // visit abandoned segments in the arena's - return mi_arena_visit_abandoned_blocks(_mi_subproc_from_id(subproc_id), heap_tag, visit_blocks, visitor, arg); + if (!mi_arena_visit_abandoned_blocks(subproc, heap_tag, visit_blocks, visitor, arg)) return false; + // and visit abandoned segments outside arena's (in OS allocated memory) + if (!mi_subproc_visit_abandoned_os_blocks(subproc, heap_tag, visit_blocks, visitor, arg)) return false; + return true; } diff --git a/src/init.c b/src/init.c index be8c16de..f2d99d9e 100644 --- a/src/init.c +++ b/src/init.c @@ -171,7 +171,8 @@ static void mi_heap_main_init(void) { #endif _mi_heap_main.cookie = _mi_heap_random_next(&_mi_heap_main); _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main); - _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main); + _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main); + mi_lock_init(&mi_subproc_default.abandoned_os_lock); } } @@ -185,8 +186,6 @@ mi_heap_t* _mi_heap_main_get(void) { Sub process ----------------------------------------------------------- */ -static mi_decl_cache_align _Atomic(uintptr_t) mi_subproc_count; - mi_subproc_id_t mi_subproc_main(void) { return NULL; } @@ -195,8 +194,9 @@ mi_subproc_id_t mi_subproc_new(void) { mi_memid_t memid = _mi_memid_none(); mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid); if (subproc == NULL) return NULL; - mi_atomic_increment_relaxed(&mi_subproc_count); subproc->memid = memid; + subproc->abandoned_os_list = NULL; + mi_lock_init(&subproc->abandoned_os_lock); return subproc; } @@ -207,8 +207,19 @@ mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) { void mi_subproc_delete(mi_subproc_id_t subproc_id) { if (subproc_id == NULL) return; mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id); + // check if there are no abandoned segments still.. + bool safe_to_delete = false; + if (mi_lock_acquire(&subproc->abandoned_os_lock)) { + if (subproc->abandoned_os_list == NULL) { + safe_to_delete = true; + } + mi_lock_release(&subproc->abandoned_os_lock); + } + if (!safe_to_delete) return; + // safe to release + // todo: should we refcount subprocesses? + mi_lock_done(&subproc->abandoned_os_lock); _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t)); - mi_atomic_decrement_relaxed(&mi_subproc_count); } void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { From f7fe5bf20ea8a88f8a55f58549e21dfeadc5dc1f Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 10:28:30 -0700 Subject: [PATCH 09/18] optimize heap walks, by Sam Gross, upstream of python/cpython#114133 --- src/heap.c | 98 ++++++++++++++++++++++++++++++++++------------ test/test-stress.c | 14 +++++++ 2 files changed, 87 insertions(+), 25 deletions(-) diff --git a/src/heap.c b/src/heap.c index 2cde5fb0..be2800c1 100644 --- a/src/heap.c +++ b/src/heap.c @@ -528,46 +528,83 @@ void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) { } +static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) { + mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX); + *shift = 64 - mi_clz(divisor - 1); + *magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1); +} + +static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) { + mi_assert_internal(n <= UINT32_MAX); + return ((((uint64_t)n * magic) >> 32) + n) >> shift; +} + bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) { mi_assert(area != NULL); if (area==NULL) return true; mi_assert(page != NULL); if (page == NULL) return true; - _mi_page_free_collect(page,true); + _mi_page_free_collect(page,true); // collect both thread_delayed and local_free mi_assert_internal(page->local_free == NULL); if (page->used == 0) return true; - const size_t bsize = mi_page_block_size(page); - const size_t ubsize = mi_page_usable_block_size(page); // without padding - size_t psize; - uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize); + size_t psize; + uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize); + mi_heap_t* const heap = mi_page_heap(page); + const size_t bsize = mi_page_block_size(page); + const size_t ubsize = mi_page_usable_block_size(page); // without padding + // optimize page with one block if (page->capacity == 1) { - // optimize page with one block mi_assert_internal(page->used == 1 && page->free == NULL); return visitor(mi_page_heap(page), area, pstart, ubsize, arg); } + mi_assert(bsize <= UINT32_MAX); + + // optimize full pages + if (page->used == page->capacity) { + uint8_t* block = pstart; + for (size_t i = 0; i < page->capacity; i++) { + if (!visitor(heap, area, block, ubsize, arg)) return false; + block += bsize; + } + return true; + } // create a bitmap of free blocks. #define MI_MAX_BLOCKS (MI_SMALL_PAGE_SIZE / sizeof(void*)) - uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)]; - memset(free_map, 0, sizeof(free_map)); + uintptr_t free_map[MI_MAX_BLOCKS / MI_INTPTR_BITS]; + const uintptr_t bmapsize = _mi_divide_up(page->capacity, MI_INTPTR_BITS); + memset(free_map, 0, bmapsize * sizeof(intptr_t)); + if (page->capacity % MI_INTPTR_BITS != 0) { + // mark left-over bits at the end as free + size_t shift = (page->capacity % MI_INTPTR_BITS); + uintptr_t mask = (UINTPTR_MAX << shift); + free_map[bmapsize - 1] = mask; + } + + // fast repeated division by the block size + uint64_t magic; + size_t shift; + mi_get_fast_divisor(bsize, &magic, &shift); #if MI_DEBUG>1 size_t free_count = 0; #endif - for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) { + for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) { #if MI_DEBUG>1 free_count++; #endif mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize)); size_t offset = (uint8_t*)block - pstart; mi_assert_internal(offset % bsize == 0); - size_t blockidx = offset / bsize; // Todo: avoid division? - mi_assert_internal( blockidx < MI_MAX_BLOCKS); - size_t bitidx = (blockidx / sizeof(uintptr_t)); - size_t bit = blockidx - (bitidx * sizeof(uintptr_t)); + mi_assert_internal(offset <= UINT32_MAX); + size_t blockidx = mi_fast_divide(offset, magic, shift); + mi_assert_internal(blockidx == offset / bsize); + mi_assert_internal(blockidx < MI_MAX_BLOCKS); + size_t bitidx = (blockidx / MI_INTPTR_BITS); + size_t bit = blockidx - (bitidx * MI_INTPTR_BITS); free_map[bitidx] |= ((uintptr_t)1 << bit); } mi_assert_internal(page->capacity == (free_count + page->used)); @@ -576,19 +613,30 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_ #if MI_DEBUG>1 size_t used_count = 0; #endif - for (size_t i = 0; i < page->capacity; i++) { - size_t bitidx = (i / sizeof(uintptr_t)); - size_t bit = i - (bitidx * sizeof(uintptr_t)); - uintptr_t m = free_map[bitidx]; - if (bit == 0 && m == UINTPTR_MAX) { - i += (sizeof(uintptr_t) - 1); // skip a run of free blocks + uint8_t* block = pstart; + for (size_t i = 0; i < bmapsize; i++) { + if (free_map[i] == 0) { + // every block is in use + for (size_t j = 0; j < MI_INTPTR_BITS; j++) { + #if MI_DEBUG>1 + used_count++; + #endif + if (!visitor(heap, area, block, ubsize, arg)) return false; + block += bsize; + } } - else if ((m & ((uintptr_t)1 << bit)) == 0) { - #if MI_DEBUG>1 - used_count++; - #endif - uint8_t* block = pstart + (i * bsize); - if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false; + else { + // visit the used blocks in the mask + uintptr_t m = ~free_map[i]; + while (m != 0) { + #if MI_DEBUG>1 + used_count++; + #endif + size_t bitidx = mi_ctz(m); + if (!visitor(heap, area, block + (bitidx * bsize), ubsize, arg)) return false; + m &= m - 1; // clear least significant bit + } + block += bsize * MI_INTPTR_BITS; } } mi_assert_internal(page->used == used_count); diff --git a/test/test-stress.c b/test/test-stress.c index 0368007a..f82b9743 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -129,6 +129,16 @@ static void free_items(void* p) { custom_free(p); } +/* +static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) { + (void)(heap); (void)(area); + size_t* total = (size_t*)arg; + if (block != NULL) { + total += block_size; + } + return true; +} +*/ static void stress(intptr_t tid) { //bench_start_thread(); @@ -173,6 +183,10 @@ static void stress(intptr_t tid) { data[data_idx] = q; } } + // walk the heap + // size_t total = 0; + // mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total); + // free everything that is left for (size_t i = 0; i < retain_top; i++) { free_items(retained[i]); From 635d626c82e636e89163da6601dfe1f02a57e4a9 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 10:43:41 -0700 Subject: [PATCH 10/18] fix leak in abandoned block visiting --- src/arena.c | 4 +++- test/test-stress.c | 19 +++++++++++++++---- 2 files changed, 18 insertions(+), 5 deletions(-) diff --git a/src/arena.c b/src/arena.c index 913a02a9..801475fd 100644 --- a/src/arena.c +++ b/src/arena.c @@ -946,7 +946,9 @@ static bool mi_arena_visit_abandoned_blocks(mi_subproc_t* subproc, int heap_tag, _mi_arena_field_cursor_init(NULL, subproc, ¤t); mi_segment_t* segment; while ((segment = _mi_arena_segment_clear_abandoned_next(¤t, true /* visit all */)) != NULL) { - if (!_mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg)) return false; + bool ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg); + _mi_arena_segment_mark_abandoned(segment); + if (!ok) return false; } return true; } diff --git a/test/test-stress.c b/test/test-stress.c index f82b9743..c3afde9b 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -39,6 +39,10 @@ static int ITER = 50; // N full iterations destructing and re-creating a #define STRESS // undefine for leak test +#ifndef NDEBUG +#define HEAP_WALK // walk the heap objects? +#endif + static bool allow_large_objects = true; // allow very large objects? (set to `true` if SCALE>100) static size_t use_one_size = 0; // use single object size of `N * sizeof(uintptr_t)`? @@ -129,7 +133,7 @@ static void free_items(void* p) { custom_free(p); } -/* +#ifdef HEAP_WALK static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) { (void)(heap); (void)(area); size_t* total = (size_t*)arg; @@ -138,7 +142,7 @@ static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void } return true; } -*/ +#endif static void stress(intptr_t tid) { //bench_start_thread(); @@ -183,9 +187,12 @@ static void stress(intptr_t tid) { data[data_idx] = q; } } + + #ifdef HEAP_WALK // walk the heap - // size_t total = 0; - // mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total); + size_t total = 0; + mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total); + #endif // free everything that is left for (size_t i = 0; i < retain_top; i++) { @@ -205,6 +212,10 @@ static void test_stress(void) { uintptr_t r = rand(); for (int n = 0; n < ITER; n++) { run_os_threads(THREADS, &stress); + #ifdef HEAP_WALK + size_t total = 0; + mi_abandoned_visit_blocks(mi_subproc_main(), -1, true, visit_blocks, &total); + #endif for (int i = 0; i < TRANSFERS; i++) { if (chance(50, &r) || n + 1 == ITER) { // free all on last run, otherwise free half of the transfers void* p = atomic_exchange_ptr(&transfer[i], NULL); From 5501f59f6ce044b33149391132c3dd83b964e710 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 13:16:20 -0700 Subject: [PATCH 11/18] only reclaim for exclusive heaps in their associated arena --- include/mimalloc.h | 5 +++++ include/mimalloc/internal.h | 4 ++-- src/arena.c | 35 +++++++++++++++++++++-------------- src/heap.c | 13 +++++++++---- src/segment.c | 3 ++- test/test-stress.c | 6 ++++++ 6 files changed, 45 insertions(+), 21 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 9fc770cc..0b4b182c 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -300,6 +300,11 @@ mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t sub // Experimental: visit abandoned heap areas (from threads that have been terminated) mi_decl_export bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); +// Experimental: create a new heap with a specified heap tag. Set `allow_destroy` to false to allow the thread +// to reclaim abandoned memory (with a compatible heap_tag and arena_id) but in that case `mi_heap_destroy` will +// fall back to `mi_heap_delete`. +mi_decl_export mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id); + // deprecated mi_decl_export int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept; diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 89f04103..0b6cf056 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -131,8 +131,8 @@ void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid); void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size); typedef struct mi_arena_field_cursor_s { // abstract struct - mi_arena_id_t start; - int count; + size_t start; + size_t end; size_t bitmap_idx; mi_subproc_t* subproc; } mi_arena_field_cursor_t; diff --git a/src/arena.c b/src/arena.c index 801475fd..095c5745 100644 --- a/src/arena.c +++ b/src/arena.c @@ -850,11 +850,20 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) // start a cursor at a randomized arena void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_arena_field_cursor_t* current) { mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc); - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena)); - current->count = 0; - current->bitmap_idx = 0; + current->bitmap_idx = 0; current->subproc = subproc; + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + if (heap != NULL && heap->arena_id != _mi_arena_id_none()) { + // for a heap that is bound to one arena, only visit that arena + current->start = mi_arena_id_index(heap->arena_id); + current->end = current->start + 1; + } + else { + // otherwise visit all starting at a random location + current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena)); + current->end = current->start + max_arena; + } + mi_assert_internal(current->start < max_arena); } static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) { @@ -884,16 +893,15 @@ static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_s // this does not set the thread id (so it appears as still abandoned) mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous, bool visit_all ) { - const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count); + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); if (max_arena <= 0 || mi_atomic_load_relaxed(&previous->subproc->abandoned_count) == 0) return NULL; - int count = previous->count; size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx); size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1; // visit arena's (from the previous cursor) - for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) { - mi_arena_id_t arena_idx = previous->start + count; - if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around + for ( ; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) { + // index wraps around + size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start); mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); if (arena != NULL) { bool has_lock = false; @@ -918,11 +926,9 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr // pre-check if the bit is set size_t mask = ((size_t)1 << bit_idx); if mi_unlikely((field & mask) == mask) { - const mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); - mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx); + previous->bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); + mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, previous->bitmap_idx); if (segment != NULL) { - previous->bitmap_idx = bitmap_idx; - previous->count = count; //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } return segment; @@ -935,8 +941,9 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr } } // no more found + mi_assert(previous->start == previous->end); previous->bitmap_idx = 0; - previous->count = 0; + previous->start = previous->end = 0; return NULL; } diff --git a/src/heap.c b/src/heap.c index be2800c1..0049abc3 100644 --- a/src/heap.c +++ b/src/heap.c @@ -226,17 +226,22 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool heap->tld->heaps = heap; } -mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { +mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) { mi_heap_t* bheap = mi_heap_get_backing(); mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? if (heap == NULL) return NULL; - // don't reclaim abandoned pages or otherwise destroy is unsafe - _mi_heap_init(heap, bheap->tld, arena_id, true /* no reclaim */, 0 /* default tag */); + mi_assert(heap_tag >= 0 && heap_tag < 256); + _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */); return heap; } +mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { + return mi_heap_new_ex(0 /* default heap tag */, false /* don't allow `mi_heap_destroy` */, arena_id); +} + mi_decl_nodiscard mi_heap_t* mi_heap_new(void) { - return mi_heap_new_in_arena(_mi_arena_id_none()); + // don't reclaim abandoned memory or otherwise destroy is unsafe + return mi_heap_new_ex(0 /* default heap tag */, true /* no reclaim */, _mi_arena_id_none()); } bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) { diff --git a/src/segment.c b/src/segment.c index dc82b89d..8fccf18e 100644 --- a/src/segment.c +++ b/src/segment.c @@ -905,7 +905,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag); // allow custom heaps to separate objects if (target_heap == NULL) { target_heap = heap; - _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using tag %u instead)\n", page->heap_tag, heap->tag ); + _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag ); } // associate the heap with this page, and allow heap thread delayed free again. mi_page_set_heap(page, target_heap); @@ -948,6 +948,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) { if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false; // it is not abandoned if (segment->subproc != heap->tld->segments.subproc) return false; // only reclaim within the same subprocess + if (!_mi_heap_memid_is_suitable(heap,segment->memid)) return false; // don't reclaim between exclusive and non-exclusive arena's // don't reclaim more from a `free` call than half the current segments // this is to prevent a pure free-ing thread to start owning too many segments if (heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) return false; diff --git a/test/test-stress.c b/test/test-stress.c index c3afde9b..599c6c2e 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -255,6 +255,12 @@ static void test_leak(void) { #endif int main(int argc, char** argv) { + #ifdef HEAP_WALK + mi_option_enable(mi_option_visit_abandoned); + #endif + #ifndef NDBEBUG + mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); + #endif #ifndef USE_STD_MALLOC mi_stats_reset(); #endif From a964322a21907206909798771ab90a9ccf27f8d8 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 14:46:59 -0700 Subject: [PATCH 12/18] revise the segment map to only apply to OS allocated segments and reduce the .BSS footprint --- src/arena.c | 4 +- src/os.c | 15 ++-- src/segment-map.c | 166 +++++++++++++++++++-------------------------- test/test-stress.c | 2 +- 4 files changed, 82 insertions(+), 105 deletions(-) diff --git a/src/arena.c b/src/arena.c index 095c5745..24f1299c 100644 --- a/src/arena.c +++ b/src/arena.c @@ -36,7 +36,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo typedef uintptr_t mi_block_info_t; #define MI_ARENA_BLOCK_SIZE (MI_SEGMENT_SIZE) // 64MiB (must be at least MI_SEGMENT_ALIGN) #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2) // 32MiB -#define MI_MAX_ARENAS (255) // Limited as the reservation exponentially increases (and takes up .bss) +#define MI_MAX_ARENAS (132) // Limited as the reservation exponentially increases (and takes up .bss) // A memory arena descriptor typedef struct mi_arena_s { @@ -735,7 +735,7 @@ void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { bool _mi_arena_contains(const void* p) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { return true; } diff --git a/src/os.c b/src/os.c index 88e7fcb3..4babd8da 100644 --- a/src/os.c +++ b/src/os.c @@ -157,7 +157,8 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_st _mi_stat_decrease(&stats->reserved, size); } -void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* tld_stats) { +void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats) { + if (stats == NULL) stats = &_mi_stats_main; if (mi_memkind_is_os(memid.memkind)) { size_t csize = _mi_os_good_alloc_size(size); void* base = addr; @@ -171,10 +172,10 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me // free it if (memid.memkind == MI_MEM_OS_HUGE) { mi_assert(memid.is_pinned); - mi_os_free_huge_os_pages(base, csize, tld_stats); + mi_os_free_huge_os_pages(base, csize, stats); } else { - mi_os_prim_free(base, csize, still_committed, tld_stats); + mi_os_prim_free(base, csize, still_committed, stats); } } else { @@ -183,8 +184,9 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me } } -void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* tld_stats) { - _mi_os_free_ex(p, size, true, memid, tld_stats); +void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats) { + if (stats == NULL) stats = &_mi_stats_main; + _mi_os_free_ex(p, size, true, memid, stats); } @@ -299,6 +301,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { *memid = _mi_memid_none(); if (size == 0) return NULL; + if (stats == NULL) stats = &_mi_stats_main; size = _mi_os_good_alloc_size(size); bool os_is_large = false; bool os_is_zero = false; @@ -314,6 +317,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings *memid = _mi_memid_none(); if (size == 0) return NULL; + if (stats == NULL) stats = &_mi_stats_main; size = _mi_os_good_alloc_size(size); alignment = _mi_align_up(alignment, _mi_os_page_size()); @@ -342,6 +346,7 @@ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offse mi_assert(offset <= size); mi_assert((alignment % _mi_os_page_size()) == 0); *memid = _mi_memid_none(); + if (stats == NULL) stats = &_mi_stats_main; if (offset > MI_SEGMENT_SIZE) return NULL; if (offset == 0) { // regular aligned allocation diff --git a/src/segment-map.c b/src/segment-map.c index 1efb1e23..be461d7e 100644 --- a/src/segment-map.c +++ b/src/segment-map.c @@ -16,140 +16,112 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/internal.h" #include "mimalloc/atomic.h" -#if (MI_INTPTR_SIZE>=8) && MI_TRACK_ASAN -#define MI_MAX_ADDRESS ((size_t)140 << 40) // 140TB (see issue #881) -#elif (MI_INTPTR_SIZE >= 8) -#define MI_MAX_ADDRESS ((size_t)40 << 40) // 40TB (to include huge page areas) +// Reduce total address space to reduce .bss (due to the `mi_segment_map`) +#if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN +#define MI_SEGMENT_MAP_MAX_ADDRESS (128*1024ULL*MI_GiB) // 128 TiB (see issue #881) +#elif (MI_INTPTR_SIZE > 4) +#define MI_SEGMENT_MAP_MAX_ADDRESS (48*1024ULL*MI_GiB) // 48 TiB #else -#define MI_MAX_ADDRESS ((size_t)2 << 30) // 2Gb +#define MI_SEGMENT_MAP_MAX_ADDRESS (MAX_UINT32) #endif -#define MI_SEGMENT_MAP_BITS (MI_MAX_ADDRESS / MI_SEGMENT_SIZE) -#define MI_SEGMENT_MAP_SIZE (MI_SEGMENT_MAP_BITS / 8) -#define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE) +#define MI_SEGMENT_MAP_PART_SIZE (MI_INTPTR_SIZE*MI_KiB - 128) // 128 > sizeof(mi_memid_t) ! +#define MI_SEGMENT_MAP_PART_BITS (8*MI_SEGMENT_MAP_PART_SIZE) +#define MI_SEGMENT_MAP_PART_ENTRIES (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE) +#define MI_SEGMENT_MAP_PART_BIT_SPAN (MI_SEGMENT_ALIGN) +#define MI_SEGMENT_MAP_PART_SPAN (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN) +#define MI_SEGMENT_MAP_MAX_PARTS ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1) -static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE + 1]; // 2KiB per TB with 64MiB segments +// A part of the segment map. +typedef struct mi_segmap_part_s { + mi_memid_t memid; + _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES]; +} mi_segmap_part_t; -static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) { +// Allocate parts on-demand to reduce .bss footprint +_Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. } + + +static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) { // note: segment can be invalid or NULL. mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE? - if ((uintptr_t)segment >= MI_MAX_ADDRESS) { - *bitidx = 0; - return MI_SEGMENT_MAP_WSIZE; - } - else { - const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_SIZE; - *bitidx = segindex % MI_INTPTR_BITS; - const size_t mapindex = segindex / MI_INTPTR_BITS; - mi_assert_internal(mapindex < MI_SEGMENT_MAP_WSIZE); - return mapindex; + *idx = 0; + *bitidx = 0; + if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL; + const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN; + if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL; + mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t*, &mi_segment_map[segindex]); + + // allocate on demand to reduce .bss footprint + if (part == NULL) { + if (!create_on_demand) return NULL; + mi_memid_t memid; + part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid, NULL); + if (part == NULL) return NULL; + mi_segmap_part_t* expected = NULL; + if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) { + _mi_os_free(part, sizeof(mi_segmap_part_t), memid, NULL); + part = expected; + if (part == NULL) return NULL; + } } + mi_assert(part != NULL); + const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN; + const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN; + *idx = bitofs / MI_INTPTR_BITS; + *bitidx = bitofs % MI_INTPTR_BITS; + return part; } void _mi_segment_map_allocated_at(const mi_segment_t* segment) { + if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map + size_t index; size_t bitidx; - size_t index = mi_segment_map_index_of(segment, &bitidx); - mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE); - if (index==MI_SEGMENT_MAP_WSIZE) return; - uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]); + mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx); + if (part == NULL) return; // outside our address range.. + uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); uintptr_t newmask; do { newmask = (mask | ((uintptr_t)1 << bitidx)); - } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask)); + } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask)); } void _mi_segment_map_freed_at(const mi_segment_t* segment) { + if (segment->memid.memkind == MI_MEM_ARENA) return; + size_t index; size_t bitidx; - size_t index = mi_segment_map_index_of(segment, &bitidx); - mi_assert_internal(index <= MI_SEGMENT_MAP_WSIZE); - if (index == MI_SEGMENT_MAP_WSIZE) return; - uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]); + mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx); + if (part == NULL) return; // outside our address range.. + uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); uintptr_t newmask; do { newmask = (mask & ~((uintptr_t)1 << bitidx)); - } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask)); + } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask)); } // Determine the segment belonging to a pointer or NULL if it is not in a valid segment. static mi_segment_t* _mi_segment_of(const void* p) { if (p == NULL) return NULL; mi_segment_t* segment = _mi_ptr_segment(p); // segment can be NULL + size_t index; size_t bitidx; - size_t index = mi_segment_map_index_of(segment, &bitidx); - // fast path: for any pointer to valid small/medium/large object or first MI_SEGMENT_SIZE in huge - const uintptr_t mask = mi_atomic_load_relaxed(&mi_segment_map[index]); + mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx); + if (part == NULL) return NULL; + const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) { + bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie); + mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok); return segment; // yes, allocated by us } - if (index==MI_SEGMENT_MAP_WSIZE) return NULL; - - // TODO: maintain max/min allocated range for efficiency for more efficient rejection of invalid pointers? - - // search downwards for the first segment in case it is an interior pointer - // could be slow but searches in MI_INTPTR_SIZE * MI_SEGMENT_SIZE (512MiB) steps trough - // valid huge objects - // note: we could maintain a lowest index to speed up the path for invalid pointers? - size_t lobitidx; - size_t loindex; - uintptr_t lobits = mask & (((uintptr_t)1 << bitidx) - 1); - if (lobits != 0) { - loindex = index; - lobitidx = mi_bsr(lobits); // lobits != 0 - } - else if (index == 0) { - return NULL; - } - else { - mi_assert_internal(index > 0); - uintptr_t lomask = mask; - loindex = index; - do { - loindex--; - lomask = mi_atomic_load_relaxed(&mi_segment_map[loindex]); - } while (lomask != 0 && loindex > 0); - if (lomask == 0) return NULL; - lobitidx = mi_bsr(lomask); // lomask != 0 - } - mi_assert_internal(loindex < MI_SEGMENT_MAP_WSIZE); - // take difference as the addresses could be larger than the MAX_ADDRESS space. - size_t diff = (((index - loindex) * (8*MI_INTPTR_SIZE)) + bitidx - lobitidx) * MI_SEGMENT_SIZE; - segment = (mi_segment_t*)((uint8_t*)segment - diff); - - if (segment == NULL) return NULL; - mi_assert_internal((void*)segment < p); - bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie); - mi_assert_internal(cookie_ok); - if mi_unlikely(!cookie_ok) return NULL; - if (((uint8_t*)segment + mi_segment_size(segment)) <= (uint8_t*)p) return NULL; // outside the range - mi_assert_internal(p >= (void*)segment && (uint8_t*)p < (uint8_t*)segment + mi_segment_size(segment)); - return segment; + return NULL; } // Is this a valid pointer in our heap? -static bool mi_is_valid_pointer(const void* p) { - return ((_mi_segment_of(p) != NULL) || (_mi_arena_contains(p))); +static bool mi_is_valid_pointer(const void* p) { + // first check if it is in an arena, then check if it is OS allocated + return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL); } mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { return mi_is_valid_pointer(p); } - -/* -// Return the full segment range belonging to a pointer -static void* mi_segment_range_of(const void* p, size_t* size) { - mi_segment_t* segment = _mi_segment_of(p); - if (segment == NULL) { - if (size != NULL) *size = 0; - return NULL; - } - else { - if (size != NULL) *size = segment->segment_size; - return segment; - } - mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld)); - mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size); - mi_reset_delayed(tld); - mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld)); - return page; -} -*/ diff --git a/test/test-stress.c b/test/test-stress.c index 599c6c2e..24dcf00f 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -258,7 +258,7 @@ int main(int argc, char** argv) { #ifdef HEAP_WALK mi_option_enable(mi_option_visit_abandoned); #endif - #ifndef NDBEBUG + #ifndef NDEBUG mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); #endif #ifndef USE_STD_MALLOC From e8f4bdd1ea568b723da5c8362d5cdb092fa4cbc2 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 14:59:37 -0700 Subject: [PATCH 13/18] fix cast; make segment map static --- src/segment-map.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/src/segment-map.c b/src/segment-map.c index be461d7e..8927a8bd 100644 --- a/src/segment-map.c +++ b/src/segment-map.c @@ -39,8 +39,7 @@ typedef struct mi_segmap_part_s { } mi_segmap_part_t; // Allocate parts on-demand to reduce .bss footprint -_Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. } - +static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. } static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) { // note: segment can be invalid or NULL. @@ -50,7 +49,7 @@ static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bo if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL; const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN; if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL; - mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t*, &mi_segment_map[segindex]); + mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]); // allocate on demand to reduce .bss footprint if (part == NULL) { From f87ec74bb3103f68f4e8b6f34098e09cbd1b306d Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 15:10:17 -0700 Subject: [PATCH 14/18] reduce delayed output from redirection to 16KiB to reduce the .bss size --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index 32fa212b..462a7c71 100644 --- a/src/options.c +++ b/src/options.c @@ -200,7 +200,7 @@ static void mi_cdecl mi_out_stderr(const char* msg, void* arg) { // an output function is registered it is called immediately with // the output up to that point. #ifndef MI_MAX_DELAY_OUTPUT -#define MI_MAX_DELAY_OUTPUT ((size_t)(32*1024)) +#define MI_MAX_DELAY_OUTPUT ((size_t)(16*1024)) #endif static char out_buf[MI_MAX_DELAY_OUTPUT+1]; static _Atomic(size_t) out_len; From f9076a5cf83a4326cee17e70c0b11baa056a5e57 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 15:54:49 -0700 Subject: [PATCH 15/18] use EFAULT if a target heap tag cannot be found on reclaim --- src/segment.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/segment.c b/src/segment.c index 8fccf18e..e484a38f 100644 --- a/src/segment.c +++ b/src/segment.c @@ -905,7 +905,7 @@ static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag); // allow custom heaps to separate objects if (target_heap == NULL) { target_heap = heap; - _mi_error_message(EINVAL, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag ); + _mi_error_message(EFAULT, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag ); } // associate the heap with this page, and allow heap thread delayed free again. mi_page_set_heap(page, target_heap); From e4c8f42bb6b4169ff329393621474612f4cce4f5 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 2 Jun 2024 16:10:08 -0700 Subject: [PATCH 16/18] always include sys/prctl.h on linux to disable THP if large_os_pages are not enabled --- src/prim/unix/prim.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 93785b22..63a36f25 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -30,9 +30,9 @@ terms of the MIT license. A copy of the license can be found in the file #if defined(__linux__) #include - #if defined(MI_NO_THP) - #include - #endif + //#if defined(MI_NO_THP) + #include // THP disable + //#endif #if defined(__GLIBC__) #include // linux mmap flags #else From 768872e4e0bdec168fe82358614e4dfbfde1c779 Mon Sep 17 00:00:00 2001 From: Daan Date: Sun, 2 Jun 2024 16:24:13 -0700 Subject: [PATCH 17/18] typo in stress test --- test/test-stress.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test-stress.c b/test/test-stress.c index 24dcf00f..544c2838 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -138,7 +138,7 @@ static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void (void)(heap); (void)(area); size_t* total = (size_t*)arg; if (block != NULL) { - total += block_size; + *total += block_size; } return true; } From 6b15342709241f278256d0392926752b130b9d5e Mon Sep 17 00:00:00 2001 From: Daan Date: Sun, 2 Jun 2024 16:41:07 -0700 Subject: [PATCH 18/18] fix pthread initalization of mutexes --- include/mimalloc/atomic.h | 7 ++++--- src/arena.c | 2 +- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index d2711019..3a0d4892 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -441,13 +441,13 @@ static inline void mi_lock_release(mi_lock_t* lock) { pthread_mutex_unlock(lock); } static inline void mi_lock_init(mi_lock_t* lock) { - (void)(lock); + pthread_mutex_init(lock, NULL); } static inline void mi_lock_done(mi_lock_t* lock) { - (void)(lock); + pthread_mutex_destroy(lock); } - +/* #elif defined(__cplusplus) #include @@ -469,6 +469,7 @@ static inline void mi_lock_init(mi_lock_t* lock) { static inline void mi_lock_done(mi_lock_t* lock) { (void)(lock); } +*/ #else diff --git a/src/arena.c b/src/arena.c index 24f1299c..7d7eb089 100644 --- a/src/arena.c +++ b/src/arena.c @@ -863,7 +863,7 @@ void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, mi_aren current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena)); current->end = current->start + max_arena; } - mi_assert_internal(current->start < max_arena); + mi_assert_internal(current->start <= max_arena); } static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) {