From 2f789aae9a1ed271e3feb22e4ead04db809e4e2e Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 1 Dec 2024 16:26:59 -0800 Subject: [PATCH] wip: cannot compile --- include/mimalloc/internal.h | 84 +++++++++++++++++++------------------ include/mimalloc/types.h | 20 +++++---- src/bitmap.c | 45 ++++++++++++++++++++ src/bitmap.h | 28 ++++++++++++- src/free.c | 81 +++++++++++++++++++++++------------ 5 files changed, 181 insertions(+), 77 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index ec106047..84244c21 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -92,11 +92,13 @@ bool _mi_preloading(void); // true while the C runtime is not in void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); + mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; -size_t _mi_thread_seq_id(void) mi_attr_noexcept; +size_t _mi_thread_seq_id(void) mi_attr_noexcept; + mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); -void _mi_heap_guarded_init(mi_heap_t* heap); +void _mi_heap_guarded_init(mi_heap_t* heap); // os.c void _mi_os_init(void); // called from process init @@ -180,8 +182,6 @@ void _mi_heap_delayed_free_all(mi_heap_t* heap); bool _mi_heap_delayed_free_partial(mi_heap_t* heap); void _mi_heap_collect_retired(mi_heap_t* heap, bool force); -void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never); -bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never); size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); void _mi_deferred_free(mi_heap_t* heap, bool force); @@ -426,6 +426,10 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) { return ((uintptr_t)p ^ _mi_heap_main.cookie); } +static inline mi_tld_t* _mi_tld(void) { + return mi_heap_get_default()->tld; +} + /* ----------------------------------------------------------- Pages ----------------------------------------------------------- */ @@ -507,53 +511,53 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) { return mi_page_block_size(page) - MI_PADDING_SIZE; } +//static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { +// mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); +// if (heap != NULL) { +// mi_atomic_store_release(&page->xheap, (uintptr_t)heap); +// page->heap_tag = heap->tag; +// mi_atomic_store_release(&page->xthread_id, heap->thread_id); +// } +// else { +// mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc); +// mi_atomic_store_release(&page->xthread_id,0); +// } +//} + +// Thread free flag helpers +static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { + return (mi_block_t*)(tf & ~1); +} +static inline bool mi_tf_is_owned(mi_thread_free_t tf) { + return ((tf & 1) == 0); +} +static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) { + return (mi_thread_free_t)((uintptr_t)block | (owned ? 0 : 1)); +} + + // Thread free access static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { - return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3); + return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); } -static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) { - return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3); -} - -// Heap access -static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { - return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap)); +// Owned? +static inline bool mi_page_is_owned(const mi_page_t* page) { + return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); } +// Thread id of thread that owns this page static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { return mi_atomic_load_relaxed(&page->xthread_id); } -static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { - mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); - if (heap != NULL) { - mi_atomic_store_release(&page->xheap, (uintptr_t)heap); - page->heap_tag = heap->tag; - mi_atomic_store_release(&page->xthread_id, heap->thread_id); - } - else { - mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc); - mi_atomic_store_release(&page->xthread_id,0); - } -} -// Thread free flag helpers -static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { - return (mi_block_t*)(tf & ~0x03); -} -static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) { - return (mi_delayed_t)(tf & 0x03); -} -static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) { - return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed); -} -static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) { - return mi_tf_make(mi_tf_block(tf),delayed); -} -static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) { - return mi_tf_make(block, mi_tf_delayed(tf)); -} +//static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) { +// return mi_tf_make(mi_tf_block(tf),delayed); +//} +//static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) { +// return mi_tf_make(block, mi_tf_delayed(tf)); +//} // are all blocks in a page freed? // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`. diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 271c7efb..7329cb86 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -216,13 +216,14 @@ typedef struct mi_block_s { #endif -// The delayed flags are used for efficient multi-threaded free-ing -typedef enum mi_delayed_e { - MI_USE_DELAYED_FREE = 0, // push on the owning heap thread delayed list - MI_DELAYED_FREEING = 1, // temporary: another thread is accessing the owning heap - MI_NO_DELAYED_FREE = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list - MI_NEVER_DELAYED_FREE = 3 // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim -} mi_delayed_t; +// The owned flags are used for efficient multi-threaded free-ing +// When we push on the page thread free queue of an abandoned page, +// we also atomically get to own it. This is needed to atomically +// abandon a page (while other threads could concurrently free blocks in it). +typedef enum mi_owned_e { + MI_OWNED = 0, // some heap owns this page + MI_ABANDONED = 1, // the page is abandoned +} mi_owned_t; // The `in_full` and `has_aligned` page flags are put in a union to efficiently @@ -247,7 +248,7 @@ typedef union mi_page_flags_s { #endif // Thread free list. -// We use the bottom 2 bits of the pointer for mi_delayed_t flags +// We use the bottom bit of the pointer for `mi_owned_t` flags typedef uintptr_t mi_thread_free_t; // Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython) @@ -304,10 +305,11 @@ typedef struct mi_page_s { #endif _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - _Atomic(uintptr_t) xheap; // heap this threads belong to. + // _Atomic(uintptr_t) xheap; // heap this threads belong to. struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` + mi_subproc_t* subproc; // sub-process of this heap mi_memid_t memid; // provenance of the page memory } mi_page_t; diff --git a/src/bitmap.c b/src/bitmap.c index dd1afe75..5cce6bfa 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -693,3 +693,48 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t mi_bitmap_forall_set_chunks_end(); return false; } + + + +mi_decl_nodiscard bool mi_pairmap_xset(mi_pair_t set, mi_bitmap_t* bitmap, size_t idx); +mi_decl_nodiscard bool mi_pairmap_xset_while_not_busy(mi_pair_t set, mi_bitmap_t* bitmap, size_t idx); + +mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) { + size_t set_idx; + size_t start = tseq % MI_BFIELD_BITS; + size_t epoch = mi_atomic_load_acquire(&pairmap->epoch); + mi_bfield_t any_set = mi_bfield_rotate_right(mi_atomic_load_relaxed(&pairmap->any_set), start); + while (mi_bfield_find_least_bit(any_set, &set_idx)) { + size_t chunk_idx = 2*((set_idx + start) % MI_BFIELD_BITS); + { + // look at chunk_idx and chunck_idx+1 + mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx]; + mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx+1]; + size_t cidx; + if (mi_pairmap_chunk_find_and_set_busy(chunk1, &cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx < MI_PAIRMAP_MAX_BITS); + return true; + } + else { + if (mi_pairmap_chunk_find_and_set_busy(chunk2, &cidx)) { + *pidx = ((chunk_idx+1) * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx < MI_PAIRMAP_MAX_BITS); + return true; + } + else if (mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk2)) { + + mi_bfield_atomic_xset(MI_BIT_CLEAR, &pairmap->any_set, chunk_idx/2); + } + } + else { + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, chunk_idx); + } + } + } + start += set_idx+1; /* so chunk_idx stays valid */ + any_set >>= set_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ + any_set >>= 1; + } +} diff --git a/src/bitmap.h b/src/bitmap.h index 1a180924..2b4bfc25 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -41,7 +41,7 @@ typedef mi_decl_align(32) struct mi_bitmap_s { #define MI_BITMAP_MAX_BITS (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit /* -------------------------------------------------------------------------------- - Bitmap + Atomic bitmap -------------------------------------------------------------------------------- */ typedef bool mi_bit_t; @@ -89,4 +89,30 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ); + +/* -------------------------------------------------------------------------------- + Atomic bitmap for a pair of bits +-------------------------------------------------------------------------------- */ + +typedef mi_bfield_t mi_pair_t; + +#define MI_PAIR_CLEAR (0) +#define MI_PAIR_BUSY (1) +#define MI_PAIR_BUSYX (2) +#define MI_PAIR_SET (3) + +typedef mi_decl_align(32) struct mi_pairmap_s { + mi_bitmap_chunk_t chunks[2*MI_BFIELD_BITS]; + _Atomic(mi_bfield_t) any_set; + _Atomic(size_t) epoch; +} mi_pairmap_t; + +#define MI_PAIRMAP_MAX_PAIRS (MI_BITMAP_MAX_BITS) // 16k pairs on 64bit, 8k pairs on 32bit +#define MI_PAIRMAP_MAX_BITS (2*MI_PAIRMAP_MAX_PAIRS) + +mi_decl_nodiscard bool mi_pairmap_xset(mi_pair_t set, mi_pairmap_t* pairmap, size_t idx); +mi_decl_nodiscard bool mi_pairmap_xset_while_not_busy(mi_pair_t set, mi_pairmap_t* pairmap, size_t idx); +mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t n, size_t tseq, size_t* pidx); + + #endif // MI_XBITMAP_H diff --git a/src/free.c b/src/free.c index f0ce8c22..42fcd07e 100644 --- a/src/free.c +++ b/src/free.c @@ -147,39 +147,66 @@ void mi_free(void* p) mi_attr_noexcept } } -// return true if successful -bool _mi_free_delayed_block(mi_block_t* block) { - // get segment and page - mi_assert_internal(block!=NULL); - mi_page_t* const page = mi_checked_ptr_page(block,"_mi_free_delayed_block"); - mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); - // Clear the no-delayed flag so delayed freeing is used again for this page. - // This must be done before collecting the free lists on this page -- otherwise - // some blocks may end up in the page `thread_free` list with no blocks in the - // heap `thread_delayed_free` list which may cause the page to be never freed! - // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`) - if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) { - return false; - } - - // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count - _mi_page_free_collect(page, false); - - // and free the block (possibly freeing the page as well since `used` is updated) - mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */); - return true; -} // ------------------------------------------------------ // Multi-threaded Free (`_mt`) // ------------------------------------------------------ -// Push a block that is owned by another thread on its page-local thread free -// list or it's heap delayed free list. Such blocks are later collected by -// the owning thread in `_mi_free_delayed_block`. -static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block ) +static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_thread_id(page)==0); + + // we own the page now.. + // first remove it from the abandoned pages in the arena + mi_heap_t* const heap = mi_heap_get_default(); + _mi_arena_page_unabandon(page,heap->tld); + + // collect the thread atomic free list + _mi_page_free_collect(page, false); // update `used` count + if (mi_page_is_singleton(page)) mi_assert_internal(mi_page_all_free(page)); + + if (mi_page_all_free(page)) { + // we can free the page directly + _mi_arena_page_free(page, heap->tld); + } + else { + // the page has still some blocks in use + // reclaim in our heap if compatible, or otherwise abandon again + if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && + (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) && // we did not already terminate our thread (can this happen? yes, due to thread-local destructors for example (issue #944)) + (page->subproc == heap->tld->subproc) && // don't reclaim across sub-processes + mi_arena_page_try_reclaim(page) // and we can reclaim it from the arena + ) + { + // make it part of our heap + _mi_heap_page_reclaim(heap, page); + } + else { + // abandon again + _mi_arena_page_abandon(page, heap->tld); + } + } +} + +// Push a block that is owned by another thread on its page-local thread free list. +static void mi_decl_noinline mi_free_block_delayed_mt(mi_page_t* page, mi_block_t* block) { + // push atomically on the page thread free list + mi_thread_free_t tf_new; + mi_thread_free_t tf; + do { + tf = mi_atomic_load_relaxed(&page->xthread_free); + mi_block_set_next(page, block, mi_tf_block(tf)); + tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); + } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf, tf_new)); + + // and atomically reclaim the page if it was abandoned + bool reclaimed = !mi_tf_is_owned(tf); + if (reclaimed) mi_free_try_reclaim_mt(page); +} + + /* // Try to put the block on either the page-local thread free list, // or the heap delayed free list (if this is the first non-local free in that page) mi_thread_free_t tfreex; @@ -276,7 +303,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block // thread_delayed free list (or heap delayed free list) mi_free_block_delayed_mt(page,block); } - +*/ // ------------------------------------------------------ // Usable size