diff --git a/src/arena.c b/src/arena.c index bc885ef8..19815616 100644 --- a/src/arena.c +++ b/src/arena.c @@ -199,7 +199,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( void* p = mi_arena_slice_start(arena, slice_index); *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count); memid->is_pinned = arena->memid.is_pinned; - + // set the dirty bits if (arena->memid.initially_zero) { // size_t dirty_count = 0; @@ -239,7 +239,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_zero = false; } } - #endif + #endif size_t already_committed_count = 0; mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count); if (already_committed_count < slice_count) { @@ -247,7 +247,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); } } - } + } } else { // no need to commit, but check if already fully committed @@ -282,8 +282,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); if (arena_count >= 1 && arena_count <= 128) { - // scale up the arena sizes exponentially every 8 entries - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); + // scale up the arena sizes exponentially every 8 entries + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); size_t reserve = 0; if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { arena_reserve = reserve; @@ -399,7 +399,7 @@ again: if (mi_lock_try_acquire(&mi_arena_reserve_lock)) { mi_arena_id_t arena_id = 0; bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id); - mi_lock_release(&mi_arena_reserve_lock); + mi_lock_release(&mi_arena_reserve_lock); if (ok) { // and try allocate in there mi_assert_internal(req_arena_id == _mi_arena_id_none()); @@ -476,6 +476,19 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t Arena page allocation ----------------------------------------------------------- */ +static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2) { + mi_arena_t* arena = (mi_arena_t*)arg1; + mi_subproc_t* subproc = (mi_subproc_t*)arg2; + + // found an abandoned page of the right size + // it is set busy for now so we can read safely even with concurrent mi_free reclaiming + // try to claim ownership atomically + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + if (subproc != page->subproc) return false; + if (!mi_page_try_claim_ownership(page)) return false; + return true; +} + static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) { MI_UNUSED(slice_count); @@ -493,38 +506,29 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl { size_t slice_index; mi_pairmap_t* const pairmap = &arena->pages_abandoned[bin]; - while (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index)) { // todo: don't restart from scratch if we fail for some entry? - // found an abandoned page of the right size - // it is set busy for now so we can read safely even with concurrent mi_free reclaiming - // try to claim ownership atomically - mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); - if (!mi_page_try_claim_ownership(page)) { - // a concurrent free already grabbed the page. - // Restore the abandoned_map to make it available again (unblocking busy waiters) - mi_pairmap_set(pairmap, slice_index); - } - else { - // we got ownership, clear the abandoned entry (unblocking busy waiters) - mi_pairmap_clear(pairmap, slice_index); - mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); - _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); - _mi_page_free_collect(page, false); // update `used` count - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); - mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); - mi_assert_internal(_mi_ptr_page(page)==page); - mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); - mi_assert_internal(mi_page_block_size(page) == block_size); - mi_assert_internal(mi_page_is_abandoned(page)); - mi_assert_internal(mi_page_is_owned(page)); - mi_assert_internal(!mi_page_is_full(page)); - return page; - } - } + if (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) { + // found an abandoned page of the right size + // and claimed ownership. + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_is_abandoned(page)); + mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); + _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); + _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); + + _mi_page_free_collect(page, false); // update `used` count + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); + mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); + mi_assert_internal(mi_page_block_size(page) == block_size); + mi_assert_internal(!mi_page_is_full(page)); + return page; + } } mi_forall_arenas_end(); return NULL; @@ -565,8 +569,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment)); // claimed free slices: initialize the page partly - if (!memid.initially_zero) { - _mi_memzero_aligned(page, sizeof(*page)); + if (!memid.initially_zero) { + _mi_memzero_aligned(page, sizeof(*page)); } #if MI_DEBUG > 1 else { @@ -779,7 +783,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done - mi_pairmap_clear_while_not_busy(&arena->pages_abandoned[bin], slice_index); + mi_pairmap_clear_once_not_busy(&arena->pages_abandoned[bin], slice_index); mi_page_clear_abandoned_mapped(page); mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]); } @@ -999,7 +1003,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* mi_atomic_decrement_acq_rel(&mi_arena_count); return false; } - + _mi_stat_counter_increase(&stats->arena_count,1); arena->id = mi_arena_id_create(i); mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); @@ -1049,7 +1053,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB); return false; - } + } size_t bitmap_base; const size_t info_slices = mi_arena_info_slices_needed(slice_count, &bitmap_base); if (slice_count < info_slices+1) { diff --git a/src/bitmap.c b/src/bitmap.c index 2dbba52d..1aa0a822 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -995,13 +995,13 @@ mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, s // Set/clear a sequence of 2 bits that were on an even `idx` in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { +static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal((idx%2)==0); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - + if (set) { // first set the chunkmap since it is a conservative approximation (increases epoch) mi_bitmap_chunkmap_set(bitmap, chunk_idx); @@ -1066,7 +1066,7 @@ static inline bool mi_bitmap_is_xset2(mi_xset_t set, mi_bitmap_t* bitmap, size_t mi_assert_internal(idx + 2 <= mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); + mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); return mi_bitmap_chunk_is_xset2(set, &bitmap->chunks[chunk_idx], cidx); } @@ -1091,13 +1091,13 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n /* -------------------------------------------------------------------------------- bitmap try_find_and_clear -------------------------------------------------------------------------------- */ - +/* typedef bool (mi_bitmap_find_fun_t)(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx); static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx, mi_bitmap_find_fun_t* find_fun) { if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; - + // start chunk index -- todo: can depend on the tseq to decrease contention between threads MI_UNUSED(tseq); const size_t chunk_start = 0; @@ -1105,7 +1105,7 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; // for each chunkmap entry `i` - for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) + for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { size_t i = (_i + chunk_map_start); if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count; // adjust for the start position @@ -1122,50 +1122,106 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } // set the chunk idx const size_t chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; - + // try to find and clear N bits in that chunk if (chunk_idx < mi_bitmap_chunk_count(bitmap)) { // we can have less chunks than in the chunkmap.. if ((*find_fun)(bitmap, n, chunk_idx, epoch, pidx)) { return true; } } - + // skip to the next bit cmap_idx_shift += cmap_idx+1; cmap >>= cmap_idx; // skip scanned bits (and avoid UB for `cmap_idx+1`) cmap >>= 1; } } - + return false; } +*/ -static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { - size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); - return true; - } - else { - // we may find that all are cleared only on a second iteration but that is ok as - // the chunkmap is a conservative approximation. - if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } - return false; - } -} +#define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \ + { \ + /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ + MI_UNUSED(tseq); \ + const size_t chunk_start = 0; \ + const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; \ + const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; \ + /* for each chunkmap entry `i` */ \ + for (size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { \ + size_t i = (_i + chunk_map_start); \ + if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count; /* adjust for the start position */ \ + \ + const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; \ + mi_epoch_t name_epoch; \ + mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &name_epoch); \ + if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); } /* rotate right for the start position (on the first iteration) */ \ + \ + uint32_t cmap_idx; /* one bit set of each chunk that may have bits set */ \ + size_t cmap_idx_shift = 0; /* shift through the cmap */ \ + while (mi_bsf32(cmap, &cmap_idx)) { /* find least bit that is set */ \ + /* adjust for the start position again */ \ + if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } \ + /* set the chunk idx */ \ + const size_t name_chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; \ + /* try to find and clear N bits in that chunk */ \ + if (name_chunk_idx < mi_bitmap_chunk_count(bitmap)) { /* we can have less chunks than in the chunkmap.. */ + +#define mi_bitmap_forall_chunks_end() \ + } \ + /* skip to the next bit */ \ + cmap_idx_shift += cmap_idx+1; \ + cmap >>= cmap_idx; /* skip scanned bits (and avoid UB for `cmap_idx+1`) */ \ + cmap >>= 1; \ + } \ + }} + +//static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { +// size_t cidx; +// if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { +// *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; +// mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); +// return true; +// } +// else { +// // we may find that all are cleared only on a second iteration but that is ok as +// // the chunkmap is a conservative approximation. +// if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { +// mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); +// } +// return false; +// } +//} // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { - return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at); + // return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at); + mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); + return true; + } + else { + // we may find that all are cleared only on a second iteration but that is ok as + // the chunkmap is a conservative approximation. + if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); + } + // continue + } + } + mi_bitmap_forall_chunks_end(); + return false; } /* -------------------------------------------------------------------------------- - pairmap + pairmap -------------------------------------------------------------------------------- */ void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2) { @@ -1215,10 +1271,10 @@ bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) { pairmap clear while not busy -------------------------------------------------------------------------------- */ -static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). +static inline bool mi_bfield_atomic_clear2_once_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) { + mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set). mi_assert_internal(idx < MI_BFIELD_BITS-1); - const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx); + const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx); const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); mi_bfield_t bnew; mi_bfield_t old = mi_atomic_load_relaxed(b); @@ -1238,32 +1294,32 @@ static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b return ((old&mask) == mask); } -static inline bool mi_bitmap_chunk_clear2_while_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) { +static inline bool mi_bitmap_chunk_clear2_once_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) { mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; - return mi_bfield_atomic_clear2_while_not_busy(&chunk->bfields[i], idx); + return mi_bfield_atomic_clear2_once_not_busy(&chunk->bfields[i], idx); } -static bool mi_bitmap_clear2_while_not_busy(mi_bitmap_t* bitmap, size_t idx) { +static bool mi_bitmap_clear2_once_not_busy(mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal((idx%2)==0); mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx); - bool cleared = mi_bitmap_chunk_clear2_while_not_busy(&bitmap->chunks[chunk_idx], cidx); + bool cleared = mi_bitmap_chunk_clear2_once_not_busy(&bitmap->chunks[chunk_idx], cidx); if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } + } return cleared; } -void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { +void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { mi_bitmap_t* bitmap; size_t idx; mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); - mi_bitmap_clear2_while_not_busy(bitmap, idx); + mi_bitmap_clear2_once_not_busy(bitmap, idx); } @@ -1274,9 +1330,9 @@ void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { // Atomically go from set to busy, or return false otherwise and leave the bit field as-is. static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). + mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set). mi_assert_internal(idx < MI_BFIELD_BITS-1); - const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx); + const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx); const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); mi_bfield_t old; mi_bfield_t bnew; @@ -1290,49 +1346,57 @@ static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) { for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - size_t idx; - if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i], &idx)) { // find least 1-bit, it may be set or busy - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). - if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) { - *pidx = (i*MI_BFIELD_BITS) + idx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1); - return true; + while (true) { + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]) & MI_BFIELD_LO_BIT2; // only keep MI_PAIR_SET bits + size_t idx; + if (!mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit + break; // not found: continue with the next field + } + else { + mi_assert_internal((idx%2)==0); + if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) { + *pidx = (i*MI_BFIELD_BITS) + idx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1); + return true; + } + // else: try this word once again } } } return false; } -static bool mi_bitmap_try_find_and_set_busy_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { - MI_UNUSED(epoch); MI_UNUSED(n); - mi_assert_internal(n==2); - size_t cidx; - if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); - return true; - } - else { - return false; - } -} -static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { - return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_set_busy_at); +static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t idx_offset, size_t* ppair_idx, + mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2) +{ + mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) + { + MI_UNUSED(epoch); MI_UNUSED(n); + mi_assert_internal(n==2); + size_t cidx; + if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) { + const size_t idx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal((idx%2)==0); + const size_t pair_idx = (idx + idx_offset)/2; + if (claim(pair_idx, arg1, arg2)) { // while busy, the claim function can read from the page + mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx); // claimed, clear the entry + *ppair_idx = pair_idx; + return true; + } + else { + mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx); // not claimed, reset the entry + // and continue + } + } + } + mi_bitmap_forall_chunks_end(); + return false; } // Used to find an abandoned page, and transition from set to busy. -mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) { - size_t idx = 0; - if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, &idx)) { - if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, &idx)) { - return false; - } - else { - idx += mi_bitmap_max_bits(pairmap->bitmap1); - } - } - mi_assert_internal((idx%2)==0); - *pidx = idx/2; - return true; +mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pair_idx, + mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2 ) { + if (mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, 0, pair_idx, claim, arg1, arg2)) return true; + return mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, mi_bitmap_max_bits(pairmap->bitmap1), pair_idx, claim, arg1, arg2); } diff --git a/src/bitmap.h b/src/bitmap.h index d73ee98a..ca62735b 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -13,9 +13,47 @@ Concurrent bitmap that can set/reset sequences of bits atomically #define MI_BITMAP_H /* -------------------------------------------------------------------------------- - Definitions --------------------------------------------------------------------------------- */ + Atomic bitmaps: + `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`) + each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB). + We need 16K bits to represent a 1GiB arena. + + `mi_bitmap_chunk_t`: a chunk of bfield's of a total of MI_BITMAP_CHUNK_BITS (= 512) + allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number + of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB). + These chunks are cache-aligned and we can use AVX2/AVX512/SVE/SVE2/etc. instructions + to scan for bits (perhaps) more efficiently. + + `mi_chunkmap_t`: for each chunk we track if it has (potentially) any bit set. + The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set. + This is used to avoid scanning every chunk. (and thus strictly an optimization) + It is conservative: it is fine to a bit in the chunk map even if the chunk turns out + to have no bits set. + + When we (potentially) set a bit in a chunk, we first update the chunkmap. + However, when we clear a bit in a chunk, and the chunk is indeed all clear, we + cannot safely clear the bit corresponding to the chunk in the chunkmap since it + may race with another thread setting a bit in the same chunk (and we may clear the + bit even though a bit is set in the chunk which is not allowed). + + To fix this, the chunkmap contains 32-bits of bits for chunks, and a 32-bit "epoch" + counter that is increased everytime a bit is set. We only clear a bit if the epoch + stayed the same over our clear operation (so we know no other thread in the mean + time set a bit in any of the chunks corresponding to the chunkmap). + Since increasing the epoch and setting a bit must be atomic, we use only half-word + bits (32) (we could use 128-bit atomics if needed since modern hardware supports this) + + `mi_bitmap_t`: a bitmap with N chunks. A bitmap always has MI_BITMAP_MAX_CHUNK_FIELDS (=16) + and can support arena's from few chunks up to 16 chunkmap's = 16 * 32 chunks = 16 GiB + The `chunk_count` can be anything from 1 to the max supported by the chunkmap's but + each chunk is always complete (512 bits, so 512 * 64KiB = 32MiB memory area's). + + For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count + and pop-count (but we think it can be adapted work reasonably well on older hardware too) +--------------------------------------------------------------------------------------------- */ + +// A word-size bit field. typedef size_t mi_bfield_t; #define MI_BFIELD_BITS_SHIFT (MI_SIZE_SHIFT+3) @@ -29,16 +67,18 @@ typedef size_t mi_bfield_t; #define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) #define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) -// 512 bits on 64_bit +// A bitmap chunk contains 512 bits of bfields on 64_bit (256 on 32-bit) typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s { _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; } mi_bitmap_chunk_t; + // for now 32-bit epoch + 32-bit bit-set (note: with ABA instructions we can double this) typedef uint64_t mi_chunkmap_t; typedef uint32_t mi_epoch_t; typedef uint32_t mi_cmap_t; + #define MI_CHUNKMAP_BITS (32) // 1 chunkmap tracks 32 chunks #define MI_BITMAP_MAX_CHUNKMAPS (16) @@ -48,15 +88,18 @@ typedef uint32_t mi_cmap_t; #define MI_BITMAP_MAX_BIT_COUNT (MI_BITMAP_MAX_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 16 GiB arena #define MI_BITMAP_MIN_BIT_COUNT (MI_BITMAP_MIN_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 1 GiB arena + +// An atomic bitmap typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s { - _Atomic(size_t) chunk_map_count; - _Atomic(size_t) chunk_count; + _Atomic(size_t) chunk_map_count; // valid chunk_map's + _Atomic(size_t) chunk_count; // total count of chunks size_t padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc _Atomic(mi_chunkmap_t) chunk_maps[MI_BITMAP_MAX_CHUNKMAPS]; - + mi_bitmap_chunk_t chunks[MI_BITMAP_MIN_BIT_COUNT]; // or more, up to MI_BITMAP_MAX_CHUNK_COUNT } mi_bitmap_t; + static inline size_t mi_bitmap_chunk_map_count(const mi_bitmap_t* bitmap) { return mi_atomic_load_relaxed(&bitmap->chunk_map_count); } @@ -72,17 +115,19 @@ static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) { /* -------------------------------------------------------------------------------- - Atomic bitmap + Atomic bitmap operations -------------------------------------------------------------------------------- */ +// Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing) typedef bool mi_xset_t; #define MI_BIT_SET (true) #define MI_BIT_CLEAR (false) +// Required size of a bitmap to represent `bit_count` bits. size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count); -// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true // returns the size of the bitmap. size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero); @@ -134,56 +179,46 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t - -// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) -// and false otherwise leaving the bitmask as is. -//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx); -// -//static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) { -// return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx); -//} -// -//static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { -// return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx); -//} - - -// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) -// and false otherwise leaving the bitmask as is. -//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx); -// -//static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) { -// return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx); -//} -// -//static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) { -// return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx); -//} - - /* -------------------------------------------------------------------------------- - Atomic bitmap for a pair of bits + Atomic bitmap for a pair of bits. + + The valid pairs are CLEAR (0), SET (3), or BUSY (2). + + These bit pairs are used in the abandoned pages maps: when set, the entry has + an available page. When we scan for an available abandoned page and find an entry SET, + we first set it to BUSY, and try to claim the page atomically (since it can race + with a concurrent `mi_free` which also tries to claim the page). However, unlike `mi_free`, + we cannot be sure that a concurrent `mi_free` also didn't free (and decommit) the page + just when we got the entry. Therefore, a page can only be freed after `mi_arena_unabandon` + which (busy) waits until the BUSY flag is cleared to ensure all readers are done. + (and pair-bit operations must therefore be release_acquire). -------------------------------------------------------------------------------- */ #define MI_PAIR_CLEAR (0) -#define MI_PAIR_BUSY (1) -#define MI_PAIR_UNUSED (2) // should never occur +#define MI_PAIR_UNUSED (1) // should never occur +#define MI_PAIR_BUSY (2) #define MI_PAIR_SET (3) +// 0b....0101010101010101 +#define MI_BFIELD_LO_BIT2 ((MI_BFIELD_LO_BIT8 << 6)|(MI_BFIELD_LO_BIT8 << 4)|(MI_BFIELD_LO_BIT8 << 2)|MI_BFIELD_LO_BIT8) + +// A pairmap manipulates pairs of bits (and consists of 2 bitmaps) typedef struct mi_pairmap_s { mi_bitmap_t* bitmap1; - mi_bitmap_t* bitmap2; + mi_bitmap_t* bitmap2; } mi_pairmap_t; - - -// initialize a pairmap to all unset; avoid a mem_zero if `already_zero` is true +// initialize a pairmap to all clear; avoid a mem_zero if `already_zero` is true void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2); bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx); bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx); bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx); -void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx); -mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx); +void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx); + +typedef bool (mi_bitmap_claim_while_busy_fun_t)(size_t pair_index, void* arg1, void* arg2); +mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx, + mi_bitmap_claim_while_busy_fun_t* claim, void* arg1 ,void* arg2 + ); -#endif // MI_XBITMAP_H +#endif // MI_BITMAP_H diff --git a/src/free.c b/src/free.c index 70ef5d8a..1e07dbd2 100644 --- a/src/free.c +++ b/src/free.c @@ -148,15 +148,44 @@ void mi_free(void* p) mi_attr_noexcept } - // ------------------------------------------------------ // Multi-threaded Free (`_mt`) // ------------------------------------------------------ +static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page); + +// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. +static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) +{ + // adjust stats (after padding check and potentially recursive `mi_free` above) + mi_stat_free(page, block); // stat_free may access the padding + mi_track_free_size(block, mi_page_usable_size_of(page, block)); + + // _mi_padding_shrink(page, block, sizeof(mi_block_t)); + #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading + size_t dbgsize = mi_usable_size(block); + if (dbgsize > MI_MiB) { dbgsize = MI_MiB; } + _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize); + #endif + + // push atomically on the page thread free list + mi_thread_free_t tf_new; + mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); + do { + mi_block_set_next(page, block, mi_tf_block(tf_old)); + tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); + } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); + + // and atomically reclaim the page if it was abandoned + bool reclaimed = !mi_tf_is_owned(tf_old); + if (reclaimed) { + mi_free_try_reclaim_mt(page); + } +} + static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); -#if 1 // we own the page now.. // safe to collect the thread atomic free list _mi_page_free_collect(page, false); // update `used` count @@ -209,237 +238,8 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { // not reclaimed or free'd, unown again _mi_page_unown(page); - -#else - if (!mi_page_is_abandoned_mapped(page)) { - // singleton or OS allocated - if (mi_page_is_singleton(page)) { - // free singleton pages - #if MI_DEBUG>1 - _mi_page_free_collect(page, false); // update `used` count - mi_assert_internal(mi_page_all_free(page)); - #endif - // we can free the page directly - _mi_arena_page_free(page); - return; - } - else { - const bool was_full = mi_page_is_full(page); - _mi_page_free_collect(page,false); // update used - if (mi_page_all_free(page)) { - // no need to unabandon as it is unmapped - _mi_arena_page_free(page); - return; - } - else if (was_full && _mi_arena_page_reabandon_full(page)) { - return; - } - else if (!mi_page_is_mostly_used(page) && _mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) { - // the page has still some blocks in use (but not too many) - // reclaim in our heap if compatible, or otherwise abandon again - // todo: optimize this check further? - // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should - // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) - mi_heap_t* const heap = mi_prim_get_default_heap(); - if (heap != (mi_heap_t*)&_mi_heap_empty) { // we did not already terminate our thread (can this happen? - mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); - if ((tagheap != NULL) && // don't reclaim across heap object types - (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) - (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) - ) - { - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); - // make it part of our heap (no need to unabandon as is unmapped) - _mi_heap_page_reclaim(tagheap, page); - return; - } - } - } - } - } - else { - // don't reclaim pages that can be found for fresh page allocations - } - - // not reclaimed or free'd, unown again - _mi_page_unown(page); -#endif } -/* -// we own the page now.. -// safe to collect the thread atomic free list -_mi_page_free_collect(page, false); // update `used` count -if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); } - -if (mi_page_all_free(page)) { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); // this must be before free'ing - // we can free the page directly - _mi_arena_page_free(page); - return; -} -else if (!mi_page_is_mostly_used(page)) { - // the page has still some blocks in use (but not too many) - // reclaim in our heap if compatible, or otherwise abandon again - // todo: optimize this check further? - // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should - // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) - mi_heap_t* const heap = mi_prim_get_default_heap(); - - if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && // only if reclaim on free is allowed - (heap != (mi_heap_t*)&_mi_heap_empty)) // we did not already terminate our thread (can this happen? - { - mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); - if ((tagheap != NULL) && // don't reclaim across heap object types - (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) - (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) - ) - { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); - // make it part of our heap - _mi_heap_page_reclaim(tagheap, page); - return; - } - } -} - -// we cannot reclaim this page.. leave it abandoned -// todo: should re-abandon or otherwise a partly used page could never be re-used if the -// objects in it are not freed explicitly. -_mi_page_unown(page); -*/ - - -// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. -static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) -{ - // adjust stats (after padding check and potentially recursive `mi_free` above) - mi_stat_free(page, block); // stat_free may access the padding - mi_track_free_size(block, mi_page_usable_size_of(page, block)); - - // _mi_padding_shrink(page, block, sizeof(mi_block_t)); - #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading - size_t dbgsize = mi_usable_size(block); - if (dbgsize > MI_MiB) { dbgsize = MI_MiB; } - _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize); - #endif - - // push atomically on the page thread free list - mi_thread_free_t tf_new; - mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); - do { - mi_block_set_next(page, block, mi_tf_block(tf_old)); - tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); - } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); - - // and atomically reclaim the page if it was abandoned - bool reclaimed = !mi_tf_is_owned(tf_old); - if (reclaimed) { - mi_free_try_reclaim_mt(page); - } -} - - /* - // Try to put the block on either the page-local thread free list, - // or the heap delayed free list (if this is the first non-local free in that page) - mi_thread_free_t tfreex; - bool use_delayed; - mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); - do { - use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE); - if mi_unlikely(use_delayed) { - // unlikely: this only happens on the first concurrent free in a page that is in the full list - tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING); - } - else { - // usual: directly add to page thread_free list - mi_block_set_next(page, block, mi_tf_block(tfree)); - tfreex = mi_tf_set_block(tfree,block); - } - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); - - // If this was the first non-local free, we need to push it on the heap delayed free list instead - if mi_unlikely(use_delayed) { - // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`) - mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page); - mi_assert_internal(heap != NULL); - if (heap != NULL) { - // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity) - mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); - do { - mi_block_set_nextx(heap,block,dfree, heap->keys); - } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); - } - - // and reset the MI_DELAYED_FREEING flag - tfree = mi_atomic_load_relaxed(&page->xthread_free); - do { - tfreex = tfree; - mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING); - tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE); - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); - } -} - -// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) -static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) -{ - // first see if the page was abandoned and if we can reclaim it into our thread - if (mi_page_is_abandoned(page)) { - if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 || - mi_page_is_singleton(page)) { // only one block, and we are free-ing it - if (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944)) - { - // the page is abandoned, try to reclaim it into our heap - if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) { // TODO: avoid putting it in the full free queue - mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); - // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc); - mi_free(block); // recursively free as now it will be a local free in our heap - return; - } - else { - if (mi_page_is_abandoned(page)) { - // mi_assert(false); - } - // mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages - } - } - } - } - - - // The padding check may access the non-thread-owned page for the key values. - // that is safe as these are constant and the page won't be freed (as the block is not freed yet). - mi_check_padding(page, block); - - // adjust stats (after padding check and potentially recursive `mi_free` above) - mi_stat_free(page, block); // stat_free may access the padding - mi_track_free_size(block, mi_page_usable_size_of(page,block)); - - // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection - _mi_padding_shrink(page, block, sizeof(mi_block_t)); - - if (mi_page_is_huge(page)) { - mi_assert_internal(mi_page_is_singleton(page)); - // huge pages are special as they occupy the entire segment - // as these are large we reset the memory occupied by the page so it is available to other threads - // (as the owning thread needs to actually free the memory later). - _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively - } - else { - #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading - memset(block, MI_DEBUG_FREED, mi_usable_size(block)); - #endif - } - - // and finally free the actual block by pushing it on the owning heap - // thread_delayed free list (or heap delayed free list) - mi_free_block_delayed_mt(page,block); -} -*/ // ------------------------------------------------------ // Usable size diff --git a/src/page.c b/src/page.c index e5e3f972..faef2f48 100644 --- a/src/page.c +++ b/src/page.c @@ -44,7 +44,7 @@ static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) { mi_assert_internal(_mi_ptr_page(page) == page); size_t count = 0; while (head != NULL) { - mi_assert_internal((uint8_t*)head - (uint8_t*)page > MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head)); + mi_assert_internal((uint8_t*)head - (uint8_t*)page > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head)); count++; head = mi_block_next(page, head); }