diff --git a/src/arena.c b/src/arena.c index d8b882d3..84db2fb0 100644 --- a/src/arena.c +++ b/src/arena.c @@ -44,7 +44,7 @@ typedef struct mi_arena_s { bool is_large; // memory area consists of large- or huge OS pages (always committed) _Atomic(mi_msecs_t) purge_expire; // expiration time when slices can be purged from `slices_purge`. - mi_bitmap_t* slices_free; // is the slice free? + mi_bbitmap_t* slices_free; // is the slice free? (a binned bitmap with size classes) mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) mi_bitmap_t* slices_dirty; // is the slice potentially non-zero? mi_bitmap_t* slices_purge; // slices that can be purged @@ -213,7 +213,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid) { size_t slice_index; - if (!mi_bitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL; + if (!mi_bbitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL; // claimed it! void* p = mi_arena_slice_start(arena, slice_index); @@ -267,7 +267,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count); } - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); } mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); @@ -574,7 +574,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); _mi_page_free_collect(page, false); // update `used` count - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); @@ -775,7 +775,7 @@ void _mi_arena_page_free(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); @@ -812,7 +812,7 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(!mi_page_is_singleton(page)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); @@ -867,7 +867,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done @@ -935,7 +935,7 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) { } // and make it available to others again - bool all_inuse = mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL); + bool all_inuse = mi_bbitmap_setN(arena->slices_free, slice_index, slice_count); if (!all_inuse) { _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count)); return; @@ -1051,8 +1051,8 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas if (slice_count == 0) slice_count = MI_BCHUNK_BITS; mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0); const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE); - const size_t bitmaps_count = 4 + MI_BIN_COUNT; // free, commit, dirty, purge, and abandonded - const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count,NULL); + const size_t bitmaps_count = 4 + MI_BIN_COUNT; // commit, dirty, purge, and abandonded + const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count, NULL) + mi_bbitmap_size(slice_count, NULL); // + free const size_t size = base_size + bitmaps_size; const size_t os_page_size = _mi_os_page_size(); @@ -1069,6 +1069,12 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) { return bitmap; } +static mi_bbitmap_t* mi_arena_bbitmap_init(size_t slice_count, uint8_t** base) { + mi_bbitmap_t* bbitmap = (mi_bbitmap_t*)(*base); + *base = (*base) + mi_bbitmap_init(bbitmap, slice_count, true /* already zero */); + return bbitmap; +} + static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { @@ -1121,7 +1127,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int // init bitmaps uint8_t* base = mi_arena_start(arena) + bitmap_base; - arena->slices_free = mi_arena_bitmap_init(slice_count,&base); + arena->slices_free = mi_arena_bbitmap_init(slice_count,&base); arena->slices_committed = mi_arena_bitmap_init(slice_count,&base); arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base); arena->slices_purge = mi_arena_bitmap_init(slice_count, &base); @@ -1132,7 +1138,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena))); // reserve our meta info (and reserve slices outside the memory area) - mi_bitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); + mi_bbitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); if (memid.initially_committed) { mi_bitmap_unsafe_setN(arena->slices_committed, 0, arena->slice_count); } @@ -1225,7 +1231,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t else if (_mi_meta_is_meta_page(start)) { c = 'm'; } else if (slice_index + bit < arena->info_slices) { c = 'i'; } // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; } - else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) { + else if (mi_bbitmap_is_setN(arena->slices_free, slice_index+bit, 1)) { if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; } else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; } else { c = '.'; } @@ -1237,14 +1243,14 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t return bit_set_count; } -static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { +static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, bool invert, mi_arena_t* arena) { _mi_output_message("%s:\n", header); size_t bit_count = 0; size_t bit_set_count = 0; - for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { + for (size_t i = 0; i < chunk_count && bit_count < slice_count; i++) { char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); size_t k = 0; - mi_bchunk_t* chunk = &bitmap->chunks[i]; + mi_bchunk_t* chunk = &chunks[i]; if (i<10) { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; } else if (i<100) { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; } @@ -1276,6 +1282,15 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi return bit_set_count; } +static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { + return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], invert, arena); +} + +static size_t mi_debug_show_bbitmap(const char* header, size_t slice_count, mi_bbitmap_t* bbitmap, bool invert, mi_arena_t* arena) { + return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], invert, arena); +} + + void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept { size_t max_arenas = mi_arena_get_count(); size_t free_total = 0; @@ -1288,7 +1303,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) slice_total += arena->slice_count; _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { - free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); + free_total += mi_debug_show_bbitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); } if (show_committed) { mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL); @@ -1391,7 +1406,7 @@ static long mi_arena_purge_delay(void) { // assumes we own the area (i.e. slices_free is claimed by us) static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { mi_assert_internal(!arena->memid.is_pinned); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); const size_t size = mi_size_of_slices(slice_count); void* const p = mi_arena_slice_start(arena, slice_index); @@ -1417,7 +1432,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ const long delay = mi_arena_purge_delay(); if (delay < 0 || _mi_preloading()) return; // is purging allowed at all? - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); if (delay == 0) { // purge directly mi_arena_purge(arena, slice_index, slice_count); @@ -1443,11 +1458,11 @@ typedef struct mi_purge_visit_info_s { } mi_purge_visit_info_t; static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) { - if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) { + if (mi_bbitmap_try_clearN(arena->slices_free, slice_index, slice_count)) { // purge mi_arena_purge(arena, slice_index, slice_count); // and reset the free range - mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL); + mi_bbitmap_setN(arena->slices_free, slice_index, slice_count); return true; } else { diff --git a/src/bitmap.c b/src/bitmap.c index 2734e2b2..4a0c4a60 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -477,9 +477,9 @@ static inline __m256i mi_mm256_zero(void) { static inline __m256i mi_mm256_ones(void) { return _mm256_set1_epi64x(~0); } -//static inline bool mi_mm256_is_ones(__m256i vec) { -// return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec)); -//} +static inline bool mi_mm256_is_ones(__m256i vec) { + return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec)); +} static inline bool mi_mm256_is_zero( __m256i vec) { return _mm256_testz_si256(vec,vec); } @@ -706,7 +706,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, const size_t bmask = mask<>idx == mask); - if ((b&bmask) == bmask) { // found a match + if ((b&bmask) == bmask) { // found a match if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) { *pidx = (i*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); @@ -837,6 +837,24 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { #endif } +// are all bits in a bitmap chunk set? +static inline bool mi_bchunk_all_are_set_relaxed(mi_bchunk_t* chunk) { +#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return mi_mm256_is_ones(vec); +#elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) + // a 64b cache-line contains the entire chunk anyway so load both at once + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); + return (mi_mm256_is_ones(_mm256_and_si256(vec1, vec2))); +#else + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + if (~mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false; + } + return true; +#endif +} + static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) { for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) { @@ -902,6 +920,7 @@ size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) { return size; } + // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true // returns the size of the bitmap size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) { @@ -915,38 +934,33 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) return size; } -// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. -void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0); - mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); - // first chunk +// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + const size_t total = n; + + + // start chunk and index size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; + const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS); + + // first update the chunkmap + mi_bchunk_setN(cmap, chunk_idx, ccount, NULL); + + // first chunk size_t m = MI_BCHUNK_BITS - cidx; if (m > n) { m = n; } - mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL); - mi_bitmap_chunkmap_set(bitmap, chunk_idx); + mi_bchunk_setN(&chunks[chunk_idx], cidx, m, NULL); // n can be large so use memset for efficiency for all in-between chunks chunk_idx++; n -= m; const size_t mid_chunks = n / MI_BCHUNK_BITS; if (mid_chunks > 0) { - _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE); - const size_t end_chunk = chunk_idx + mid_chunks; - while (chunk_idx < end_chunk) { - if ((chunk_idx % MI_BFIELD_BITS) == 0 && (chunk_idx + MI_BFIELD_BITS <= end_chunk)) { - // optimize: we can set a full bfield in the chunkmap - mi_atomic_store_relaxed( &bitmap->chunkmap.bfields[chunk_idx/MI_BFIELD_BITS], mi_bfield_all_set()); - mi_bitmap_chunkmap_set(bitmap, chunk_idx + MI_BFIELD_BITS - 1); // track the max set - chunk_idx += MI_BFIELD_BITS; - } - else { - mi_bitmap_chunkmap_set(bitmap, chunk_idx); - chunk_idx++; - } - } + _mi_memset(&chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE); + chunk_idx += mid_chunks; n -= (mid_chunks * MI_BCHUNK_BITS); } @@ -954,12 +968,15 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { if (n > 0) { mi_assert_internal(n < MI_BCHUNK_BITS); mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); - mi_bitmap_chunkmap_set(bitmap, chunk_idx); + mi_bchunk_setN(&chunks[chunk_idx], 0, n, NULL); } +} - // reset max_accessed - mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0); +// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); + mi_bchunks_unsafe_setN(&bitmap->chunks[0], &bitmap->chunkmap, idx, n); } @@ -1085,7 +1102,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n #define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \ mi_assert_internal(start <= cycle); \ mi_assert_internal(start < MI_BFIELD_BITS); \ - mi_assert_internal(cycle < MI_BFIELD_BITS); \ + mi_assert_internal(cycle <= MI_BFIELD_BITS); \ mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \ size_t _bcount##SUF = mi_bfield_popcount(bfield); \ mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\ @@ -1250,7 +1267,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) -mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, +bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag) { mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag }; @@ -1351,3 +1368,248 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi return true; } + + +/* -------------------------------------------------------------------------------- + binned bitmap's +-------------------------------------------------------------------------------- */ + + +size_t mi_bbitmap_size(size_t bit_count, size_t* pchunk_count) { + mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0); + bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS); + mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT); + mi_assert_internal(bit_count > 0); + const size_t chunk_count = bit_count / MI_BCHUNK_BITS; + mi_assert_internal(chunk_count >= 1); + const size_t size = offsetof(mi_bbitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE); + mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 ); + if (pchunk_count != NULL) { *pchunk_count = chunk_count; } + return size; +} + +// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +// returns the size of the bitmap +size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero) { + size_t chunk_count; + const size_t size = mi_bbitmap_size(bit_count, &chunk_count); + if (!already_zero) { + _mi_memzero_aligned(bbitmap, size); + } + mi_atomic_store_release(&bbitmap->chunk_count, chunk_count); + mi_assert_internal(mi_atomic_load_relaxed(&bbitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT); + return size; +} + +void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); + mi_bchunks_unsafe_setN(&bbitmap->chunks[0], &bbitmap->chunkmap, idx, n); +} + + + +/* -------------------------------------------------------------------------------- + binned bitmap chunkmap +-------------------------------------------------------------------------------- */ + +static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) { + size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed); + if mi_unlikely(chunk_idx > oldmax) { + mi_atomic_cas_strong_relaxed(&bbitmap->chunk_max_accessed, &oldmax, chunk_idx); + } +} + +static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) { + mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + if (check_all_set) { + if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) { + // all slices are free in this chunk: return back to the NONE bin + mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], MI_BBIN_NONE); + } + } + mi_bchunk_set(&bbitmap->chunkmap, chunk_idx); + mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx); +} + +static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_idx) { + mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + // check if the corresponding chunk is all clear + if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) return false; + // clear the chunkmap bit + mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL); + // .. but a concurrent set may have happened in between our all-clear test and the clearing of the + // bit in the mask. We check again to catch this situation. + if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) { + mi_bchunk_set(&bbitmap->chunkmap, chunk_idx); + return false; + } + mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx); + return true; +} + +// Assign from the NONE bin to a specific size bin +static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) { + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin); +} + + +/* -------------------------------------------------------------------------------- + mi_bbitmap_setN, try_clearN, and is_xsetN + (used to find free pages) +-------------------------------------------------------------------------------- */ + +// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! +bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BCHUNK_BITS); + + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia + + const bool were_allclear = mi_bchunk_setN(&bbitmap->chunks[chunk_idx], cidx, n, NULL); + mi_bbitmap_chunkmap_set(bbitmap, chunk_idx, true); // set after + return were_allclear; +} + + +// ------- mi_bbitmap_try_clearN --------------------------------------- + +bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BCHUNK_BITS); + mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); + + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + if (cidx + n > MI_BCHUNK_BITS) return false; + bool maybe_all_clear; + const bool cleared = mi_bchunk_try_clearN(&bbitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); + if (cleared && maybe_all_clear) { mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); } + // note: we don't set the size class for an explicit try_clearN (only used by purging) + return cleared; +} + + +// ------- mi_bbitmap_is_xset --------------------------------------- + +// Is a sequence of n bits already all set/cleared? +bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BCHUNK_BITS); + mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); + + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia + + return mi_bchunk_is_xsetN(set, &bbitmap->chunks[chunk_idx], cidx, n); +} + + + + +/* -------------------------------------------------------------------------------- + mi_bbitmap_find + (used to find free pages) +-------------------------------------------------------------------------------- */ + +typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx); + +// Go through the bbitmap and for every sequence of `n` set bits, call the visitor function. +// If it returns `true` stop the search. +static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find) +{ + // we space out threads to reduce contention + const size_t cmap_max_count = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap),MI_BFIELD_BITS); + const size_t chunk_acc = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed); + const size_t cmap_acc = chunk_acc / MI_BFIELD_BITS; + const size_t cmap_acc_bits = 1 + (chunk_acc % MI_BFIELD_BITS); + + // create a mask over the chunkmap entries to iterate over them efficiently + mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS); + const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); + const size_t cmap_cycle = cmap_acc+1; + const mi_bbin_t bbin = mi_bbin_of(n); + // visit bins from largest size bin up to the NONE bin + // for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL + const mi_bbin_t bin = bbin; + { + mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) + { + // don't search into non-accessed memory until we tried other size bins as well + //if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { + // break; + //} + + // and for each chunkmap entry we iterate over its bits to find the chunks + const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]); + const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); + mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) + { + mi_assert_internal(eidx <= MI_BFIELD_BITS); + const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + // only in the current size class! + const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]); + if (bin >= chunk_bin) { // || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { + mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; + size_t cidx; + if ((*on_find)(chunk, n, &cidx)) { + if (cidx==0 && chunk_bin == MI_BBIN_NONE) { // only the first determines the size bin + // this chunk is now reserved for the `bbin` size class + mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin); + } + *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; + mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap)); + return true; + } + else { + /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ + mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); + } + } + } + mi_bfield_cycle_iterate_end(Y); + } + mi_bfield_cycle_iterate_end(X); + } + return false; +} + + +/* -------------------------------------------------------------------------------- + mi_bbitmap_try_find_and_clear -- used to find free pages + note: the compiler will fully inline the indirect function calls +-------------------------------------------------------------------------------- */ + +bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1); +} + +bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8); +} + +bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X); +} + +bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BFIELD_BITS); + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX); +} + +bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BCHUNK_BITS); + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_); +} diff --git a/src/bitmap.h b/src/bitmap.h index 4afcdaf1..b28a09e4 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -36,7 +36,7 @@ Concurrent bitmap that can set/reset sequences of bits atomically This is used to avoid scanning every chunk. (and thus strictly an optimization) It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out to have no bits set. It is also allowed to briefly have a clear bit even if the - chunk has bits set -- as long as we guarantee that the bit will be set later on; + chunk has bits set -- as long as we guarantee that the bit will be set later on; (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk). However, when we clear a bit in a chunk, and the chunk is indeed all clear, we @@ -236,4 +236,97 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a // Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`) bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); +// +typedef enum mi_bbin_e { + MI_BBIN_NONE, // no bin assigned yet (the chunk is completely free) + MI_BBIN_SMALL, // slice_count == 1 + MI_BBIN_MEDIUM, // slice_count == 8 + MI_BBIN_OTHER, // slice_count > 1, and not 8 + MI_BBIN_COUNT +} mi_bbin_t; + +static inline mi_bbin_t mi_bbin_of(size_t n) { + return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : MI_BBIN_OTHER)); +} + +// An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes +typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s { + _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) + _Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set + size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc + mi_bchunkmap_t chunkmap; + _Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT]; // 512b + mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT +} mi_bbitmap_t; + + +static inline size_t mi_bbitmap_chunk_count(const mi_bbitmap_t* bbitmap) { + return mi_atomic_load_relaxed(&((mi_bbitmap_t*)bbitmap)->chunk_count); +} + +static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) { + return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS); +} + +size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count); + + +// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true +// returns the size of the bitmap. +size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero); + +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). +// Not atomic so only use if still local to a thread. +void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); + + +// Set a sequence of `n` bits in the bbitmap; returns `true` if atomically transitioned from all 0's to 1's +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! +bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); + +// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! +bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); + + +// Is a sequence of n bits already all set/cleared? +bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n); + +// Is a sequence of n bits already set? +// (Used to check if a memory range is already committed) +static inline bool mi_bbitmap_is_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + return mi_bbitmap_is_xsetN(MI_BIT_SET, bbitmap, idx, n); +} + +// Is a sequence of n bits already clear? +static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + return mi_bbitmap_is_xsetN(MI_BIT_CLEAR, bbitmap, idx, n); +} + + +// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes. +// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`. +bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); + + + +// Specialized versions for common bit sequence sizes +bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 1-bit +bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits +bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS +bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS +bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS + +// Find a sequence of `n` bits in the bbitmap with all bits set, and try to atomically clear all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +mi_decl_nodiscard static inline bool mi_bbitmap_try_find_and_clearN(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx) { + if (n==1) return mi_bbitmap_try_find_and_clear(bbitmap, tseq, pidx); // small pages + if (n==8) return mi_bbitmap_try_find_and_clear8(bbitmap, tseq, pidx); // medium pages + if (n==MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearX(bbitmap, tseq, pidx); // large pages + if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk + if (n < MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearNX(bbitmap, tseq, n, pidx); + return mi_bbitmap_try_find_and_clearN_(bbitmap, tseq, n, pidx); +} + + #endif // MI_BITMAP_H