From 26fa8be42759ac39f7b4869b4e0936bd35a8be17 Mon Sep 17 00:00:00 2001 From: Daan Date: Wed, 19 Mar 2025 18:50:53 -0700 Subject: [PATCH] improved accounting of committed bytes (issue #1035) --- include/mimalloc/internal.h | 16 +++++++++++ src/arena.c | 39 ++++++++++++++++--------- src/bitmap.c | 28 +++++++++++------- src/bitmap.h | 4 +-- src/libc.c | 57 +++++++++++++++++++++++++++++++++++++ src/stats.c | 1 + 6 files changed, 119 insertions(+), 26 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 106da0d1..5b3e7e23 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -127,6 +127,7 @@ bool _mi_os_has_virtual_reserve(void); bool _mi_os_reset(void* addr, size_t size); bool _mi_os_commit(void* p, size_t size, bool* is_zero); +bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size); bool _mi_os_decommit(void* addr, size_t size); bool _mi_os_protect(void* addr, size_t size); bool _mi_os_unprotect(void* addr, size_t size); @@ -947,6 +948,21 @@ static inline size_t mi_bsr(size_t x) { return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x)); } +size_t _mi_popcount_generic(size_t x); + +static inline size_t mi_popcount(size_t x) { + if (x<=1) return x; + if (x==SIZE_MAX) return MI_SIZE_BITS; + #if defined(__GNUC__) + #if (SIZE_MAX == ULONG_MAX) + return __builtin_popcountl(x); + #else + return __builtin_popcountll(x); + #endif + #else + return _mi_popcount_generic(x); + #endif +} // --------------------------------------------------------------------------------- // Provide our own `_mi_memcpy` for potential performance optimizations. diff --git a/src/arena.c b/src/arena.c index 1f6f6d9d..a7c20764 100644 --- a/src/arena.c +++ b/src/arena.c @@ -255,7 +255,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar // set the dirty bits (todo: no need for an atomic op here?) if (arena->memid.initially_zero && arena->blocks_dirty != NULL) { - memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL); + memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL, NULL); } // set commit state @@ -267,10 +267,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar // commit requested, but the range may not be committed as a whole: ensure it is committed now memid->initially_committed = true; bool any_uncommitted; - _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted); + size_t already_committed = 0; + _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted, &already_committed); if (any_uncommitted) { + mi_assert_internal(already_committed < needed_bcount); + const size_t commit_size = mi_arena_block_size(needed_bcount); + const size_t stat_commit_size = commit_size - mi_arena_block_size(already_committed); bool commit_zero = false; - if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero)) { + if (!_mi_os_commit_ex(p, commit_size, &commit_zero, stat_commit_size)) { memid->initially_committed = false; } else { @@ -280,7 +284,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar } else { // no need to commit, but check if already fully committed - memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index); + size_t already_committed = 0; + memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &already_committed); + if (!memid->initially_committed && already_committed > 0) { + // partially committed: as it will be committed at some time, adjust the stats and pretend the range is fully uncommitted. + mi_assert_internal(already_committed < needed_bcount); + _mi_stat_decrease(&_mi_stats_main.committed, mi_arena_block_size(already_committed)); + _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index); + } } return p; @@ -464,17 +475,19 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks) const size_t size = mi_arena_block_size(blocks); void* const p = mi_arena_block_start(arena, bitmap_idx); bool needs_recommit; - if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { + size_t already_committed = 0; + if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx, &already_committed)) { // all blocks are committed, we can purge freely + mi_assert_internal(already_committed == blocks); needs_recommit = _mi_os_purge(p, size); } else { // some blocks are not committed -- this can happen when a partially committed block is freed // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge - // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), - // and also undo the decommit stats (as it was already adjusted) + // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory). + mi_assert_internal(already_committed < blocks); mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); - needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, 0); + needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, mi_arena_block_size(already_committed)); } // clear the purged blocks @@ -508,7 +521,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t else { // already an expiration was set } - _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL); + _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL, NULL); } } @@ -648,7 +661,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi if (p==NULL) return; if (size==0) return; const bool all_committed = (committed_size == size); - const bool decommitted_size = (committed_size <= size ? size - committed_size : 0); + const size_t decommitted_size = (committed_size <= size ? size - committed_size : 0); // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) mi_track_mem_undefined(p,size); @@ -691,14 +704,14 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi mi_assert_internal(arena->blocks_purge != NULL); if (!all_committed) { - // mark the entire range as no longer committed (so we recommit the full range when re-using) + // mark the entire range as no longer committed (so we will recommit the full range when re-using) _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); mi_track_mem_noaccess(p,size); - if (committed_size > 0) { + //if (committed_size > 0) { // if partially committed, adjust the committed stats (is it will be recommitted when re-using) // in the delayed purge, we do no longer decrease the commit if the range is not marked entirely as committed. _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } + //} // note: if not all committed, it may be that the purge will reset/decommit the entire range // that contains already decommitted parts. Since purge consistently uses reset or decommit that // works (as we should never reset decommitted parts). diff --git a/src/bitmap.c b/src/bitmap.c index 9ef784d6..50f4df2b 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -351,7 +351,7 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t // Set `count` bits at `bitmap_idx` to 1 atomically // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) { +bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero, size_t* already_set) { size_t idx = mi_bitmap_index_field(bitmap_idx); size_t pre_mask; size_t mid_mask; @@ -359,28 +359,31 @@ bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t co size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); bool all_zero = true; bool any_zero = false; + size_t one_count = 0; _Atomic(size_t)*field = &bitmap[idx]; size_t prev = mi_atomic_or_acq_rel(field++, pre_mask); - if ((prev & pre_mask) != 0) all_zero = false; + if ((prev & pre_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & pre_mask); } if ((prev & pre_mask) != pre_mask) any_zero = true; while (mid_count-- > 0) { prev = mi_atomic_or_acq_rel(field++, mid_mask); - if ((prev & mid_mask) != 0) all_zero = false; + if ((prev & mid_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & mid_mask); } if ((prev & mid_mask) != mid_mask) any_zero = true; } if (post_mask!=0) { prev = mi_atomic_or_acq_rel(field, post_mask); - if ((prev & post_mask) != 0) all_zero = false; + if ((prev & post_mask) != 0) { all_zero = false; one_count += mi_popcount(prev & post_mask); } if ((prev & post_mask) != post_mask) any_zero = true; } if (pany_zero != NULL) { *pany_zero = any_zero; } + if (already_set != NULL) { *already_set = one_count; }; + mi_assert_internal(all_zero ? one_count == 0 : one_count <= count); return all_zero; } // Returns `true` if all `count` bits were 1. // `any_ones` is `true` if there was at least one bit set to one. -static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) { +static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones, size_t* already_set) { size_t idx = mi_bitmap_index_field(bitmap_idx); size_t pre_mask; size_t mid_mask; @@ -388,30 +391,33 @@ static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_field size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); bool all_ones = true; bool any_ones = false; + size_t one_count = 0; mi_bitmap_field_t* field = &bitmap[idx]; size_t prev = mi_atomic_load_relaxed(field++); if ((prev & pre_mask) != pre_mask) all_ones = false; - if ((prev & pre_mask) != 0) any_ones = true; + if ((prev & pre_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & pre_mask); } while (mid_count-- > 0) { prev = mi_atomic_load_relaxed(field++); if ((prev & mid_mask) != mid_mask) all_ones = false; - if ((prev & mid_mask) != 0) any_ones = true; + if ((prev & mid_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & mid_mask); } } if (post_mask!=0) { prev = mi_atomic_load_relaxed(field); if ((prev & post_mask) != post_mask) all_ones = false; - if ((prev & post_mask) != 0) any_ones = true; + if ((prev & post_mask) != 0) { any_ones = true; one_count += mi_popcount(prev & post_mask); } } if (pany_ones != NULL) { *pany_ones = any_ones; } + if (already_set != NULL) { *already_set = one_count; } + mi_assert_internal(all_ones ? one_count == count : one_count < count); return all_ones; } -bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL); +bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, size_t* already_set) { + return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL, already_set); } bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { bool any_ones; - mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); + mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones, NULL); return any_ones; } diff --git a/src/bitmap.h b/src/bitmap.h index d60668cb..60b38815 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -102,9 +102,9 @@ bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t // Set `count` bits at `bitmap_idx` to 1 atomically // Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero); +bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero, size_t* already_set); -bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); +bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, size_t* already_set); bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); #endif diff --git a/src/libc.c b/src/libc.c index 1bd97aa3..52d095eb 100644 --- a/src/libc.c +++ b/src/libc.c @@ -275,3 +275,60 @@ int _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { va_end(args); return written; } + + +#if MI_SIZE_SIZE == 4 +#define mi_mask_even_bits32 (0x55555555) +#define mi_mask_even_pairs32 (0x33333333) +#define mi_mask_even_nibbles32 (0x0F0F0F0F) + +// sum of all the bytes in `x` if it is guaranteed that the sum < 256! +static size_t mi_byte_sum32(uint32_t x) { + // perform `x * 0x01010101`: the highest byte contains the sum of all bytes. + x += (x << 8); + x += (x << 16); + return (size_t)(x >> 24); +} + +static size_t mi_popcount_generic32(uint32_t x) { + // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10 + // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair + // into the lower bit-pair: + x = x - ((x >> 1) & mi_mask_even_bits32); + // add the 2-bit pair results + x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32); + // add the 4-bit nibble results + x = (x + (x >> 4)) & mi_mask_even_nibbles32; + // each byte now has a count of its bits, we can sum them now: + return mi_byte_sum32(x); +} + +mi_decl_noinline size_t _mi_popcount_generic(size_t x) { + return mi_popcount_generic32(x); +} + +#else +#define mi_mask_even_bits64 (0x5555555555555555) +#define mi_mask_even_pairs64 (0x3333333333333333) +#define mi_mask_even_nibbles64 (0x0F0F0F0F0F0F0F0F) + +// sum of all the bytes in `x` if it is guaranteed that the sum < 256! +static size_t mi_byte_sum64(uint64_t x) { + x += (x << 8); + x += (x << 16); + x += (x << 32); + return (size_t)(x >> 56); +} + +static size_t mi_popcount_generic64(uint64_t x) { + x = x - ((x >> 1) & mi_mask_even_bits64); + x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64); + x = (x + (x >> 4)) & mi_mask_even_nibbles64; + return mi_byte_sum64(x); +} + +mi_decl_noinline size_t _mi_popcount_generic(size_t x) { + return mi_popcount_generic64(x); +} +#endif + diff --git a/src/stats.c b/src/stats.c index 1cfc3104..6a480816 100644 --- a/src/stats.c +++ b/src/stats.c @@ -30,6 +30,7 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { { // add atomically (for abandoned pages) int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount); + // if (stat == &_mi_stats_main.committed) { mi_assert_internal(current + amount >= 0); }; mi_atomic_maxi64_relaxed(&stat->peak, current + amount); if (amount > 0) { mi_atomic_addi64_relaxed(&stat->total,amount);