/* ---------------------------------------------------------------------------- Copyright (c) 2019-2024 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* ---------------------------------------------------------------------------- Concurrent bitmap that can set/reset sequences of bits atomically ---------------------------------------------------------------------------- */ #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/bits.h" #include "bitmap.h" #ifndef MI_OPT_SIMD #define MI_OPT_SIMD 0 #endif /* -------------------------------------------------------------------------------- bfields -------------------------------------------------------------------------------- */ static inline size_t mi_bfield_ctz(mi_bfield_t x) { return mi_ctz(x); } static inline size_t mi_bfield_clz(mi_bfield_t x) { return mi_clz(x); } static inline size_t mi_bfield_popcount(mi_bfield_t x) { return mi_popcount(x); } static inline mi_bfield_t mi_bfield_clear_least_bit(mi_bfield_t x) { return (x & (x-1)); } // find the least significant bit that is set (i.e. count trailing zero's) // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { return mi_bsf(x,idx); } // find the most significant bit that is set. // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) { return mi_bsf(x, idx); } // find each set bit in a bit field `x` and clear it, until it becomes zero. static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) { const bool found = mi_bfield_find_least_bit(*x, idx); *x = mi_bfield_clear_least_bit(*x); return found; } static inline mi_bfield_t mi_bfield_zero(void) { return 0; } static inline mi_bfield_t mi_bfield_one(void) { return 1; } static inline mi_bfield_t mi_bfield_all_set(void) { return ~((mi_bfield_t)0); } // mask of `bit_count` bits set shifted to the left by `shiftl` static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) { mi_assert_internal(bit_count > 0); mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS); const mi_bfield_t mask0 = (bit_count < MI_BFIELD_BITS ? (mi_bfield_one() << bit_count)-1 : mi_bfield_all_set()); return (mask0 << shiftl); } // ------- mi_bfield_atomic_set --------------------------------------- // the `_set` functions return also the count of bits that were already set (for commit statistics) // the `_clear` functions return also whether the new bfield is all clear or not (for the chunk_map) // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_mask(1, idx);; const mi_bfield_t old = mi_atomic_or_acq_rel(b, mask); return ((old&mask) == 0); } // Clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0. // `all_clear` is set if the new bfield is zero. static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_mask(1, idx);; mi_bfield_t old = mi_atomic_and_acq_rel(b, ~mask); if (all_clear != NULL) { *all_clear = ((old&~mask)==0); } return ((old&mask) == mask); } // Clear a bit but only when/once it is set. This is used by concurrent free's while // the page is abandoned and mapped. This can incure a busy wait :-( but it should // happen almost never (and is accounted for in the stats) static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_mask(1, idx);; mi_bfield_t old = mi_atomic_load_relaxed(b); do { if mi_unlikely((old&mask) == 0) { old = mi_atomic_load_acquire(b); if ((old&mask)==0) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); } while ((old&mask)==0) { // busy wait mi_atomic_yield(); old = mi_atomic_load_acquire(b); } } } while (!mi_atomic_cas_weak_acq_rel(b,&old, (old&~mask))); mi_assert_internal((old&mask)==mask); // we should only clear when it was set } // Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's. // `already_set` contains the count of bits that were already set (used when committing ranges to account // statistics correctly). static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) {}; // try to atomically set the mask bits until success if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); } return ((old&mask) == 0); } // Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) {}; // try to atomically clear the mask bits until success if (all_clear != NULL) { *all_clear = ((old&~mask)==0); } return ((old&mask) == mask); } static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already_set) { const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_all_set()); if (already_set!=NULL) { *already_set = mi_bfield_popcount(old); } return (old==0); } static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) { const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero()); if (all_clear!=NULL) { *all_clear = true; } return (~old==0); } // ------- mi_bfield_atomic_try_clear --------------------------------------- // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0 // and false otherwise (leaving the bit field as is). static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); do { if ((old&mask) != mask) { // the mask bits are no longer set if (all_clear != NULL) { *all_clear = (old==0); } return false; } } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)); // try to atomically clear the mask bits if (all_clear != NULL) { *all_clear = ((old&~mask) == 0); } return true; } // Tries to set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0) // and `false` otherwise leaving the bfield `b` as-is. // `all_clear` is set to true if the new bfield is zero (and false otherwise) static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_one()<bfields[i], idx); } static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { mi_assert_internal(cidx < MI_BCHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; const mi_bfield_t mask = mi_bfield_mask(n, idx); return mi_bfield_atomic_set_mask(&chunk->bfields[i], mask, already_set); } static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) { mi_assert_internal(cidx < MI_BCHUNK_BITS); mi_assert_internal((cidx%MI_BFIELD_BITS)==0); const size_t i = cidx / MI_BFIELD_BITS; return mi_bfield_atomic_setX(&chunk->bfields[i], already_set); } // Set a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0). mi_decl_noinline static bool mi_bchunk_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_set, bool* pmaybe_all_clear) { mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); bool all_transition = true; bool maybe_all_clear = true; size_t total_already_set = 0; size_t idx = cidx % MI_BFIELD_BITS; size_t field = cidx / MI_BFIELD_BITS; while (n > 0) { size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field if (m > n) { m = n; } mi_assert_internal(idx + m <= MI_BFIELD_BITS); mi_assert_internal(field < MI_BCHUNK_FIELDS); const mi_bfield_t mask = mi_bfield_mask(m, idx); size_t already_set = 0; bool all_clear = false; const bool transition = (set ? mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, &already_set) : mi_bfield_atomic_clear_mask(&chunk->bfields[field], mask, &all_clear)); mi_assert_internal((transition && already_set == 0) || (!transition && already_set > 0)); all_transition = all_transition && transition; total_already_set += already_set; maybe_all_clear = maybe_all_clear && all_clear; // next field field++; idx = 0; n -= m; } if (palready_set!=NULL) { *palready_set = total_already_set; } if (pmaybe_all_clear!=NULL) { *pmaybe_all_clear = maybe_all_clear; } return all_transition; } static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS); if (n==1) { bool was_clear = mi_bchunk_set(chunk, cidx); if (already_set != NULL) { *already_set = !was_clear; } return was_clear; } if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set); if (n bfields[i], idx, all_clear); } static inline bool mi_bchunk_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* all_clear) { mi_assert_internal(cidx < MI_BCHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; const mi_bfield_t mask = mi_bfield_mask(n, idx); return mi_bfield_atomic_clear_mask(&chunk->bfields[i], mask, all_clear); } static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) { mi_assert_internal(cidx < MI_BCHUNK_BITS); mi_assert_internal((cidx%MI_BFIELD_BITS)==0); const size_t i = cidx / MI_BFIELD_BITS; return mi_bfield_atomic_clearX(&chunk->bfields[i], all_clear); } static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS); if (n==1) return mi_bchunk_clear(chunk, cidx, pmaybe_all_clear); if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, pmaybe_all_clear); if (n 0) { size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field if (m > n) { m = n; } mi_assert_internal(idx + m <= MI_BFIELD_BITS); mi_assert_internal(field_idx < MI_BCHUNK_FIELDS); const size_t mask = mi_bfield_mask(m, idx); if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field_idx], mask)) { return false; } // next field field_idx++; idx = 0; n -= m; } return true; } // Check if a sequence of `n` bits within a chunk are all set/cleared. static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) { mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); if (n==0) return true; const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; if mi_likely(n<=MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); } else { return mi_bchunk_is_xsetN_(set, chunk, i, idx, n); } } // ------- mi_bchunk_try_clear --------------------------------------- static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { mi_assert_internal(cidx < MI_BCHUNK_BITS); mi_assert_internal(n <= MI_BFIELD_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; mi_assert_internal(idx + n <= MI_BFIELD_BITS); const size_t mask = mi_bfield_mask(n, idx); return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mask, pmaybe_all_clear); } static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) { mi_assert_internal(cidx < MI_BCHUNK_BITS); mi_assert_internal((cidx%MI_BFIELD_BITS) == 0); const size_t i = cidx / MI_BFIELD_BITS; return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear); } // Try to atomically set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving all bit fields as is. // Note: this is a hard one as we need to unwind partial atomic operations // if we fail halfway.. mi_decl_noinline static bool mi_bchunk_try_clearN_(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); if (n==0) return true; size_t start_idx = cidx % MI_BFIELD_BITS; size_t start_field = cidx / MI_BFIELD_BITS; size_t end_field = MI_BCHUNK_FIELDS; mi_bfield_t mask_mid = 0; mi_bfield_t mask_end = 0; bool field_is_clear; bool maybe_all_clear = true; if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = false; } // first field size_t field = start_field; size_t m = MI_BFIELD_BITS - start_idx; // m is the bits to xset in this field if (m > n) { m = n; } mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); mi_assert_internal(start_field < MI_BCHUNK_FIELDS); const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx); if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &field_is_clear)) return false; maybe_all_clear = maybe_all_clear && field_is_clear; // done? n -= m; if (n==0) { if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; } return true; } // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields // mid fields while (n >= MI_BFIELD_BITS) { field++; mi_assert_internal(field < MI_BCHUNK_FIELDS); mask_mid = mi_bfield_all_set(); if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_mid, &field_is_clear)) goto restore; maybe_all_clear = maybe_all_clear && field_is_clear; n -= MI_BFIELD_BITS; } // last field if (n > 0) { mi_assert_internal(n < MI_BFIELD_BITS); field++; mi_assert_internal(field < MI_BCHUNK_FIELDS); end_field = field; mask_end = mi_bfield_mask(n, 0); if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_end, &field_is_clear)) goto restore; maybe_all_clear = maybe_all_clear && field_is_clear; } if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; } return true; restore: // field is on the field that failed to set atomically; we need to restore all previous fields mi_assert_internal(field > start_field); while( field > start_field) { field--; const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, NULL); } return false; } static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) { mi_assert_internal(n>0); if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear); if (nbfields[chunk_idx]); size_t idx; if (!allow_allset && (~b == 0)) return false; if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; } } return false; } // Find least 1-bit in a chunk and try to clear it atomically // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // This is used to find free slices and abandoned pages and should be efficient. // todo: try neon version static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) { #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared) if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 const size_t chunk_idx = _tzcnt_u32(mask) / 8; if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { size_t chunk_idx = 0; #if 0 // one vector at a time __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); if (mi_mm256_is_zero(vec)) { chunk_idx += 4; vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1); } const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared) if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 chunk_idx += _tzcnt_u32(mask) / 8; #else // a cache line is 64b so we can just as well load all at the same time const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); const __m256i cmpv = mi_mm256_zero(); const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == 0 ? 0xFF : 0) const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == 0 ? 0xFF : 0) const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte (so each 8 bits are all set or clear) const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared) if (mask==0) return false; mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. chunk_idx = mi_ctz(mask) / 8; #endif if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #else // try first to find a field that is not all set (to reduce fragmentation) for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true; } for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true; } return false; #endif } static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, size_t* pidx) { mi_assert_internal(n==1); MI_UNUSED(n); return mi_bchunk_try_find_and_clear(chunk, pidx); } #if !MI_OPT_SIMD static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) { const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); if (!allow_all_set && (~b == 0)) return false; // has_set8 has low bit in each byte set if the byte in x == 0xFF const mi_bfield_t has_set8 = ((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F (b & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 >> 7; // shift high bit to low bit size_t idx; if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); mi_assert_internal((idx%8)==0); if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // unset the byte atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; } } return false; } #endif // find least byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // Used to find medium size pages in the free blocks. // todo: try neon version static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) { #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); const __m256i cmpv = mi_mm256_ones(); const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0) const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0) const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte const uint32_t mask2 = _mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared) if (mask==0) return false; const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk const size_t chunk_idx = bidx / 8; const size_t idx = (bidx % 8)*8; mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; } // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } } #else // first skip allset fields to reduce fragmentation for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true; } for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true; } return false; #endif } static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n, size_t* pidx) { mi_assert_internal(n==8); MI_UNUSED(n); return mi_bchunk_try_find_and_clear8(chunk, pidx); } // find least bfield in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // Used to find large size pages in the free blocks. // todo: try neon version static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) { #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); const __m256i cmpv = mi_mm256_ones(); const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (bfield == ~0 ? -1 : 0) const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (bfield == ~0 ? -1 : 0) const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte const uint32_t mask2 = _mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; // mask is inverted, so each 8-bits are set iff the corresponding elem64 has all bits set (and thus can be cleared) if (mask==0) return false; mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. const size_t chunk_idx = _tzcnt_u64(mask) / 8; mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx],NULL)) { *pidx = chunk_idx*MI_BFIELD_BITS; mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS); return true; } // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #else for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); if (~b==0 && mi_bfield_atomic_try_clearX(&chunk->bfields[i], NULL)) { *pidx = i*MI_BFIELD_BITS; mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS); return true; } } return false; #endif } static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n, size_t* pidx) { mi_assert_internal(n==MI_BFIELD_BITS); MI_UNUSED(n); return mi_bchunk_try_find_and_clearX(chunk, pidx); } // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set, // and try to clear them atomically. // Currently does not cross bfield boundaries. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // (We do not cross bfield boundaries) mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BFIELD_BITS) return false; const mi_bfield_t mask = mi_bfield_mask(n, 0); for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t idx; while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit if (idx + n > MI_BFIELD_BITS) break; const size_t bmask = mask<>idx == mask); if ((b&bmask) == bmask) { // found a match if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) { *pidx = (i*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); return true; } else { // if failed to atomically commit, reload b and try again from this position b = mi_atomic_load_acquire(&chunk->bfields[i]); } } else { // advance const size_t ones = mi_bfield_ctz(~(b>>idx)); // skip all ones (since it didn't fit the mask) mi_assert_internal(ones>0); b = b & ~mi_bfield_mask(ones, idx); // clear the ones } } } return false; } // find a sequence of `n` bits in a chunk with `n < MI_BCHUNK_BITS` with all bits set, // and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // This can cross bfield boundaries. static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk const size_t skip_count = n/MI_BFIELD_BITS; size_t cidx; for (size_t i = 0; i <= MI_BCHUNK_FIELDS - skip_count; i++) { size_t m = n; // bits to go // first field mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t ones = mi_bfield_clz(~b); cidx = i*MI_BFIELD_BITS + (MI_BFIELD_BITS - ones); // start index if (ones >= m) { // we found enough bits! m = 0; } else { m -= ones; mi_assert_internal(m>0); } // keep scanning further fields? size_t j = 1; // field count from i while (i+j < MI_BCHUNK_FIELDS) { mi_assert_internal(m > 0); b = mi_atomic_load_relaxed(&chunk->bfields[i+j]); ones = mi_bfield_ctz(~b); if (ones >= m) { // we found enough bits m = 0; break; } else if (ones == MI_BFIELD_BITS) { // not enough yet, proceed to the next field j++; m -= MI_BFIELD_BITS; } else { // the range was not enough, start from scratch i = i + j - 1; // no need to re-scan previous fields, except the last one (with clz this time) mi_assert_internal(m>0); break; } } // did we find a range? if (m==0) { if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) { // we cleared all atomically *pidx = cidx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); return true; } // note: if we fail for a small `n` on the first field, we don't rescan that field (as `i` is incremented) } // otherwise continue searching } return false; } //static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { // if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages // if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages // if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages // if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk // if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); // return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); //} // ------- mi_bchunk_clear_once_set --------------------------------------- static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) { mi_assert_internal(cidx < MI_BCHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx); } // ------- mi_bitmap_all_are_clear --------------------------------------- // are all bits in a bitmap chunk clear? static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); return mi_mm256_is_zero(vec); #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) // a 64b cache-line contains the entire chunk anyway so load both at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2))); #else for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false; } return true; #endif } // are all bits in a bitmap chunk set? static inline bool mi_bchunk_all_are_set_relaxed(mi_bchunk_t* chunk) { #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); return mi_mm256_is_ones(vec); #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) // a 64b cache-line contains the entire chunk anyway so load both at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); return (mi_mm256_is_ones(_mm256_and_si256(vec1, vec2))); #else for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (~mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false; } return true; #endif } static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) { for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) { i--; mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t idx; if (mi_bsr(b, &idx)) { *pidx = (i*MI_BFIELD_BITS) + idx; return true; } } return false; } /* -------------------------------------------------------------------------------- bitmap chunkmap -------------------------------------------------------------------------------- */ static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); mi_bchunk_set(&bitmap->chunkmap, chunk_idx); } static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); // check if the corresponding chunk is all clear if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) return false; // clear the chunkmap bit mi_bchunk_clear(&bitmap->chunkmap, chunk_idx, NULL); // .. but a concurrent set may have happened in between our all-clear test and the clearing of the // bit in the mask. We check again to catch this situation. if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) { mi_bchunk_set(&bitmap->chunkmap, chunk_idx); return false; } return true; } /* -------------------------------------------------------------------------------- bitmap -------------------------------------------------------------------------------- */ size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) { mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0); bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS); mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT); mi_assert_internal(bit_count > 0); const size_t chunk_count = bit_count / MI_BCHUNK_BITS; mi_assert_internal(chunk_count >= 1); const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE); mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 ); if (pchunk_count != NULL) { *pchunk_count = chunk_count; } return size; } // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true // returns the size of the bitmap size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) { size_t chunk_count; const size_t size = mi_bitmap_size(bit_count, &chunk_count); if (!already_zero) { _mi_memzero_aligned(bitmap, size); } mi_atomic_store_release(&bitmap->chunk_count, chunk_count); mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT); return size; } // Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) { mi_assert_internal(n>0); // start chunk and index size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS); // first update the chunkmap mi_bchunk_setN(cmap, chunk_idx, ccount, NULL); // first chunk size_t m = MI_BCHUNK_BITS - cidx; if (m > n) { m = n; } mi_bchunk_setN(&chunks[chunk_idx], cidx, m, NULL); // n can be large so use memset for efficiency for all in-between chunks chunk_idx++; n -= m; const size_t mid_chunks = n / MI_BCHUNK_BITS; if (mid_chunks > 0) { _mi_memset(&chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE); chunk_idx += mid_chunks; n -= (mid_chunks * MI_BCHUNK_BITS); } // last chunk if (n > 0) { mi_assert_internal(n < MI_BCHUNK_BITS); mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); mi_bchunk_setN(&chunks[chunk_idx], 0, n, NULL); } } // Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); mi_bchunks_unsafe_setN(&bitmap->chunks[0], &bitmap->chunkmap, idx, n); } // ------- mi_bitmap_xset --------------------------------------- // Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BCHUNK_BITS); const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia const bool were_allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_set); mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards return were_allclear; } // Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 1's to 0's. // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BCHUNK_BITS); const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia bool maybe_all_clear; const bool were_allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } return were_allset; } // Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) { return mi_bitmap_setN(bitmap, idx, 1, NULL); } bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) { return mi_bitmap_clearN(bitmap, idx, 1); } // ------- mi_bitmap_is_xset --------------------------------------- // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BCHUNK_BITS); mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia return mi_bchunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); } /* -------------------------------------------------------------------------------- Iterate through a bfield -------------------------------------------------------------------------------- */ // Cycle iteration through a bitfield. This is used to space out threads // so there is less chance of contention. When searching for a free page we // like to first search only the accessed part (so we reuse better). This // high point is called the `cycle`. // // We then iterate through the bitfield as: // first: [start, cycle> // then : [0, start> // then : [cycle, MI_BFIELD_BITS> // // The start is determined usually as `tseq % cycle` to have each thread // start at a different spot. // - We use `popcount` to improve branch prediction (maybe not needed? can we simplify?) // - The `cycle_mask` is the part `[start, cycle>`. #define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \ mi_assert_internal(start <= cycle); \ mi_assert_internal(start < MI_BFIELD_BITS); \ mi_assert_internal(cycle <= MI_BFIELD_BITS); \ mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \ size_t _bcount##SUF = mi_bfield_popcount(bfield); \ mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\ while(_bcount##SUF > 0) { \ _bcount##SUF--;\ if (_b##SUF==0) { _b##SUF = bfield & ~_cycle_mask##SUF; } /* process [0,start> + [cycle, MI_BFIELD_BITS> next */ \ size_t name_idx; \ bool _found##SUF = mi_bfield_find_least_bit(_b##SUF,&name_idx); \ mi_assert_internal(_found##SUF); MI_UNUSED(_found##SUF); \ { \ #define mi_bfield_iterate_end(SUF) \ } \ _b##SUF = mi_bfield_clear_least_bit(_b##SUF); \ } \ } #define mi_bfield_cycle_iterate(bfield,tseq,cycle,name_idx,SUF) { \ const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); /* or: 0 to always search from the start? */\ mi_bfield_iterate(bfield,_start##SUF,cycle,name_idx,SUF) #define mi_bfield_cycle_iterate_end(SUF) \ mi_bfield_iterate_end(SUF); } /* -------------------------------------------------------------------------------- mi_bitmap_find (used to find free pages) -------------------------------------------------------------------------------- */ typedef bool (mi_bitmap_visit_fun_t)(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* idx, void* arg1, void* arg2); // Go through the bitmap and for every sequence of `n` set bits, call the visitor function. // If it returns `true` stop the search. static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bitmap_visit_fun_t* on_find, void* arg1, void* arg2) { const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); for (size_t i = 0; i < chunkmap_max; i++) { // and for each chunkmap entry we iterate over its bits to find the chunks const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); size_t hi; if (mi_bfield_find_highest_bit(cmap_entry, &hi)) { mi_bfield_cycle_iterate(cmap_entry, tseq%8, hi+1, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) { mi_assert_internal(eidx <= MI_BFIELD_BITS); const size_t chunk_idx = i*MI_BFIELD_BITS + eidx; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) { return true; } } mi_bfield_cycle_iterate_end(Y); } } return false; } /* -------------------------------------------------------------------------------- Bitmap: try_find_and_claim -- used to allocate abandoned pages note: the compiler will fully inline the indirect function call -------------------------------------------------------------------------------- */ typedef struct mi_claim_fun_data_s { mi_arena_t* arena; mi_subproc_t* subproc; mi_heaptag_t heap_tag; } mi_claim_fun_data_t; static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2) { mi_assert_internal(n==1); MI_UNUSED(n); mi_claim_fun_t* claim_fun = (mi_claim_fun_t*)arg1; mi_claim_fun_data_t* claim_data = (mi_claim_fun_data_t*)arg2; size_t cidx; if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) { const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx; mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap)); bool keep_set = true; if ((*claim_fun)(slice_index, claim_data->arena, claim_data->subproc, claim_data->heap_tag, &keep_set)) { // success! mi_assert_internal(!keep_set); *pidx = slice_index; return true; } else { // failed to claim it, set abandoned mapping again (unless the page was freed) if (keep_set) { const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); mi_assert_internal(wasclear); MI_UNUSED(wasclear); } } } else { // we may find that all are cleared only on a second iteration but that is ok as // the chunkmap is a conservative approximation. mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } return false; } // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag) { mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag }; return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data); } bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx) { const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); for (size_t i = chunkmap_max; i > 0; ) { i--; mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); size_t cmap_idx; if (mi_bsr(cmap,&cmap_idx)) { // highest chunk const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx; size_t cidx; if (mi_bchunk_bsr(&bitmap->chunks[chunk_idx], &cidx)) { *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx; return true; } } } return false; } // Clear a bit once it is set. void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx); } // Visit all set bits in a bitmap. // todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore? bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) { // for all chunkmap entries const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); for(size_t i = 0; i < chunkmap_max; i++) { mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); size_t cmap_idx; // for each chunk (corresponding to a set bit in a chunkmap entry) while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) { const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx; // for each chunk field mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx]; for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); size_t bidx; while (mi_bfield_foreach_bit(&b, &bidx)) { const size_t idx = base_idx + bidx; if (!visit(idx, 1, arena, arg)) return false; } } } } return true; } // Visit all set bits in a bitmap but try to return ranges (within bfields) if possible. // used by purging to purge larger ranges if possible // todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore? bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) { // for all chunkmap entries const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); for (size_t i = 0; i < chunkmap_max; i++) { mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); size_t cmap_idx; // for each chunk (corresponding to a set bit in a chunkmap entry) while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) { const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx; // for each chunk field mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx]; for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); #if MI_DEBUG > 1 const size_t bpopcount = mi_popcount(b); size_t rngcount = 0; #endif size_t bidx; while (mi_bfield_find_least_bit(b, &bidx)) { const size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx #if MI_DEBUG > 1 rngcount += rng; #endif mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS); const size_t idx = base_idx + bidx; mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS); mi_assert_internal((idx / MI_BCHUNK_BITS) < mi_bitmap_chunk_count(bitmap)); if (!visit(idx, rng, arena, arg)) return false; // clear rng bits in b b = b & ~mi_bfield_mask(rng, bidx); } mi_assert_internal(rngcount == bpopcount); } } } return true; } /* -------------------------------------------------------------------------------- binned bitmap's -------------------------------------------------------------------------------- */ size_t mi_bbitmap_size(size_t bit_count, size_t* pchunk_count) { mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0); bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS); mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT); mi_assert_internal(bit_count > 0); const size_t chunk_count = bit_count / MI_BCHUNK_BITS; mi_assert_internal(chunk_count >= 1); const size_t size = offsetof(mi_bbitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE); mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 ); if (pchunk_count != NULL) { *pchunk_count = chunk_count; } return size; } // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true // returns the size of the bitmap size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero) { size_t chunk_count; const size_t size = mi_bbitmap_size(bit_count, &chunk_count); if (!already_zero) { _mi_memzero_aligned(bbitmap, size); } mi_atomic_store_release(&bbitmap->chunk_count, chunk_count); mi_assert_internal(mi_atomic_load_relaxed(&bbitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT); return size; } void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); mi_bchunks_unsafe_setN(&bbitmap->chunks[0], &bbitmap->chunkmap, idx, n); } /* -------------------------------------------------------------------------------- binned bitmap chunkmap -------------------------------------------------------------------------------- */ static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) { size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed); if mi_unlikely(chunk_idx > oldmax) { mi_atomic_cas_strong_relaxed(&bbitmap->chunk_max_accessed, &oldmax, chunk_idx); } } static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) { mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); if (check_all_set) { if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) { // all slices are free in this chunk: return back to the NONE bin mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], MI_BBIN_NONE); } } mi_bchunk_set(&bbitmap->chunkmap, chunk_idx); mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx); } static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); // check if the corresponding chunk is all clear if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) return false; // clear the chunkmap bit mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL); // .. but a concurrent set may have happened in between our all-clear test and the clearing of the // bit in the mask. We check again to catch this situation. if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) { mi_bchunk_set(&bbitmap->chunkmap, chunk_idx); return false; } mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx); return true; } // Assign from the NONE bin to a specific size bin static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) { mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin); } /* -------------------------------------------------------------------------------- mi_bbitmap_setN, try_clearN, and is_xsetN (used to find free pages) -------------------------------------------------------------------------------- */ // Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BCHUNK_BITS); const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia const bool were_allclear = mi_bchunk_setN(&bbitmap->chunks[chunk_idx], cidx, n, NULL); mi_bbitmap_chunkmap_set(bbitmap, chunk_idx, true); // set after return were_allclear; } // ------- mi_bbitmap_try_clearN --------------------------------------- bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BCHUNK_BITS); mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); if (cidx + n > MI_BCHUNK_BITS) return false; bool maybe_all_clear; const bool cleared = mi_bchunk_try_clearN(&bbitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); if (cleared && maybe_all_clear) { mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); } // note: we don't set the size class for an explicit try_clearN (only used by purging) return cleared; } // ------- mi_bbitmap_is_xset --------------------------------------- // Is a sequence of n bits already all set/cleared? bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BCHUNK_BITS); mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia return mi_bchunk_is_xsetN(set, &bbitmap->chunks[chunk_idx], cidx, n); } /* -------------------------------------------------------------------------------- mi_bbitmap_find (used to find free pages) -------------------------------------------------------------------------------- */ typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx); // Go through the bbitmap and for every sequence of `n` set bits, call the visitor function. // If it returns `true` stop the search. // // This is used for finding free blocks and it is important to be efficient (with 2-level bitscan) // but also reduce fragmentation (through size bins). static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find) { // we space out threads to reduce contention const size_t cmap_max_count = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap),MI_BFIELD_BITS); const size_t chunk_acc = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed); const size_t cmap_acc = chunk_acc / MI_BFIELD_BITS; const size_t cmap_acc_bits = 1 + (chunk_acc % MI_BFIELD_BITS); // create a mask over the chunkmap entries to iterate over them efficiently mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS); const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); const size_t cmap_cycle = cmap_acc+1; const mi_bbin_t bbin = mi_bbin_of(n); // visit bins from largest size bin up to the NONE bin for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL // const mi_bbin_t bin = bbin; { mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) { // don't search into non-accessed memory until we tried other size bins as well if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { break; } // and for each chunkmap entry we iterate over its bits to find the chunks const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]); const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) { mi_assert_internal(eidx <= MI_BFIELD_BITS); const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); // only in the current size class! const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]); if // (bin >= chunk_bin) { ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; size_t cidx; if ((*on_find)(chunk, n, &cidx)) { if (cidx==0 && chunk_bin == MI_BBIN_NONE) { // only the first determines the size bin // this chunk is now reserved for the `bbin` size class mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin); } *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap)); return true; } else { /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); } } } mi_bfield_cycle_iterate_end(Y); } mi_bfield_cycle_iterate_end(X); } return false; } /* -------------------------------------------------------------------------------- mi_bbitmap_try_find_and_clear -- used to find free pages note: the compiler will fully inline the indirect function calls -------------------------------------------------------------------------------- */ bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1); } bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8); } bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X); } bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) { mi_assert_internal(n<=MI_BFIELD_BITS); return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX); } bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) { mi_assert_internal(n<=MI_BCHUNK_BITS); return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_); }