revise visiting arenas, better bitmap scanning

This commit is contained in:
daanx 2024-12-07 14:03:51 -08:00
parent 70115d8b8c
commit 9631b0d4d2
2 changed files with 164 additions and 107 deletions

View file

@ -202,20 +202,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
// set the dirty bits // set the dirty bits
if (arena->memid.initially_zero) { if (arena->memid.initially_zero) {
// size_t dirty_count = 0; memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL);
memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL);
//if (dirty_count>0) {
// if (memid->initially_zero) {
// _mi_error_message(EFAULT, "ouch1\n");
// }
// // memid->initially_zero = false;
//}
//else {
// if (!memid->initially_zero) {
// _mi_error_message(EFAULT, "ouch2\n");
// }
// // memid->initially_zero = true;
//}
} }
// set commit state // set commit state
@ -235,7 +222,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
#if MI_DEBUG > 1 #if MI_DEBUG > 1
if (memid->initially_zero) { if (memid->initially_zero) {
if (!mi_mem_is_zero(p, mi_size_of_slices(slice_count))) { if (!mi_mem_is_zero(p, mi_size_of_slices(slice_count))) {
_mi_error_message(EFAULT, "arena allocation was not zero-initialized!\n"); _mi_error_message(EFAULT, "interal error: arena allocation was not zero-initialized!\n");
memid->initially_zero = false; memid->initially_zero = false;
} }
} }
@ -327,31 +314,47 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
return true; return true;
} }
#define MI_THREADS_PER_ARENA (16)
#define mi_forall_arenas(req_arena_id, allow_large, tseq, var_arena_id, var_arena) \ #define mi_forall_arenas(req_arena_id, tseq, name_arena) \
{ \ { \
size_t _max_arena; \ const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \
size_t _start; \ if (_arena_count > 0) { \
if (req_arena_id == _mi_arena_id_none()) { \ const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
_max_arena = mi_atomic_load_relaxed(&mi_arena_count); \ size_t _start; \
_start = (_max_arena <= 2 ? 0 : (tseq % (_max_arena-1))); \ if (req_arena_id == _mi_arena_id_none()) { \
} \ /* always start searching in an arena 1 below the max */ \
else { \ _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
_max_arena = 1; \ } \
_start = mi_arena_id_index(req_arena_id); \ else { \
mi_assert_internal(mi_atomic_load_relaxed(&mi_arena_count) > _start); \ _start = mi_arena_id_index(req_arena_id); \
} \ mi_assert_internal(_start < _arena_count); \
for (size_t i = 0; i < _max_arena; i++) { \ } \
size_t _idx = i + _start; \ for (size_t _i = 0; _i < _arena_count; _i++) { \
if (_idx >= _max_arena) { _idx -= _max_arena; } \ size_t _idx; \
const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); MI_UNUSED(var_arena_id);\ if (_i < _arena_cycle) { \
mi_arena_t* const var_arena = mi_arena_from_index(_idx); \ _idx = _i + _start; \
if (var_arena != NULL && mi_arena_is_suitable(var_arena,req_arena_id,-1 /* todo: numa node */,allow_large)) \ if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \
{ } \
else { \
_idx = _i; \
} \
mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
if (name_arena != NULL) \
{
#define mi_forall_arenas_end() }}} #define mi_forall_arenas_end() \
} \
if (req_arena_id != _mi_arena_id_none()) break; \
} \
}}
#define mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, name_arena) \
mi_forall_arenas(req_arena_id,tseq,name_arena) { \
if (mi_arena_is_suitable(name_arena, req_arena_id, -1 /* todo: numa node */, allow_large)) { \
#define mi_forall_suitable_arenas_end() \
}} \
mi_forall_arenas_end()
/* ----------------------------------------------------------- /* -----------------------------------------------------------
Arena allocation Arena allocation
@ -369,12 +372,12 @@ static mi_decl_noinline void* mi_arena_try_find_free(
// search arena's // search arena's
const size_t tseq = tld->tseq; const size_t tseq = tld->tseq;
mi_forall_arenas(req_arena_id, allow_large, tseq, arena_id, arena) mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
{ {
void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid); void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
if (p != NULL) return p; if (p != NULL) return p;
} }
mi_forall_arenas_end(); mi_forall_suitable_arenas_end();
return NULL; return NULL;
} }
@ -517,7 +520,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
// search arena's // search arena's
const bool allow_large = true; const bool allow_large = true;
size_t tseq = tld->tseq; size_t tseq = tld->tseq;
mi_forall_arenas(req_arena_id, allow_large, tseq, arena_id, arena) mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena)
{ {
size_t slice_index; size_t slice_index;
mi_bitmap_t* const bitmap = arena->pages_abandoned[bin]; mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
@ -545,7 +548,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
return page; return page;
} }
} }
mi_forall_arenas_end(); mi_forall_suitable_arenas_end();
return NULL; return NULL;
} }

View file

@ -42,9 +42,9 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
return mi_rotr(x,r); return mi_rotr(x,r);
} }
//static inline mi_bfield_t mi_bfield_zero(void) { static inline mi_bfield_t mi_bfield_zero(void) {
// return 0; return 0;
//} }
static inline mi_bfield_t mi_bfield_one(void) { static inline mi_bfield_t mi_bfield_one(void) {
return 1; return 1;
@ -64,9 +64,9 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) {
// Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR). // Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR).
// return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise, // return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise,
// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) { //static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) {
return mi_bfield_find_least_bit((set ? ~x : x), idx); // return mi_bfield_find_least_bit((set ? ~x : x), idx);
} //}
// Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1
static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) {
@ -244,10 +244,10 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by
// Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's. // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
// and false otherwise leaving the bit field as-is. // and false otherwise leaving the bit field as-is.
//static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) { static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) {
// mi_bfield_t old = mi_bfield_all_set(); mi_bfield_t old = mi_bfield_all_set();
// return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_zero()); return mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero());
//} }
// Check if all bits corresponding to a mask are set. // Check if all bits corresponding to a mask are set.
@ -514,31 +514,33 @@ static inline __m256i mi_mm256_zero(void) {
static inline __m256i mi_mm256_ones(void) { static inline __m256i mi_mm256_ones(void) {
return _mm256_set1_epi64x(~0); return _mm256_set1_epi64x(~0);
} }
static inline bool mi_mm256_is_ones(__m256i vec) { //static inline bool mi_mm256_is_ones(__m256i vec) {
return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec)); // return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec));
} //}
static inline bool mi_mm256_is_zero( __m256i vec) { static inline bool mi_mm256_is_zero( __m256i vec) {
return _mm256_testz_si256(vec,vec); return _mm256_testz_si256(vec,vec);
} }
#endif #endif
// find least 0/1-bit in a chunk and try to set/clear it atomically // Find least 1-bit in a chunk and try to clear it atomically
// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// This is used to find free slices and abandoned pages and should be efficient.
// todo: try neon version // todo: try neon version
static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t* pidx) { static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==256) #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
while (true) { while (true) {
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF : 0) const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0)
const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear)
// mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
if (mask==0) return false; if (mask==0) return false;
mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
const size_t chunk_idx = _tzcnt_u32(mask) / 8; const size_t chunk_idx = _tzcnt_u32(mask) / 8;
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
size_t cidx; size_t cidx;
if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit
if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true; return true;
@ -546,39 +548,42 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk
} }
// try again // try again
} }
#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) { while (true) {
size_t chunk_idx = 0; size_t chunk_idx = 0;
#if 1 #if 0
// one vector at a time
__m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
if ((set ? mi_mm256_is_ones(vec) : mi_mm256_is_zero(vec))) { if (mi_mm256_is_zero(vec)) {
chunk_idx += 4; chunk_idx += 4;
vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1); vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1);
} }
const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF : 0) const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0)
const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear)
// mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
if (mask==0) return false; if (mask==0) return false;
mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
chunk_idx += _tzcnt_u32(mask) / 8; chunk_idx += _tzcnt_u32(mask) / 8;
#else #else
// a cache line is 64b so we can just as well load all at the same time
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
const __m256i cmpv = (set ? mi_mm256_ones() : mi_mm256_zero()); const __m256i cmpv = mi_mm256_zero();
const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == 0 ? 0xFF : 0)
const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == 0 ? 0xFF : 0)
const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear)
const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte (so each 8 bits are all set or clear)
const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
// mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared)
if (mask==0) return false; if (mask==0) return false;
mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
const size_t chunk_idx = _tzcnt_u64(mask) / 8; chunk_idx = _tzcnt_u64(mask) / 8;
#endif #endif
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
size_t cidx; size_t cidx;
if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear if (mi_bfield_find_least_bit(b, &cidx)) { // find the bit-idx that is clear
if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true; return true;
@ -586,11 +591,12 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk
} }
// try again // try again
} }
#else #else
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
size_t idx; size_t idx;
if mi_unlikely(mi_bfield_find_least_to_xset(set, chunk->bfields[i], &idx)) { // find least 0-bit if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx)) { // try to set it atomically if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) { // try to clear it atomically
*pidx = (i*MI_BFIELD_BITS + idx); *pidx = (i*MI_BFIELD_BITS + idx);
mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true; return true;
@ -598,48 +604,49 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk
} }
} }
return false; return false;
#endif #endif
} }
static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) {
return mi_bchunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx);
}
//static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) {
// return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx);
//}
// find least byte in a chunk with all bits set, and try unset it atomically // find least byte in a chunk with all bits set, and try unset it atomically
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// Used to find medium size pages in the free blocks.
// todo: try neon version // todo: try neon version
static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) { static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==256) #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while(true) { while (true) {
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); // since a cache-line is 64b, load all at once
const __m256i vcmp = _mm256_cmpeq_epi8(vec, mi_mm256_ones()); // (byte == ~0 ? -1 : 0) const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const uint32_t mask = _mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
if (mask == 0) return false; const __m256i cmpv = mi_mm256_ones();
const size_t i = _tzcnt_u32(mask); const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0)
mi_assert_internal(8*i < MI_BCHUNK_BITS); const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0)
const size_t chunk_idx = i / MI_BFIELD_SIZE; const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte
const size_t byte_idx = i % MI_BFIELD_SIZE; const uint32_t mask2 = _mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte
if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) { // try to unset atomically const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
*pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8); // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared)
mi_assert_internal(*pidx < MI_BCHUNK_BITS); if (mask==0) return false;
const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk
const size_t chunk_idx = bidx / 8;
const size_t byte_idx = bidx % 8; // byte index of the byte in the bfield
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
return true; return true;
} }
// try again // try again
} }
#else #else
for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
const mi_bfield_t x = chunk->bfields[i]; const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]);
// has_set8 has low bit in each byte set if the byte in x == 0xFF // has_set8 has low bit in each byte set if the byte in x == 0xFF
const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F
(x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 (x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80
>> 7; // shift high bit to low bit >> 7; // shift high bit to low bit
size_t idx; size_t idx;
if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
mi_assert_internal((idx%8)==0); mi_assert_internal((idx%8)==0);
const size_t byte_idx = idx/8; const size_t byte_idx = idx/8;
@ -656,14 +663,58 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid
} }
// find least bfield in a chunk with all bits set, and try unset it atomically
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// Used to find large size pages in the free blocks.
// todo: try neon version
static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) {
// since a cache-line is 64b, load all at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
const __m256i cmpv = mi_mm256_ones();
const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (bfield == ~0 ? -1 : 0)
const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (bfield == ~0 ? -1 : 0)
const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte
const uint32_t mask2 = _mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte
const uint64_t mask = ((uint64_t)mask2 << 32) | mask1;
// mask is inverted, so each 8-bits are set iff the corresponding elem64 has all bits set (and thus can be cleared)
if (mask==0) return false;
mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
const size_t chunk_idx = _tzcnt_u64(mask) / 8;
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx])) {
*pidx = chunk_idx*MI_BFIELD_BITS;
mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
return true;
}
// try again
}
#else
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
if (~b==0 && mi_bfield_atomic_try_clearX(&chunk->bfields[i])) {
*pidx = i*MI_BFIELD_BITS;
mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
return true;
}
}
return false;
#endif
}
// find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set, // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
// and try to clear them atomically. // and try to clear them atomically.
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
// (We do not cross bfield boundaries)
static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n == 0 || n > MI_BFIELD_BITS) return false; if (n == 0 || n > MI_BFIELD_BITS) return false;
const mi_bfield_t mask = mi_bfield_mask(n, 0); const mi_bfield_t mask = mi_bfield_mask(n, 0);
for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
mi_bfield_t b = chunk->bfields[i]; mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
size_t bshift = 0; size_t bshift = 0;
size_t idx; size_t idx;
while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
@ -680,8 +731,9 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t*
return true; return true;
} }
else { else {
// if failed to atomically commit, try again from this position // if failed to atomically commit, reload b and try again from this position
b = (chunk->bfields[i] >> bshift); bshift -= idx;
b = mi_atomic_load_relaxed(&chunk->bfields[i]) >> bshift;
} }
} }
else { else {
@ -699,11 +751,11 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t*
// find a sequence of `n` bits in a chunk with `n < MI_BCHUNK_BITS` with all bits set, // find a sequence of `n` bits in a chunk with `n < MI_BCHUNK_BITS` with all bits set,
// and try to clear them atomically. // and try to clear them atomically.
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
// This can cross bfield boundaries.
static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
// if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx);
// we align at a bfield, and scan `field_count` fields
// we align an a field, and require `field_count` fields to be all clear.
// n >= MI_BFIELD_BITS; find a first field that is 0 // n >= MI_BFIELD_BITS; find a first field that is 0
const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields
for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++) for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
@ -740,14 +792,16 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
return true; return true;
} }
} }
// continue
} }
return false; return false;
} }
static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx); if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx); // small pages
if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx); if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx); // medium pages
if (n==MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearX(chunk, pidx); // large pages
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx); if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx);
return mi_bchunk_find_and_try_clearN_(chunk, n, pidx); return mi_bchunk_find_and_try_clearN_(chunk, n, pidx);