mirror of
https://github.com/microsoft/mimalloc.git
synced 2025-05-08 00:09:31 +03:00
flexible clearN_ that can start at any index
This commit is contained in:
parent
b5dfd233e9
commit
4aeb2e1005
1 changed files with 64 additions and 31 deletions
95
src/bitmap.c
95
src/bitmap.c
|
@ -26,6 +26,10 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) {
|
||||||
return mi_ctz(x);
|
return mi_ctz(x);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline size_t mi_bfield_clz(mi_bfield_t x) {
|
||||||
|
return mi_clz(x);
|
||||||
|
}
|
||||||
|
|
||||||
static inline size_t mi_bfield_popcount(mi_bfield_t x) {
|
static inline size_t mi_bfield_popcount(mi_bfield_t x) {
|
||||||
return mi_popcount(x);
|
return mi_popcount(x);
|
||||||
}
|
}
|
||||||
|
@ -41,6 +45,15 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
|
||||||
return mi_bsf(x,idx);
|
return mi_bsf(x,idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// find the most significant bit that is set.
|
||||||
|
// return false if `x==0` (with `*idx` undefined) and true otherwise,
|
||||||
|
// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
|
||||||
|
static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) {
|
||||||
|
return mi_bsr(x, idx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// find each set bit in a bit field `x` and clear it, until it becomes zero.
|
// find each set bit in a bit field `x` and clear it, until it becomes zero.
|
||||||
static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
|
static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) {
|
||||||
const bool found = mi_bfield_find_least_bit(*x, idx);
|
const bool found = mi_bfield_find_least_bit(*x, idx);
|
||||||
|
@ -598,9 +611,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
|
||||||
#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
||||||
while (true) {
|
while (true) {
|
||||||
// since a cache-line is 64b, load all at once
|
// since a cache-line is 64b, load all at once
|
||||||
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
|
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
|
||||||
const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
|
const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
|
||||||
const __m256i cmpv = mi_mm256_ones();
|
const __m256i cmpv = mi_mm256_ones();
|
||||||
const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0)
|
const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0)
|
||||||
const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0)
|
const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0)
|
||||||
const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte
|
const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte
|
||||||
|
@ -610,7 +623,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
|
||||||
if (mask==0) return false;
|
if (mask==0) return false;
|
||||||
const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk
|
const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk
|
||||||
const size_t chunk_idx = bidx / 8;
|
const size_t chunk_idx = bidx / 8;
|
||||||
const size_t idx = (bidx % 8)*8;
|
const size_t idx = (bidx % 8)*8;
|
||||||
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
|
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
|
||||||
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically
|
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically
|
||||||
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
|
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
|
||||||
|
@ -618,6 +631,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// try again
|
// try again
|
||||||
|
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded }
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
// first skip allset fields to reduce fragmentation
|
// first skip allset fields to reduce fragmentation
|
||||||
|
@ -664,6 +678,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk,
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
// try again
|
// try again
|
||||||
|
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
|
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
|
||||||
|
@ -684,7 +699,8 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n,
|
||||||
}
|
}
|
||||||
|
|
||||||
// find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
|
// find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set,
|
||||||
// and try to clear them atomically.
|
// and try to clear them atomically.
|
||||||
|
// Currently does not cross bfield boundaries.
|
||||||
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
|
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
|
||||||
// (We do not cross bfield boundaries)
|
// (We do not cross bfield boundaries)
|
||||||
mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
|
mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
|
||||||
|
@ -732,35 +748,51 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
|
||||||
static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
|
static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
|
||||||
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
|
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
|
||||||
|
|
||||||
// we align at a bfield, and scan `field_count` fields
|
const size_t skip_count = n/MI_BFIELD_BITS;
|
||||||
// n >= MI_BFIELD_BITS; find a first field that is 0
|
size_t cidx;
|
||||||
const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields
|
for (size_t i = 0; i <= MI_BCHUNK_FIELDS - skip_count; i++)
|
||||||
for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
|
|
||||||
{
|
{
|
||||||
// first pre-scan for a range of fields that are all set (up to the last one)
|
size_t j = 1; // field count from i
|
||||||
bool allset = true;
|
size_t m = n; // bits to go
|
||||||
size_t j = 0;
|
|
||||||
size_t m = n;
|
// first field
|
||||||
do {
|
mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
|
||||||
mi_assert_internal(i + j < MI_BCHUNK_FIELDS);
|
size_t ones = mi_bfield_clz(~b);
|
||||||
mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
|
cidx = i*MI_BFIELD_BITS + (MI_BFIELD_BITS - ones); // start index
|
||||||
size_t idx;
|
if (ones >= m) {
|
||||||
if (mi_bfield_find_least_bit(~b,&idx)) {
|
// we found enough bits!
|
||||||
if (m > idx) {
|
m = 0;
|
||||||
allset = false;
|
}
|
||||||
i += j; // no need to look again at the previous fields
|
else {
|
||||||
break;
|
m -= ones;
|
||||||
}
|
mi_assert_internal(m>0);
|
||||||
|
}
|
||||||
|
|
||||||
|
// keep scanning further fields?
|
||||||
|
while (i+j < MI_BCHUNK_FIELDS) {
|
||||||
|
mi_assert_internal(m > 0);
|
||||||
|
b = mi_atomic_load_relaxed(&chunk->bfields[i+j]);
|
||||||
|
ones = mi_bfield_ctz(~b);
|
||||||
|
if (ones >= m) {
|
||||||
|
// we found enough bits
|
||||||
|
m = 0;
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
else if (ones == MI_BFIELD_BITS) {
|
||||||
|
// not enough yet, proceed to the next field
|
||||||
|
j++;
|
||||||
|
m -= MI_BFIELD_BITS;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// all bits in b were set
|
// the range was not enough, start from scratch
|
||||||
m -= MI_BFIELD_BITS; // note: can underflow
|
i = i + j - 1; // no need to re-scan previous fields, except the last one (with clz this time)
|
||||||
|
mi_assert_internal(m>0);
|
||||||
|
break;
|
||||||
}
|
}
|
||||||
} while (++j < field_count);
|
}
|
||||||
|
|
||||||
// if all set, we can try to atomically clear them
|
// did we find a range?
|
||||||
if (allset) {
|
if (m==0) {
|
||||||
const size_t cidx = i*MI_BFIELD_BITS;
|
|
||||||
if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) {
|
if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) {
|
||||||
// we cleared all atomically
|
// we cleared all atomically
|
||||||
*pidx = cidx;
|
*pidx = cidx;
|
||||||
|
@ -768,8 +800,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
|
||||||
mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
|
mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
// note: if we fail for a small `n` on the first field, we don't rescan that field (as `i` is incremented)
|
||||||
}
|
}
|
||||||
// continue
|
// otherwise continue searching
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Add table
Reference in a new issue