fix avx2 bug with atomics

This commit is contained in:
daanx 2024-12-13 19:59:08 -08:00
parent 216c04f8d9
commit b5dfd233e9
3 changed files with 29 additions and 40 deletions

View file

@ -117,8 +117,8 @@ if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo")
if (NOT MI_OPT_ARCH)
message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)")
endif()
else()
set(MI_OPT_ARCH OFF)
#else()
# set(MI_OPT_ARCH OFF)
endif()
if(MI_OVERRIDE)

View file

@ -143,20 +143,9 @@ static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_cle
return (~old==0);
}
// ------- mi_bfield_atomic_try_set/clear ---------------------------------------
// ------- mi_bfield_atomic_try_clear ---------------------------------------
// Tries to set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask
// and false otherwise (leaving the bit field as is).
static inline bool mi_bfield_atomic_try_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) {
mi_assert_internal(mask != 0);
mi_bfield_t old = mi_atomic_load_relaxed(b);
do {
if ((old&mask) != 0) return false; // the mask bits are no longer 0
} while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)); // try to atomically set the mask bits
return true;
}
// Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
// and false otherwise (leaving the bit field as is).
static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) {
@ -493,12 +482,14 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) {
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
size_t cidx;
// note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever
// as the compiler won't reload the registers vec1 and vec2 from memory again.
const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]);
size_t idx;
if (!allow_allset && (~b == 0)) return false;
if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true;
}
@ -522,6 +513,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
const size_t chunk_idx = _tzcnt_u32(mask) / 8;
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
// try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
}
#elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) {
@ -556,6 +548,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
#endif
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true;
// try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
}
#else
// try first to find a field that is not all set (to reduce fragmentation)
@ -620,7 +613,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
const size_t idx = (bidx % 8)*8;
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx;
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
return true;
}
@ -665,7 +658,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk,
mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
const size_t chunk_idx = _tzcnt_u64(mask) / 8;
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx])) {
if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx],NULL)) {
*pidx = chunk_idx*MI_BFIELD_BITS;
mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
return true;
@ -804,13 +797,6 @@ static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
// ------- mi_bitmap_all_are_clear ---------------------------------------
// are all bits in a bitmap chunk clear? (this uses guaranteed atomic reads)
static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
}
return true;
}
// are all bits in a bitmap chunk clear?
static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
@ -823,7 +809,10 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2)));
#else
return mi_bchunk_all_are_clear(chunk);
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false;
}
return true;
#endif
}

View file

@ -353,7 +353,7 @@ int main(int argc, char** argv) {
mi_debug_show_arenas(true,false,false);
#else
//mi_collect(true);
mi_debug_show_arenas(true,false,false);
//mi_debug_show_arenas(true,false,false);
// mi_stats_print(NULL);
#endif
#else