From 69ac69abac87b513674f79d1217aab00e2b6ccb8 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 2 Dec 2024 00:31:08 -0800 Subject: [PATCH] wip: use epoch with 512bit chunks --- include/mimalloc/bits.h | 119 +++++--- include/mimalloc/types.h | 2 +- src/arena.c | 61 +++-- src/bitmap.c | 567 +++++++++++++++++++++++++++------------ src/bitmap.h | 65 ++++- src/free.c | 4 +- src/libc.c | 10 +- src/options.c | 2 +- 8 files changed, 574 insertions(+), 256 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 79034c2f..90d56b4f 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -145,20 +145,13 @@ typedef int32_t mi_ssize_t; size_t _mi_clz_generic(size_t x); size_t _mi_ctz_generic(size_t x); +uint32_t _mi_ctz_generic32(uint32_t x); static inline size_t mi_ctz(size_t x) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0 uint64_t r; __asm volatile ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); return r; - #elif defined(__GNUC__) && MI_ARCH_ARM64 - uint64_t r; - __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc"); - return r; - #elif defined(__GNUC__) && MI_ARCH_RISCV - size_t r; - __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : ); - return r; #elif MI_ARCH_X64 && defined(__BMI1__) return (size_t)_tzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) @@ -168,6 +161,17 @@ static inline size_t mi_ctz(size_t x) { #else return (_BitScanForward64(&idx, x) ? (size_t)idx : 64); #endif + /* + // for arm64 and riscv, the builtin_ctz is defined for 0 as well + #elif defined(__GNUC__) && MI_ARCH_ARM64 + uint64_t r; + __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_RISCV + size_t r; + __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : ); + return r; + */ #elif mi_has_builtin_size(ctz) return (x!=0 ? (size_t)mi_builtin_size(ctz)(x) : MI_SIZE_BITS); #else @@ -177,18 +181,10 @@ static inline size_t mi_ctz(size_t x) { } static inline size_t mi_clz(size_t x) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 uint64_t r; __asm volatile ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); return r; - #elif defined(__GNUC__) && MI_ARCH_ARM64 - uint64_t r; - __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc"); - return r; - #elif defined(__GNUC__) && MI_ARCH_RISCV - size_t r; - __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : ); - return r; #elif MI_ARCH_X64 && defined(__BMI1__) return (size_t)_lzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) @@ -198,6 +194,17 @@ static inline size_t mi_clz(size_t x) { #else return (_BitScanReverse64(&idx, x) ? 63 - (size_t)idx : 64); #endif + /* + // for arm64 and riscv, the builtin_clz is defined for 0 as well + #elif defined(__GNUC__) && MI_ARCH_ARM64 + uint64_t r; + __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_RISCV + size_t r; + __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : ); + return r; + */ #elif mi_has_builtin_size(clz) return (x!=0 ? (size_t)mi_builtin_size(clz)(x) : MI_SIZE_BITS); #else @@ -206,6 +213,26 @@ static inline size_t mi_clz(size_t x) { #endif } +static inline uint32_t mi_ctz32(uint32_t x) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0 + uint32_t r; + __asm volatile ("tzcntl\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif MI_ARCH_X64 && defined(__BMI1__) + return (uint32_t)_tzcnt_u32(x); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long idx; + return (_BitScanForward(&idx, x) ? (uint32_t)idx : 32); + #elif mi_has_builtin(ctz) && (INT_MAX == INT32_MAX) + return (x!=0 ? (uint32_t)mi_builtin(ctz)(x) : 32); + #elif mi_has_builtin(ctzl) && (LONG_MAX == INT32_MAX) + return (x!=0 ? (uint32_t)mi_builtin(ctzl)(x) : 32); + #else + #define MI_HAS_FAST_BITSCAN 0 + return _mi_ctz_generic32(x); + #endif +} + #ifndef MI_HAS_FAST_BITSCAN #define MI_HAS_FAST_BITSCAN 1 #endif @@ -229,6 +256,22 @@ static inline bool mi_bsf(size_t x, size_t* idx) { #endif } +// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's) +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bsf32(uint32_t x, uint32_t* idx) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + // on x64 the carry flag is set on zero which gives better codegen + bool is_zero; + __asm ("tzcntl\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc"); + return !is_zero; + #else + *idx = mi_ctz32(x); + return (x!=0); + #endif +} + + // Bit scan reverse: find the most significant bit that is set // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). @@ -248,29 +291,6 @@ static inline bool mi_bsr(size_t x, size_t* idx) { } -/* -------------------------------------------------------------------------------- - find least/most significant bit position --------------------------------------------------------------------------------- */ - -// Find most significant bit index, or MI_SIZE_BITS if 0 -static inline size_t mi_find_msb(size_t x) { - #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) - unsigned long i; - #if MI_SIZE_BITS==32 - return (_BitScanReverse(&i, x) ? i : 32); - #else - return (_BitScanReverse64(&i, x) ? i : 64); - #endif - #else - return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x)); - #endif -} - -// Find least significant bit index, or MI_SIZE_BITS if 0 (this equals `mi_ctz`, count trailing zero's) -static inline size_t mi_find_lsb(size_t x) { - return mi_ctz(x); -} - /* -------------------------------------------------------------------------------- rotate @@ -288,13 +308,26 @@ static inline size_t mi_rotr(size_t x, size_t r) { return _rotr64(x,(int)r); #endif #else - // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to + // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to // avoid UB when `rshift==0`. See const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1)))); #endif } +static inline uint32_t mi_rotr32(uint32_t x, uint32_t r) { + #if mi_has_builtin(rotateright32) + return mi_builtin(rotateright32)(x, r); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + return _lrotr(x, (int)r); + #else + // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to + // avoid UB when `rshift==0`. See + const unsigned int rshift = (unsigned int)(r) & 31; + return ((x >> rshift) | (x << ((-rshift) & 31))); + #endif +} + static inline size_t mi_rotl(size_t x, size_t r) { #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64) return mi_builtin(rotateleft64)(x,r); @@ -307,7 +340,7 @@ static inline size_t mi_rotl(size_t x, size_t r) { return _rotl64(x,(int)r); #endif #else - // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to + // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to // avoid UB when `rshift==0`. See const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1)))); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 271c7efb..fe7e8227 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -120,7 +120,7 @@ terms of the MIT license. A copy of the license can be found in the file #endif #endif #ifndef MI_BITMAP_CHUNK_BITS_SHIFT -#define MI_BITMAP_CHUNK_BITS_SHIFT 8 // optimized for 256 bits per chunk (avx2) +#define MI_BITMAP_CHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) #endif #define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) diff --git a/src/arena.c b/src/arena.c index a713a110..cc2fe7b8 100644 --- a/src/arena.c +++ b/src/arena.c @@ -197,7 +197,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // set the dirty bits if (arena->memid.initially_zero) { - memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count, NULL); + memid->initially_zero = mi_bitmap_setN(&arena->slices_dirty, slice_index, slice_count, NULL); } // set commit state @@ -206,7 +206,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_committed = true; bool all_already_committed; - mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count, &all_already_committed); + mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &all_already_committed); if (!all_already_committed) { bool commit_zero = false; if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) { @@ -219,13 +219,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( } else { // no need to commit, but check if already fully committed - memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count); + memid->initially_committed = mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count); } - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count)); - if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count)); } - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count)); - // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); + if (commit) { mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); } + mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); + // mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); return p; } @@ -455,10 +455,10 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl // found an abandoned page of the right size mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(!mi_page_is_full(page)); mi_assert_internal(mi_page_is_abandoned(page)); @@ -626,7 +626,7 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { size_t bin = _mi_bin(mi_page_block_size(page)); size_t slice_index; mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL); - bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_abandoned[bin], slice_index, 1, NULL); + bool were_zero = mi_bitmap_setN(&arena->slices_abandoned[bin], slice_index, 1, NULL); MI_UNUSED(were_zero); mi_assert_internal(were_zero); mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]); } @@ -660,7 +660,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { // return false; // } const size_t bin = _mi_bin(page->block_size); - if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->slices_abandoned[bin], slice_index, 1)) { + if (mi_bitmap_try_clear(&arena->slices_abandoned[bin], slice_index)) { // we got it atomically _mi_page_reclaim(heap, page); mi_assert_internal(!mi_page_is_abandoned(page)); @@ -668,7 +668,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { } else { if (mi_page_is_abandoned(page)) { - mi_assert(false); + // mi_assert(false); } } } @@ -748,7 +748,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi else { if (!all_committed) { // mark the entire range as no longer committed (so we recommit the full range when re-using) - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slice_index, slice_count, NULL); + mi_bitmap_clearN(&arena->slices_committed, slice_index, slice_count); mi_track_mem_noaccess(p, size); if (committed_size > 0) { // if partially committed, adjust the committed stats (is it will be recommitted when re-using) @@ -764,7 +764,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi } // and make it available to others again - bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_free, slice_index, slice_count, NULL); + bool all_inuse = mi_bitmap_setN(&arena->slices_free, slice_index, slice_count, NULL); if (!all_inuse) { _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count)); return; @@ -906,14 +906,14 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int } // reserve our meta info (and reserve slices outside the memory area) - mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); + mi_bitmap_unsafe_setN(&arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); if (memid.initially_committed) { - mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_committed, 0, arena->slice_count); + mi_bitmap_unsafe_setN(&arena->slices_committed, 0, arena->slice_count); } else { - mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, 0, info_slices, NULL); + mi_bitmap_setN(&arena->slices_committed, 0, info_slices, NULL); } - mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, 0, info_slices, NULL); + mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL); return mi_arena_add(arena, arena_id, &_mi_stats_main); } @@ -973,10 +973,16 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ _mi_output_message("%s%s:\n", prefix, header); size_t bit_count = 0; size_t bit_set_count = 0; - for (int i = 0; i < MI_BFIELD_BITS && bit_count < slice_count; i++) { - char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf)); + for (int i = 0; i < MI_BITMAP_CHUNK_COUNT && bit_count < slice_count; i++) { + char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { + if (j > 0 && (j % 4) == 0) { + buf[k++] = '\n'; + _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix); + buf[k++] = ' '; + buf[k++] = ' '; + } if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; if (invert) bfield = ~bfield; @@ -987,12 +993,11 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ buf[k++] = ' '; } else { - _mi_memset(buf + k, ' ', MI_BFIELD_BITS); + _mi_memset(buf + k, 'o', MI_BFIELD_BITS); k += MI_BFIELD_BITS; } - bit_count += MI_BFIELD_BITS; + bit_count += MI_BFIELD_BITS; } - _mi_output_message("%s %s\n", prefix, buf); } _mi_output_message("%s total ('x'): %zu\n", prefix, bit_set_count); @@ -1113,7 +1118,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, const size_t size = mi_size_of_slices(slices); void* const p = mi_arena_slice_start(arena, slice_index); bool needs_recommit; - if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slices)) { + if (mi_bitmap_is_setN(&arena->slices_committed, slice_index, slices)) { // all slices are committed, we can purge freely needs_recommit = _mi_os_purge(p, size, stats); } @@ -1128,11 +1133,11 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, } // clear the purged slices - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slices, slice_index, NULL); + mi_bitmap_clearN(&arena->slices_purge, slices, slice_index); // update committed bitmap if (needs_recommit) { - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slices, slice_index, NULL); + mi_bitmap_clearN(&arena->slices_committed, slices, slice_index); } } diff --git a/src/bitmap.c b/src/bitmap.c index dd1afe75..d24a89be 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -44,85 +44,168 @@ static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, siz return mi_bfield_find_least_bit((set ? ~x : x), idx); } -// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). -static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { +// Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 +static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = ((mi_bfield_t)1)<bfields[i], idx); -} - -static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) { - mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); - const size_t i = byte_idx / MI_BFIELD_SIZE; - const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; - return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx); -} - -// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0) -static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) { +// Set/clear a sequence of `n` bits within a chunk. +// Returns true if all bits transitioned from 0 to 1 (or 1 to 0). +static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* pall_already_xset) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); bool all_transition = true; @@ -164,17 +234,28 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset); + all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset ); all_already_xset = all_already_xset && already_xset; // next field field++; idx = 0; n -= m; } - *palready_xset = all_already_xset; + if (pall_already_xset!=NULL) { *pall_already_xset = all_already_xset; } return all_transition; } + +static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_set) { + return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, all_allready_set); +} + +static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_clear) { + return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, all_allready_clear); +} + + + // Check if a sequence of `n` bits within a chunk are all set/cleared. static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); @@ -197,6 +278,38 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz return all_xset; } + + +static inline bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx) { + mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + return mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx); +} + +static inline bool mi_bitmap_chunk_try_set(mi_bitmap_chunk_t* chunk, size_t cidx) { + return mi_bitmap_chunk_try_xset(MI_BIT_SET, chunk, cidx); +} + +static inline bool mi_bitmap_chunk_try_clear(mi_bitmap_chunk_t* chunk, size_t cidx) { + return mi_bitmap_chunk_try_xset(MI_BIT_CLEAR, chunk, cidx); +} + +static inline bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) { + mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); + const size_t i = byte_idx / MI_BFIELD_SIZE; + const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; + return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx); +} + +static inline bool mi_bitmap_chunk_try_set8(mi_bitmap_chunk_t* chunk, size_t byte_idx) { + return mi_bitmap_chunk_try_xset8(MI_BIT_SET, chunk, byte_idx); +} + +static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t byte_idx) { + return mi_bitmap_chunk_try_xset8(MI_BIT_CLEAR, chunk, byte_idx); +} + // Try to atomically set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving all bit fields as is. @@ -252,12 +365,19 @@ restore: while( field > start_field) { field--; const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); - bool already_xset; - mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset); + mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, NULL); } return false; } +static inline bool mi_bitmap_chunk_try_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + return mi_bitmap_chunk_try_xsetN(MI_BIT_SET, chunk, cidx, n); +} + +static inline bool mi_bitmap_chunk_try_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + return mi_bitmap_chunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n); +} + // find least 0/1-bit in a chunk and try to set/clear it atomically // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. @@ -265,8 +385,8 @@ restore: static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) { #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) while (true) { - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) if (mask==0) return false; @@ -283,6 +403,46 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu } // try again } +#elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512) + while (true) { + size_t chunk_idx = 0; + #if 1 + __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + if ((set ? _mm256_test_all_ones(vec) : _mm256_testz_si256(vec,vec))) { + chunk_idx += 4; + vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1); + } + const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) + // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) + if (mask==0) return false; + mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 + chunk_idx += _tzcnt_u32(mask) / 8; + #else + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); + const __m256i cmpv = (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256()); + const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) + const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) + const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) + const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; + // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) + if (mask==0) return false; + mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. + const size_t chunk_idx = _tzcnt_u64(mask) / 8; + #endif + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + size_t cidx; + if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear + if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + // try again + } #else for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { size_t idx; @@ -302,49 +462,10 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx); } -// static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) { -// return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); -// } - -/* -// find least 1-bit in a chunk and try unset it atomically -// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. -// todo: try neon version -static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) - while(true) { - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - if (_mm256_testz_si256(vec,vec)) return false; // vec == 0 ? - const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1 : 0) - const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0) - mi_assert_internal(mask != 0); - const size_t chunk_idx = _tzcnt_u32(mask) / 8; // tzcnt == 0, 8, 16, or 24 - mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); - size_t cidx; - if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) { // find the bit that is set - if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) { // unset atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); - return true; - } - } - // try again - } - #else - for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - size_t idx; - if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit - if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) { // try unset atomically - *pidx = (i*MI_BFIELD_BITS + idx); - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); - return true; - } - } - } - return false; - #endif +static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) { + return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); } -*/ + // find least byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. @@ -392,7 +513,8 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, } -// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically +// find a sequence of `n` bits in a chunk with all `n` (`< MI_BFIELD_BITS`!) bits set, +// and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. // todo: try avx2 and neon version // todo: allow spanning across bfield boundaries? @@ -410,7 +532,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, if ((b&mask) == mask) { // found a match mi_assert_internal( ((mask << bshift) >> bshift) == mask ); - if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<bfields[i],mask<bfields); return _mm256_testz_si256( vec, vec ); + #elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512) + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + if (!_mm256_testz_si256(vec1, vec1)) return false; + const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); + return (_mm256_testz_si256(vec2, vec2)); #else - // written like this for vectorization - mi_bfield_t x = chunk->bfields[0]; - for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { - x = x | chunk->bfields[i]; + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + if (chunk->bfields[i] != 0) return false; } - return (x == 0); + return true; #endif } +/* -------------------------------------------------------------------------------- + epochset (for now for 32-bit sets only) +-------------------------------------------------------------------------------- */ + +static void mi_epochset_split(mi_epochset_t es, uint32_t* bset, size_t* epoch) { + *bset = (uint32_t)es; + *epoch = (size_t)(es >> 32); +} + +static mi_epochset_t mi_epochset_join(uint32_t bset, size_t epoch) { + return ((uint64_t)epoch << 32) | bset; +} + +// setting a bit increases the epoch +static void mi_epochset_set(_Atomic(mi_epochset_t)*es, size_t idx) { + mi_assert(idx < 32); + size_t epoch; + uint32_t bset; + mi_epochset_t es_new; + mi_epochset_t es_old = mi_atomic_load_relaxed(es); + do { + mi_epochset_split(es_old, &bset, &epoch); + es_new = mi_epochset_join(bset | (MI_ZU(1)<any_set, chunk_idx); +} + +static bool mi_bitmap_anyset_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, size_t epoch) { + mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT); + return mi_epochset_try_clear(&bitmap->any_set, chunk_idx, epoch); +} + +static uint32_t mi_bitmap_anyset(mi_bitmap_t* bitmap, size_t* epoch) { + uint32_t bset; + mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, epoch); + return bset; +} + +static size_t mi_bitmap_epoch(mi_bitmap_t* bitmap) { + size_t epoch; + uint32_t bset; + mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, &epoch); + return epoch; +} + /* -------------------------------------------------------------------------------- bitmap -------------------------------------------------------------------------------- */ -static void mi_bitmap_update_anyset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { - if (set) { - mi_bfield_atomic_xset(MI_BIT_SET, &bitmap->any_set, idx); - } - else { // clear - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, idx); - } - } -} // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { @@ -485,8 +664,8 @@ void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { } } -// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. -void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { +// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS); @@ -495,19 +674,18 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; size_t m = MI_BITMAP_CHUNK_BITS - cidx; if (m > n) { m = n; } - bool already_xset; - mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset); - mi_bitmap_update_anyset(set, bitmap, chunk_idx); + mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL); + mi_bitmap_anyset_set(bitmap, chunk_idx); // n can be large so use memset for efficiency for all in-between chunks chunk_idx++; n -= m; const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; if (mid_chunks > 0) { - _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), mid_chunks * (MI_BITMAP_CHUNK_BITS/8)); + _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * (MI_BITMAP_CHUNK_BITS/8)); const size_t end_chunk = chunk_idx + mid_chunks; while (chunk_idx < end_chunk) { - mi_bitmap_update_anyset(set, bitmap, chunk_idx); + mi_bitmap_anyset_set(bitmap, chunk_idx); chunk_idx++; } n -= (mid_chunks * MI_BITMAP_CHUNK_BITS); @@ -517,8 +695,8 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ if (n > 0) { mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); - mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset); - mi_bitmap_update_anyset(set, bitmap, chunk_idx); + mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); + mi_bitmap_anyset_set(bitmap, chunk_idx); } } @@ -528,12 +706,26 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < MI_BITMAP_MAX_BITS); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - bool ok = mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx); - if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } - return ok; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + if (set) { + // first set the anyset since it is a conservative approximation (increases epoch) + mi_bitmap_anyset_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_try_set(&bitmap->chunks[chunk_idx], cidx); + } + else { + const size_t epoch = mi_bitmap_epoch(bitmap); + bool cleared = mi_bitmap_chunk_try_clear(&bitmap->chunks[chunk_idx], cidx); + if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; + } } + + + // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise leaving the bitmask as is. bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { @@ -541,11 +733,23 @@ bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx%8 == 0); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; - bool ok = mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx); - if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } - return ok; + if (set) { + // first set the anyset since it is a conservative approximation (increases epoch) + mi_bitmap_anyset_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx); + } + else { + const size_t epoch = mi_bitmap_epoch(bitmap); + bool cleared = mi_bitmap_chunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx); + if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; + } } + // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! @@ -561,22 +765,32 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < MI_BFIELD_BITS); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - - bool ok = mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n); - if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } - return ok; + + if (set) { + // first set the anyset since it is a conservative approximation (increases epoch) + mi_bitmap_anyset_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n); + } + else { + const size_t epoch = mi_bitmap_epoch(bitmap); + bool cleared = mi_bitmap_chunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n); + if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; + } } + // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) { +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset ) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - bool local_already_xset; - if (already_xset==NULL) { already_xset = &local_already_xset; } - // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } - // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } - mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + + //if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } + //if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; @@ -584,11 +798,23 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bo mi_assert_internal(chunk_idx < MI_BFIELD_BITS); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - const bool allx = mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset); - mi_bitmap_update_anyset(set, bitmap, chunk_idx); - return allx; + if (set) { + // first set the anyset since it is a conservative approximation (increases epoch) + mi_bitmap_anyset_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset); + } + else { + const size_t epoch = mi_bitmap_epoch(bitmap); + bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset); + if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; + } } + // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); @@ -605,16 +831,18 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) } -#define mi_bitmap_forall_set_chunks(bitmap,tseq,decl_chunk_idx) \ - { size_t _set_idx; \ - size_t _start = tseq % MI_BFIELD_BITS; \ - mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \ - while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \ - decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS; +#define mi_bitmap_forall_set_chunks(bitmap,tseq,name_epoch,name_chunk_idx) \ + { uint32_t _bit_idx; \ + uint32_t _start = (uint32_t)(tseq % MI_EPOCHSET_BITS); \ + size_t name_epoch; \ + uint32_t _any_set = mi_bitmap_anyset(bitmap,&name_epoch); \ + _any_set = mi_rotr32(_any_set, _start); \ + while (mi_bsf32(_any_set,&_bit_idx)) { \ + size_t name_chunk_idx = (_bit_idx + _start) % MI_BFIELD_BITS; #define mi_bitmap_forall_set_chunks_end() \ - _start += _set_idx+1; /* so chunk_idx stays valid */ \ - _any_set >>= _set_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ \ + _start += _bit_idx+1; /* so chunk_idx stays valid */ \ + _any_set >>= _bit_idx; /* skip scanned bits (and avoid UB with (_bit_idx+1)) */ \ _any_set >>= 1; \ } \ } @@ -623,8 +851,8 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. // The low `MI_BFIELD_BITS` of start are used to set the start point of the search // (to reduce thread contention). -bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_set_chunks(bitmap, tseq, epoch, chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) { @@ -635,8 +863,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx else { // we may find that all are unset only on a second iteration but that is ok as // _any_set is a conservative approximation. - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); } } } @@ -647,8 +875,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) { - mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) { + mi_bitmap_forall_set_chunks(bitmap,tseq, epoch, chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) { @@ -658,8 +886,10 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid return true; } else { - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + // we may find that all are unset only on a second iteration but that is ok as + // _any_set is a conservative approximation. + if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); } } } @@ -672,11 +902,8 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) { // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger // TODO: allow spanning across chunk boundaries - if (n == 0 || n > MI_BFIELD_BITS) return false; - if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); - if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); - - mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) + if (n == 0 || n > MI_BFIELD_BITS) return false; + mi_bitmap_forall_set_chunks(bitmap,tseq,epoch,chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) { @@ -685,8 +912,10 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t return true; } else { - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + // we may find that all are unset only on a second iteration but that is ok as + // _any_set is a conservative approximation. + if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); } } } diff --git a/src/bitmap.h b/src/bitmap.h index 1a180924..38137b0f 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -25,20 +25,26 @@ typedef size_t mi_bfield_t; #define MI_BFIELD_LO_BIT8 (((~(mi_bfield_t)0))/0xFF) // 0x01010101 .. #define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. +#define MI_BITMAP_CHUNK_SIZE (MI_BITMAP_CHUNK_BITS / 8) #define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) #define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) -typedef mi_decl_align(32) struct mi_bitmap_chunk_s { +// 512 bits on 64_bit +typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s { _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; } mi_bitmap_chunk_t; +// for now 32 (note: with ABA instructions we can make this 64) +#define MI_EPOCHSET_BITS (32) +#define MI_BITMAP_CHUNK_COUNT MI_EPOCHSET_BITS +typedef uint64_t mi_epochset_t; -typedef mi_decl_align(32) struct mi_bitmap_s { - mi_bitmap_chunk_t chunks[MI_BFIELD_BITS]; - _Atomic(mi_bfield_t)any_set; +typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s { + mi_bitmap_chunk_t chunks[MI_BITMAP_CHUNK_COUNT]; + _Atomic(mi_epochset_t) any_set; } mi_bitmap_t; -#define MI_BITMAP_MAX_BITS (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit +#define MI_BITMAP_MAX_BITS (MI_BITMAP_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit /* -------------------------------------------------------------------------------- Bitmap @@ -52,29 +58,73 @@ typedef bool mi_bit_t; void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero); // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. -void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); +void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! // If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset); +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset); + +static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_set) { + return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, all_already_set); +} + +static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_xsetN(MI_BIT_CLEAR, bitmap, idx, n, NULL); +} + // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); +static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n); +} + +static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n); +} + + // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) // and false otherwise leaving the bitmask as is. mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); +static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx); +} + +static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx); +} + + // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise leaving the bitmask as is. mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); +static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx); +} + +static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx); +} + // Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); +static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n); +} + +static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n); +} + + // Find a set bit in a bitmap and atomically unset it. Returns true on success, // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. // The low `MI_BFIELD_BITS` of start are used to set the start point of the search @@ -89,4 +139,5 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ); + #endif // MI_XBITMAP_H diff --git a/src/free.c b/src/free.c index f0ce8c22..1e9fe478 100644 --- a/src/free.c +++ b/src/free.c @@ -239,9 +239,9 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block } else { if (mi_page_is_abandoned(page)) { - mi_assert(false); + // mi_assert(false); } - mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages + // mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages } } } diff --git a/src/libc.c b/src/libc.c index 05ed7b02..20e9e38b 100644 --- a/src/libc.c +++ b/src/libc.c @@ -280,7 +280,7 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { // generic trailing and leading zero count // -------------------------------------------------------- -static inline size_t mi_ctz_generic32(uint32_t x) { +uint32_t _mi_ctz_generic32(uint32_t x) { // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, @@ -290,7 +290,7 @@ static inline size_t mi_ctz_generic32(uint32_t x) { return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27]; } -static inline size_t mi_clz_generic32(uint32_t x) { +static size_t mi_clz_generic32(uint32_t x) { // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, @@ -319,10 +319,10 @@ size_t _mi_clz_generic(size_t x) { size_t _mi_ctz_generic(size_t x) { if (x==0) return MI_SIZE_BITS; #if (MI_SIZE_BITS <= 32) - return mi_ctz_generic32((uint32_t)x); + return _mi_ctz_generic32((uint32_t)x); #else - const size_t count = mi_ctz_generic32((uint32_t)x); + const size_t count = _mi_ctz_generic32((uint32_t)x); if (count < 32) return count; - return (32 + mi_ctz_generic32((uint32_t)(x>>32))); + return (32 + _mi_ctz_generic32((uint32_t)(x>>32))); #endif } diff --git a/src/options.c b/src/options.c index 8cb0d216..1e64c08e 100644 --- a/src/options.c +++ b/src/options.c @@ -412,7 +412,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me // Define our own limited `fprintf` that avoids memory allocation. // We do this using `_mi_vsnprintf` with a limited buffer. static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) { - char buf[512]; + char buf[768]; if (fmt==NULL) return; if (!mi_recurse_enter()) return; _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args);