From d5ed0cc71ef02b5ab986fa7ffc06b4c6e65dd622 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 9 Dec 2024 14:31:43 -0800 Subject: [PATCH] various improvements --- include/mimalloc/atomic.h | 3 + include/mimalloc/bits.h | 15 ++- include/mimalloc/types.h | 6 +- src/arena.c | 52 ++++++--- src/bitmap.c | 238 +++++++++++++++++++++----------------- src/bitmap.h | 20 +++- src/free.c | 7 +- src/init.c | 2 +- src/os.c | 13 +-- src/random.c | 19 ++- 10 files changed, 223 insertions(+), 152 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index caa90cf8..3b0ff559 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -74,8 +74,11 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_atomic_store_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_exchange_release(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(release)) #define mi_atomic_exchange_acq_rel(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel)) + +#define mi_atomic_cas_weak_relaxed(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed)) #define mi_atomic_cas_weak_release(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) #define mi_atomic_cas_weak_acq_rel(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) +#define mi_atomic_cas_strong_relaxed(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed)) #define mi_atomic_cas_strong_release(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) #define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 3afac04d..e47d8a76 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -229,7 +229,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) { unsigned long i; return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false); #else - return (x!=0 ? (*idx = mi_ctz(x), true) : false); + return (x!=0 ? (*idx = mi_ctz(x), true) : false); #endif } @@ -289,5 +289,18 @@ static inline size_t mi_rotl(size_t x, size_t r) { #endif } +static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) { + #if mi_has_builtin(rotateleft32) + return mi_builtin(rotateleft32)(x,r); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + return _lrotl(x, (int)r); + #else + // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to + // avoid UB when `rshift==0`. See + const unsigned int rshift = (unsigned int)(r) & 31; + return ((x << rshift) | (x >> ((-rshift) & 31))); + #endif +} + #endif // MI_BITS_H diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index d507ca69..71edb397 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -334,9 +334,9 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) -#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~16KiB -#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~128KiB -#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // ~2MiB +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB +#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // < 2 MiB #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/arena.c b/src/arena.c index ab74b988..24835f42 100644 --- a/src/arena.c +++ b/src/arena.c @@ -29,7 +29,8 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo ----------------------------------------------------------- */ #define MI_ARENA_BIN_COUNT (MI_BIN_COUNT) - +#define MI_ARENA_MIN_SIZE (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE) // 32 MiB (or 8 MiB on 32-bit) +#define MI_ARENA_MAX_SIZE (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE) // A memory arena descriptor typedef struct mi_arena_s { @@ -105,7 +106,7 @@ size_t mi_arena_get_count(void) { mi_arena_t* mi_arena_from_index(size_t idx) { mi_assert_internal(idx < mi_arena_get_count()); - return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); + return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]); } mi_arena_t* mi_arena_from_id(mi_arena_id_t id) { @@ -235,6 +236,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( } } } + if (memid->initially_zero) { + mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE); + } + else { + mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE); + } } else { // no need to commit, but check if already fully committed @@ -253,7 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // try to reserve a fresh arena space static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) { - if (_mi_preloading()) return false; // use OS only while pre loading + // if (_mi_preloading()) return false; // use OS only while pre loading if (req_arena_id != _mi_arena_id_none()) return false; const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); @@ -269,8 +276,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); if (arena_count >= 1 && arena_count <= 128) { - // scale up the arena sizes exponentially every 8 entries - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); + // scale up the arena sizes exponentially every 4 entries + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16); size_t reserve = 0; if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { arena_reserve = reserve; @@ -278,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re } // check arena bounds - const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps? - const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE; // 16 GiB + const size_t min_reserve = MI_ARENA_MIN_SIZE; + const size_t max_reserve = MI_ARENA_MAX_SIZE; // 16 GiB if (arena_reserve < min_reserve) { arena_reserve = min_reserve; } @@ -294,7 +301,17 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } - return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); + // and try to reserve the arena + int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); + if (err != 0) { + // failed, try a smaller size? + const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB); + if (arena_reserve > small_arena_reserve) { + // try again + err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); + } + } + return (err==0); } @@ -317,12 +334,12 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are #define mi_forall_arenas(req_arena_id, tseq, name_arena) \ { \ - const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \ + const size_t _arena_count = mi_arena_get_count(); \ if (_arena_count > 0) { \ const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \ size_t _start; \ if (req_arena_id == _mi_arena_id_none()) { \ - /* always start searching in an arena 1 below the max */ \ + /* always start searching in the arena's below the max */ \ _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \ } \ else { \ @@ -333,10 +350,10 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are size_t _idx; \ if (_i < _arena_cycle) { \ _idx = _i + _start; \ - if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \ + if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \ } \ else { \ - _idx = _i; \ + _idx = _i; /* remaining arena's */ \ } \ mi_arena_t* const name_arena = mi_arena_from_index(_idx); \ if (name_arena != NULL) \ @@ -397,6 +414,9 @@ again: // did we need a specific arena? if (req_arena_id != _mi_arena_id_none()) return NULL; + // don't create arena's while preloading (todo: or should we?) + if (_mi_preloading()) return NULL; + // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?) if (mi_lock_try_acquire(&mi_arena_reserve_lock)) { mi_arena_id_t arena_id = 0; @@ -917,7 +937,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. static void mi_arenas_unsafe_destroy(void) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + const size_t max_arena = mi_arena_get_count(); size_t new_max_arena = 0; for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); @@ -949,7 +969,7 @@ void _mi_arena_unsafe_destroy_all(void) { // Is a pointer inside any of our arenas? bool _mi_arena_contains(const void* p) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + const size_t max_arena = mi_arena_get_count(); for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) { @@ -1175,7 +1195,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { MI_UNUSED(show_abandoned); - size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); + size_t max_arenas = mi_arena_get_count(); size_t free_total = 0; size_t slice_total = 0; //size_t abandoned_total = 0; @@ -1331,7 +1351,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ static void mi_arenas_try_purge(bool force, bool visit_all) { if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + const size_t max_arena = mi_arena_get_count(); if (max_arena == 0) return; // _mi_error_message(EFAULT, "purging not yet implemented\n"); diff --git a/src/bitmap.c b/src/bitmap.c index 45a82ba3..2f563066 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -14,6 +14,8 @@ Concurrent bitmap that can set/reset sequences of bits atomically #include "mimalloc/bits.h" #include "bitmap.h" +#define MI_USE_SIMD 0 + /* -------------------------------------------------------------------------------- bfields -------------------------------------------------------------------------------- */ @@ -34,9 +36,9 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { return mi_bsf(x,idx); } -static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { - return mi_rotr(x,r); -} +//static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { +// return mi_rotr(x,r); +//} static inline mi_bfield_t mi_bfield_zero(void) { return 0; @@ -456,7 +458,7 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t // ------- mi_bchunk_try_find_and_clear --------------------------------------- -#if defined(__AVX2__) +#if MI_USE_SIMD && defined(__AVX2__) static inline __m256i mi_mm256_zero(void) { return _mm256_setzero_si256(); } @@ -471,12 +473,27 @@ static inline bool mi_mm256_is_zero( __m256i vec) { } #endif +static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) { + mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); + size_t cidx; + if (!allow_allset && (~b == 0)) return false; + if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit + if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + mi_assert_internal(*pidx < MI_BCHUNK_BITS); + return true; + } + } + return false; +} + // Find least 1-bit in a chunk and try to clear it atomically // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // This is used to find free slices and abandoned pages and should be efficient. // todo: try neon version static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) + #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0) @@ -485,19 +502,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 const size_t chunk_idx = _tzcnt_u32(mask) / 8; - mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); - size_t cidx; - if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BCHUNK_BITS); - return true; - } - } + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again } - #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { size_t chunk_idx = 0; #if 0 @@ -528,42 +536,50 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. chunk_idx = _tzcnt_u64(mask) / 8; #endif - mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); - size_t cidx; - if (mi_bfield_find_least_bit(b, &cidx)) { // find the bit-idx that is clear - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BCHUNK_BITS); - return true; - } - } + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again } #else + // try first to find a field that is not all set (to reduce fragmentation) for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); - size_t idx; - if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) { // try to clear it atomically - *pidx = (i*MI_BFIELD_BITS + idx); - mi_assert_internal(*pidx < MI_BCHUNK_BITS); - return true; - } - } + if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true; + } + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true; } return false; #endif } +static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) { + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); + if (!allow_all_set && (~b == 0)) return false; + // has_set8 has low bit in each byte set if the byte in x == 0xFF + const mi_bfield_t has_set8 = + ((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F + (b & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 + >> 7; // shift high bit to low bit + size_t idx; + if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit + mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); + mi_assert_internal((idx%8)==0); + const size_t byte_idx = idx/8; + if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // unset the byte atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; + mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); + return true; + } + } + return false; +} // find least byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // Used to find medium size pages in the free blocks. // todo: try neon version static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); @@ -588,24 +604,12 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // try again } #else + // first skip allset fields to reduce fragmentation for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { - const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]); - // has_set8 has low bit in each byte set if the byte in x == 0xFF - const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F - (x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 - >> 7; // shift high bit to low bit - size_t idx; - if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit - mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); - mi_assert_internal((idx%8)==0); - const size_t byte_idx = idx/8; - if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) { // unset the byte atomically - *pidx = (i*MI_BFIELD_BITS) + idx; - mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); - return true; - } - // else continue - } + if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true; + } + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true; } return false; #endif @@ -618,7 +622,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // Used to find large size pages in the free blocks. // todo: try neon version static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); @@ -747,14 +751,14 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, } -static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { - if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages - if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages - if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages - if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); - return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); -} +//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { +// if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages +// if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages +// if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages +// if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk +// if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); +// return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); +//} // ------- mi_bchunk_clear_once_set --------------------------------------- @@ -779,10 +783,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) { // are all bits in a bitmap chunk clear? static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) + #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); return mi_mm256_is_zero(vec); - #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) // a 64b cache-line contains the entire chunk anyway so load both at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); @@ -796,9 +800,17 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { bitmap chunkmap -------------------------------------------------------------------------------- */ +static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) { + size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); + if mi_unlikely(chunk_idx > oldmax) { + mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx); + } +} + static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); mi_bchunk_set(&bitmap->chunkmap, chunk_idx); + mi_bitmap_chunkmap_set_max(bitmap, chunk_idx); } static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) { @@ -813,11 +825,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) mi_bchunk_set(&bitmap->chunkmap, chunk_idx); return false; } - // record the max clear - size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); - do { - if mi_likely(chunk_idx <= oldmax) break; - } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx)); + mi_bitmap_chunkmap_set_max(bitmap, chunk_idx); return true; } @@ -894,6 +902,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); mi_bitmap_chunkmap_set(bitmap, chunk_idx); } + + // reset max_accessed + mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0); } @@ -1027,31 +1038,27 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ - const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \ - const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \ - const size_t chunkmap_hi_bfield = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\ - const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ - const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \ + const size_t chunk_max_acc = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \ + const size_t chunk_start = tseq % chunk_max_acc; /* space out threads? */ \ + const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \ + const size_t chunkmap_max_acc = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \ + const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ /* for each chunkmap entry `i` */ \ - for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \ + for (size_t _i = 0; _i < chunkmap_max; _i++) { \ size_t i; \ - if (_i < chunkmap_hi_bfield) { \ - i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \ - if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \ + if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \ + i = _i + chunkmap_start; \ + if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \ } \ - else { i = _i; } /* the rest of the chunks above chunk_hi_idx */ \ + else { i = _i; } /* the rest of the chunks above chunk_max_accessed */ \ const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ - size_t cmap_idx_shift = 0; /* shift through the cmap */ \ - if (_i == 0 && chunkmap_start_idx > 0) { \ - cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \ - cmap_idx_shift = chunkmap_start_idx; \ - } \ + /* todo: space out threads within a chunkmap (2GiB) as well? */ \ + size_t cmap_idx_shift = 0; /* shift through the cmap */ \ size_t cmap_idx; \ while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \ /* set the chunk idx */ \ size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \ - mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \ /* try to find and clear N bits in that chunk */ \ { @@ -1064,28 +1071,45 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n } \ }} -// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS) -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) -{ - // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); - mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) - { - size_t cidx; - if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { - *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; - mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap)); - return true; - } - else { - // we may find that all are cleared only on a second iteration but that is ok as - // the chunkmap is a conservative approximation. - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); - } - } - mi_bitmap_forall_chunks_end(); - return false; + +#define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \ + mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \ + size_t _cidx; \ + if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \ + *pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \ + return true; \ + } \ + else { \ + /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \ + mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \ + } \ + } \ + mi_bitmap_forall_chunks_end(); \ + return false; \ +} + +#define COMMA , + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , ); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, ); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, ); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BFIELD_BITS); + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BCHUNK_BITS); + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n); } diff --git a/src/bitmap.h b/src/bitmap.h index 40c4df42..b26791cc 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -91,8 +91,8 @@ typedef mi_bchunk_t mi_bchunkmap_t; // An atomic bitmap typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { - _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) - _Atomic(size_t) chunk_max_clear; // max chunk index that was once cleared + _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) + _Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc mi_bchunkmap_t chunkmap; mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT @@ -172,9 +172,23 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n } +// Specialized versions for common bit sequence sizes +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 1-bit +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS + // Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); +mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { + if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); // small pages + if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); // medium pages + if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages + if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk + if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx); + return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx); +} // Called once a bit is cleared to see if the memory slice can be claimed. diff --git a/src/free.c b/src/free.c index d45507e7..0da0332e 100644 --- a/src/free.c +++ b/src/free.c @@ -217,8 +217,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { } // 2. if the page is not too full, we can try to reclaim it for ourselves + // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit. if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && - !mi_page_is_used_at_frac(page,8)) + !mi_page_is_used_at_frac(page,4) + // && !mi_page_is_abandoned_mapped(page) + ) { // the page has still some blocks in use (but not too many) // reclaim in our heap if compatible, or otherwise abandon again @@ -247,7 +250,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { } // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations - if (!mi_page_is_used_at_frac(page, 4) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page + if (!mi_page_is_used_at_frac(page,4) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && _mi_arena_page_try_reabandon_to_mapped(page)) { diff --git a/src/init.c b/src/init.c index 2396f594..2070405d 100644 --- a/src/init.c +++ b/src/init.c @@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- -#define MI_MEMID_STATIC {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } +#define MI_MEMID_STATIC {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, diff --git a/src/os.c b/src/os.c index b913fb1c..55f7428e 100644 --- a/src/os.c +++ b/src/os.c @@ -203,10 +203,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL; size = _mi_align_up(size, _mi_os_page_size()); - // try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste). - const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64); + // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size. + const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8); - // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD) void* p = NULL; if (try_direct_alloc) { p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero); @@ -233,8 +232,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit if (p == NULL) return NULL; // set p to the aligned part in the full region - // note: this is dangerous on Windows as VirtualFree needs the actual base pointer - // this is handled though by having the `base` field in the memid's + // note: on Windows VirtualFree needs the actual base pointer + // this is handledby having the `base` field in the memid. *base = p; // remember the base p = _mi_align_up_ptr(p, alignment); @@ -361,7 +360,7 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, if (newsize != NULL) *newsize = 0; if (size == 0 || addr == NULL) return NULL; - // page align conservatively within the range + // page align conservatively within the range, or liberally straddling pages outside the range void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size()) : mi_align_down_ptr(addr, _mi_os_page_size())); void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size()) @@ -472,7 +471,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset) return needs_recommit; } else { - if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed + if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory) _mi_os_reset(p, size); } return false; // needs no recommit diff --git a/src/random.c b/src/random.c index 4fc8b2f8..35e2718a 100644 --- a/src/random.c +++ b/src/random.c @@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" // _mi_prim_random_buf -#include // memset /* ---------------------------------------------------------------------------- We use our own PRNG to keep predictable performance of random number generation @@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil (gcc x64 has no register spills, and clang 6+ uses SSE instructions) -----------------------------------------------------------------------------*/ -static inline uint32_t rotl(uint32_t x, uint32_t shift) { - return (x << shift) | (x >> (32 - shift)); -} - static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) { - x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16); - x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12); - x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8); - x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7); + x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16); + x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12); + x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8); + x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7); } static void chacha_block(mi_random_ctx_t* ctx) @@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no // since we only use chacha for randomness (and not encryption) we // do not _need_ to read 32-bit values as little endian but we do anyways // just for being compatible :-) - memset(ctx, 0, sizeof(*ctx)); + _mi_memzero(ctx, sizeof(*ctx)); for (size_t i = 0; i < 4; i++) { const uint8_t* sigma = (uint8_t*)"expand 32-byte k"; ctx->input[i] = read32(sigma,i); @@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no } static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) { - memset(ctx_new, 0, sizeof(*ctx_new)); + _mi_memzero(ctx_new, sizeof(*ctx_new)); _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input)); ctx_new->input[12] = 0; ctx_new->input[13] = 0; @@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim uintptr_t _mi_os_random_weak(uintptr_t extra_seed) { uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random - x ^= _mi_prim_clock_now(); + x ^= _mi_prim_clock_now(); // and do a few randomization steps uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1; for (uintptr_t i = 0; i < max; i++) {