various improvements

This commit is contained in:
daanx 2024-12-09 14:31:43 -08:00
parent 68ac94c1ba
commit d5ed0cc71e
10 changed files with 223 additions and 152 deletions

View file

@ -74,8 +74,11 @@ terms of the MIT license. A copy of the license can be found in the file
#define mi_atomic_store_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_store_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
#define mi_atomic_exchange_release(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(release)) #define mi_atomic_exchange_release(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
#define mi_atomic_exchange_acq_rel(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel)) #define mi_atomic_exchange_acq_rel(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
#define mi_atomic_cas_weak_relaxed(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
#define mi_atomic_cas_weak_release(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) #define mi_atomic_cas_weak_release(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
#define mi_atomic_cas_weak_acq_rel(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) #define mi_atomic_cas_weak_acq_rel(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
#define mi_atomic_cas_strong_relaxed(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
#define mi_atomic_cas_strong_release(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) #define mi_atomic_cas_strong_release(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
#define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) #define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))

View file

@ -229,7 +229,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
unsigned long i; unsigned long i;
return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false); return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
#else #else
return (x!=0 ? (*idx = mi_ctz(x), true) : false); return (x!=0 ? (*idx = mi_ctz(x), true) : false);
#endif #endif
} }
@ -289,5 +289,18 @@ static inline size_t mi_rotl(size_t x, size_t r) {
#endif #endif
} }
static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
#if mi_has_builtin(rotateleft32)
return mi_builtin(rotateleft32)(x,r);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
return _lrotl(x, (int)r);
#else
// The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
// avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
const unsigned int rshift = (unsigned int)(r) & 31;
return ((x << rshift) | (x >> ((-rshift) & 31)));
#endif
}
#endif // MI_BITS_H #endif // MI_BITS_H

View file

@ -334,9 +334,9 @@ typedef struct mi_page_s {
// The max object size are checked to not waste more than 12.5% internally over the page sizes. // The max object size are checked to not waste more than 12.5% internally over the page sizes.
// (Except for large pages since huge objects are allocated in 4MiB chunks) // (Except for large pages since huge objects are allocated in 4MiB chunks)
#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~16KiB #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB
#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~128KiB #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB
#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // ~2MiB #define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // < 2 MiB
#define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)

View file

@ -29,7 +29,8 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
----------------------------------------------------------- */ ----------------------------------------------------------- */
#define MI_ARENA_BIN_COUNT (MI_BIN_COUNT) #define MI_ARENA_BIN_COUNT (MI_BIN_COUNT)
#define MI_ARENA_MIN_SIZE (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE) // 32 MiB (or 8 MiB on 32-bit)
#define MI_ARENA_MAX_SIZE (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE)
// A memory arena descriptor // A memory arena descriptor
typedef struct mi_arena_s { typedef struct mi_arena_s {
@ -105,7 +106,7 @@ size_t mi_arena_get_count(void) {
mi_arena_t* mi_arena_from_index(size_t idx) { mi_arena_t* mi_arena_from_index(size_t idx) {
mi_assert_internal(idx < mi_arena_get_count()); mi_assert_internal(idx < mi_arena_get_count());
return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]);
} }
mi_arena_t* mi_arena_from_id(mi_arena_id_t id) { mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
@ -235,6 +236,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
} }
} }
} }
if (memid->initially_zero) {
mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE);
}
else {
mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE);
}
} }
else { else {
// no need to commit, but check if already fully committed // no need to commit, but check if already fully committed
@ -253,7 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
// try to reserve a fresh arena space // try to reserve a fresh arena space
static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
{ {
if (_mi_preloading()) return false; // use OS only while pre loading // if (_mi_preloading()) return false; // use OS only while pre loading
if (req_arena_id != _mi_arena_id_none()) return false; if (req_arena_id != _mi_arena_id_none()) return false;
const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
@ -269,8 +276,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
if (arena_count >= 1 && arena_count <= 128) { if (arena_count >= 1 && arena_count <= 128) {
// scale up the arena sizes exponentially every 8 entries // scale up the arena sizes exponentially every 4 entries
const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16);
size_t reserve = 0; size_t reserve = 0;
if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
arena_reserve = reserve; arena_reserve = reserve;
@ -278,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
} }
// check arena bounds // check arena bounds
const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps? const size_t min_reserve = MI_ARENA_MIN_SIZE;
const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE; // 16 GiB const size_t max_reserve = MI_ARENA_MAX_SIZE; // 16 GiB
if (arena_reserve < min_reserve) { if (arena_reserve < min_reserve) {
arena_reserve = min_reserve; arena_reserve = min_reserve;
} }
@ -294,7 +301,17 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); // and try to reserve the arena
int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
if (err != 0) {
// failed, try a smaller size?
const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
if (arena_reserve > small_arena_reserve) {
// try again
err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
}
}
return (err==0);
} }
@ -317,12 +334,12 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
#define mi_forall_arenas(req_arena_id, tseq, name_arena) \ #define mi_forall_arenas(req_arena_id, tseq, name_arena) \
{ \ { \
const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \ const size_t _arena_count = mi_arena_get_count(); \
if (_arena_count > 0) { \ if (_arena_count > 0) { \
const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \ const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
size_t _start; \ size_t _start; \
if (req_arena_id == _mi_arena_id_none()) { \ if (req_arena_id == _mi_arena_id_none()) { \
/* always start searching in an arena 1 below the max */ \ /* always start searching in the arena's below the max */ \
_start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \ _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
} \ } \
else { \ else { \
@ -333,10 +350,10 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
size_t _idx; \ size_t _idx; \
if (_i < _arena_cycle) { \ if (_i < _arena_cycle) { \
_idx = _i + _start; \ _idx = _i + _start; \
if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \ if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \
} \ } \
else { \ else { \
_idx = _i; \ _idx = _i; /* remaining arena's */ \
} \ } \
mi_arena_t* const name_arena = mi_arena_from_index(_idx); \ mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
if (name_arena != NULL) \ if (name_arena != NULL) \
@ -397,6 +414,9 @@ again:
// did we need a specific arena? // did we need a specific arena?
if (req_arena_id != _mi_arena_id_none()) return NULL; if (req_arena_id != _mi_arena_id_none()) return NULL;
// don't create arena's while preloading (todo: or should we?)
if (_mi_preloading()) return NULL;
// otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?) // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
if (mi_lock_try_acquire(&mi_arena_reserve_lock)) { if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
mi_arena_id_t arena_id = 0; mi_arena_id_t arena_id = 0;
@ -917,7 +937,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
// for dynamic libraries that are unloaded and need to release all their allocated memory. // for dynamic libraries that are unloaded and need to release all their allocated memory.
static void mi_arenas_unsafe_destroy(void) { static void mi_arenas_unsafe_destroy(void) {
const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); const size_t max_arena = mi_arena_get_count();
size_t new_max_arena = 0; size_t new_max_arena = 0;
for (size_t i = 0; i < max_arena; i++) { for (size_t i = 0; i < max_arena; i++) {
mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
@ -949,7 +969,7 @@ void _mi_arena_unsafe_destroy_all(void) {
// Is a pointer inside any of our arenas? // Is a pointer inside any of our arenas?
bool _mi_arena_contains(const void* p) { bool _mi_arena_contains(const void* p) {
const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); const size_t max_arena = mi_arena_get_count();
for (size_t i = 0; i < max_arena; i++) { for (size_t i = 0; i < max_arena; i++) {
mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) { if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
@ -1175,7 +1195,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
MI_UNUSED(show_abandoned); MI_UNUSED(show_abandoned);
size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); size_t max_arenas = mi_arena_get_count();
size_t free_total = 0; size_t free_total = 0;
size_t slice_total = 0; size_t slice_total = 0;
//size_t abandoned_total = 0; //size_t abandoned_total = 0;
@ -1331,7 +1351,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
static void mi_arenas_try_purge(bool force, bool visit_all) { static void mi_arenas_try_purge(bool force, bool visit_all) {
if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled
const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); const size_t max_arena = mi_arena_get_count();
if (max_arena == 0) return; if (max_arena == 0) return;
// _mi_error_message(EFAULT, "purging not yet implemented\n"); // _mi_error_message(EFAULT, "purging not yet implemented\n");

View file

@ -14,6 +14,8 @@ Concurrent bitmap that can set/reset sequences of bits atomically
#include "mimalloc/bits.h" #include "mimalloc/bits.h"
#include "bitmap.h" #include "bitmap.h"
#define MI_USE_SIMD 0
/* -------------------------------------------------------------------------------- /* --------------------------------------------------------------------------------
bfields bfields
-------------------------------------------------------------------------------- */ -------------------------------------------------------------------------------- */
@ -34,9 +36,9 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
return mi_bsf(x,idx); return mi_bsf(x,idx);
} }
static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { //static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
return mi_rotr(x,r); // return mi_rotr(x,r);
} //}
static inline mi_bfield_t mi_bfield_zero(void) { static inline mi_bfield_t mi_bfield_zero(void) {
return 0; return 0;
@ -456,7 +458,7 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t
// ------- mi_bchunk_try_find_and_clear --------------------------------------- // ------- mi_bchunk_try_find_and_clear ---------------------------------------
#if defined(__AVX2__) #if MI_USE_SIMD && defined(__AVX2__)
static inline __m256i mi_mm256_zero(void) { static inline __m256i mi_mm256_zero(void) {
return _mm256_setzero_si256(); return _mm256_setzero_si256();
} }
@ -471,12 +473,27 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
} }
#endif #endif
static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) {
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
size_t cidx;
if (!allow_allset && (~b == 0)) return false;
if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true;
}
}
return false;
}
// Find least 1-bit in a chunk and try to clear it atomically // Find least 1-bit in a chunk and try to clear it atomically
// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// This is used to find free slices and abandoned pages and should be efficient. // This is used to find free slices and abandoned pages and should be efficient.
// todo: try neon version // todo: try neon version
static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) { static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==256) #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
while (true) { while (true) {
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0) const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0)
@ -485,19 +502,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
if (mask==0) return false; if (mask==0) return false;
mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
const size_t chunk_idx = _tzcnt_u32(mask) / 8; const size_t chunk_idx = _tzcnt_u32(mask) / 8;
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
size_t cidx;
if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true;
}
}
// try again // try again
} }
#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) { while (true) {
size_t chunk_idx = 0; size_t chunk_idx = 0;
#if 0 #if 0
@ -528,42 +536,50 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
chunk_idx = _tzcnt_u64(mask) / 8; chunk_idx = _tzcnt_u64(mask) / 8;
#endif #endif
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
size_t cidx;
if (mi_bfield_find_least_bit(b, &cidx)) { // find the bit-idx that is clear
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true;
}
}
// try again // try again
} }
#else #else
// try first to find a field that is not all set (to reduce fragmentation)
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true;
size_t idx; }
if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) { // try to clear it atomically if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true;
*pidx = (i*MI_BFIELD_BITS + idx);
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true;
}
}
} }
return false; return false;
#endif #endif
} }
static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
if (!allow_all_set && (~b == 0)) return false;
// has_set8 has low bit in each byte set if the byte in x == 0xFF
const mi_bfield_t has_set8 =
((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F
(b & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80
>> 7; // shift high bit to low bit
size_t idx;
if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
mi_assert_internal((idx%8)==0);
const size_t byte_idx = idx/8;
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // unset the byte atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
return true;
}
}
return false;
}
// find least byte in a chunk with all bits set, and try unset it atomically // find least byte in a chunk with all bits set, and try unset it atomically
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// Used to find medium size pages in the free blocks. // Used to find medium size pages in the free blocks.
// todo: try neon version // todo: try neon version
static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) { static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==512) #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) { while (true) {
// since a cache-line is 64b, load all at once // since a cache-line is 64b, load all at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
@ -588,24 +604,12 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
// try again // try again
} }
#else #else
// first skip allset fields to reduce fragmentation
for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]); if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true;
// has_set8 has low bit in each byte set if the byte in x == 0xFF }
const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
(x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true;
>> 7; // shift high bit to low bit
size_t idx;
if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
mi_assert_internal((idx%8)==0);
const size_t byte_idx = idx/8;
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) { // unset the byte atomically
*pidx = (i*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
return true;
}
// else continue
}
} }
return false; return false;
#endif #endif
@ -618,7 +622,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
// Used to find large size pages in the free blocks. // Used to find large size pages in the free blocks.
// todo: try neon version // todo: try neon version
static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) { static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==512) #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) { while (true) {
// since a cache-line is 64b, load all at once // since a cache-line is 64b, load all at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
@ -747,14 +751,14 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
} }
static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { //static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages // if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages
if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages // if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages
if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages // if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk // if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); // if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); // return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
} //}
// ------- mi_bchunk_clear_once_set --------------------------------------- // ------- mi_bchunk_clear_once_set ---------------------------------------
@ -779,10 +783,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
// are all bits in a bitmap chunk clear? // are all bits in a bitmap chunk clear?
static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==256) #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
return mi_mm256_is_zero(vec); return mi_mm256_is_zero(vec);
#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
// a 64b cache-line contains the entire chunk anyway so load both at once // a 64b cache-line contains the entire chunk anyway so load both at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
@ -796,9 +800,17 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
bitmap chunkmap bitmap chunkmap
-------------------------------------------------------------------------------- */ -------------------------------------------------------------------------------- */
static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) {
size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed);
if mi_unlikely(chunk_idx > oldmax) {
mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx);
}
}
static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) { static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
mi_bchunk_set(&bitmap->chunkmap, chunk_idx); mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
} }
static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) { static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
@ -813,11 +825,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
mi_bchunk_set(&bitmap->chunkmap, chunk_idx); mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
return false; return false;
} }
// record the max clear mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
do {
if mi_likely(chunk_idx <= oldmax) break;
} while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
return true; return true;
} }
@ -894,6 +902,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
mi_bitmap_chunkmap_set(bitmap, chunk_idx); mi_bitmap_chunkmap_set(bitmap, chunk_idx);
} }
// reset max_accessed
mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0);
} }
@ -1027,31 +1038,27 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
{ \ { \
/* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
MI_UNUSED(tseq); \ MI_UNUSED(tseq); \
const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \ const size_t chunk_max_acc = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \
const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \ const size_t chunk_start = tseq % chunk_max_acc; /* space out threads? */ \
const size_t chunkmap_hi_bfield = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\ const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \
const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ const size_t chunkmap_max_acc = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \
const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \ const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
/* for each chunkmap entry `i` */ \ /* for each chunkmap entry `i` */ \
for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \ for (size_t _i = 0; _i < chunkmap_max; _i++) { \
size_t i; \ size_t i; \
if (_i < chunkmap_hi_bfield) { \ if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \
i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \ i = _i + chunkmap_start; \
if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \ if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \
} \ } \
else { i = _i; } /* the rest of the chunks above chunk_hi_idx */ \ else { i = _i; } /* the rest of the chunks above chunk_max_accessed */ \
const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
size_t cmap_idx_shift = 0; /* shift through the cmap */ \ /* todo: space out threads within a chunkmap (2GiB) as well? */ \
if (_i == 0 && chunkmap_start_idx > 0) { \ size_t cmap_idx_shift = 0; /* shift through the cmap */ \
cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \
cmap_idx_shift = chunkmap_start_idx; \
} \
size_t cmap_idx; \ size_t cmap_idx; \
while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \ while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
/* set the chunk idx */ \ /* set the chunk idx */ \
size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \ size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \
/* try to find and clear N bits in that chunk */ \ /* try to find and clear N bits in that chunk */ \
{ {
@ -1064,28 +1071,45 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
} \ } \
}} }}
// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. #define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \
// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS) mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) size_t _cidx; \
{ if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \
// const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); *pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \
mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) return true; \
{ } \
size_t cidx; else { \
if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \
*pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \
mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap)); } \
return true; } \
} mi_bitmap_forall_chunks_end(); \
else { return false; \
// we may find that all are cleared only on a second iteration but that is ok as }
// the chunkmap is a conservative approximation.
mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); #define COMMA ,
}
} mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
mi_bitmap_forall_chunks_end(); mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , );
return false; }
mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, );
}
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, );
}
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
mi_assert_internal(n<=MI_BFIELD_BITS);
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n);
}
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
mi_assert_internal(n<=MI_BCHUNK_BITS);
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n);
} }

View file

@ -91,8 +91,8 @@ typedef mi_bchunk_t mi_bchunkmap_t;
// An atomic bitmap // An atomic bitmap
typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
_Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
_Atomic(size_t) chunk_max_clear; // max chunk index that was once cleared _Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set
size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc
mi_bchunkmap_t chunkmap; mi_bchunkmap_t chunkmap;
mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
@ -172,9 +172,23 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
} }
// Specialized versions for common bit sequence sizes
mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 1-bit
mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
// Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all. // Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all.
// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); // small pages
if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); // medium pages
if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx);
}
// Called once a bit is cleared to see if the memory slice can be claimed. // Called once a bit is cleared to see if the memory slice can be claimed.

View file

@ -217,8 +217,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
} }
// 2. if the page is not too full, we can try to reclaim it for ourselves // 2. if the page is not too full, we can try to reclaim it for ourselves
// note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
!mi_page_is_used_at_frac(page,8)) !mi_page_is_used_at_frac(page,4)
// && !mi_page_is_abandoned_mapped(page)
)
{ {
// the page has still some blocks in use (but not too many) // the page has still some blocks in use (but not too many)
// reclaim in our heap if compatible, or otherwise abandon again // reclaim in our heap if compatible, or otherwise abandon again
@ -247,7 +250,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
} }
// 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations
if (!mi_page_is_used_at_frac(page, 4) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page if (!mi_page_is_used_at_frac(page,4) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page
!mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA &&
_mi_arena_page_try_reabandon_to_mapped(page)) _mi_arena_page_try_reabandon_to_mapped(page))
{ {

View file

@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = {
// may lead to allocation itself on some platforms) // may lead to allocation itself on some platforms)
// -------------------------------------------------------- // --------------------------------------------------------
#define MI_MEMID_STATIC {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } #define MI_MEMID_STATIC {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
mi_decl_cache_align const mi_heap_t _mi_heap_empty = { mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
NULL, NULL,

View file

@ -203,10 +203,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL; if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
size = _mi_align_up(size, _mi_os_page_size()); size = _mi_align_up(size, _mi_os_page_size());
// try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste). // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64); const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
// try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
void* p = NULL; void* p = NULL;
if (try_direct_alloc) { if (try_direct_alloc) {
p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero); p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
@ -233,8 +232,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
if (p == NULL) return NULL; if (p == NULL) return NULL;
// set p to the aligned part in the full region // set p to the aligned part in the full region
// note: this is dangerous on Windows as VirtualFree needs the actual base pointer // note: on Windows VirtualFree needs the actual base pointer
// this is handled though by having the `base` field in the memid's // this is handledby having the `base` field in the memid.
*base = p; // remember the base *base = p; // remember the base
p = _mi_align_up_ptr(p, alignment); p = _mi_align_up_ptr(p, alignment);
@ -361,7 +360,7 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
if (newsize != NULL) *newsize = 0; if (newsize != NULL) *newsize = 0;
if (size == 0 || addr == NULL) return NULL; if (size == 0 || addr == NULL) return NULL;
// page align conservatively within the range // page align conservatively within the range, or liberally straddling pages outside the range
void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size()) void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
: mi_align_down_ptr(addr, _mi_os_page_size())); : mi_align_down_ptr(addr, _mi_os_page_size()));
void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size()) void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
@ -472,7 +471,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
return needs_recommit; return needs_recommit;
} }
else { else {
if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
_mi_os_reset(p, size); _mi_os_reset(p, size);
} }
return false; // needs no recommit return false; // needs no recommit

View file

@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
#include "mimalloc.h" #include "mimalloc.h"
#include "mimalloc/internal.h" #include "mimalloc/internal.h"
#include "mimalloc/prim.h" // _mi_prim_random_buf #include "mimalloc/prim.h" // _mi_prim_random_buf
#include <string.h> // memset
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
We use our own PRNG to keep predictable performance of random number generation We use our own PRNG to keep predictable performance of random number generation
@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
(gcc x64 has no register spills, and clang 6+ uses SSE instructions) (gcc x64 has no register spills, and clang 6+ uses SSE instructions)
-----------------------------------------------------------------------------*/ -----------------------------------------------------------------------------*/
static inline uint32_t rotl(uint32_t x, uint32_t shift) {
return (x << shift) | (x >> (32 - shift));
}
static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) { static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16); x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12); x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8); x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7); x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
} }
static void chacha_block(mi_random_ctx_t* ctx) static void chacha_block(mi_random_ctx_t* ctx)
@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
// since we only use chacha for randomness (and not encryption) we // since we only use chacha for randomness (and not encryption) we
// do not _need_ to read 32-bit values as little endian but we do anyways // do not _need_ to read 32-bit values as little endian but we do anyways
// just for being compatible :-) // just for being compatible :-)
memset(ctx, 0, sizeof(*ctx)); _mi_memzero(ctx, sizeof(*ctx));
for (size_t i = 0; i < 4; i++) { for (size_t i = 0; i < 4; i++) {
const uint8_t* sigma = (uint8_t*)"expand 32-byte k"; const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
ctx->input[i] = read32(sigma,i); ctx->input[i] = read32(sigma,i);
@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
} }
static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) { static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
memset(ctx_new, 0, sizeof(*ctx_new)); _mi_memzero(ctx_new, sizeof(*ctx_new));
_mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input)); _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
ctx_new->input[12] = 0; ctx_new->input[12] = 0;
ctx_new->input[13] = 0; ctx_new->input[13] = 0;
@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim
uintptr_t _mi_os_random_weak(uintptr_t extra_seed) { uintptr_t _mi_os_random_weak(uintptr_t extra_seed) {
uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random
x ^= _mi_prim_clock_now(); x ^= _mi_prim_clock_now();
// and do a few randomization steps // and do a few randomization steps
uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1; uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1;
for (uintptr_t i = 0; i < max; i++) { for (uintptr_t i = 0; i < max; i++) {