mirror of
https://github.com/microsoft/mimalloc.git
synced 2025-05-06 23:39:31 +03:00
various improvements
This commit is contained in:
parent
68ac94c1ba
commit
d5ed0cc71e
10 changed files with 223 additions and 152 deletions
|
@ -74,8 +74,11 @@ terms of the MIT license. A copy of the license can be found in the file
|
||||||
#define mi_atomic_store_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
|
#define mi_atomic_store_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed))
|
||||||
#define mi_atomic_exchange_release(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
|
#define mi_atomic_exchange_release(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(release))
|
||||||
#define mi_atomic_exchange_acq_rel(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
|
#define mi_atomic_exchange_acq_rel(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel))
|
||||||
|
|
||||||
|
#define mi_atomic_cas_weak_relaxed(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
|
||||||
#define mi_atomic_cas_weak_release(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
|
#define mi_atomic_cas_weak_release(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
|
||||||
#define mi_atomic_cas_weak_acq_rel(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
|
#define mi_atomic_cas_weak_acq_rel(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
|
||||||
|
#define mi_atomic_cas_strong_relaxed(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed))
|
||||||
#define mi_atomic_cas_strong_release(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
|
#define mi_atomic_cas_strong_release(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed))
|
||||||
#define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
|
#define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
|
||||||
|
|
||||||
|
|
|
@ -289,5 +289,18 @@ static inline size_t mi_rotl(size_t x, size_t r) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) {
|
||||||
|
#if mi_has_builtin(rotateleft32)
|
||||||
|
return mi_builtin(rotateleft32)(x,r);
|
||||||
|
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
|
||||||
|
return _lrotl(x, (int)r);
|
||||||
|
#else
|
||||||
|
// The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
|
||||||
|
// avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
|
||||||
|
const unsigned int rshift = (unsigned int)(r) & 31;
|
||||||
|
return ((x << rshift) | (x >> ((-rshift) & 31)));
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
#endif // MI_BITS_H
|
#endif // MI_BITS_H
|
||||||
|
|
|
@ -334,9 +334,9 @@ typedef struct mi_page_s {
|
||||||
|
|
||||||
// The max object size are checked to not waste more than 12.5% internally over the page sizes.
|
// The max object size are checked to not waste more than 12.5% internally over the page sizes.
|
||||||
// (Except for large pages since huge objects are allocated in 4MiB chunks)
|
// (Except for large pages since huge objects are allocated in 4MiB chunks)
|
||||||
#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~16KiB
|
#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB
|
||||||
#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~128KiB
|
#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB
|
||||||
#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // ~2MiB
|
#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // < 2 MiB
|
||||||
#define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
|
#define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
|
||||||
|
|
||||||
|
|
||||||
|
|
52
src/arena.c
52
src/arena.c
|
@ -29,7 +29,8 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo
|
||||||
----------------------------------------------------------- */
|
----------------------------------------------------------- */
|
||||||
|
|
||||||
#define MI_ARENA_BIN_COUNT (MI_BIN_COUNT)
|
#define MI_ARENA_BIN_COUNT (MI_BIN_COUNT)
|
||||||
|
#define MI_ARENA_MIN_SIZE (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE) // 32 MiB (or 8 MiB on 32-bit)
|
||||||
|
#define MI_ARENA_MAX_SIZE (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE)
|
||||||
|
|
||||||
// A memory arena descriptor
|
// A memory arena descriptor
|
||||||
typedef struct mi_arena_s {
|
typedef struct mi_arena_s {
|
||||||
|
@ -105,7 +106,7 @@ size_t mi_arena_get_count(void) {
|
||||||
|
|
||||||
mi_arena_t* mi_arena_from_index(size_t idx) {
|
mi_arena_t* mi_arena_from_index(size_t idx) {
|
||||||
mi_assert_internal(idx < mi_arena_get_count());
|
mi_assert_internal(idx < mi_arena_get_count());
|
||||||
return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]);
|
return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]);
|
||||||
}
|
}
|
||||||
|
|
||||||
mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
|
mi_arena_t* mi_arena_from_id(mi_arena_id_t id) {
|
||||||
|
@ -235,6 +236,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
if (memid->initially_zero) {
|
||||||
|
mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE);
|
||||||
|
}
|
||||||
|
else {
|
||||||
|
mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
// no need to commit, but check if already fully committed
|
// no need to commit, but check if already fully committed
|
||||||
|
@ -253,7 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(
|
||||||
// try to reserve a fresh arena space
|
// try to reserve a fresh arena space
|
||||||
static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
|
static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id)
|
||||||
{
|
{
|
||||||
if (_mi_preloading()) return false; // use OS only while pre loading
|
// if (_mi_preloading()) return false; // use OS only while pre loading
|
||||||
if (req_arena_id != _mi_arena_id_none()) return false;
|
if (req_arena_id != _mi_arena_id_none()) return false;
|
||||||
|
|
||||||
const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
|
const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count);
|
||||||
|
@ -269,8 +276,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
|
||||||
arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
|
arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE);
|
||||||
|
|
||||||
if (arena_count >= 1 && arena_count <= 128) {
|
if (arena_count >= 1 && arena_count <= 128) {
|
||||||
// scale up the arena sizes exponentially every 8 entries
|
// scale up the arena sizes exponentially every 4 entries
|
||||||
const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16);
|
const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16);
|
||||||
size_t reserve = 0;
|
size_t reserve = 0;
|
||||||
if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
|
if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) {
|
||||||
arena_reserve = reserve;
|
arena_reserve = reserve;
|
||||||
|
@ -278,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
|
||||||
}
|
}
|
||||||
|
|
||||||
// check arena bounds
|
// check arena bounds
|
||||||
const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps?
|
const size_t min_reserve = MI_ARENA_MIN_SIZE;
|
||||||
const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE; // 16 GiB
|
const size_t max_reserve = MI_ARENA_MAX_SIZE; // 16 GiB
|
||||||
if (arena_reserve < min_reserve) {
|
if (arena_reserve < min_reserve) {
|
||||||
arena_reserve = min_reserve;
|
arena_reserve = min_reserve;
|
||||||
}
|
}
|
||||||
|
@ -294,7 +301,17 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re
|
||||||
if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
|
if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); }
|
||||||
else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
|
else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; }
|
||||||
|
|
||||||
return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0);
|
// and try to reserve the arena
|
||||||
|
int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
|
||||||
|
if (err != 0) {
|
||||||
|
// failed, try a smaller size?
|
||||||
|
const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB);
|
||||||
|
if (arena_reserve > small_arena_reserve) {
|
||||||
|
// try again
|
||||||
|
err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return (err==0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -317,12 +334,12 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
|
||||||
|
|
||||||
#define mi_forall_arenas(req_arena_id, tseq, name_arena) \
|
#define mi_forall_arenas(req_arena_id, tseq, name_arena) \
|
||||||
{ \
|
{ \
|
||||||
const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \
|
const size_t _arena_count = mi_arena_get_count(); \
|
||||||
if (_arena_count > 0) { \
|
if (_arena_count > 0) { \
|
||||||
const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
|
const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \
|
||||||
size_t _start; \
|
size_t _start; \
|
||||||
if (req_arena_id == _mi_arena_id_none()) { \
|
if (req_arena_id == _mi_arena_id_none()) { \
|
||||||
/* always start searching in an arena 1 below the max */ \
|
/* always start searching in the arena's below the max */ \
|
||||||
_start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
|
_start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \
|
||||||
} \
|
} \
|
||||||
else { \
|
else { \
|
||||||
|
@ -333,10 +350,10 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are
|
||||||
size_t _idx; \
|
size_t _idx; \
|
||||||
if (_i < _arena_cycle) { \
|
if (_i < _arena_cycle) { \
|
||||||
_idx = _i + _start; \
|
_idx = _i + _start; \
|
||||||
if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \
|
if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \
|
||||||
} \
|
} \
|
||||||
else { \
|
else { \
|
||||||
_idx = _i; \
|
_idx = _i; /* remaining arena's */ \
|
||||||
} \
|
} \
|
||||||
mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
|
mi_arena_t* const name_arena = mi_arena_from_index(_idx); \
|
||||||
if (name_arena != NULL) \
|
if (name_arena != NULL) \
|
||||||
|
@ -397,6 +414,9 @@ again:
|
||||||
// did we need a specific arena?
|
// did we need a specific arena?
|
||||||
if (req_arena_id != _mi_arena_id_none()) return NULL;
|
if (req_arena_id != _mi_arena_id_none()) return NULL;
|
||||||
|
|
||||||
|
// don't create arena's while preloading (todo: or should we?)
|
||||||
|
if (_mi_preloading()) return NULL;
|
||||||
|
|
||||||
// otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
|
// otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?)
|
||||||
if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
|
if (mi_lock_try_acquire(&mi_arena_reserve_lock)) {
|
||||||
mi_arena_id_t arena_id = 0;
|
mi_arena_id_t arena_id = 0;
|
||||||
|
@ -917,7 +937,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi
|
||||||
// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
|
// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit`
|
||||||
// for dynamic libraries that are unloaded and need to release all their allocated memory.
|
// for dynamic libraries that are unloaded and need to release all their allocated memory.
|
||||||
static void mi_arenas_unsafe_destroy(void) {
|
static void mi_arenas_unsafe_destroy(void) {
|
||||||
const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
|
const size_t max_arena = mi_arena_get_count();
|
||||||
size_t new_max_arena = 0;
|
size_t new_max_arena = 0;
|
||||||
for (size_t i = 0; i < max_arena; i++) {
|
for (size_t i = 0; i < max_arena; i++) {
|
||||||
mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
|
mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
|
||||||
|
@ -949,7 +969,7 @@ void _mi_arena_unsafe_destroy_all(void) {
|
||||||
|
|
||||||
// Is a pointer inside any of our arenas?
|
// Is a pointer inside any of our arenas?
|
||||||
bool _mi_arena_contains(const void* p) {
|
bool _mi_arena_contains(const void* p) {
|
||||||
const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
|
const size_t max_arena = mi_arena_get_count();
|
||||||
for (size_t i = 0; i < max_arena; i++) {
|
for (size_t i = 0; i < max_arena; i++) {
|
||||||
mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
|
mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]);
|
||||||
if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
|
if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) {
|
||||||
|
@ -1175,7 +1195,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi
|
||||||
|
|
||||||
void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
|
void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept {
|
||||||
MI_UNUSED(show_abandoned);
|
MI_UNUSED(show_abandoned);
|
||||||
size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count);
|
size_t max_arenas = mi_arena_get_count();
|
||||||
size_t free_total = 0;
|
size_t free_total = 0;
|
||||||
size_t slice_total = 0;
|
size_t slice_total = 0;
|
||||||
//size_t abandoned_total = 0;
|
//size_t abandoned_total = 0;
|
||||||
|
@ -1331,7 +1351,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_
|
||||||
static void mi_arenas_try_purge(bool force, bool visit_all) {
|
static void mi_arenas_try_purge(bool force, bool visit_all) {
|
||||||
if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled
|
if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled
|
||||||
|
|
||||||
const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count);
|
const size_t max_arena = mi_arena_get_count();
|
||||||
if (max_arena == 0) return;
|
if (max_arena == 0) return;
|
||||||
|
|
||||||
// _mi_error_message(EFAULT, "purging not yet implemented\n");
|
// _mi_error_message(EFAULT, "purging not yet implemented\n");
|
||||||
|
|
224
src/bitmap.c
224
src/bitmap.c
|
@ -14,6 +14,8 @@ Concurrent bitmap that can set/reset sequences of bits atomically
|
||||||
#include "mimalloc/bits.h"
|
#include "mimalloc/bits.h"
|
||||||
#include "bitmap.h"
|
#include "bitmap.h"
|
||||||
|
|
||||||
|
#define MI_USE_SIMD 0
|
||||||
|
|
||||||
/* --------------------------------------------------------------------------------
|
/* --------------------------------------------------------------------------------
|
||||||
bfields
|
bfields
|
||||||
-------------------------------------------------------------------------------- */
|
-------------------------------------------------------------------------------- */
|
||||||
|
@ -34,9 +36,9 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) {
|
||||||
return mi_bsf(x,idx);
|
return mi_bsf(x,idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
|
//static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) {
|
||||||
return mi_rotr(x,r);
|
// return mi_rotr(x,r);
|
||||||
}
|
//}
|
||||||
|
|
||||||
static inline mi_bfield_t mi_bfield_zero(void) {
|
static inline mi_bfield_t mi_bfield_zero(void) {
|
||||||
return 0;
|
return 0;
|
||||||
|
@ -456,7 +458,7 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t
|
||||||
|
|
||||||
// ------- mi_bchunk_try_find_and_clear ---------------------------------------
|
// ------- mi_bchunk_try_find_and_clear ---------------------------------------
|
||||||
|
|
||||||
#if defined(__AVX2__)
|
#if MI_USE_SIMD && defined(__AVX2__)
|
||||||
static inline __m256i mi_mm256_zero(void) {
|
static inline __m256i mi_mm256_zero(void) {
|
||||||
return _mm256_setzero_si256();
|
return _mm256_setzero_si256();
|
||||||
}
|
}
|
||||||
|
@ -471,12 +473,27 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) {
|
||||||
|
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
|
||||||
|
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
|
||||||
|
size_t cidx;
|
||||||
|
if (!allow_allset && (~b == 0)) return false;
|
||||||
|
if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit
|
||||||
|
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
|
||||||
|
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
|
||||||
|
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// Find least 1-bit in a chunk and try to clear it atomically
|
// Find least 1-bit in a chunk and try to clear it atomically
|
||||||
// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
|
// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
|
||||||
// This is used to find free slices and abandoned pages and should be efficient.
|
// This is used to find free slices and abandoned pages and should be efficient.
|
||||||
// todo: try neon version
|
// todo: try neon version
|
||||||
static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
|
static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
|
||||||
#if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
|
#if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
|
||||||
while (true) {
|
while (true) {
|
||||||
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
|
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
|
||||||
const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0)
|
const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0)
|
||||||
|
@ -485,19 +502,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
|
||||||
if (mask==0) return false;
|
if (mask==0) return false;
|
||||||
mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
|
mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
|
||||||
const size_t chunk_idx = _tzcnt_u32(mask) / 8;
|
const size_t chunk_idx = _tzcnt_u32(mask) / 8;
|
||||||
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
|
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
|
||||||
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
|
|
||||||
size_t cidx;
|
|
||||||
if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit
|
|
||||||
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
|
|
||||||
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
|
|
||||||
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// try again
|
// try again
|
||||||
}
|
}
|
||||||
#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
#elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
||||||
while (true) {
|
while (true) {
|
||||||
size_t chunk_idx = 0;
|
size_t chunk_idx = 0;
|
||||||
#if 0
|
#if 0
|
||||||
|
@ -528,42 +536,50 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
|
||||||
mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
|
mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
|
||||||
chunk_idx = _tzcnt_u64(mask) / 8;
|
chunk_idx = _tzcnt_u64(mask) / 8;
|
||||||
#endif
|
#endif
|
||||||
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
|
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
|
||||||
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
|
|
||||||
size_t cidx;
|
|
||||||
if (mi_bfield_find_least_bit(b, &cidx)) { // find the bit-idx that is clear
|
|
||||||
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically
|
|
||||||
*pidx = (chunk_idx*MI_BFIELD_BITS) + cidx;
|
|
||||||
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// try again
|
// try again
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
// try first to find a field that is not all set (to reduce fragmentation)
|
||||||
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
|
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
|
||||||
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
|
if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true;
|
||||||
size_t idx;
|
|
||||||
if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
|
|
||||||
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) { // try to clear it atomically
|
|
||||||
*pidx = (i*MI_BFIELD_BITS + idx);
|
|
||||||
mi_assert_internal(*pidx < MI_BCHUNK_BITS);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
|
||||||
|
if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) {
|
||||||
|
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
|
||||||
|
if (!allow_all_set && (~b == 0)) return false;
|
||||||
|
// has_set8 has low bit in each byte set if the byte in x == 0xFF
|
||||||
|
const mi_bfield_t has_set8 =
|
||||||
|
((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F
|
||||||
|
(b & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80
|
||||||
|
>> 7; // shift high bit to low bit
|
||||||
|
size_t idx;
|
||||||
|
if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
|
||||||
|
mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
|
||||||
|
mi_assert_internal((idx%8)==0);
|
||||||
|
const size_t byte_idx = idx/8;
|
||||||
|
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // unset the byte atomically
|
||||||
|
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
|
||||||
|
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
// find least byte in a chunk with all bits set, and try unset it atomically
|
// find least byte in a chunk with all bits set, and try unset it atomically
|
||||||
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
|
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
|
||||||
// Used to find medium size pages in the free blocks.
|
// Used to find medium size pages in the free blocks.
|
||||||
// todo: try neon version
|
// todo: try neon version
|
||||||
static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
|
static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
|
||||||
#if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
#if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
||||||
while (true) {
|
while (true) {
|
||||||
// since a cache-line is 64b, load all at once
|
// since a cache-line is 64b, load all at once
|
||||||
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
|
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
|
||||||
|
@ -588,24 +604,12 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
|
||||||
// try again
|
// try again
|
||||||
}
|
}
|
||||||
#else
|
#else
|
||||||
|
// first skip allset fields to reduce fragmentation
|
||||||
for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
|
for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
|
||||||
const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]);
|
if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true;
|
||||||
// has_set8 has low bit in each byte set if the byte in x == 0xFF
|
|
||||||
const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F
|
|
||||||
(x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80
|
|
||||||
>> 7; // shift high bit to low bit
|
|
||||||
size_t idx;
|
|
||||||
if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit
|
|
||||||
mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
|
|
||||||
mi_assert_internal((idx%8)==0);
|
|
||||||
const size_t byte_idx = idx/8;
|
|
||||||
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) { // unset the byte atomically
|
|
||||||
*pidx = (i*MI_BFIELD_BITS) + idx;
|
|
||||||
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
// else continue
|
|
||||||
}
|
}
|
||||||
|
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
|
||||||
|
if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true;
|
||||||
}
|
}
|
||||||
return false;
|
return false;
|
||||||
#endif
|
#endif
|
||||||
|
@ -618,7 +622,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
|
||||||
// Used to find large size pages in the free blocks.
|
// Used to find large size pages in the free blocks.
|
||||||
// todo: try neon version
|
// todo: try neon version
|
||||||
static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
|
static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
|
||||||
#if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
#if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
||||||
while (true) {
|
while (true) {
|
||||||
// since a cache-line is 64b, load all at once
|
// since a cache-line is 64b, load all at once
|
||||||
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
|
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
|
||||||
|
@ -747,14 +751,14 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
|
//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
|
||||||
if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages
|
// if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages
|
||||||
if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages
|
// if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages
|
||||||
if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages
|
// if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages
|
||||||
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
|
// if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
|
||||||
if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
|
// if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
|
||||||
return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
|
// return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
|
||||||
}
|
//}
|
||||||
|
|
||||||
|
|
||||||
// ------- mi_bchunk_clear_once_set ---------------------------------------
|
// ------- mi_bchunk_clear_once_set ---------------------------------------
|
||||||
|
@ -779,10 +783,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) {
|
||||||
|
|
||||||
// are all bits in a bitmap chunk clear?
|
// are all bits in a bitmap chunk clear?
|
||||||
static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
|
static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
|
||||||
#if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
|
#if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256)
|
||||||
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
|
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
|
||||||
return mi_mm256_is_zero(vec);
|
return mi_mm256_is_zero(vec);
|
||||||
#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
#elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)
|
||||||
// a 64b cache-line contains the entire chunk anyway so load both at once
|
// a 64b cache-line contains the entire chunk anyway so load both at once
|
||||||
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
|
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
|
||||||
const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
|
const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1);
|
||||||
|
@ -796,9 +800,17 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) {
|
||||||
bitmap chunkmap
|
bitmap chunkmap
|
||||||
-------------------------------------------------------------------------------- */
|
-------------------------------------------------------------------------------- */
|
||||||
|
|
||||||
|
static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) {
|
||||||
|
size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed);
|
||||||
|
if mi_unlikely(chunk_idx > oldmax) {
|
||||||
|
mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
|
static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) {
|
||||||
mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
|
mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap));
|
||||||
mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
|
mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
|
||||||
|
mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
|
static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) {
|
||||||
|
@ -813,11 +825,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
|
||||||
mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
|
mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
// record the max clear
|
mi_bitmap_chunkmap_set_max(bitmap, chunk_idx);
|
||||||
size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
|
|
||||||
do {
|
|
||||||
if mi_likely(chunk_idx <= oldmax) break;
|
|
||||||
} while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
|
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -894,6 +902,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) {
|
||||||
mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
|
mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL);
|
||||||
mi_bitmap_chunkmap_set(bitmap, chunk_idx);
|
mi_bitmap_chunkmap_set(bitmap, chunk_idx);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// reset max_accessed
|
||||||
|
mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
@ -1027,31 +1038,27 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
|
||||||
{ \
|
{ \
|
||||||
/* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
|
/* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
|
||||||
MI_UNUSED(tseq); \
|
MI_UNUSED(tseq); \
|
||||||
const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \
|
const size_t chunk_max_acc = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \
|
||||||
const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \
|
const size_t chunk_start = tseq % chunk_max_acc; /* space out threads? */ \
|
||||||
const size_t chunkmap_hi_bfield = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\
|
const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \
|
||||||
|
const size_t chunkmap_max_acc = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \
|
||||||
const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
|
const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
|
||||||
const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \
|
|
||||||
/* for each chunkmap entry `i` */ \
|
/* for each chunkmap entry `i` */ \
|
||||||
for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \
|
for (size_t _i = 0; _i < chunkmap_max; _i++) { \
|
||||||
size_t i; \
|
size_t i; \
|
||||||
if (_i < chunkmap_hi_bfield) { \
|
if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \
|
||||||
i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \
|
i = _i + chunkmap_start; \
|
||||||
if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \
|
if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \
|
||||||
} \
|
} \
|
||||||
else { i = _i; } /* the rest of the chunks above chunk_hi_idx */ \
|
else { i = _i; } /* the rest of the chunks above chunk_max_accessed */ \
|
||||||
const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
|
const size_t chunk_idx0 = i*MI_BFIELD_BITS; \
|
||||||
mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
|
mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \
|
||||||
|
/* todo: space out threads within a chunkmap (2GiB) as well? */ \
|
||||||
size_t cmap_idx_shift = 0; /* shift through the cmap */ \
|
size_t cmap_idx_shift = 0; /* shift through the cmap */ \
|
||||||
if (_i == 0 && chunkmap_start_idx > 0) { \
|
|
||||||
cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \
|
|
||||||
cmap_idx_shift = chunkmap_start_idx; \
|
|
||||||
} \
|
|
||||||
size_t cmap_idx; \
|
size_t cmap_idx; \
|
||||||
while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
|
while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \
|
||||||
/* set the chunk idx */ \
|
/* set the chunk idx */ \
|
||||||
size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
|
size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \
|
||||||
mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \
|
|
||||||
/* try to find and clear N bits in that chunk */ \
|
/* try to find and clear N bits in that chunk */ \
|
||||||
{
|
{
|
||||||
|
|
||||||
|
@ -1064,28 +1071,45 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
|
||||||
} \
|
} \
|
||||||
}}
|
}}
|
||||||
|
|
||||||
// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
|
|
||||||
// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
|
#define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \
|
||||||
// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS)
|
mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \
|
||||||
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
|
size_t _cidx; \
|
||||||
{
|
if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \
|
||||||
// const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
|
*pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \
|
||||||
mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
|
return true; \
|
||||||
{
|
} \
|
||||||
size_t cidx;
|
else { \
|
||||||
if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
|
/* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \
|
||||||
*pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
|
mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \
|
||||||
mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
|
} \
|
||||||
return true;
|
} \
|
||||||
|
mi_bitmap_forall_chunks_end(); \
|
||||||
|
return false; \
|
||||||
}
|
}
|
||||||
else {
|
|
||||||
// we may find that all are cleared only on a second iteration but that is ok as
|
#define COMMA ,
|
||||||
// the chunkmap is a conservative approximation.
|
|
||||||
mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
|
||||||
|
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , );
|
||||||
}
|
}
|
||||||
|
|
||||||
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
|
||||||
|
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, );
|
||||||
}
|
}
|
||||||
mi_bitmap_forall_chunks_end();
|
|
||||||
return false;
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) {
|
||||||
|
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, );
|
||||||
|
}
|
||||||
|
|
||||||
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
|
||||||
|
mi_assert_internal(n<=MI_BFIELD_BITS);
|
||||||
|
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n);
|
||||||
|
}
|
||||||
|
|
||||||
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) {
|
||||||
|
mi_assert_internal(n<=MI_BCHUNK_BITS);
|
||||||
|
mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
|
18
src/bitmap.h
18
src/bitmap.h
|
@ -92,7 +92,7 @@ typedef mi_bchunk_t mi_bchunkmap_t;
|
||||||
// An atomic bitmap
|
// An atomic bitmap
|
||||||
typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
|
typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s {
|
||||||
_Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
|
_Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
|
||||||
_Atomic(size_t) chunk_max_clear; // max chunk index that was once cleared
|
_Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set
|
||||||
size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc
|
size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc
|
||||||
mi_bchunkmap_t chunkmap;
|
mi_bchunkmap_t chunkmap;
|
||||||
mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
|
mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
|
||||||
|
@ -172,9 +172,23 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
|
// Specialized versions for common bit sequence sizes
|
||||||
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 1-bit
|
||||||
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits
|
||||||
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS
|
||||||
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS
|
||||||
|
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS
|
||||||
|
|
||||||
// Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all.
|
// Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all.
|
||||||
// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
|
// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
|
||||||
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx);
|
mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) {
|
||||||
|
if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); // small pages
|
||||||
|
if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); // medium pages
|
||||||
|
if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages
|
||||||
|
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
|
||||||
|
if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx);
|
||||||
|
return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx);
|
||||||
|
}
|
||||||
|
|
||||||
|
|
||||||
// Called once a bit is cleared to see if the memory slice can be claimed.
|
// Called once a bit is cleared to see if the memory slice can be claimed.
|
||||||
|
|
|
@ -217,8 +217,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) {
|
||||||
}
|
}
|
||||||
|
|
||||||
// 2. if the page is not too full, we can try to reclaim it for ourselves
|
// 2. if the page is not too full, we can try to reclaim it for ourselves
|
||||||
|
// note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit.
|
||||||
if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
|
if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 &&
|
||||||
!mi_page_is_used_at_frac(page,8))
|
!mi_page_is_used_at_frac(page,4)
|
||||||
|
// && !mi_page_is_abandoned_mapped(page)
|
||||||
|
)
|
||||||
{
|
{
|
||||||
// the page has still some blocks in use (but not too many)
|
// the page has still some blocks in use (but not too many)
|
||||||
// reclaim in our heap if compatible, or otherwise abandon again
|
// reclaim in our heap if compatible, or otherwise abandon again
|
||||||
|
|
|
@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = {
|
||||||
// may lead to allocation itself on some platforms)
|
// may lead to allocation itself on some platforms)
|
||||||
// --------------------------------------------------------
|
// --------------------------------------------------------
|
||||||
|
|
||||||
#define MI_MEMID_STATIC {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
|
#define MI_MEMID_STATIC {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC }
|
||||||
|
|
||||||
mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
|
mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
|
||||||
NULL,
|
NULL,
|
||||||
|
|
13
src/os.c
13
src/os.c
|
@ -203,10 +203,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
|
||||||
if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
|
if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL;
|
||||||
size = _mi_align_up(size, _mi_os_page_size());
|
size = _mi_align_up(size, _mi_os_page_size());
|
||||||
|
|
||||||
// try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste).
|
// try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size.
|
||||||
const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64);
|
const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8);
|
||||||
|
|
||||||
// try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD)
|
|
||||||
void* p = NULL;
|
void* p = NULL;
|
||||||
if (try_direct_alloc) {
|
if (try_direct_alloc) {
|
||||||
p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
|
p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero);
|
||||||
|
@ -233,8 +232,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit
|
||||||
if (p == NULL) return NULL;
|
if (p == NULL) return NULL;
|
||||||
|
|
||||||
// set p to the aligned part in the full region
|
// set p to the aligned part in the full region
|
||||||
// note: this is dangerous on Windows as VirtualFree needs the actual base pointer
|
// note: on Windows VirtualFree needs the actual base pointer
|
||||||
// this is handled though by having the `base` field in the memid's
|
// this is handledby having the `base` field in the memid.
|
||||||
*base = p; // remember the base
|
*base = p; // remember the base
|
||||||
p = _mi_align_up_ptr(p, alignment);
|
p = _mi_align_up_ptr(p, alignment);
|
||||||
|
|
||||||
|
@ -361,7 +360,7 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size,
|
||||||
if (newsize != NULL) *newsize = 0;
|
if (newsize != NULL) *newsize = 0;
|
||||||
if (size == 0 || addr == NULL) return NULL;
|
if (size == 0 || addr == NULL) return NULL;
|
||||||
|
|
||||||
// page align conservatively within the range
|
// page align conservatively within the range, or liberally straddling pages outside the range
|
||||||
void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
|
void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size())
|
||||||
: mi_align_down_ptr(addr, _mi_os_page_size()));
|
: mi_align_down_ptr(addr, _mi_os_page_size()));
|
||||||
void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
|
void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size())
|
||||||
|
@ -472,7 +471,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset)
|
||||||
return needs_recommit;
|
return needs_recommit;
|
||||||
}
|
}
|
||||||
else {
|
else {
|
||||||
if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed
|
if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory)
|
||||||
_mi_os_reset(p, size);
|
_mi_os_reset(p, size);
|
||||||
}
|
}
|
||||||
return false; // needs no recommit
|
return false; // needs no recommit
|
||||||
|
|
17
src/random.c
17
src/random.c
|
@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file
|
||||||
#include "mimalloc.h"
|
#include "mimalloc.h"
|
||||||
#include "mimalloc/internal.h"
|
#include "mimalloc/internal.h"
|
||||||
#include "mimalloc/prim.h" // _mi_prim_random_buf
|
#include "mimalloc/prim.h" // _mi_prim_random_buf
|
||||||
#include <string.h> // memset
|
|
||||||
|
|
||||||
/* ----------------------------------------------------------------------------
|
/* ----------------------------------------------------------------------------
|
||||||
We use our own PRNG to keep predictable performance of random number generation
|
We use our own PRNG to keep predictable performance of random number generation
|
||||||
|
@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil
|
||||||
(gcc x64 has no register spills, and clang 6+ uses SSE instructions)
|
(gcc x64 has no register spills, and clang 6+ uses SSE instructions)
|
||||||
-----------------------------------------------------------------------------*/
|
-----------------------------------------------------------------------------*/
|
||||||
|
|
||||||
static inline uint32_t rotl(uint32_t x, uint32_t shift) {
|
|
||||||
return (x << shift) | (x >> (32 - shift));
|
|
||||||
}
|
|
||||||
|
|
||||||
static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
|
static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) {
|
||||||
x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16);
|
x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16);
|
||||||
x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12);
|
x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12);
|
||||||
x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8);
|
x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8);
|
||||||
x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7);
|
x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7);
|
||||||
}
|
}
|
||||||
|
|
||||||
static void chacha_block(mi_random_ctx_t* ctx)
|
static void chacha_block(mi_random_ctx_t* ctx)
|
||||||
|
@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
|
||||||
// since we only use chacha for randomness (and not encryption) we
|
// since we only use chacha for randomness (and not encryption) we
|
||||||
// do not _need_ to read 32-bit values as little endian but we do anyways
|
// do not _need_ to read 32-bit values as little endian but we do anyways
|
||||||
// just for being compatible :-)
|
// just for being compatible :-)
|
||||||
memset(ctx, 0, sizeof(*ctx));
|
_mi_memzero(ctx, sizeof(*ctx));
|
||||||
for (size_t i = 0; i < 4; i++) {
|
for (size_t i = 0; i < 4; i++) {
|
||||||
const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
|
const uint8_t* sigma = (uint8_t*)"expand 32-byte k";
|
||||||
ctx->input[i] = read32(sigma,i);
|
ctx->input[i] = read32(sigma,i);
|
||||||
|
@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no
|
||||||
}
|
}
|
||||||
|
|
||||||
static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
|
static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) {
|
||||||
memset(ctx_new, 0, sizeof(*ctx_new));
|
_mi_memzero(ctx_new, sizeof(*ctx_new));
|
||||||
_mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
|
_mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input));
|
||||||
ctx_new->input[12] = 0;
|
ctx_new->input[12] = 0;
|
||||||
ctx_new->input[13] = 0;
|
ctx_new->input[13] = 0;
|
||||||
|
|
Loading…
Add table
Reference in a new issue