roll back to old arena cache as it seems to do better on AMD

This commit is contained in:
daan 2020-05-02 11:57:33 -07:00
parent 66e5484c1c
commit 18d697a1e6

View file

@ -127,57 +127,88 @@ static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t*
/* ----------------------------------------------------------- /* -----------------------------------------------------------
Arena cache Arena cache
----------------------------------------------------------- */ ----------------------------------------------------------- */
#define MI_CACHE_FIELDS (8)
#define MI_CACHE_MAX (MI_BITMAP_FIELD_BITS*MI_CACHE_FIELDS) // 512 on 64-bit
/* -----------------------------------------------------------
Arena cache
----------------------------------------------------------- */
#define MI_CACHE_MAX (256)
#define MI_MAX_NUMA (4)
#define MI_SLOT_IN_USE ((void*)1)
typedef struct mi_cache_slot_s { typedef struct mi_cache_slot_s {
void* p; volatile _Atomic(void*)p;
size_t memid; volatile size_t memid;
mi_msecs_t expire; volatile mi_msecs_t expire;
bool is_committed; // TODO: use bit from p to reduce size? volatile bool is_committed;
volatile bool is_large;
} mi_cache_slot_t; } mi_cache_slot_t;
static mi_cache_slot_t cache[MI_CACHE_MAX]; // = 0 static mi_cache_slot_t cache[MI_MAX_NUMA][MI_CACHE_MAX]; // = 0
static volatile _Atomic(uintptr_t)cache_count[MI_MAX_NUMA]; // = 0
#define BITS_SET() (UINTPTR_MAX)
static mi_bitmap_field_t cache_available[MI_CACHE_FIELDS] = { MI_INIT8(BITS_SET) }; // zero bit = available!
static mi_bitmap_field_t cache_available_large[MI_CACHE_FIELDS] = { MI_INIT8(BITS_SET) };
static mi_bitmap_field_t cache_inuse[MI_CACHE_FIELDS]; // zero bit = free
typedef union mi_cache_count_u {
uintptr_t value;
struct {
int16_t count; // at most `count` elements in the cache
#if MI_INTPTR_SIZE > 4
uint32_t epoch; // each push/pop increase this
#else
uint16_t epoch;
#endif
} x;
} mi_cache_count_t;
static void* mi_cache_pop(int numa_node, size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld) { static void* mi_cache_pop(int numa_node, size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld) {
// only segment blocks // only segment blocks
if (size != MI_SEGMENT_SIZE || alignment > MI_SEGMENT_ALIGN) return NULL; if (size != MI_SEGMENT_SIZE || alignment > MI_SEGMENT_ALIGN) return NULL;
// numa node determines start field // set numa range
size_t start_field = 0; int numa_min = numa_node;
if (numa_node > 0) { int numa_max = numa_min;
start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node; if (numa_node < 0) {
if (start_field >= MI_CACHE_FIELDS) start_field = 0; numa_min = 0;
numa_max = _mi_os_numa_node_count() % MI_MAX_NUMA;
}
else {
if (numa_node >= MI_MAX_NUMA) numa_node %= MI_MAX_NUMA;
numa_min = numa_max = numa_node;
} }
// find an available slot // find a free slot
mi_bitmap_index_t bitidx = 0; mi_cache_slot_t* slot;
bool claimed = false; for (int n = numa_min; n <= numa_max; n++) {
if (*large) { // large allowed? mi_cache_count_t top = { 0 };
claimed = mi_bitmap_try_find_from_claim(cache_available_large, MI_CACHE_FIELDS, start_field, 1, &bitidx); top.value = mi_atomic_read_relaxed(&cache_count[n]);
if (claimed) *large = true; int16_t count = top.x.count;
for (int16_t i = count - 1; i >= 0; i--) {
slot = &cache[n][i];
void* p = mi_atomic_read_ptr_relaxed(mi_cache_slot_t, &slot->p);
if (p == NULL) {
if (count > 0) { count = i; }
} }
if (!claimed) { else if (p > MI_SLOT_IN_USE) { // not NULL or 1
claimed = mi_bitmap_try_find_from_claim(cache_available, MI_CACHE_FIELDS, start_field, 1, &bitidx); if (count >= 0 && count < top.x.count) { // new lower bound?
if (claimed) *large = false; mi_cache_count_t newtop = { 0 };
newtop.x.count = count;
newtop.x.epoch = top.x.epoch + 1;
mi_atomic_cas_strong(&cache_count[n], newtop.value, top.value); // it's fine to not succeed; just causes longer scans
} }
count = -1; // don't try to set lower bound again
if (!claimed) return NULL; if (mi_atomic_cas_ptr_weak(mi_cache_slot_t, &slot->p, MI_SLOT_IN_USE, p)) {
// claimed
// found a slot if (!*large && slot->is_large) {
mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)]; // back out again
void* p = slot->p; mi_atomic_write_ptr(mi_cache_slot_t, &slot->p, p); // make it available again
}
else {
// keep it
*memid = slot->memid; *memid = slot->memid;
*large = slot->is_large;
*is_zero = false; *is_zero = false;
bool committed = slot->is_committed; bool committed = slot->is_committed;
slot->p = NULL; mi_atomic_write_ptr(mi_cache_slot_t, &slot->p, NULL); // set it free
slot->expire = 0;
if (*commit && !committed) { if (*commit && !committed) {
bool commit_zero; bool commit_zero;
_mi_os_commit(p, MI_SEGMENT_SIZE, &commit_zero, tld->stats); _mi_os_commit(p, MI_SEGMENT_SIZE, &commit_zero, tld->stats);
@ -186,75 +217,87 @@ static void* mi_cache_pop(int numa_node, size_t size, size_t alignment, bool* co
else { else {
*commit = committed; *commit = committed;
} }
// mark the slot as free again
mi_assert_internal(mi_bitmap_is_claimed(cache_inuse, MI_CACHE_FIELDS, 1, bitidx));
mi_bitmap_unclaim(cache_inuse, MI_CACHE_FIELDS, 1, bitidx);
return p; return p;
}
}
}
}
}
return NULL;
} }
static void mi_cache_purge(mi_os_tld_t* tld) { static void mi_cache_purge(mi_os_tld_t* tld) {
UNUSED(tld); // TODO: for each numa node instead?
// if (mi_option_get(mi_option_arena_reset_delay) == 0) return;
mi_msecs_t now = _mi_clock_now(); mi_msecs_t now = _mi_clock_now();
size_t idx = (_mi_random_shuffle((uintptr_t)now) % MI_CACHE_MAX); // random start int numa_node = _mi_os_numa_node(NULL);
size_t purged = 0; if (numa_node > MI_MAX_NUMA) numa_node %= MI_MAX_NUMA;
for (size_t visited = 0; visited < MI_CACHE_FIELDS; visited++,idx++) { // probe just N slots mi_cache_slot_t* slot;
if (idx >= MI_CACHE_MAX) idx = 0; // wrap int purged = 0;
mi_cache_slot_t* slot = &cache[idx]; mi_cache_count_t top = { 0 };
if (slot->expire != 0 && now >= slot->expire) { // racy read top.value = mi_atomic_read_relaxed(&cache_count[numa_node]);
// seems expired, first claim it from available for (int i = 0; i < top.x.count; i++) {
purged++; slot = &cache[numa_node][i];
mi_bitmap_index_t bitidx = mi_bitmap_index_create_from_bit(idx); void* p = mi_atomic_read_ptr_relaxed(mi_cache_slot_t, &slot->p);
if (mi_bitmap_claim(cache_available, MI_CACHE_FIELDS, 1, bitidx, NULL)) { if (p > MI_SLOT_IN_USE && !slot->is_committed && !slot->is_large) {
// was available, we claimed it mi_msecs_t expire = slot->expire;
if (slot->expire != 0 && now >= slot->expire) { // safe read if (expire != 0 && now >= expire) {
// still expired, decommit it // expired, try to claim it
slot->expire = 0; if (mi_atomic_cas_ptr_weak(mi_cache_slot_t, &slot->p, MI_SLOT_IN_USE, p)) {
mi_assert_internal(slot->is_committed && mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx)); // claimed! test again
if (slot->is_committed && !slot->is_large && now >= slot->expire) {
_mi_abandoned_await_readers(); // wait until safe to decommit _mi_abandoned_await_readers(); // wait until safe to decommit
_mi_os_decommit(slot->p, MI_SEGMENT_SIZE, tld->stats); _mi_os_decommit(p, MI_SEGMENT_SIZE, tld->stats);
slot->is_committed = false; slot->is_committed = false;
} }
mi_bitmap_unclaim(cache_available, MI_CACHE_FIELDS, 1, bitidx); // make it available again for a pop // and unclaim again
mi_atomic_write_ptr(mi_cache_slot_t, &slot->p, p);
purged++;
if (purged >= 4) break; // limit to at most 4 decommits per push
}
} }
if (purged > 4) break; // bound to no more than 4 purge tries per push
} }
} }
} }
static bool mi_cache_push(void* start, size_t size, size_t memid, bool is_committed, bool is_large, mi_os_tld_t* tld) static bool mi_cache_push(void* start, size_t size, size_t memid, bool is_committed, bool is_large, mi_os_tld_t* tld)
{ {
mi_cache_purge(tld);
// only for segment blocks // only for segment blocks
if (size != MI_SEGMENT_SIZE || ((uintptr_t)start % MI_SEGMENT_ALIGN) != 0) return false; if (size != MI_SEGMENT_SIZE || ((uintptr_t)start % MI_SEGMENT_ALIGN) != 0) return false;
// numa node determines start field // try to add it to the cache
int numa_node = _mi_os_numa_node(NULL); int numa_node = _mi_os_numa_node(NULL);
size_t start_field = 0; if (numa_node > MI_MAX_NUMA) numa_node %= MI_MAX_NUMA;
if (numa_node > 0) { mi_cache_slot_t* slot;
start_field = (MI_CACHE_FIELDS / _mi_os_numa_node_count())*numa_node; mi_cache_count_t top = { 0 };
if (start_field >= MI_CACHE_FIELDS) start_field = 0; top.value = mi_atomic_read_relaxed(&cache_count[numa_node]);
for (int16_t i = top.x.count; i < MI_CACHE_MAX; i++) {
slot = &cache[numa_node][i];
void* p = mi_atomic_read_ptr_relaxed(mi_cache_slot_t, &slot->p);
if (p == NULL) { // free slot
if (mi_atomic_cas_ptr_weak(mi_cache_slot_t, &slot->p, MI_SLOT_IN_USE, NULL)) {
// claimed!
// first try to increase the top bound
mi_cache_count_t newtop = { 0 };
newtop.x.count = i+1;
newtop.x.epoch = top.x.epoch + 1;
while (!mi_atomic_cas_strong(&cache_count[numa_node], newtop.value, top.value)) {
top.value = mi_atomic_read_relaxed(&cache_count[numa_node]);
if (top.x.count > newtop.x.count) break; // another push max'd it
newtop.x.epoch = top.x.epoch + 1; // otherwise try again
} }
// purge expired entries
mi_cache_purge(tld);
// find an available slot
mi_bitmap_index_t bitidx;
bool claimed = mi_bitmap_try_find_from_claim(cache_inuse, MI_CACHE_FIELDS, start_field, 1, &bitidx);
if (!claimed) return false;
mi_assert_internal(mi_bitmap_is_claimed(cache_available, MI_CACHE_FIELDS, 1, bitidx));
mi_assert_internal(mi_bitmap_is_claimed(cache_available_large, MI_CACHE_FIELDS, 1, bitidx));
// set the slot // set the slot
mi_cache_slot_t* slot = &cache[mi_bitmap_index_bit(bitidx)];
slot->p = start;
slot->memid = memid;
slot->expire = 0; slot->expire = 0;
slot->is_committed = is_committed; slot->is_committed = is_committed;
if (is_committed && !is_large) { slot->memid = memid;
slot->is_large = is_large;
if (is_committed) {
long delay = mi_option_get(mi_option_arena_reset_delay); long delay = mi_option_get(mi_option_arena_reset_delay);
if (delay == 0) { if (delay == 0 && !is_large) {
_mi_abandoned_await_readers(); // wait until safe to decommit _mi_abandoned_await_readers(); // wait until safe to decommit
_mi_os_decommit(start, size, tld->stats); _mi_os_decommit(start, size, tld->stats);
slot->is_committed = false; slot->is_committed = false;
@ -263,10 +306,12 @@ static bool mi_cache_push(void* start, size_t size, size_t memid, bool is_commit
slot->expire = _mi_clock_now() + delay; slot->expire = _mi_clock_now() + delay;
} }
} }
mi_atomic_write_ptr(mi_cache_slot_t, &slot->p, start); // and make it available;
// make it available
mi_bitmap_unclaim((is_large ? cache_available_large : cache_available), MI_CACHE_FIELDS, 1, bitidx);
return true; return true;
}
}
}
return false;
} }