From 72d8608333ac776d6615798e19ff1858cd471ba9 Mon Sep 17 00:00:00 2001 From: daan Date: Mon, 15 Jul 2019 17:35:43 -0700 Subject: [PATCH] avoid thread over-allocation on initial region allocations --- ide/vs2017/mimalloc.vcxproj | 6 ++-- include/mimalloc-atomic.h | 21 ++++++------ src/memory.c | 65 ++++++++++++++++++++++--------------- 3 files changed, 54 insertions(+), 38 deletions(-) diff --git a/ide/vs2017/mimalloc.vcxproj b/ide/vs2017/mimalloc.vcxproj index 3f7c9ae1..bb1818b0 100644 --- a/ide/vs2017/mimalloc.vcxproj +++ b/ide/vs2017/mimalloc.vcxproj @@ -95,6 +95,7 @@ ../../include MI_DEBUG=3;%(PreprocessorDefinitions); Default + false @@ -112,6 +113,7 @@ ../../include MI_DEBUG=3;%(PreprocessorDefinitions); Default + false @@ -140,7 +142,7 @@ %(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) - true + false false AnySuitable Neither @@ -171,7 +173,7 @@ %(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) - true + false false AnySuitable Neither diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h index 1b6cb0f4..d504634c 100644 --- a/include/mimalloc-atomic.h +++ b/include/mimalloc-atomic.h @@ -42,14 +42,22 @@ static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exch // Atomically read a value static inline uintptr_t mi_atomic_read(volatile uintptr_t* p); -// Atomically read a pointer -static inline void* mi_atomic_read_ptr(volatile void** p); - // Atomically write a value static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x); +// Atomically read a pointer +static inline void* mi_atomic_read_ptr(volatile void** p) { + return (void*)mi_atomic_read( (volatile uintptr_t*)p ); +} + static inline void mi_atomic_yield(void); + +// Atomically write a pointer +static inline void mi_atomic_write_ptr(volatile void** p, void* x) { + mi_atomic_write((volatile uintptr_t*)p, (uintptr_t)x ); +} + // Atomically compare and exchange a pointer; returns `true` if successful. static inline bool mi_atomic_compare_exchange_ptr(volatile void** p, void* newp, void* compare) { return mi_atomic_compare_exchange((volatile uintptr_t*)p, (uintptr_t)newp, (uintptr_t)compare); @@ -99,9 +107,6 @@ static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exch static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) { return *p; } -static inline void* mi_atomic_read_ptr(volatile void** p) { - return (void*)(*p); -} static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) { *p = x; } @@ -171,10 +176,6 @@ static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) { MI_USING_STD return atomic_load_explicit((volatile atomic_uintptr_t*)p, memory_order_relaxed); } -static inline void* mi_atomic_read_ptr(volatile void** p) { - MI_USING_STD - return atomic_load_explicit((volatile _Atomic(void*)*)p, memory_order_relaxed); -} static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) { MI_USING_STD return atomic_store_explicit((volatile atomic_uintptr_t*)p, x, memory_order_relaxed); diff --git a/src/memory.c b/src/memory.c index 6a72e2e0..030541a6 100644 --- a/src/memory.c +++ b/src/memory.c @@ -7,13 +7,16 @@ terms of the MIT license. A copy of the license can be found in the file /* ---------------------------------------------------------------------------- This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..) -and the segment and huge object allocation by mimalloc. In contrast to the -rest of mimalloc, this uses thread-shared "regions" that are accessed using -atomic operations. We need this layer because of: +and the segment and huge object allocation by mimalloc. There may be multiple +implementations of this (one could be the identity going directly to the OS, +another could be a simple cache etc), but the current one uses large "regions". +In contrast to the rest of mimalloc, the "regions" are shared between threads and +need to be accessed using atomic operations. +We need this memory layer between the raw OS calls because of: 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order - to reuse memory + to reuse memory effectively. 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of - an OS allocation/free is still too expensive relative to the accesses in that + an OS allocation/free is still (much) too expensive relative to the accesses in that object :-( (`mallloc-large` tests this). This means we need a cheaper way to reuse memory. 3. This layer can help with a NUMA aware allocation in the future. @@ -34,7 +37,7 @@ Possible issues: #include // memset -// Internal OS interface +// Internal raw OS interface size_t _mi_os_large_page_size(); bool _mi_os_protect(void* addr, size_t size); bool _mi_os_unprotect(void* addr, size_t size); @@ -76,7 +79,7 @@ typedef struct mem_region_s { static mem_region_t regions[MI_REGION_MAX]; static volatile size_t regions_count = 0; // allocated regions -static volatile uintptr_t region_next_idx = 0; +static volatile uintptr_t region_next_idx = 0; // good place to start searching /* ---------------------------------------------------------------------------- @@ -105,6 +108,8 @@ static size_t mi_good_commit_size(size_t size) { Commit from a region -----------------------------------------------------------------------------*/ +#define ALLOCATING ((void*)1) + // Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`. // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written // if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. @@ -115,9 +120,25 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit mi_assert_internal((mask & mi_atomic_read(®ion->map)) == mask); // ensure the region is reserved - void* start = mi_atomic_read_ptr(®ion->start); - if (start == NULL) { + void* start; + do { + start = mi_atomic_read_ptr(®ion->start); + if (start == NULL) { + start = ALLOCATING; // try to start allocating + } + else if (start == ALLOCATING) { + mi_atomic_yield(); // another thead is already allocating.. wait it out + continue; + } + } while( start == ALLOCATING && !mi_atomic_compare_exchange_ptr(®ion->start, ALLOCATING, NULL) ); + mi_assert_internal(start != NULL); + + // allocate the region if needed + if (start == ALLOCATING) { start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld); + // set the new allocation (or NULL on failure) -- this releases any waiting threads. + mi_atomic_write_ptr(®ion->start, start); + if (start == NULL) { // failure to allocate from the OS! unclaim the blocks and fail size_t map; @@ -126,22 +147,14 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit } while (!mi_atomic_compare_exchange(®ion->map, map & ~mask, map)); return false; } - // set the newly allocated region - if (mi_atomic_compare_exchange_ptr(®ion->start, start, NULL)) { - // update the region count - mi_atomic_increment(®ions_count); - } - else { - // failed, another thread allocated just before us, free our allocated memory - // TODO: should we keep the allocated memory and assign it to some other region? - _mi_os_free(start, MI_REGION_SIZE, tld->stats); - start = mi_atomic_read_ptr(®ion->start); - } + + // update the region count if this is a new max idx. + mi_atomic_compare_exchange(®ions_count, idx+1, idx); } + mi_assert_internal(start != NULL && start != ALLOCATING); + mi_assert_internal(start == mi_atomic_read_ptr(®ion->start)); // Commit the blocks to memory - mi_assert_internal(start == mi_atomic_read_ptr(®ion->start)); - mi_assert_internal(start != NULL); void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE); if (commit && !mi_option_is_enabled(mi_option_eager_region_commit)) { _mi_os_commit(blocks_start, mi_good_commit_size(size), tld->stats); // only commit needed size (unless using large OS pages) @@ -174,7 +187,7 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc bitidx = 0; do { // skip ones - while ((m&1) == 1) { bitidx++; m>>=1; } + while ((m&1) != 0) { bitidx++; m>>=1; } // count zeros mi_assert_internal((m&1)==0); size_t zeros = 1; @@ -315,14 +328,14 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) { // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large if (!mi_option_is_enabled(mi_option_large_os_pages)) { if (mi_option_is_enabled(mi_option_eager_region_commit)) { - //_mi_os_reset(p, size, stats); // 10x slowdown on malloc-large + //_mi_os_reset(p, size, stats); } else { - //_mi_os_decommit(p, size, stats); // 17x slowdown on malloc-large + //_mi_os_decommit(p, size, stats); } } - // TODO: should we free empty regions? + // TODO: should we free empty regions? currently only done _mi_mem_collect. // this frees up virtual address space which // might be useful on 32-bit systems?