avoid thread over-allocation on initial region allocations

This commit is contained in:
daan 2019-07-15 17:35:43 -07:00
parent 7ea9cf8d1f
commit 72d8608333
3 changed files with 54 additions and 38 deletions

View file

@ -95,6 +95,7 @@
<AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories> <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
<PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions> <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
<CompileAs>Default</CompileAs> <CompileAs>Default</CompileAs>
<SupportJustMyCode>false</SupportJustMyCode>
</ClCompile> </ClCompile>
<Lib> <Lib>
<AdditionalLibraryDirectories> <AdditionalLibraryDirectories>
@ -112,6 +113,7 @@
<AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories> <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
<PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions> <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
<CompileAs>Default</CompileAs> <CompileAs>Default</CompileAs>
<SupportJustMyCode>false</SupportJustMyCode>
</ClCompile> </ClCompile>
<PostBuildEvent> <PostBuildEvent>
<Command> <Command>
@ -140,7 +142,7 @@
<PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions> <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
<AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput> <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
<AssemblerListingLocation>$(IntDir)</AssemblerListingLocation> <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
<WholeProgramOptimization>true</WholeProgramOptimization> <WholeProgramOptimization>false</WholeProgramOptimization>
<BufferSecurityCheck>false</BufferSecurityCheck> <BufferSecurityCheck>false</BufferSecurityCheck>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion> <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<FavorSizeOrSpeed>Neither</FavorSizeOrSpeed> <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>
@ -171,7 +173,7 @@
<PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions> <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
<AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput> <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
<AssemblerListingLocation>$(IntDir)</AssemblerListingLocation> <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
<WholeProgramOptimization>true</WholeProgramOptimization> <WholeProgramOptimization>false</WholeProgramOptimization>
<BufferSecurityCheck>false</BufferSecurityCheck> <BufferSecurityCheck>false</BufferSecurityCheck>
<InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion> <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
<FavorSizeOrSpeed>Neither</FavorSizeOrSpeed> <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>

View file

@ -42,14 +42,22 @@ static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exch
// Atomically read a value // Atomically read a value
static inline uintptr_t mi_atomic_read(volatile uintptr_t* p); static inline uintptr_t mi_atomic_read(volatile uintptr_t* p);
// Atomically read a pointer
static inline void* mi_atomic_read_ptr(volatile void** p);
// Atomically write a value // Atomically write a value
static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x); static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x);
// Atomically read a pointer
static inline void* mi_atomic_read_ptr(volatile void** p) {
return (void*)mi_atomic_read( (volatile uintptr_t*)p );
}
static inline void mi_atomic_yield(void); static inline void mi_atomic_yield(void);
// Atomically write a pointer
static inline void mi_atomic_write_ptr(volatile void** p, void* x) {
mi_atomic_write((volatile uintptr_t*)p, (uintptr_t)x );
}
// Atomically compare and exchange a pointer; returns `true` if successful. // Atomically compare and exchange a pointer; returns `true` if successful.
static inline bool mi_atomic_compare_exchange_ptr(volatile void** p, void* newp, void* compare) { static inline bool mi_atomic_compare_exchange_ptr(volatile void** p, void* newp, void* compare) {
return mi_atomic_compare_exchange((volatile uintptr_t*)p, (uintptr_t)newp, (uintptr_t)compare); return mi_atomic_compare_exchange((volatile uintptr_t*)p, (uintptr_t)newp, (uintptr_t)compare);
@ -99,9 +107,6 @@ static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exch
static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) { static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
return *p; return *p;
} }
static inline void* mi_atomic_read_ptr(volatile void** p) {
return (void*)(*p);
}
static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) { static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
*p = x; *p = x;
} }
@ -171,10 +176,6 @@ static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
MI_USING_STD MI_USING_STD
return atomic_load_explicit((volatile atomic_uintptr_t*)p, memory_order_relaxed); return atomic_load_explicit((volatile atomic_uintptr_t*)p, memory_order_relaxed);
} }
static inline void* mi_atomic_read_ptr(volatile void** p) {
MI_USING_STD
return atomic_load_explicit((volatile _Atomic(void*)*)p, memory_order_relaxed);
}
static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) { static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
MI_USING_STD MI_USING_STD
return atomic_store_explicit((volatile atomic_uintptr_t*)p, x, memory_order_relaxed); return atomic_store_explicit((volatile atomic_uintptr_t*)p, x, memory_order_relaxed);

View file

@ -7,13 +7,16 @@ terms of the MIT license. A copy of the license can be found in the file
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..) This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
and the segment and huge object allocation by mimalloc. In contrast to the and the segment and huge object allocation by mimalloc. There may be multiple
rest of mimalloc, this uses thread-shared "regions" that are accessed using implementations of this (one could be the identity going directly to the OS,
atomic operations. We need this layer because of: another could be a simple cache etc), but the current one uses large "regions".
In contrast to the rest of mimalloc, the "regions" are shared between threads and
need to be accessed using atomic operations.
We need this memory layer between the raw OS calls because of:
1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
to reuse memory to reuse memory effectively.
2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
an OS allocation/free is still too expensive relative to the accesses in that an OS allocation/free is still (much) too expensive relative to the accesses in that
object :-( (`mallloc-large` tests this). This means we need a cheaper way to object :-( (`mallloc-large` tests this). This means we need a cheaper way to
reuse memory. reuse memory.
3. This layer can help with a NUMA aware allocation in the future. 3. This layer can help with a NUMA aware allocation in the future.
@ -34,7 +37,7 @@ Possible issues:
#include <string.h> // memset #include <string.h> // memset
// Internal OS interface // Internal raw OS interface
size_t _mi_os_large_page_size(); size_t _mi_os_large_page_size();
bool _mi_os_protect(void* addr, size_t size); bool _mi_os_protect(void* addr, size_t size);
bool _mi_os_unprotect(void* addr, size_t size); bool _mi_os_unprotect(void* addr, size_t size);
@ -76,7 +79,7 @@ typedef struct mem_region_s {
static mem_region_t regions[MI_REGION_MAX]; static mem_region_t regions[MI_REGION_MAX];
static volatile size_t regions_count = 0; // allocated regions static volatile size_t regions_count = 0; // allocated regions
static volatile uintptr_t region_next_idx = 0; static volatile uintptr_t region_next_idx = 0; // good place to start searching
/* ---------------------------------------------------------------------------- /* ----------------------------------------------------------------------------
@ -105,6 +108,8 @@ static size_t mi_good_commit_size(size_t size) {
Commit from a region Commit from a region
-----------------------------------------------------------------------------*/ -----------------------------------------------------------------------------*/
#define ALLOCATING ((void*)1)
// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`. // Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`.
// Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. // if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call.
@ -115,9 +120,25 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
mi_assert_internal((mask & mi_atomic_read(&region->map)) == mask); mi_assert_internal((mask & mi_atomic_read(&region->map)) == mask);
// ensure the region is reserved // ensure the region is reserved
void* start = mi_atomic_read_ptr(&region->start); void* start;
if (start == NULL) { do {
start = mi_atomic_read_ptr(&region->start);
if (start == NULL) {
start = ALLOCATING; // try to start allocating
}
else if (start == ALLOCATING) {
mi_atomic_yield(); // another thead is already allocating.. wait it out
continue;
}
} while( start == ALLOCATING && !mi_atomic_compare_exchange_ptr(&region->start, ALLOCATING, NULL) );
mi_assert_internal(start != NULL);
// allocate the region if needed
if (start == ALLOCATING) {
start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld); start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld);
// set the new allocation (or NULL on failure) -- this releases any waiting threads.
mi_atomic_write_ptr(&region->start, start);
if (start == NULL) { if (start == NULL) {
// failure to allocate from the OS! unclaim the blocks and fail // failure to allocate from the OS! unclaim the blocks and fail
size_t map; size_t map;
@ -126,22 +147,14 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
} while (!mi_atomic_compare_exchange(&region->map, map & ~mask, map)); } while (!mi_atomic_compare_exchange(&region->map, map & ~mask, map));
return false; return false;
} }
// set the newly allocated region
if (mi_atomic_compare_exchange_ptr(&region->start, start, NULL)) { // update the region count if this is a new max idx.
// update the region count mi_atomic_compare_exchange(&regions_count, idx+1, idx);
mi_atomic_increment(&regions_count);
}
else {
// failed, another thread allocated just before us, free our allocated memory
// TODO: should we keep the allocated memory and assign it to some other region?
_mi_os_free(start, MI_REGION_SIZE, tld->stats);
start = mi_atomic_read_ptr(&region->start);
}
} }
mi_assert_internal(start != NULL && start != ALLOCATING);
mi_assert_internal(start == mi_atomic_read_ptr(&region->start));
// Commit the blocks to memory // Commit the blocks to memory
mi_assert_internal(start == mi_atomic_read_ptr(&region->start));
mi_assert_internal(start != NULL);
void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE); void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
if (commit && !mi_option_is_enabled(mi_option_eager_region_commit)) { if (commit && !mi_option_is_enabled(mi_option_eager_region_commit)) {
_mi_os_commit(blocks_start, mi_good_commit_size(size), tld->stats); // only commit needed size (unless using large OS pages) _mi_os_commit(blocks_start, mi_good_commit_size(size), tld->stats); // only commit needed size (unless using large OS pages)
@ -174,7 +187,7 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
bitidx = 0; bitidx = 0;
do { do {
// skip ones // skip ones
while ((m&1) == 1) { bitidx++; m>>=1; } while ((m&1) != 0) { bitidx++; m>>=1; }
// count zeros // count zeros
mi_assert_internal((m&1)==0); mi_assert_internal((m&1)==0);
size_t zeros = 1; size_t zeros = 1;
@ -315,14 +328,14 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
// reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
if (!mi_option_is_enabled(mi_option_large_os_pages)) { if (!mi_option_is_enabled(mi_option_large_os_pages)) {
if (mi_option_is_enabled(mi_option_eager_region_commit)) { if (mi_option_is_enabled(mi_option_eager_region_commit)) {
//_mi_os_reset(p, size, stats); // 10x slowdown on malloc-large //_mi_os_reset(p, size, stats);
} }
else { else {
//_mi_os_decommit(p, size, stats); // 17x slowdown on malloc-large //_mi_os_decommit(p, size, stats);
} }
} }
// TODO: should we free empty regions? // TODO: should we free empty regions? currently only done _mi_mem_collect.
// this frees up virtual address space which // this frees up virtual address space which
// might be useful on 32-bit systems? // might be useful on 32-bit systems?