diff --git a/CMakeLists.txt b/CMakeLists.txt index 3ea4e607..39a671a0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -82,7 +82,7 @@ endif() if(MI_USE_CXX MATCHES "ON") message(STATUS "Use the C++ compiler to compile (MI_USE_CXX=ON)") set_source_files_properties(${mi_sources} PROPERTIES LANGUAGE CXX ) - set_source_files_properties(src/static.c test/test-api.c test/test-stress.c PROPERTIES LANGUAGE CXX ) + set_source_files_properties(src/static.c test/test-api.c PROPERTIES LANGUAGE CXX ) endif() # Compiler flags diff --git a/src/memory.c b/src/memory.c index 732aa687..2eb49f41 100644 --- a/src/memory.c +++ b/src/memory.c @@ -11,24 +11,24 @@ and the segment and huge object allocation by mimalloc. There may be multiple implementations of this (one could be the identity going directly to the OS, another could be a simple cache etc), but the current one uses large "regions". In contrast to the rest of mimalloc, the "regions" are shared between threads and -need to be accessed using atomic operations. +need to be accessed using atomic operations. We need this memory layer between the raw OS calls because of: 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order to reuse memory effectively. 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of an OS allocation/free is still (much) too expensive relative to the accesses in that - object :-( (`mallloc-large` tests this). This means we need a cheaper way to + object :-( (`mallloc-large` tests this). This means we need a cheaper way to reuse memory. 3. This layer can help with a NUMA aware allocation in the future. Possible issues: -- (2) can potentially be addressed too with a small cache per thread which is much +- (2) can potentially be addressed too with a small cache per thread which is much simpler. Generally though that requires shrinking of huge pages, and may overuse - memory per thread. (and is not compatible with `sbrk`). -- Since the current regions are per-process, we need atomic operations to + memory per thread. (and is not compatible with `sbrk`). +- Since the current regions are per-process, we need atomic operations to claim blocks which may be contended - In the worst case, we need to search the whole region map (16KiB for 256GiB) - linearly. At what point will direct OS calls be faster? Is there a way to + linearly. At what point will direct OS calls be faster? Is there a way to do this better without adding too much complexity? -----------------------------------------------------------------------------*/ #include "mimalloc.h" @@ -100,7 +100,7 @@ static uintptr_t mi_region_block_mask(size_t blocks, size_t bitidx) { // Return a rounded commit/reset size such that we don't fragment large OS pages into small ones. static size_t mi_good_commit_size(size_t size) { if (size > (SIZE_MAX - _mi_os_large_page_size())) return size; - return _mi_align_up(size, _mi_os_large_page_size()); + return _mi_align_up(size, _mi_os_large_page_size()); } // Return if a pointer points into a region reserved by us. @@ -121,11 +121,11 @@ Commit from a region #define ALLOCATING ((void*)1) -// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`. +// Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`. // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written -// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. +// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. // (not being able to claim is not considered an error so check for `p != NULL` afterwards). -static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) +static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bitidx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) { size_t mask = mi_region_block_mask(blocks,bitidx); mi_assert_internal(mask != 0); @@ -142,21 +142,21 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit // another thead is already allocating.. wait it out // note: the wait here is not great (but should not happen often). Another // strategy might be to just allocate another region in parallel. This tends - // to be bad for benchmarks though as these often start many threads at the + // to be bad for benchmarks though as these often start many threads at the // same time leading to the allocation of too many regions. (Still, this might // be the most performant and it's ok on 64-bit virtual memory with over-commit.) - mi_atomic_yield(); + mi_atomic_yield(); continue; - } + } } while( start == ALLOCATING && !mi_atomic_compare_exchange_ptr(®ion->start, ALLOCATING, NULL) ); mi_assert_internal(start != NULL); // allocate the region if needed - if (start == ALLOCATING) { + if (start == ALLOCATING) { start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld); // set the new allocation (or NULL on failure) -- this releases any waiting threads. mi_atomic_write_ptr(®ion->start, start); - + if (start == NULL) { // failure to allocate from the OS! unclaim the blocks and fail size_t map; @@ -167,7 +167,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit } // update the region count if this is a new max idx. - mi_atomic_compare_exchange(®ions_count, idx+1, idx); + mi_atomic_compare_exchange(®ions_count, idx+1, idx); } mi_assert_internal(start != NULL && start != ALLOCATING); mi_assert_internal(start == mi_atomic_read_ptr(®ion->start)); @@ -218,15 +218,15 @@ static inline size_t mi_bsr(uintptr_t x) { } #endif -// Allocate `blocks` in a `region` at `idx` of a given `size`. +// Allocate `blocks` in a `region` at `idx` of a given `size`. // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written -// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. +// if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. // (not being able to claim is not considered an error so check for `p != NULL` afterwards). -static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) +static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) { mi_assert_internal(p != NULL && id != NULL); mi_assert_internal(blocks < MI_REGION_MAP_BITS); - + const uintptr_t mask = mi_region_block_mask(blocks, 0); const size_t bitidx_max = MI_REGION_MAP_BITS - blocks; uintptr_t map = mi_atomic_read(®ion->map); @@ -237,16 +237,16 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc size_t bitidx = 0; // otherwise start at 0 #endif uintptr_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx - + // scan linearly for a free range of zero bits while(bitidx <= bitidx_max) { if ((map & m) == 0) { // are the mask bits free at bitidx? - mi_assert_internal((m >> bitidx) == mask); // no overflow? + mi_assert_internal((m >> bitidx) == mask); // no overflow? uintptr_t newmap = map | m; mi_assert_internal((newmap^map) >> bitidx == mask); if (!mi_atomic_compare_exchange(®ion->map, newmap, map)) { // no success, another thread claimed concurrently.. keep going - map = mi_atomic_read(®ion->map); + map = mi_atomic_read(®ion->map); continue; } else { @@ -261,19 +261,19 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1); mi_assert_internal(shift > 0 && shift <= blocks); #else - size_t shift = 1; + size_t shift = 1; #endif bitidx += shift; - m <<= shift; + m <<= shift; } } // no error, but also no bits found - return true; + return true; } // Try to allocate `blocks` in a `region` at `idx` of a given `size`. Does a quick check before trying to claim. // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written -// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call. +// if the blocks were successfully claimed so ensure they are initialized to NULL/0 before the call. // (not being able to claim is not considered an error so check for `p != NULL` afterwards). static bool mi_region_try_alloc_blocks(size_t idx, size_t blocks, size_t size, bool commit, void** p, size_t* id, mi_os_tld_t* tld) { @@ -321,7 +321,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t* for (size_t visited = 0; visited < count; visited++, idx++) { if (idx >= count) idx = 0; // wrap around if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error - if (p != NULL) break; + if (p != NULL) break; } if (p == NULL) { @@ -361,10 +361,10 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) { if (size==0) return; if (id == SIZE_MAX) { // was a direct OS allocation, pass through - _mi_os_free(p, size, stats); + _mi_os_free(p, size, stats); } else { - // allocated in a region + // allocated in a region mi_assert_internal(size <= MI_REGION_MAX_ALLOC_SIZE); if (size > MI_REGION_MAX_ALLOC_SIZE) return; // we can align the size up to page size (as we allocate that way too) // this ensures we fully commit/decommit/reset @@ -377,29 +377,29 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) { mem_region_t* region = ®ions[idx]; mi_assert_internal((mi_atomic_read(®ion->map) & mask) == mask ); // claimed? void* start = mi_atomic_read_ptr(®ion->start); - mi_assert_internal(start != NULL); + mi_assert_internal(start != NULL); void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE); mi_assert_internal(blocks_start == p); // not a pointer in our area? mi_assert_internal(bitidx + blocks <= MI_REGION_MAP_BITS); if (blocks_start != p || bitidx + blocks > MI_REGION_MAP_BITS) return; // or `abort`? // decommit (or reset) the blocks to reduce the working set. - // TODO: implement delayed decommit/reset as these calls are too expensive + // TODO: implement delayed decommit/reset as these calls are too expensive // if the memory is reused soon. // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large if (!mi_option_is_enabled(mi_option_large_os_pages)) { if (mi_option_is_enabled(mi_option_eager_region_commit)) { - //_mi_os_reset(p, size, stats); + //_mi_os_reset(p, size, stats); } - else { - //_mi_os_decommit(p, size, stats); + else { + //_mi_os_decommit(p, size, stats); } } // TODO: should we free empty regions? currently only done _mi_mem_collect. // this frees up virtual address space which // might be useful on 32-bit systems? - + // and unclaim uintptr_t map; uintptr_t newmap; @@ -418,7 +418,7 @@ void _mi_mem_collect(mi_stats_t* stats) { // free every region that has no segments in use. for (size_t i = 0; i < regions_count; i++) { mem_region_t* region = ®ions[i]; - if (mi_atomic_read(®ion->map) == 0 && region->start != NULL) { + if (mi_atomic_read(®ion->map) == 0 && region->start != NULL) { // if no segments used, try to claim the whole region uintptr_t m; do { @@ -427,7 +427,7 @@ void _mi_mem_collect(mi_stats_t* stats) { if (m == 0) { // on success, free the whole region if (region->start != NULL) _mi_os_free((void*)region->start, MI_REGION_SIZE, stats); - // and release + // and release region->start = 0; mi_atomic_write(®ion->map,0); } diff --git a/src/os.c b/src/os.c index d0c33deb..bee5ac64 100644 --- a/src/os.c +++ b/src/os.c @@ -81,8 +81,8 @@ static size_t mi_os_good_alloc_size(size_t size, size_t alignment) { #if defined(_WIN32) // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016. // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility) -// Same for DiscardVirtualMemory -typedef PVOID(__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MEM_EXTENDED_PARAMETER*, ULONG); +// Same for DiscardVirtualMemory. (hide MEM_EXTENDED_PARAMETER to compile with older SDK's) +typedef PVOID(__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, /* MEM_EXTENDED_PARAMETER* */ void*, ULONG); typedef DWORD(__stdcall *PDiscardVirtualMemory)(PVOID,SIZE_T); static PVirtualAlloc2 pVirtualAlloc2 = NULL; static PDiscardVirtualMemory pDiscardVirtualMemory = NULL; @@ -307,8 +307,8 @@ static void* mi_unix_mmap(size_t size, size_t try_alignment, int protect_flags) // try large OS page allocation p = mi_unix_mmapx(size, try_alignment, protect_flags, lflags, lfd); if (p == MAP_FAILED) { - mi_atomic_write(&large_page_try_ok, 10); // on error, don't try again for the next N allocations - p = NULL; // and fall back to regular mmap + mi_atomic_write(&large_page_try_ok, 10); // on error, don't try again for the next N allocations + p = NULL; // and fall back to regular mmap } } } @@ -320,7 +320,7 @@ static void* mi_unix_mmap(size_t size, size_t try_alignment, int protect_flags) } #if defined(MADV_HUGEPAGE) // Many Linux systems don't allow MAP_HUGETLB but they support instead - // transparent huge pages (TPH). It is not required to call `madvise` with MADV_HUGE + // transparent huge pages (TPH). It is not required to call `madvise` with MADV_HUGE // though since properly aligned allocations will already use large pages if available // in that case -- in particular for our large regions (in `memory.c`). // However, some systems only allow TPH if called with explicit `madvise`, so