From 2c1da9d194bfdef08d7ed10538d22b67c205e5d6 Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 11 Aug 2019 09:03:52 -0700 Subject: [PATCH 1/6] fix scalar initialization of page flags --- src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/init.c b/src/init.c index 905df96a..f73ca6cf 100644 --- a/src/init.c +++ b/src/init.c @@ -17,7 +17,7 @@ const mi_page_t _mi_page_empty = { #if MI_SECURE 0, #endif - 0, {0}, // used, flags + 0, 0, // used, flags NULL, 0, 0, 0, NULL, NULL, NULL #if (MI_INTPTR_SIZE==8 && MI_SECURE==0) From 7b105c4810e5f029043bc5441839cf15426687f9 Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 11 Aug 2019 10:31:00 -0700 Subject: [PATCH 2/6] improve layout of page for x64 --- CMakeLists.txt | 2 +- include/mimalloc-types.h | 4 ++-- src/alloc.c | 16 ++++++++-------- src/init.c | 12 ++++++------ 4 files changed, 17 insertions(+), 17 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 8b37e579..3ea4e607 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU") if(CMAKE_C_COMPILER_ID MATCHES "GNU") list(APPEND mi_cflags -Wno-invalid-memory-model) list(APPEND mi_cflags -fvisibility=hidden) - list(APPEND mi_cflags -fbranch-target-load-optimize ) + list(APPEND mi_cflags -fbranch-target-load-optimize) endif() endif() diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index db5b52cc..eca5c2eb 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -182,7 +182,7 @@ typedef struct mi_page_s { // improve page index calculation #if (MI_INTPTR_SIZE==8 && MI_SECURE==0) - void* padding[1]; // 12 words on 64-bit + // void* padding[1]; // 12 words on 64-bit #elif MI_INTPTR_SIZE==4 // void* padding[1]; // 12 words on 32-bit #endif @@ -383,7 +383,7 @@ typedef struct mi_segments_tld_s { } mi_segments_tld_t; // OS thread local data -typedef struct mi_os_tld_s { +typedef struct mi_os_tld_s { mi_stats_t* stats; // points to tld stats } mi_os_tld_t; diff --git a/src/alloc.c b/src/alloc.c index d0fd28cb..7ad7c552 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -72,7 +72,7 @@ extern inline void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcep void* p; if (mi_likely(size <= MI_SMALL_SIZE_MAX)) { p = mi_heap_malloc_small(heap, size); - } + } else { p = _mi_malloc_generic(heap, size); } @@ -200,16 +200,16 @@ static void mi_decl_noinline mi_free_generic(const mi_segment_t* segment, mi_pag // Free a block void mi_free(void* p) mi_attr_noexcept -{ +{ #if (MI_DEBUG>0) if (mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0)) { _mi_error_message("trying to free an invalid (unaligned) pointer: %p\n", p); return; } #endif - + const mi_segment_t* const segment = _mi_ptr_segment(p); - if (segment == NULL) return; // checks for (p==NULL) + if (segment == NULL) return; // checks for (p==NULL) #if (MI_DEBUG>0) if (mi_unlikely(!mi_is_in_heap_region(p))) { @@ -224,8 +224,8 @@ void mi_free(void* p) mi_attr_noexcept return; } #endif - - mi_page_t* page = _mi_segment_page_of(segment, p); + + mi_page_t* const page = _mi_segment_page_of(segment, p); #if (MI_STAT>1) mi_heap_t* heap = mi_heap_get_default(); @@ -236,11 +236,11 @@ void mi_free(void* p) mi_attr_noexcept // huge page stat is accounted for in `_mi_page_retire` #endif - uintptr_t tid = _mi_thread_id(); + const uintptr_t tid = _mi_thread_id(); if (mi_likely(tid == page->flags)) { // if equal, the thread id matches and it is not a full page, nor has aligned blocks // local, and not full or aligned mi_block_t* block = (mi_block_t*)p; - mi_block_set_next(page, block, page->local_free); + mi_block_set_next(page, block, page->local_free); page->local_free = block; page->used--; if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); } diff --git a/src/init.c b/src/init.c index f73ca6cf..ad82853e 100644 --- a/src/init.c +++ b/src/init.c @@ -12,7 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { - 0, false, false, false, 0, 0, + 0, false, false, false, 0, 0, NULL, // free #if MI_SECURE 0, @@ -21,7 +21,7 @@ const mi_page_t _mi_page_empty = { NULL, 0, 0, 0, NULL, NULL, NULL #if (MI_INTPTR_SIZE==8 && MI_SECURE==0) - , { NULL } + // , { NULL } #endif }; @@ -421,7 +421,7 @@ static void mi_process_load(void) { // show message from the redirector (if present) const char* msg = NULL; mi_allocator_init(&msg); - if (msg != NULL) _mi_verbose_message(msg); + if (msg != NULL) _mi_verbose_message(msg); } // Initialize the process; called by thread_init or the process loader @@ -433,7 +433,7 @@ void mi_process_init(void) mi_attr_noexcept { // when using dynamic linking with interpose. mi_heap_t* h = _mi_heap_default; _mi_process_is_initialized = true; - + _mi_heap_main.thread_id = _mi_thread_id(); _mi_verbose_message("process init: 0x%zx\n", _mi_heap_main.thread_id); uintptr_t random = _mi_random_init(_mi_heap_main.thread_id) ^ (uintptr_t)h; @@ -442,7 +442,7 @@ void mi_process_init(void) mi_attr_noexcept { #endif _mi_heap_main.random = _mi_random_shuffle(random); mi_process_setup_auto_thread_done(); - _mi_os_init(); + _mi_os_init(); #if (MI_DEBUG) _mi_verbose_message("debug level : %d\n", MI_DEBUG); #endif @@ -480,7 +480,7 @@ static void mi_process_done(void) { __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) { UNUSED(reserved); UNUSED(inst); - if (reason==DLL_PROCESS_ATTACH) { + if (reason==DLL_PROCESS_ATTACH) { mi_process_load(); } else if (reason==DLL_THREAD_DETACH) { From 2e924150ae29c4312dc9ca730ff9b70827a8b48e Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 11 Aug 2019 11:30:24 -0700 Subject: [PATCH 3/6] further layout improvement for msvc code generation --- include/mimalloc-types.h | 6 +++--- src/init.c | 4 ++-- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index eca5c2eb..9beeb195 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -167,9 +167,9 @@ typedef struct mi_page_s { #if MI_SECURE uintptr_t cookie; // random cookie to encode the free lists #endif - size_t used; // number of blocks in use (including blocks in `local_free` and `thread_free`) mi_page_flags_t flags; // threadid:62 | has_aligned:1 | in_full:1 - + size_t used; // number of blocks in use (including blocks in `local_free` and `thread_free`) + mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) volatile uintptr_t thread_freed; // at least this number of blocks are in `thread_free` volatile mi_thread_free_t thread_free; // list of deferred free blocks freed by other threads @@ -182,7 +182,7 @@ typedef struct mi_page_s { // improve page index calculation #if (MI_INTPTR_SIZE==8 && MI_SECURE==0) - // void* padding[1]; // 12 words on 64-bit + void* padding[1]; // 12 words on 64-bit #elif MI_INTPTR_SIZE==4 // void* padding[1]; // 12 words on 32-bit #endif diff --git a/src/init.c b/src/init.c index ad82853e..0261e26b 100644 --- a/src/init.c +++ b/src/init.c @@ -17,11 +17,11 @@ const mi_page_t _mi_page_empty = { #if MI_SECURE 0, #endif - 0, 0, // used, flags + 0, 0, // flags, used NULL, 0, 0, 0, NULL, NULL, NULL #if (MI_INTPTR_SIZE==8 && MI_SECURE==0) - // , { NULL } + , { NULL } #endif }; From 0fd898315c2e535169726e95cdf303bba2a5d567 Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 11 Aug 2019 12:15:13 -0700 Subject: [PATCH 4/6] per thread region search index --- include/mimalloc-types.h | 1 + src/init.c | 2 +- src/memory.c | 7 ++++--- 3 files changed, 6 insertions(+), 4 deletions(-) diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index 9beeb195..67ad8516 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -384,6 +384,7 @@ typedef struct mi_segments_tld_s { // OS thread local data typedef struct mi_os_tld_s { + size_t region_idx; // start point for next allocation mi_stats_t* stats; // points to tld stats } mi_os_tld_t; diff --git a/src/init.c b/src/init.c index 0261e26b..77ce4aad 100644 --- a/src/init.c +++ b/src/init.c @@ -97,7 +97,7 @@ static mi_tld_t tld_main = { 0, &_mi_heap_main, { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments - { tld_main_stats }, // os + { 0, tld_main_stats }, // os { MI_STATS_NULL } // stats }; diff --git a/src/memory.c b/src/memory.c index ccd810b3..cf341105 100644 --- a/src/memory.c +++ b/src/memory.c @@ -79,7 +79,6 @@ typedef struct mem_region_s { static mem_region_t regions[MI_REGION_MAX]; static volatile size_t regions_count = 0; // allocated regions -static volatile uintptr_t region_next_idx = 0; // good place to start searching /* ---------------------------------------------------------------------------- @@ -180,7 +179,6 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit } // and return the allocation - mi_atomic_write(®ion_next_idx,idx); // next search from here *p = blocks_start; *id = (idx*MI_REGION_MAP_BITS) + bitidx; return true; @@ -267,7 +265,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t* // find a range of free blocks void* p = NULL; size_t count = mi_atomic_read(®ions_count); - size_t idx = mi_atomic_read(®ion_next_idx); + size_t idx = tld->region_idx; // start index is per-thread to reduce contention for (size_t visited = 0; visited < count; visited++, idx++) { if (idx >= count) idx = 0; // wrap around if (!mi_region_try_alloc_blocks(idx, blocks, size, commit, &p, id, tld)) return NULL; // error @@ -286,6 +284,9 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool commit, size_t* // we could not find a place to allocate, fall back to the os directly p = _mi_os_alloc_aligned(size, alignment, commit, tld); } + else { + tld->region_idx = idx; // next start of search + } mi_assert_internal( p == NULL || (uintptr_t)p % alignment == 0); return p; From 3a75a9d5be4b3aa0f177b5e898f969ae092b0bcf Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 11 Aug 2019 13:03:00 -0700 Subject: [PATCH 5/6] use bit-scan instruction to speed up region search --- src/memory.c | 42 ++++++++++++++++++++++++++++++++++++------ 1 file changed, 36 insertions(+), 6 deletions(-) diff --git a/src/memory.c b/src/memory.c index cf341105..3ac6f1cb 100644 --- a/src/memory.c +++ b/src/memory.c @@ -184,6 +184,27 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit return true; } +// Use bit scan forward to quickly find the first zero bit if it is available +#if defined(_MSC_VER) +#define MI_HAVE_BSF +#include +static inline size_t mi_bsf(uintptr_t x) { + if (x==0) return 8*MI_INTPTR_SIZE; + DWORD idx; + #if (MI_INTPTR_SIZE==8) + _BitScanForward64(&idx, x); + #else + _BitScanForward(&idx, x); + #endif + return idx; +} +#elif defined(__GNUC__) || defined(__clang__) +#define MI_HAVE_BSF +static inline size_t mi_bsf(uintptr_t x) { + return (x==0 ? 8*MI_INTPTR_SIZE : __builtin_ctz(x)); +} +#endif + // Allocate `blocks` in a `region` at `idx` of a given `size`. // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written // if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. @@ -192,14 +213,20 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc { mi_assert_internal(p != NULL && id != NULL); mi_assert_internal(blocks < MI_REGION_MAP_BITS); - - const uintptr_t mask = mi_region_block_mask(blocks,0); + + const uintptr_t mask = mi_region_block_mask(blocks, 0); const size_t bitidx_max = MI_REGION_MAP_BITS - blocks; - - // scan linearly for a free range of zero bits uintptr_t map = mi_atomic_read(®ion->map); - uintptr_t m = mask; // the mask shifted by bitidx - for(size_t bitidx = 0; bitidx <= bitidx_max; bitidx++, m <<= 1) { + + #ifdef MI_HAVE_BSF + size_t bitidx = mi_bsf(~map); // quickly find the first zero bit if possible + #else + size_t bitidx = 0; // otherwise start at 0 + #endif + uintptr_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx + + // scan linearly for a free range of zero bits + while(bitidx <= bitidx_max) { if ((map & m) == 0) { // are the mask bits free at bitidx? mi_assert_internal((m >> bitidx) == mask); // no overflow? uintptr_t newmap = map | m; @@ -214,6 +241,9 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc return mi_region_commit_blocks(region, idx, bitidx, blocks, size, commit, p, id, tld); } } + // on to the next bit + bitidx++; + m <<= 1; } // no error, but also no bits found return true; From feb7b2c160f5a4ef8c77c5e93614188bdd0f4324 Mon Sep 17 00:00:00 2001 From: daan Date: Sun, 11 Aug 2019 13:49:12 -0700 Subject: [PATCH 6/6] improve bit range skipping in region search --- src/memory.c | 40 +++++++++++++++++++++++++++++++--------- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/src/memory.c b/src/memory.c index 3ac6f1cb..732aa687 100644 --- a/src/memory.c +++ b/src/memory.c @@ -186,7 +186,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit // Use bit scan forward to quickly find the first zero bit if it is available #if defined(_MSC_VER) -#define MI_HAVE_BSF +#define MI_HAVE_BITSCAN #include static inline size_t mi_bsf(uintptr_t x) { if (x==0) return 8*MI_INTPTR_SIZE; @@ -198,10 +198,23 @@ static inline size_t mi_bsf(uintptr_t x) { #endif return idx; } +static inline size_t mi_bsr(uintptr_t x) { + if (x==0) return 8*MI_INTPTR_SIZE; + DWORD idx; + #if (MI_INTPTR_SIZE==8) + _BitScanReverse64(&idx, x); + #else + _BitScanReverse(&idx, x); + #endif + return idx; +} #elif defined(__GNUC__) || defined(__clang__) -#define MI_HAVE_BSF +#define MI_HAVE_BITSCAN static inline size_t mi_bsf(uintptr_t x) { - return (x==0 ? 8*MI_INTPTR_SIZE : __builtin_ctz(x)); + return (x==0 ? 8*MI_INTPTR_SIZE : __builtin_ctzl(x)); +} +static inline size_t mi_bsr(uintptr_t x) { + return (x==0 ? 8*MI_INTPTR_SIZE : (8*MI_INTPTR_SIZE - 1) - __builtin_clzl(x)); } #endif @@ -218,12 +231,12 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc const size_t bitidx_max = MI_REGION_MAP_BITS - blocks; uintptr_t map = mi_atomic_read(®ion->map); - #ifdef MI_HAVE_BSF + #ifdef MI_HAVE_BITSCAN size_t bitidx = mi_bsf(~map); // quickly find the first zero bit if possible #else size_t bitidx = 0; // otherwise start at 0 #endif - uintptr_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx + uintptr_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx // scan linearly for a free range of zero bits while(bitidx <= bitidx_max) { @@ -233,7 +246,8 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc mi_assert_internal((newmap^map) >> bitidx == mask); if (!mi_atomic_compare_exchange(®ion->map, newmap, map)) { // no success, another thread claimed concurrently.. keep going - map = mi_atomic_read(®ion->map); + map = mi_atomic_read(®ion->map); + continue; } else { // success, we claimed the bits @@ -241,9 +255,17 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc return mi_region_commit_blocks(region, idx, bitidx, blocks, size, commit, p, id, tld); } } - // on to the next bit - bitidx++; - m <<= 1; + else { + // on to the next bit range + #ifdef MI_HAVE_BITSCAN + size_t shift = (blocks == 1 ? 1 : mi_bsr(map & m) - bitidx + 1); + mi_assert_internal(shift > 0 && shift <= blocks); + #else + size_t shift = 1; + #endif + bitidx += shift; + m <<= shift; + } } // no error, but also no bits found return true;