From cc809b0cd4b99a564b00224cb2e66e4d881f62cd Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 18 Mar 2024 01:40:03 -0700 Subject: [PATCH] take 16 bits from used field to create a fast unalign path --- include/mimalloc/internal.h | 2 +- include/mimalloc/types.h | 24 +++++++++++++----------- src/alloc.c | 2 +- src/free.c | 17 +++++++++++------ src/init.c | 18 ++++++++++-------- src/page.c | 12 +++++++++++- 6 files changed, 47 insertions(+), 28 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 96f3922e..72544c3d 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -202,7 +202,7 @@ void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, siz void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept; mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p); bool _mi_free_delayed_block(mi_block_t* block); -void _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration +void _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); // "libc.c" diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 049e68e7..c624e5b4 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -273,7 +273,7 @@ typedef uintptr_t mi_thread_free_t; // and 12 are still good for address calculation) // - To limit the structure size, the `xblock_size` is 32-bits only; for // blocks > MI_HUGE_BLOCK_SIZE the size is determined from the segment page size -// - `thread_free` uses the bottom bits as a delayed-free flags to optimize +// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize // concurrent frees where only the first concurrent free adds to the owning // heap `thread_delayed_free` list (see `alloc.c:mi_free_block_mt`). // The invariant is that no-delayed-free is only set if there is @@ -295,19 +295,21 @@ typedef struct mi_page_s { uint8_t retire_expire:7; // expiration count for retired blocks mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) - uint32_t used; // number of blocks in use (including blocks in `thread_free`) - uint32_t xblock_size; // size available in each block (always `>0`) mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) - + uint16_t used; // number of blocks in use (including blocks in `thread_free`) + uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift == block_size)` (used for quick block start finding for aligned pointers) + uint8_t block_offset_adj; // if not zero, then `(page_start - (uint8_t*)page - 8*(block_offset_adj-1)) % block_size == 0)` (used for quick block start finding for aligned pointers) + uint32_t xblock_size; // size available in each block (always `>0`) + #if (MI_ENCODE_FREELIST || MI_PADDING) uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary - #endif + #endif _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads _Atomic(uintptr_t) xheap; - - struct mi_page_s* next; // next page owned by this thread with the same `block_size` - struct mi_page_s* prev; // previous page owned by this thread with the same `block_size` + + struct mi_page_s* next; // next page owned by the heap with the same `block_size` + struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` } mi_page_t; @@ -386,8 +388,8 @@ typedef struct mi_segment_s { uintptr_t cookie; // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie` // layout like this to optimize access in `mi_free` - size_t page_shift; // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`). _Atomic(mi_threadid_t) thread_id; // unique id of the thread owning this segment + size_t page_shift; // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`). mi_page_kind_t page_kind; // kind of pages: small, medium, large, or huge mi_page_t pages[1]; // up to `MI_SMALL_PAGES_PER_SEGMENT` pages } mi_segment_t; @@ -446,8 +448,6 @@ typedef struct mi_padding_s { // A heap owns a set of pages. struct mi_heap_s { mi_tld_t* tld; - mi_page_t* pages_free_direct[MI_PAGES_DIRECT]; // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size. - mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") _Atomic(mi_block_t*) thread_delayed_free; mi_threadid_t thread_id; // thread this heap belongs too mi_arena_id_t arena_id; // arena id if the heap belongs to a specific arena (or 0) @@ -459,6 +459,8 @@ struct mi_heap_s { size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread bool no_reclaim; // `true` if this heap should not reclaim abandoned pages + mi_page_t* pages_free_direct[MI_PAGES_DIRECT]; // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size. + mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") }; diff --git a/src/alloc.c b/src/alloc.c index 76d68d13..3a38a226 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -37,8 +37,8 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz } mi_assert_internal(block != NULL && _mi_ptr_page(block) == page); // pop from the free list - page->used++; page->free = mi_block_next(page, block); + page->used++; mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page); #if MI_DEBUG>3 if (page->free_is_zero) { diff --git a/src/free.c b/src/free.c index 7761cb6a..d0fcf133 100644 --- a/src/free.c +++ b/src/free.c @@ -249,25 +249,30 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool // Adjust a block that was allocated aligned, to the actual start of the block in the page. mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p) { mi_assert_internal(page!=NULL && p!=NULL); - const size_t diff = (uint8_t*)p - _mi_page_start(segment, page, NULL); - const size_t adjust = (diff % mi_page_block_size(page)); + const size_t diff = (mi_likely(page->block_offset_adj != 0) + ? (uint8_t*)p - (uint8_t*)page - 8*(page->block_offset_adj-1) + : (uint8_t*)p - _mi_page_start(segment, page, NULL)); + + const size_t adjust = (mi_likely(page->block_size_shift != 0) + ? diff & (((size_t)1 << page->block_size_shift) - 1) + : diff % mi_page_block_size(page)); return (mi_block_t*)((uintptr_t)p - adjust); } // free a local pointer -static void mi_decl_noinline mi_free_generic_local(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept { +static void mi_decl_noinline mi_free_generic_local(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept { mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(segment, page, p) : (mi_block_t*)p); mi_free_block_local(page, block, true); } // free a pointer owned by another thread -static void mi_decl_noinline mi_free_generic_mt(const mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept { +static void mi_decl_noinline mi_free_generic_mt(mi_segment_t* segment, mi_page_t* page, void* p) mi_attr_noexcept { mi_block_t* const block = _mi_page_ptr_unalign(segment, page, p); // don't check `has_aligned` flag to avoid a race (issue #865) mi_free_block_mt(segment, page, block); } // generic free (for runtime integration) -void mi_decl_noinline _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept { +void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept { if (is_local) mi_free_generic_local(segment,page,p); else mi_free_generic_mt(segment,page,p); } @@ -469,7 +474,7 @@ static mi_decl_noinline void mi_free_block_mt(mi_segment_t* segment, mi_page_t* // ------------------------------------------------------ // Bytes available in a block -mi_decl_noinline static size_t mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept { +static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_segment_t* segment, const mi_page_t* page, const void* p) mi_attr_noexcept { const mi_block_t* block = _mi_page_ptr_unalign(segment, page, p); const size_t size = mi_page_usable_size_of(page, block); const ptrdiff_t adjust = (uint8_t*)p - (uint8_t*)block; diff --git a/src/init.c b/src/init.c index 7ec6e01e..11471760 100644 --- a/src/init.c +++ b/src/init.c @@ -21,9 +21,11 @@ const mi_page_t _mi_page_empty = { false, // is_zero 0, // retire_expire NULL, // free - 0, // used - 0, // xblock_size NULL, // local_free + 0, // used + 0, // block size shift + 0, // block offset adj + 0, // xblock_size #if (MI_PADDING || MI_ENCODE_FREELIST) { 0, 0 }, #endif @@ -93,8 +95,6 @@ const mi_page_t _mi_page_empty = { mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, - MI_SMALL_PAGES_EMPTY, - MI_PAGE_QUEUES_EMPTY, MI_ATOMIC_VAR_INIT(NULL), 0, // tid 0, // cookie @@ -104,7 +104,9 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { 0, // page count MI_BIN_FULL, 0, // page retired min/max NULL, // next - false + false, + MI_SMALL_PAGES_EMPTY, + MI_PAGE_QUEUES_EMPTY }; @@ -130,8 +132,6 @@ static mi_tld_t tld_main = { mi_heap_t _mi_heap_main = { &tld_main, - MI_SMALL_PAGES_EMPTY, - MI_PAGE_QUEUES_EMPTY, MI_ATOMIC_VAR_INIT(NULL), 0, // thread id 0, // initial cookie @@ -141,7 +141,9 @@ mi_heap_t _mi_heap_main = { 0, // page count MI_BIN_FULL, 0, // page retired min/max NULL, // next heap - false // can reclaim + false, // can reclaim + MI_SMALL_PAGES_EMPTY, + MI_PAGE_QUEUES_EMPTY }; bool _mi_process_is_initialized = false; // set to `true` in `mi_process_init`. diff --git a/src/page.c b/src/page.c index 5fefc3b5..5930a430 100644 --- a/src/page.c +++ b/src/page.c @@ -660,7 +660,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi mi_page_set_heap(page, heap); size_t page_size; const void* page_start = _mi_segment_page_start(segment, page, block_size, &page_size, NULL); - MI_UNUSED(page_start); mi_track_mem_noaccess(page_start,page_size); page->xblock_size = (block_size < MI_HUGE_BLOCK_SIZE ? (uint32_t)block_size : MI_HUGE_BLOCK_SIZE); mi_assert_internal(page_size / block_size < (1L<<16)); @@ -677,6 +676,15 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi mi_assert_expensive(!page->is_zero_init || mi_mem_is_zero(page_start, page_size)); } #endif + if (_mi_is_power_of_two(block_size) && block_size > 0) { + page->block_size_shift = (uint32_t)(mi_ctz((uintptr_t)block_size)); + } + const ptrdiff_t start_offset = (uint8_t*)page_start - (uint8_t*)page; + const ptrdiff_t start_adjust = start_offset % block_size; + if (start_offset >= 0 && (start_adjust % 8) == 0 && (start_adjust/8) < 255) { + page->block_offset_adj = (uint8_t)((start_adjust/8) + 1); + } + mi_assert_internal(page->capacity == 0); mi_assert_internal(page->free == NULL); @@ -690,6 +698,8 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi mi_assert_internal(page->keys[0] != 0); mi_assert_internal(page->keys[1] != 0); #endif + mi_assert_internal(page->block_size_shift == 0 || (block_size == (1UL << page->block_size_shift))); + mi_assert_internal(page->block_offset_adj == 0 || (((uint8_t*)page_start - (uint8_t*)page - 8*(page->block_offset_adj-1))) % block_size == 0); mi_assert_expensive(mi_page_is_valid_init(page)); // initialize an initial free list