From 659a9dd51d1d02b620ea569d62fdda76dcb60c38 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 6 Dec 2024 22:37:59 -0800 Subject: [PATCH] fix page info size and order; atomic page flags --- CMakeLists.txt | 2 +- include/mimalloc/atomic.h | 4 +- include/mimalloc/internal.h | 24 ++++++++--- include/mimalloc/types.h | 81 +++++++++++++++---------------------- src/arena.c | 36 ++++++++--------- src/bitmap.c | 6 +-- src/free.c | 5 ++- src/init.c | 10 ++--- src/os.c | 4 +- test/test-stress.c | 4 +- 10 files changed, 87 insertions(+), 89 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c04aea8..1a4cc1f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -360,7 +360,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM list(APPEND mi_cflags_dynamic -ftls-model=initial-exec) message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)") else() - list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2) + list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2) endif() endif() if(MI_OVERRIDE) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index 3a0d4892..caa90cf8 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -80,10 +80,12 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) #define mi_atomic_add_relaxed(p,x) mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed)) -#define mi_atomic_sub_relaxed(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_add_acq_rel(p,x) mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_sub_relaxed(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_sub_acq_rel(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_and_relaxed(p,x) mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_and_acq_rel(p,x) mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_or_relaxed(p,x) mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_or_acq_rel(p,x) mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel)) #define mi_atomic_increment_relaxed(p) mi_atomic_add_relaxed(p,(uintptr_t)1) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index d9c2cd6e..ad7c41c6 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -667,7 +667,8 @@ static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) { static inline bool mi_page_is_huge(const mi_page_t* page) { - return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN)); + return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || + (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page)); } @@ -727,20 +728,33 @@ static inline bool _mi_page_unown(mi_page_t* page) { //----------------------------------------------------------- // Page flags //----------------------------------------------------------- +static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) { + return mi_atomic_load_acquire(&page->xflags); +} + +static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) { + if (set) { + mi_atomic_or_acq_rel(&page->xflags, newflag); + } + else { + mi_atomic_and_acq_rel(&page->xflags, ~newflag); + } +} + static inline bool mi_page_is_in_full(const mi_page_t* page) { - return page->flags.x.in_full; + return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0); } static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) { - page->flags.x.in_full = in_full; + mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE); } static inline bool mi_page_has_aligned(const mi_page_t* page) { - return page->flags.x.has_aligned; + return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0); } static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { - page->flags.x.has_aligned = has_aligned; + mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED); } /* ------------------------------------------------------------------- diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index d78dbc59..5dfbb808 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -111,17 +111,17 @@ terms of the MIT license. A copy of the license can be found in the file // Sizes are for 64-bit #ifndef MI_ARENA_SLICE_SHIFT -#ifdef MI_SMALL_PAGE_SHIFT // compatibility +#ifdef MI_SMALL_PAGE_SHIFT // compatibility #define MI_ARENA_SLICE_SHIFT MI_SMALL_PAGE_SHIFT #else #define MI_ARENA_SLICE_SHIFT (13 + MI_SIZE_SHIFT) // 64 KiB (32 KiB on 32-bit) #endif #endif #ifndef MI_BCHUNK_BITS_SHIFT -#define MI_BCHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) +#define MI_BCHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) #endif -#define MI_BCHUNK_BITS (1 << MI_BCHUNK_BITS_SHIFT) +#define MI_BCHUNK_BITS (1 << MI_BCHUNK_BITS_SHIFT) #define MI_ARENA_SLICE_SIZE (MI_ZU(1) << MI_ARENA_SLICE_SHIFT) #define MI_ARENA_SLICE_ALIGN (MI_ARENA_SLICE_SIZE) @@ -167,8 +167,8 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) { typedef struct mi_memid_os_info { void* base; // actual base address of the block (used for offset aligned allocations) - size_t alignment; // alignment at allocation size_t size; // allocated full size + // size_t alignment; // alignment at allocation } mi_memid_os_info_t; typedef struct mi_memid_arena_info { @@ -224,26 +224,11 @@ typedef enum mi_owned_e { } mi_owned_t; -// The `in_full` and `has_aligned` page flags are put in a union to efficiently -// test if both are false (`full_aligned == 0`) in the `mi_free` routine. -#if !MI_TSAN -typedef union mi_page_flags_s { - uint8_t full_aligned; - struct { - uint8_t in_full : 1; - uint8_t has_aligned : 1; - } x; -} mi_page_flags_t; -#else -// under thread sanitizer, use a byte for each flag to suppress warning, issue #130 -typedef union mi_page_flags_s { - uint32_t full_aligned; - struct { - uint8_t in_full; - uint8_t has_aligned; - } x; -} mi_page_flags_t; -#endif +// The `in_full` and `has_aligned` page flags are put in the same field +// to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine. +#define MI_PAGE_IN_FULL_QUEUE MI_ZU(0x01) +#define MI_PAGE_HAS_ALIGNED MI_ZU(0x02) +typedef size_t mi_page_flags_t; // Thread free list. // We use the bottom bit of the pointer for `mi_owned_t` flags @@ -280,35 +265,33 @@ typedef struct mi_subproc_s mi_subproc_t; // the owning heap `thread_delayed_free` list. This guarantees that pages // will be freed correctly even if only other threads free blocks. typedef struct mi_page_s { - _Atomic(mi_threadid_t)xthread_id; // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned) + _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned) - mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) - uint16_t used; // number of blocks in use (including blocks in `thread_free`) - uint16_t capacity; // number of blocks committed (must be the first field for proper zero-initialisation) - uint16_t reserved; // number of blocks reserved in memory - uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) - uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type + mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) + uint16_t used; // number of blocks in use (including blocks in `thread_free`) + uint16_t capacity; // number of blocks committed (must be the first field for proper zero-initialisation) + uint16_t reserved; // number of blocks reserved in memory + uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) + uint8_t retire_expire; // expiration count for retired blocks - mi_page_flags_t flags; // `in_full` and `has_aligned` flags (8 bits) - uint8_t free_is_zero:1; // `true` if the blocks in the free list are zero initialized - uint8_t retire_expire:7; // expiration count for retired blocks - // padding - - mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) - size_t block_size; // size available in each block (always `>0`) - uint8_t* page_start; // start of the blocks + mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) + _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads + _Atomic(mi_page_flags_t) xflags; // `in_full` and `has_aligned` flags + size_t block_size; // size available in each block (always `>0`) + uint8_t* page_start; // start of the blocks + uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type + bool free_is_zero; // `true` if the blocks in the free list are zero initialized + // padding #if (MI_ENCODE_FREELIST || MI_PADDING) - uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary + uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary #endif - _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - - mi_heap_t* heap; // heap this threads belong to. - struct mi_page_s* next; // next page owned by the heap with the same `block_size` - struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` - mi_subproc_t* subproc; // sub-process of this heap - mi_memid_t memid; // provenance of the page memory + mi_heap_t* heap; // heap this threads belong to. + struct mi_page_s* next; // next page owned by the heap with the same `block_size` + struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` + mi_subproc_t* subproc; // sub-process of this heap + mi_memid_t memid; // provenance of the page memory } mi_page_t; @@ -317,10 +300,10 @@ typedef struct mi_page_s { // ------------------------------------------------------ #define MI_PAGE_ALIGN MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map. -#define MI_PAGE_MIN_BLOCK_ALIGN (32) // minimal block alignment in a page +#define MI_PAGE_MIN_BLOCK_ALIGN (64) // minimal block alignment in a page #define MI_PAGE_MAX_OVERALLOC_ALIGN MI_ARENA_SLICE_SIZE // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation -#if MI_DEBUG && MI_SIZE_SIZE == 8 +#if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8 #define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) #else #define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) diff --git a/src/arena.c b/src/arena.c index 2c215264..45697081 100644 --- a/src/arena.c +++ b/src/arena.c @@ -483,7 +483,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* a mi_page_t* const page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); // can we claim ownership? if (!mi_page_try_claim_ownership(page)) { - // there was a concurrent free .. + // there was a concurrent free .. // we need to keep it in the abandoned map as the free will call `mi_arena_page_unabandon`, // and wait for readers (us!) to finish. This is why it is very important to set the abandoned // bit again (or otherwise the unabandon will never stop waiting). @@ -596,7 +596,9 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz } } #endif - mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)); + if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) { + _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small\n"); + }; const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); const size_t reserved = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size); mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); @@ -1126,28 +1128,22 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { return bit_set_count; } -static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) { - _mi_output_message("%s%s:\n", prefix, header); +static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) { + _mi_output_message("%s:\n", header); size_t bit_count = 0; size_t bit_set_count = 0; for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); size_t k = 0; mi_bchunk_t* chunk = &bitmap->chunks[i]; - - if (i<10) { buf[k++] = ' '; } - if (i<100) { itoa((int)i, buf+k, 10); k += (i < 10 ? 1 : 2); } - buf[k++] = ' '; + if (i<10) { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; } + else if (i<100) { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; } + else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); } + for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { if (j > 0 && (j % 4) == 0) { - buf[k++] = '\n'; - _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix); - buf[k++] = ' '; - buf[k++] = ' '; - buf[k++] = ' '; - buf[k++] = ' '; - buf[k++] = ' '; + buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5; } if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; @@ -1164,9 +1160,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ } bit_count += MI_BFIELD_BITS; } - _mi_output_message("%s %s\n", prefix, buf); + _mi_output_message(" %s\n", buf); } - _mi_output_message("%s total ('x'): %zu\n", prefix, bit_set_count); + _mi_output_message(" total ('x'): %zu\n", bit_set_count); return bit_set_count; } @@ -1183,12 +1179,12 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) slice_total += arena->slice_count; _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { - free_total += mi_debug_show_bitmap(" ", "in-use slices", arena->slice_count, arena->slices_free, true); + free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true); } - mi_debug_show_bitmap(" ", "committed slices", arena->slice_count, arena->slices_committed, false); + mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false); // todo: abandoned slices if (show_purge) { - purge_total += mi_debug_show_bitmap(" ", "purgeable slices", arena->slice_count, arena->slices_purge, false); + purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false); } } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); diff --git a/src/bitmap.c b/src/bitmap.c index 15401d8d..2ef692cb 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -805,10 +805,10 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) return false; } // record the max clear - size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); + /*size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); do { if mi_likely(chunk_idx <= oldmax) break; - } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx)); + } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));*/ return true; } @@ -1046,7 +1046,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ - const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); /* mi_bitmap_chunk_count(bitmap) */ \ + /* const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); */ /* mi_bitmap_chunk_count(bitmap) */ \ const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */ /* space out threads */ \ const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \ const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ diff --git a/src/free.c b/src/free.c index 0ff4bf60..afb23838 100644 --- a/src/free.c +++ b/src/free.c @@ -163,8 +163,9 @@ void mi_free(void* p) mi_attr_noexcept if mi_unlikely(page==NULL) return; const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page)); + const mi_page_flags_t flags = mi_page_flags(page); if mi_likely(is_local) { // thread-local free? - if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) + if mi_likely(flags == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) // thread-local, aligned, and not a full page mi_block_t* const block = (mi_block_t*)p; mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */); @@ -176,7 +177,7 @@ void mi_free(void* p) mi_attr_noexcept } else { // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap) - if mi_likely(page->flags.full_aligned == 0) { + if mi_likely(flags == 0) { // blocks are aligned (and not a full page) mi_block_t* const block = (mi_block_t*)p; mi_free_block_mt(page,block); diff --git a/src/init.c b/src/init.c index 5d4a775a..4fbd50ed 100644 --- a/src/init.c +++ b/src/init.c @@ -20,21 +20,21 @@ const mi_page_t _mi_page_empty = { 0, // capacity 0, // reserved capacity 0, // block size shift - 0, // heap tag - { 0 }, // flags - false, // is_zero 0, // retire_expire NULL, // local_free + MI_ATOMIC_VAR_INIT(0), // xthread_free + MI_ATOMIC_VAR_INIT(0), // xflags 0, // block_size NULL, // page_start + 0, // heap tag + false, // is_zero #if (MI_PADDING || MI_ENCODE_FREELIST) { 0, 0 }, #endif - MI_ATOMIC_VAR_INIT(0), // xthread_free NULL, // xheap NULL, NULL, // next, prev NULL, // subproc - { {{ NULL, 0, 0}}, false, false, false, MI_MEM_NONE } // memid + { {{ NULL, 0}}, false, false, false, MI_MEM_NONE } // memid }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) diff --git a/src/os.c b/src/os.c index c7f464c0..156a655b 100644 --- a/src/os.c +++ b/src/os.c @@ -128,7 +128,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me // different base? (due to alignment) if (memid.mem.os.base != base) { mi_assert(memid.mem.os.base <= addr); - mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr); + // mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr); base = memid.mem.os.base; if (memid.mem.os.size==0) { csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); } } @@ -305,7 +305,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo if (p != NULL) { *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large); memid->mem.os.base = os_base; - memid->mem.os.alignment = alignment; + // memid->mem.os.alignment = alignment; memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base); // todo: return from prim_alloc_aligned } return p; diff --git a/test/test-stress.c b/test/test-stress.c index d5f106d5..d46c2484 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,7 +40,7 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; -#elif 0 +#elif 1 static int THREADS = 4; static int SCALE = 100; static int ITER = 10; @@ -347,6 +347,8 @@ int main(int argc, char** argv) { mi_collect(true); mi_debug_show_arenas(true,true,false); #endif + mi_collect(true); + mi_debug_show_arenas(true, true, false); // mi_stats_print(NULL); #else mi_stats_print(NULL); // so we see rss/commit/elapsed