From 0be44b2b0fd300726e1f69f5e6053093744658d9 Mon Sep 17 00:00:00 2001 From: Sergiy Kuryata Date: Tue, 26 Nov 2024 16:32:54 -0800 Subject: [PATCH] Add global counters for segments and allocation stats --- include/mimalloc.h | 10 ++ include/mimalloc/internal.h | 7 ++ include/mimalloc/types.h | 13 +++ src/alloc.c | 5 + src/arena.c | 59 ++++++++++- src/free.c | 6 +- src/heap.c | 3 +- src/init.c | 4 + src/options.c | 2 +- src/segment.c | 14 ++- src/stats.c | 188 ++++++++++++++++++++++++++++++++++++ 11 files changed, 304 insertions(+), 7 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 73f82093..7871a4ed 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -373,6 +373,16 @@ mi_decl_nodiscard mi_decl_export size_t mi_option_get_size(mi_option_t option); mi_decl_export void mi_option_set(mi_option_t option, long value); mi_decl_export void mi_option_set_default(mi_option_t option, long value); +typedef struct mi_allocation_counter_s { + size_t counter; + size_t block_size; // size of the allocation block that related to this counter (for example, the block that caused a new segmented to be allocated) +} mi_allocation_counter_t; + +mi_decl_export bool mi_get_segment_stats(size_t* abandoned, size_t* reclaimed, size_t* reclaim_failed, size_t* allocated, size_t* freed, + mi_allocation_counter_t* allocated_segments, int allocated_segments_count, + mi_allocation_counter_t* free_space_in_segments, int free_space_in_segments_count, + mi_allocation_counter_t* allocated_memory, int allocated_memory_count); + // ------------------------------------------------------------------------------------------------------- // "mi" prefixed implementations of various posix, Unix, Windows, and C++ allocation functions. diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index aad8a92b..05c092d7 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -181,6 +181,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool o bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never); size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); void _mi_deferred_free(mi_heap_t* heap, bool force); +mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page); void _mi_page_free_collect(mi_page_t* page,bool force); void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page); // callback from segments @@ -1017,5 +1018,11 @@ static inline void _mi_memzero_aligned(void* dst, size_t n) { } #endif +void mi_segment_increment_alloc_stats(size_t block_size); +void mi_segment_increment_freed_stats(); +void mi_segment_increment_reclaimed_stats(); +void mi_segment_increment_reclaim_failed_stats(); +void mi_allocation_stats_increment(size_t block_size); +void mi_allocation_stats_decrement(size_t block_size); #endif diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index dbc4f403..f3f43f08 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -503,6 +503,7 @@ typedef struct mi_page_queue_s { mi_page_t* first; mi_page_t* last; size_t block_size; + size_t allocationCount; } mi_page_queue_t; #define MI_BIN_FULL (MI_BIN_HUGE+1) @@ -646,6 +647,18 @@ typedef struct mi_stats_s { #endif } mi_stats_t; +typedef struct mi_segment_alloc_counter_s { + _Atomic(size_t) counter; + size_t block_size; // size of the allocation block that caused a new segmented to be allocated) +} mi_segment_alloc_counter_t; + +typedef struct mi_segment_stats_s { + _Atomic(size_t) reclaimed_count; + _Atomic(size_t) reclaim_failed_count; + _Atomic(size_t) allocated_count; + _Atomic(size_t) freed_count; + mi_segment_alloc_counter_t alloc_stats[MI_BIN_HUGE+1]; +} mi_segment_stats_t; void _mi_stat_increase(mi_stat_count_t* stat, size_t amount); void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount); diff --git a/src/alloc.c b/src/alloc.c index 86aaae75..9b9a6c01 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -39,6 +39,11 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_ // pop from the free list page->free = mi_block_next(page, block); page->used++; + + mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); + pq->allocationCount++; + mi_allocation_stats_increment(page->block_size); + mi_assert_internal(page->free == NULL || _mi_ptr_page(page->free) == page); #if MI_DEBUG>3 if (page->free_is_zero) { diff --git a/src/arena.c b/src/arena.c index 7704ecc3..c51ec843 100644 --- a/src/arena.c +++ b/src/arena.c @@ -825,7 +825,14 @@ void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) // start a cursor at a randomized arena void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_arena_field_cursor_t* current) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena)); + + if (heap != NULL) { + current->start = (max_arena == 0 ? 0 : (mi_arena_id_t)( _mi_heap_random_next(heap) % max_arena)); + } + else { + current->start = 0; + } + current->count = 0; current->bitmap_idx = 0; current->free_space_mask = MI_FREE_SPACE_MASK_ANY; @@ -896,6 +903,56 @@ mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* pr return NULL; } +// NOTE: This function has RACE CONDITION. It access abandoned segments WITHOUT clearing the abandoned bit. +// This can result in touching a segment object that has been freed and cause a crash. +// This function is strictly for experimental purpose to be able to calculate free space in segments quickly +// without performing numerous interlock operations while traversing through ALL abandoned segments. +// It should be deleted after the experiment is done. +size_t _mi_arena_segment_abandoned_free_space_stats_next(mi_arena_field_cursor_t* previous) +{ + const int max_arena = (int)mi_atomic_load_relaxed(&mi_arena_count); + if (max_arena <= 0 || mi_atomic_load_relaxed(&abandoned_count) == 0) return MI_FREE_SPACE_MASK_ALL; + + int count = previous->count; + size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx); + size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx) + 1; + // visit arena's (from previous) + for (; count < max_arena; count++, field_idx = 0, bit_idx = 0) { + mi_arena_id_t arena_idx = previous->start + count; + if (arena_idx >= max_arena) { arena_idx = arena_idx % max_arena; } // wrap around + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); + if (arena != NULL) { + // visit the abandoned fields (starting at previous_idx) + for ( ; field_idx < arena->field_count; field_idx++, bit_idx = 0) { + size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]); + if mi_unlikely(field != 0) { // skip zero fields quickly + // visit each set bit in the field (todo: maybe use `ctz` here?) + for ( ; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) { + // pre-check if the bit is set + size_t mask = ((size_t)1 << bit_idx); + if mi_unlikely((field & mask) == mask) { + mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); + + // *** THIS CAN CAUSE A CRASH: the segment can be freed while we access it's fields. + mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); + size_t free_space_mask = mi_atomic_load_relaxed(&segment->free_space_mask) & MI_FREE_SPACE_MASK_ANY; + + previous->bitmap_idx = bitmap_idx; + previous->count = count; + + return free_space_mask; + } + } + } + } + } + } + // no more found + previous->bitmap_idx = 0; + previous->count = 0; + return MI_FREE_SPACE_MASK_ALL; +} + /* ----------------------------------------------------------- Add an arena. diff --git a/src/free.c b/src/free.c index ed3c7d08..4b2eeac6 100644 --- a/src/free.c +++ b/src/free.c @@ -34,7 +34,10 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool // checks if mi_unlikely(mi_check_is_double_free(page, block)) return; mi_check_padding(page, block); - if (track_stats) { mi_stat_free(page, block); } + if (track_stats) { + mi_stat_free(page, block); + mi_allocation_stats_decrement(page->block_size); + } #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN if (!mi_page_is_huge(page)) { // huge page content may be already decommitted memset(block, MI_DEBUG_FREED, mi_page_block_size(page)); @@ -261,6 +264,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg // adjust stats (after padding check and potentially recursive `mi_free` above) mi_stat_free(page, block); // stat_free may access the padding mi_track_free_size(block, mi_page_usable_size_of(page,block)); + mi_allocation_stats_decrement(page->block_size); // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection _mi_padding_shrink(page, block, sizeof(mi_block_t)); diff --git a/src/heap.c b/src/heap.c index 3932bc9e..c52721dd 100644 --- a/src/heap.c +++ b/src/heap.c @@ -665,7 +665,6 @@ static mi_segment_t* mi_heap_get_segment_to_drop(mi_heap_t* heap) { } const mi_slice_t* mi_segment_slices_end(const mi_segment_t* segment); -mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page); // Visit all pages in a segment static mi_decl_noinline void mi_segment_visit_pages(mi_heap_t* heap, mi_segment_t* segment, heap_page_visitor_fun* fn, void* arg1) @@ -744,7 +743,7 @@ void mi_heap_drop_segment_if_required(mi_heap_t* heap, size_t alloc_block_size) { size_t targetSegmentCount = mi_option_get_size(mi_option_max_segments_per_heap); if ((targetSegmentCount > 0) && - (alloc_block_size <= MI_MEDIUM_OBJ_SIZE_MAX) && + (alloc_block_size <= MI_LARGE_OBJ_SIZE_MAX) && (heap->tld->segments.count >= targetSegmentCount)) { mi_heap_drop_segment(heap, targetSegmentCount); diff --git a/src/init.c b/src/init.c index 61062a6c..afd39dba 100644 --- a/src/init.c +++ b/src/init.c @@ -413,6 +413,9 @@ size_t _mi_current_thread_count(void) { return mi_atomic_load_relaxed(&thread_count); } +size_t _mi_get_next_thread_partition_id(); +extern mi_decl_thread size_t _mi_current_thread_partitionId; + // This is called from the `mi_malloc_generic` void mi_thread_init(void) mi_attr_noexcept { @@ -424,6 +427,7 @@ void mi_thread_init(void) mi_attr_noexcept // fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called) if (_mi_thread_heap_init()) return; // returns true if already initialized + _mi_current_thread_partitionId = _mi_get_next_thread_partition_id(); _mi_stat_increase(&_mi_stats_main.threads, 1); mi_atomic_increment_relaxed(&thread_count); //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id()); diff --git a/src/options.c b/src/options.c index 774ce577..fc3348ba 100644 --- a/src/options.c +++ b/src/options.c @@ -94,7 +94,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. { 8, UNINIT, MI_OPTION(max_segments_per_heap) }, // max number of segments that heap can own. - { 2000, UNINIT, MI_OPTION(heap_collect_abandoned_interval) }, // max number of segments that heap can own. + { 2000, UNINIT, MI_OPTION(heap_collect_abandoned_interval) }, // delay (ms) in between collecting abandoned segments when a heap drops exessive segments. }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/segment.c b/src/segment.c index 6850e555..18e7d3ed 100644 --- a/src/segment.c +++ b/src/segment.c @@ -981,6 +981,7 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t // return it to the OS mi_segment_os_free(segment, tld); + mi_segment_increment_freed_stats(); } @@ -1367,7 +1368,8 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice // the segment due to concurrent frees (in which case `NULL` is returned). mi_segment_t* segmentToReturn = mi_segment_reclaim(segment, heap, block_size, reclaimed, tld); if (segmentToReturn != NULL) { - return segmentToReturn; + mi_segment_increment_reclaimed_stats(); + return segmentToReturn; } } else if (segment->abandoned_visits > 3 && is_suitable && mi_option_get_size(mi_option_max_segments_per_heap) == 0) { @@ -1380,6 +1382,8 @@ static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t needed_slice _mi_arena_segment_mark_abandoned(segment); } } + + mi_segment_increment_reclaim_failed_stats(); return NULL; } @@ -1431,7 +1435,12 @@ static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t needed_ return segment; } // 2. otherwise allocate a fresh segment - return mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL); + segment = mi_segment_alloc(0, 0, heap->arena_id, tld, os_tld, NULL); + if (segment != NULL) { + mi_segment_increment_alloc_stats(block_size); + } + + return segment; } @@ -1482,6 +1491,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, segment->thread_id = 0; // huge segments are immediately abandoned #endif + mi_segment_increment_alloc_stats(size); // for huge pages we initialize the block_size as we may // overallocate to accommodate large alignments. size_t psize; diff --git a/src/stats.c b/src/stats.c index a9364027..8b6e23b1 100644 --- a/src/stats.c +++ b/src/stats.c @@ -15,6 +15,192 @@ terms of the MIT license. A copy of the license can be found in the file #pragma warning(disable:4204) // non-constant aggregate initializer #endif +// -------------------------------------------------------- +// Segment statistics +// -------------------------------------------------------- +mi_segment_stats_t _mi_global_segment_stats; + +void mi_init_segment_stats() +{ + _mi_global_segment_stats.reclaimed_count = 0; + _mi_global_segment_stats.reclaim_failed_count = 0; + _mi_global_segment_stats.allocated_count = 0; + _mi_global_segment_stats.freed_count = 0; + + static_assert((MI_BIN_HUGE + 1) == sizeof(_mi_global_segment_stats.alloc_stats) / sizeof(_mi_global_segment_stats.alloc_stats[0])); + for (int i = 0; i <= MI_BIN_HUGE; i++) + { + size_t block_size = _mi_bin_size((uint8_t)i); + + _mi_global_segment_stats.alloc_stats[i].counter = 0; + _mi_global_segment_stats.alloc_stats[i].block_size = block_size; + } + + // (MI_FREE_SPACE_MASK_BIT_COUNT-1) combines multiple block sizes. Set it INT32_MAX to distinguish from the rest. + _mi_global_segment_stats.alloc_stats[MI_FREE_SPACE_MASK_BIT_COUNT - 1].block_size = INT32_MAX; +} + + +void mi_segment_increment_alloc_stats(size_t block_size) +{ + uint8_t page_queue_index = _mi_bin(block_size); + + mi_atomic_increment_relaxed(&_mi_global_segment_stats.alloc_stats[page_queue_index].counter); + mi_atomic_increment_relaxed(&_mi_global_segment_stats.allocated_count); +} + +void mi_segment_increment_freed_stats() +{ + mi_atomic_increment_relaxed(&_mi_global_segment_stats.freed_count); +} + +void mi_segment_increment_reclaimed_stats() +{ + mi_atomic_increment_relaxed(&_mi_global_segment_stats.reclaimed_count); +} + +void mi_segment_increment_reclaim_failed_stats() +{ + mi_atomic_increment_relaxed(&_mi_global_segment_stats.reclaim_failed_count); +} + +// -------------------------------------------------------- +// Partitioned counter to avoid contention in interlocked operations +// -------------------------------------------------------- +_Atomic(size_t) _mi_next_counter_partition_id; +#define NUMBER_OF_PARTITIONS 32 + +size_t _mi_get_next_thread_partition_id() +{ + return mi_atomic_increment_relaxed(&_mi_next_counter_partition_id) % NUMBER_OF_PARTITIONS; +} + +mi_decl_thread size_t _mi_current_thread_partitionId = 0; + +// Implements a counter that has its value partitioned in a set of bucket (in separate cache lines) +// to reduce contention when the value of the counter is updated. +typedef struct mi_decl_cache_align mi_partitioned_counter_value_s +{ + _Atomic(int64_t) counter_value; +} mi_partitioned_counter_value_t; + +typedef struct mi_partitioned_counter_s +{ + mi_partitioned_counter_value_t counter_partitions[NUMBER_OF_PARTITIONS]; +} mi_partitioned_counter_t; + +void mi_partitioned_counter_increment(mi_partitioned_counter_t* counter, size_t value) +{ + mi_atomic_add_relaxed(&counter->counter_partitions[_mi_current_thread_partitionId].counter_value, value); +} + +void mi_partitioned_counter_decrement(mi_partitioned_counter_t* counter, size_t value) +{ + mi_atomic_sub_relaxed(&counter->counter_partitions[_mi_current_thread_partitionId].counter_value, value); +} + +int64_t mi_partitioned_counter_get_value(mi_partitioned_counter_t* counter) +{ + size_t total = 0; + + for (int i = 0; i < NUMBER_OF_PARTITIONS; i++) + { + total += mi_atomic_load_relaxed(&counter->counter_partitions[i].counter_value); + } + + int64_t retVal = ((int64_t)total); + if (retVal < 0) + { + retVal = 0; + } + + return retVal; +} + +mi_partitioned_counter_t _mi_allocated_memory[MI_BIN_HUGE+1]; + +void mi_allocation_stats_increment(size_t block_size) +{ + uint8_t binIndex = _mi_bin(block_size); + mi_partitioned_counter_increment(&_mi_allocated_memory[binIndex], block_size); +} + +void mi_allocation_stats_decrement(size_t block_size) +{ + uint8_t binIndex = _mi_bin(block_size); + mi_partitioned_counter_decrement(&_mi_allocated_memory[binIndex], block_size); +} + +size_t _mi_arena_segment_abandoned_free_space_stats_next(mi_arena_field_cursor_t* previous); +void mi_segment_update_free_space_stats(mi_allocation_counter_t* free_space_in_segments) +{ + mi_arena_field_cursor_t current; + size_t free_space_mask = 0; + + _mi_arena_field_cursor_init(NULL, ¤t); + while ((free_space_mask = _mi_arena_segment_abandoned_free_space_stats_next(¤t)) != MI_FREE_SPACE_MASK_ALL) { + + int bit_index = 0; + while (free_space_mask != 0) { + if ((free_space_mask & 1) != 0) { + free_space_in_segments[bit_index].counter++; + } + + free_space_mask = free_space_mask >> 1; + bit_index++; + } + } +} + +void mi_update_allocated_memory_stats(mi_allocation_counter_t* allocated_memory, int allocated_memory_count) +{ + for (int i = 0; i < allocated_memory_count; i++) { + allocated_memory[i].counter = mi_partitioned_counter_get_value(&_mi_allocated_memory[i]); + } +} + +bool mi_get_segment_stats(size_t* abandoned, size_t* reclaimed, size_t* reclaim_failed, size_t* allocated, size_t* freed, + mi_allocation_counter_t* allocated_segments, int allocated_segments_count, + mi_allocation_counter_t* free_space_in_segments, int free_space_in_segments_count, + mi_allocation_counter_t* allocated_memory, int allocated_memory_count) +{ + int stat_count = sizeof(_mi_global_segment_stats.alloc_stats) / sizeof(_mi_global_segment_stats.alloc_stats[0]); + + if ((allocated_segments == NULL) || (allocated_segments_count != stat_count)) { + return false; + } + + if ((free_space_in_segments == NULL) || (free_space_in_segments_count != stat_count)) { + return false; + } + + if ((allocated_memory == NULL) || (allocated_memory_count != stat_count)) { + return false; + } + + *abandoned = _mi_arena_segment_abandoned_count(); + *reclaimed = mi_atomic_load_relaxed(&_mi_global_segment_stats.reclaimed_count); + *reclaim_failed = mi_atomic_load_relaxed(&_mi_global_segment_stats.reclaim_failed_count); + *allocated = mi_atomic_load_relaxed(&_mi_global_segment_stats.allocated_count); + *freed = mi_atomic_load_relaxed(&_mi_global_segment_stats.freed_count); + + for (int i = 0; i < stat_count; i++) { + allocated_segments[i].counter = mi_atomic_load_relaxed(&_mi_global_segment_stats.alloc_stats[i].counter); + allocated_segments[i].block_size = _mi_global_segment_stats.alloc_stats[i].block_size; + + free_space_in_segments[i].counter = 0; + free_space_in_segments[i].block_size = allocated_segments[i].block_size; + + allocated_memory[i].counter = 0; + allocated_memory[i].block_size = allocated_segments[i].block_size; + } + + mi_segment_update_free_space_stats(free_space_in_segments); + mi_update_allocated_memory_stats(allocated_memory, allocated_memory_count); + + return true; +} + /* ----------------------------------------------------------- Statistics operations ----------------------------------------------------------- */ @@ -388,6 +574,8 @@ void mi_stats_reset(void) mi_attr_noexcept { if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); } memset(&_mi_stats_main, 0, sizeof(mi_stats_t)); if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); }; + + mi_init_segment_stats(); } void mi_stats_merge(void) mi_attr_noexcept {