diff --git a/src/heap.c b/src/heap.c index 2cde5fb0..be2800c1 100644 --- a/src/heap.c +++ b/src/heap.c @@ -528,46 +528,83 @@ void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page) { } +static void mi_get_fast_divisor(size_t divisor, uint64_t* magic, size_t* shift) { + mi_assert_internal(divisor > 0 && divisor <= UINT32_MAX); + *shift = 64 - mi_clz(divisor - 1); + *magic = ((((uint64_t)1 << 32) * (((uint64_t)1 << *shift) - divisor)) / divisor + 1); +} + +static size_t mi_fast_divide(size_t n, uint64_t magic, size_t shift) { + mi_assert_internal(n <= UINT32_MAX); + return ((((uint64_t)n * magic) >> 32) + n) >> shift; +} + bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg) { mi_assert(area != NULL); if (area==NULL) return true; mi_assert(page != NULL); if (page == NULL) return true; - _mi_page_free_collect(page,true); + _mi_page_free_collect(page,true); // collect both thread_delayed and local_free mi_assert_internal(page->local_free == NULL); if (page->used == 0) return true; - const size_t bsize = mi_page_block_size(page); - const size_t ubsize = mi_page_usable_block_size(page); // without padding - size_t psize; - uint8_t* pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize); + size_t psize; + uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize); + mi_heap_t* const heap = mi_page_heap(page); + const size_t bsize = mi_page_block_size(page); + const size_t ubsize = mi_page_usable_block_size(page); // without padding + // optimize page with one block if (page->capacity == 1) { - // optimize page with one block mi_assert_internal(page->used == 1 && page->free == NULL); return visitor(mi_page_heap(page), area, pstart, ubsize, arg); } + mi_assert(bsize <= UINT32_MAX); + + // optimize full pages + if (page->used == page->capacity) { + uint8_t* block = pstart; + for (size_t i = 0; i < page->capacity; i++) { + if (!visitor(heap, area, block, ubsize, arg)) return false; + block += bsize; + } + return true; + } // create a bitmap of free blocks. #define MI_MAX_BLOCKS (MI_SMALL_PAGE_SIZE / sizeof(void*)) - uintptr_t free_map[MI_MAX_BLOCKS / sizeof(uintptr_t)]; - memset(free_map, 0, sizeof(free_map)); + uintptr_t free_map[MI_MAX_BLOCKS / MI_INTPTR_BITS]; + const uintptr_t bmapsize = _mi_divide_up(page->capacity, MI_INTPTR_BITS); + memset(free_map, 0, bmapsize * sizeof(intptr_t)); + if (page->capacity % MI_INTPTR_BITS != 0) { + // mark left-over bits at the end as free + size_t shift = (page->capacity % MI_INTPTR_BITS); + uintptr_t mask = (UINTPTR_MAX << shift); + free_map[bmapsize - 1] = mask; + } + + // fast repeated division by the block size + uint64_t magic; + size_t shift; + mi_get_fast_divisor(bsize, &magic, &shift); #if MI_DEBUG>1 size_t free_count = 0; #endif - for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page,block)) { + for (mi_block_t* block = page->free; block != NULL; block = mi_block_next(page, block)) { #if MI_DEBUG>1 free_count++; #endif mi_assert_internal((uint8_t*)block >= pstart && (uint8_t*)block < (pstart + psize)); size_t offset = (uint8_t*)block - pstart; mi_assert_internal(offset % bsize == 0); - size_t blockidx = offset / bsize; // Todo: avoid division? - mi_assert_internal( blockidx < MI_MAX_BLOCKS); - size_t bitidx = (blockidx / sizeof(uintptr_t)); - size_t bit = blockidx - (bitidx * sizeof(uintptr_t)); + mi_assert_internal(offset <= UINT32_MAX); + size_t blockidx = mi_fast_divide(offset, magic, shift); + mi_assert_internal(blockidx == offset / bsize); + mi_assert_internal(blockidx < MI_MAX_BLOCKS); + size_t bitidx = (blockidx / MI_INTPTR_BITS); + size_t bit = blockidx - (bitidx * MI_INTPTR_BITS); free_map[bitidx] |= ((uintptr_t)1 << bit); } mi_assert_internal(page->capacity == (free_count + page->used)); @@ -576,19 +613,30 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_ #if MI_DEBUG>1 size_t used_count = 0; #endif - for (size_t i = 0; i < page->capacity; i++) { - size_t bitidx = (i / sizeof(uintptr_t)); - size_t bit = i - (bitidx * sizeof(uintptr_t)); - uintptr_t m = free_map[bitidx]; - if (bit == 0 && m == UINTPTR_MAX) { - i += (sizeof(uintptr_t) - 1); // skip a run of free blocks + uint8_t* block = pstart; + for (size_t i = 0; i < bmapsize; i++) { + if (free_map[i] == 0) { + // every block is in use + for (size_t j = 0; j < MI_INTPTR_BITS; j++) { + #if MI_DEBUG>1 + used_count++; + #endif + if (!visitor(heap, area, block, ubsize, arg)) return false; + block += bsize; + } } - else if ((m & ((uintptr_t)1 << bit)) == 0) { - #if MI_DEBUG>1 - used_count++; - #endif - uint8_t* block = pstart + (i * bsize); - if (!visitor(mi_page_heap(page), area, block, ubsize, arg)) return false; + else { + // visit the used blocks in the mask + uintptr_t m = ~free_map[i]; + while (m != 0) { + #if MI_DEBUG>1 + used_count++; + #endif + size_t bitidx = mi_ctz(m); + if (!visitor(heap, area, block + (bitidx * bsize), ubsize, arg)) return false; + m &= m - 1; // clear least significant bit + } + block += bsize * MI_INTPTR_BITS; } } mi_assert_internal(page->used == used_count); diff --git a/test/test-stress.c b/test/test-stress.c index 0368007a..f82b9743 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -129,6 +129,16 @@ static void free_items(void* p) { custom_free(p); } +/* +static bool visit_blocks(const mi_heap_t* heap, const mi_heap_area_t* area, void* block, size_t block_size, void* arg) { + (void)(heap); (void)(area); + size_t* total = (size_t*)arg; + if (block != NULL) { + total += block_size; + } + return true; +} +*/ static void stress(intptr_t tid) { //bench_start_thread(); @@ -173,6 +183,10 @@ static void stress(intptr_t tid) { data[data_idx] = q; } } + // walk the heap + // size_t total = 0; + // mi_heap_visit_blocks(mi_heap_get_default(), true, visit_blocks, &total); + // free everything that is left for (size_t i = 0; i < retain_top; i++) { free_items(retained[i]);