diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index 3af664cf..45307c15 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -91,21 +91,31 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_MEDIUM_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE) #define MI_LARGE_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE) -#define MI_MEDIUM_SIZE_MAX (MI_MEDIUM_PAGE_SIZE/8) // 64kb on 64-bit -#define MI_LARGE_SIZE_MAX (MI_LARGE_PAGE_SIZE/8) // 512kb on 64-bit +#define MI_MEDIUM_SIZE_MAX (MI_MEDIUM_PAGE_SIZE/4) // 64kb on 64-bit +#define MI_LARGE_SIZE_MAX (MI_LARGE_PAGE_SIZE/4) // 512kb on 64-bit #define MI_LARGE_WSIZE_MAX (MI_LARGE_SIZE_MAX>>MI_INTPTR_SHIFT) -// Maximum number of size classes. (spaced exponentially in 16.7% increments) -#define MI_BIN_HUGE (64U) - // Minimal alignment necessary. On most platforms 16 bytes are needed // due to SSE registers for example. This must be at least `MI_INTPTR_SIZE` #define MI_MAX_ALIGN_SIZE 16 // sizeof(max_align_t) -#if (MI_LARGE_WSIZE_MAX > 131072) +#define MI_BIN4 +#ifdef MI_BIN4 +// Maximum number of size classes. (spaced exponentially in 25% increments) +#define MI_BIN_HUGE (40U) + +#if (MI_LARGE_WSIZE_MAX > 524287) #error "define more bins" #endif +#else +// Maximum number of size classes. (spaced exponentially in 12.5% increments) +#define MI_BIN_HUGE (70U) + +#if (MI_LARGE_WSIZE_MAX > 393216) +#error "define more bins" +#endif +#endif typedef uintptr_t mi_encoded_t; @@ -172,10 +182,10 @@ typedef struct mi_page_s { bool is_reset:1; // `true` if the page memory was reset bool is_committed:1; // `true` if the page virtual memory is committed - // layout like this to optimize access in `mi_malloc` and `mi_free` + // layout like this to optimize access in `mi_malloc` and `mi_free` uint16_t capacity; // number of blocks committed uint16_t reserved; // number of blocks reserved in memory - // 16 bits padding + // 16 bits padding mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) #if MI_SECURE uintptr_t cookie; // random cookie to encode the free lists diff --git a/src/alloc.c b/src/alloc.c index fe9d5fb0..bfb37d19 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -57,6 +57,7 @@ extern inline void* mi_malloc_small(size_t size) mi_attr_noexcept { return mi_heap_malloc_small(mi_get_default_heap(), size); } + // zero initialized small block void* mi_zalloc_small(size_t size) mi_attr_noexcept { void* p = mi_malloc_small(size); @@ -71,7 +72,7 @@ extern inline void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcep void* p; if (mi_likely(size <= MI_SMALL_SIZE_MAX)) { p = mi_heap_malloc_small(heap, size); - } + } else { p = _mi_malloc_generic(heap, size); } @@ -235,11 +236,11 @@ void mi_free(void* p) mi_attr_noexcept // huge page stat is accounted for in `_mi_page_retire` #endif - // adjust if it might be an un-aligned block uintptr_t tid = _mi_thread_id(); - if (mi_likely(tid == page->flags.value)) { // local, and not full or aligned + if (mi_likely(tid == page->flags.value)) { + // local, and not full or aligned mi_block_t* block = (mi_block_t*)p; - mi_block_set_next(page, block, page->local_free); // note: moving this write earlier does not matter for performance + mi_block_set_next(page, block, page->local_free); page->local_free = block; page->used--; if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); } diff --git a/src/init.c b/src/init.c index 13ed9561..1ea510b2 100644 --- a/src/init.c +++ b/src/init.c @@ -32,24 +32,37 @@ const mi_page_t _mi_page_empty = { // Empty page queues for every bin #define QNULL(sz) { NULL, NULL, (sz)*sizeof(uintptr_t) } +#ifdef MI_BIN4 #define MI_PAGE_QUEUES_EMPTY \ { QNULL(1), \ - QNULL(1), QNULL(2), QNULL(3), QNULL(4), QNULL(5), QNULL(6), QNULL(7), QNULL(8), \ - QNULL(10), QNULL(12), QNULL(14), QNULL(16), QNULL(20), QNULL(24), QNULL(28), QNULL(32), \ - QNULL(40), QNULL(48), QNULL(56), QNULL(64), QNULL(80), QNULL(96), QNULL(112), QNULL(128), \ - QNULL(160), QNULL(192), QNULL(224), QNULL(256), QNULL(320), QNULL(384), QNULL(448), QNULL(512), \ - QNULL(640), QNULL(768), QNULL(896), QNULL(1024), QNULL(1280), QNULL(1536), QNULL(1792), QNULL(2048), \ - QNULL(2560), QNULL(3072), QNULL(3584), QNULL(4096), QNULL(5120), QNULL(6144), QNULL(7168), QNULL(8192), \ - QNULL(10240), QNULL(12288), QNULL(14336), QNULL(16384), QNULL(20480), QNULL(24576), QNULL(28672), QNULL(32768), \ - QNULL(40960), QNULL(49152), QNULL(57344), QNULL(65536), QNULL(81920), QNULL(98304), QNULL(114688), \ - QNULL(MI_LARGE_WSIZE_MAX + 1 /*131072, Huge queue */), \ + QNULL( 1), QNULL( 2), QNULL( 3), QNULL( 4), QNULL( 5), QNULL( 6), QNULL( 7), QNULL( 8), /* 8 */ \ + QNULL( 11), QNULL( 15), QNULL( 23), QNULL( 31), QNULL( 47), QNULL( 63), QNULL( 95), QNULL( 127), /* 16 */ \ + QNULL( 191), QNULL( 255), QNULL( 383), QNULL( 511), QNULL( 767), QNULL( 1023), QNULL( 1535), QNULL( 2047), /* 24 */ \ + QNULL( 3071), QNULL( 4095), QNULL( 6143), QNULL( 8191), QNULL( 12287), QNULL( 16383), QNULL( 24575), QNULL( 32767), /* 32 */ \ + QNULL( 49151), QNULL( 65535), QNULL( 98303), QNULL(131071), QNULL(196607), QNULL(262143), QNULL(393215), /* 39 */ \ + QNULL(MI_LARGE_WSIZE_MAX + 1 /* 524287, Huge queue */), \ QNULL(MI_LARGE_WSIZE_MAX + 2) /* Full queue */ } +#else +#define MI_PAGE_QUEUES_EMPTY \ + { QNULL(1), \ + QNULL( 1), QNULL( 2), QNULL( 3), QNULL( 4), QNULL( 5), QNULL( 6), QNULL( 7), QNULL( 8), /* 8 */ \ + QNULL( 10), QNULL( 12), QNULL( 14), QNULL( 16), QNULL( 20), QNULL( 24), QNULL( 28), QNULL( 32), /* 16 */ \ + QNULL( 40), QNULL( 48), QNULL( 56), QNULL( 64), QNULL( 80), QNULL( 96), QNULL( 112), QNULL( 128), /* 24 */ \ + QNULL( 160), QNULL( 192), QNULL( 224), QNULL( 256), QNULL( 320), QNULL( 384), QNULL( 448), QNULL( 512), /* 32 */ \ + QNULL( 640), QNULL( 768), QNULL( 896), QNULL( 1024), QNULL( 1280), QNULL( 1536), QNULL( 1792), QNULL( 2048), /* 40 */ \ + QNULL( 2560), QNULL( 3072), QNULL( 3584), QNULL( 4096), QNULL( 5120), QNULL( 6144), QNULL( 7168), QNULL( 8192), /* 48 */ \ + QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \ + QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \ + QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), /* 69 */ \ + QNULL(MI_LARGE_WSIZE_MAX + 1 /* 393216, Huge queue */), \ + QNULL(MI_LARGE_WSIZE_MAX + 2) /* Full queue */ } +#endif #define MI_STAT_COUNT_NULL() {0,0,0,0} // Empty statistics #if MI_STAT>1 -#define MI_STAT_COUNT_END_NULL() , { MI_STAT_COUNT_NULL(), MI_INIT64(MI_STAT_COUNT_NULL) } +#define MI_STAT_COUNT_END_NULL() , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) } #else #define MI_STAT_COUNT_END_NULL() #endif @@ -97,8 +110,8 @@ static mi_tld_t tld_main = { 0, &_mi_heap_main, { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments - { 0, NULL, NULL, 0, tld_main_stats }, // os - { MI_STATS_NULL } // stats + { 0, NULL, NULL, 0, tld_main_stats }, // os + { MI_STATS_NULL } // stats }; mi_heap_t _mi_heap_main = { diff --git a/src/page-queue.c b/src/page-queue.c index fd388113..69ebcc75 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -97,7 +97,7 @@ uint8_t _mi_bsr(uintptr_t x) { // Returns MI_BIN_HUGE if the size is too large. // We use `wsize` for the size in "machine word sizes", // i.e. byte size == `wsize*sizeof(void*)`. -inline uint8_t _mi_bin(size_t size) { +extern inline uint8_t _mi_bin(size_t size) { size_t wsize = _mi_wsize_from_size(size); uint8_t bin; if (wsize <= 1) { @@ -120,16 +120,21 @@ inline uint8_t _mi_bin(size_t size) { bin = MI_BIN_HUGE; } else { - #if defined(MI_ALIGN4W) + #if defined(MI_ALIGN4W) if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes #endif + #ifdef MI_BIN4 + uint8_t b = mi_bsr32((uint32_t)wsize); + bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3; + #else wsize--; // find the highest bit uint8_t b = mi_bsr32((uint32_t)wsize); - // and use the top 3 bits to determine the bin (~16% worst internal fragmentation). + // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3; + #endif } mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE); return bin; diff --git a/src/page.c b/src/page.c index 9be0372d..e6be8df6 100644 --- a/src/page.c +++ b/src/page.c @@ -385,7 +385,7 @@ void _mi_page_retire(mi_page_t* page) { // is the only page left with free blocks. It is not clear // how to check this efficiently though... for now we just check // if its neighbours are almost fully used. - if (mi_likely(page->block_size <= MI_SMALL_SIZE_MAX)) { + if (mi_likely(page->block_size <= MI_MEDIUM_SIZE_MAX)) { if (mi_page_mostly_used(page->prev) && mi_page_mostly_used(page->next)) { _mi_stat_counter_increase(&_mi_stats_main.page_no_retire,1); return; // dont't retire after all @@ -722,10 +722,10 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept // call potential deferred free routines _mi_deferred_free(heap, false); - + // free delayed frees from other threads _mi_heap_delayed_free(heap); - + // huge allocation? mi_page_t* page; if (mi_unlikely(size > MI_LARGE_SIZE_MAX)) { diff --git a/src/segment.c b/src/segment.c index f2fd09ad..736345bf 100644 --- a/src/segment.c +++ b/src/segment.c @@ -236,8 +236,8 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se // The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use, -// and no more than 2. -#define MI_SEGMENT_CACHE_MAX (2) +// and no more than 4. +#define MI_SEGMENT_CACHE_MAX (4) #define MI_SEGMENT_CACHE_FRACTION (8) // note: returned segment may be partially reset @@ -708,16 +708,20 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld /* ----------------------------------------------------------- Page allocation and free ----------------------------------------------------------- */ +static bool mi_is_good_fit(size_t bsize, size_t size) { + // good fit if no more than 25% wasted + return (bsize > 0 && size > 0 && bsize < size && (size - (size % bsize)) < (size/4)); +} mi_page_t* _mi_segment_page_alloc(size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { mi_page_t* page; - if (block_size <= (MI_SMALL_PAGE_SIZE/4)) { + if (block_size <= MI_SMALL_SIZE_MAX || mi_is_good_fit(block_size,MI_SMALL_PAGE_SIZE)) { page = mi_segment_small_page_alloc(tld,os_tld); } - else if (block_size <= (MI_MEDIUM_PAGE_SIZE/4)) { + else if (block_size <= MI_MEDIUM_SIZE_MAX || mi_is_good_fit(block_size, MI_MEDIUM_PAGE_SIZE)) { page = mi_segment_medium_page_alloc(tld, os_tld); } - else if (block_size < (MI_LARGE_SIZE_MAX - sizeof(mi_segment_t))) { + else if (block_size < MI_LARGE_SIZE_MAX || mi_is_good_fit(block_size, MI_LARGE_PAGE_SIZE - sizeof(mi_segment_t))) { page = mi_segment_large_page_alloc(tld, os_tld); } else { diff --git a/test/main-override-static.c b/test/main-override-static.c index 6ddf4f37..83aa388a 100644 --- a/test/main-override-static.c +++ b/test/main-override-static.c @@ -6,8 +6,154 @@ #include #include // redefines malloc etc. +#include +#include + +#define MI_INTPTR_SIZE 8 +#define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE) + +#define MI_BIN_HUGE 100 +//#define MI_ALIGN2W + +// Bit scan reverse: return the index of the highest bit. +static inline uint8_t mi_bsr32(uint32_t x); + +#if defined(_MSC_VER) +#include +#include +static inline uint8_t mi_bsr32(uint32_t x) { + uint32_t idx; + _BitScanReverse((DWORD*)&idx, x); + return idx; +} +#elif defined(__GNUC__) || defined(__clang__) +static inline uint8_t mi_bsr32(uint32_t x) { + return (31 - __builtin_clz(x)); +} +#else +static inline uint8_t mi_bsr32(uint32_t x) { + // de Bruijn multiplication, see + static const uint8_t debruijn[32] = { + 31, 0, 22, 1, 28, 23, 18, 2, 29, 26, 24, 10, 19, 7, 3, 12, + 30, 21, 27, 17, 25, 9, 6, 11, 20, 16, 8, 5, 15, 4, 14, 13, + }; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + x++; + return debruijn[(x*0x076be629) >> 27]; +} +#endif + +// Bit scan reverse: return the index of the highest bit. +uint8_t _mi_bsr(uintptr_t x) { + if (x == 0) return 0; + #if MI_INTPTR_SIZE==8 + uint32_t hi = (x >> 32); + return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi)); + #elif MI_INTPTR_SIZE==4 + return mi_bsr32(x); + #else + # error "define bsr for non-32 or 64-bit platforms" + #endif +} + +static inline size_t _mi_wsize_from_size(size_t size) { + return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t); +} + +// Return the bin for a given field size. +// Returns MI_BIN_HUGE if the size is too large. +// We use `wsize` for the size in "machine word sizes", +// i.e. byte size == `wsize*sizeof(void*)`. +extern inline uint8_t _mi_bin8(size_t size) { + size_t wsize = _mi_wsize_from_size(size); + uint8_t bin; + if (wsize <= 1) { + bin = 1; + } + #if defined(MI_ALIGN4W) + else if (wsize <= 4) { + bin = (uint8_t)((wsize+1)&~1); // round to double word sizes + } + #elif defined(MI_ALIGN2W) + else if (wsize <= 8) { + bin = (uint8_t)((wsize+1)&~1); // round to double word sizes + } + #else + else if (wsize <= 8) { + bin = (uint8_t)wsize; + } + #endif + else if (wsize > MI_LARGE_WSIZE_MAX) { + bin = MI_BIN_HUGE; + } + else { + #if defined(MI_ALIGN4W) + if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes + #endif + wsize--; + // find the highest bit + uint8_t b = mi_bsr32((uint32_t)wsize); + // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). + // - adjust with 3 because we use do not round the first 8 sizes + // which each get an exact bin + bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3; + } + return bin; +} + +extern inline uint8_t _mi_bin4(size_t size) { + size_t wsize = _mi_wsize_from_size(size); + uint8_t bin; + if (wsize <= 1) { + bin = 1; + } + #if defined(MI_ALIGN4W) + else if (wsize <= 4) { + bin = (uint8_t)((wsize+1)&~1); // round to double word sizes + } + #elif defined(MI_ALIGN2W) + else if (wsize <= 8) { + bin = (uint8_t)((wsize+1)&~1); // round to double word sizes + } + #else + else if (wsize <= 8) { + bin = (uint8_t)wsize; + } + #endif + else if (wsize > MI_LARGE_WSIZE_MAX) { + bin = MI_BIN_HUGE; + } + else { + uint8_t b = mi_bsr32((uint32_t)wsize); + bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3; + } + return bin; +} + +void mi_bins() { + printf(" QNULL(1), /* 0 */ \\\n "); + size_t last_bin = 0; + for (size_t size = 1; size < (MI_INTPTR_SIZE*MI_LARGE_WSIZE_MAX); size++) { + size_t bin = _mi_bin4(size); + if (bin != last_bin) { + size_t wsize = (size-1)/sizeof(intptr_t); + // printf("size: %6zd, wsize: %6d, bin: %6zd\n", size - 1, (size-1)/sizeof(intptr_t), last_bin); + printf("QNULL(%6zd), ", wsize); + if (last_bin%8 == 0) printf("/* %i */ \\\n ", last_bin); + last_bin = bin; + } + } +} + + + int main() { mi_version(); + mi_bins(); void* p1 = malloc(78); void* p2 = malloc(24); free(p1);