add good-fit for allowing larger blocks in smaller segments

This commit is contained in:
daan 2019-08-09 11:18:38 -07:00
parent 5e56b40fe6
commit 442bad9190
7 changed files with 214 additions and 35 deletions

View file

@ -91,21 +91,31 @@ terms of the MIT license. A copy of the license can be found in the file
#define MI_MEDIUM_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE)
#define MI_LARGE_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE)
#define MI_MEDIUM_SIZE_MAX (MI_MEDIUM_PAGE_SIZE/8) // 64kb on 64-bit
#define MI_LARGE_SIZE_MAX (MI_LARGE_PAGE_SIZE/8) // 512kb on 64-bit
#define MI_MEDIUM_SIZE_MAX (MI_MEDIUM_PAGE_SIZE/4) // 64kb on 64-bit
#define MI_LARGE_SIZE_MAX (MI_LARGE_PAGE_SIZE/4) // 512kb on 64-bit
#define MI_LARGE_WSIZE_MAX (MI_LARGE_SIZE_MAX>>MI_INTPTR_SHIFT)
// Maximum number of size classes. (spaced exponentially in 16.7% increments)
#define MI_BIN_HUGE (64U)
// Minimal alignment necessary. On most platforms 16 bytes are needed
// due to SSE registers for example. This must be at least `MI_INTPTR_SIZE`
#define MI_MAX_ALIGN_SIZE 16 // sizeof(max_align_t)
#if (MI_LARGE_WSIZE_MAX > 131072)
#define MI_BIN4
#ifdef MI_BIN4
// Maximum number of size classes. (spaced exponentially in 25% increments)
#define MI_BIN_HUGE (40U)
#if (MI_LARGE_WSIZE_MAX > 524287)
#error "define more bins"
#endif
#else
// Maximum number of size classes. (spaced exponentially in 12.5% increments)
#define MI_BIN_HUGE (70U)
#if (MI_LARGE_WSIZE_MAX > 393216)
#error "define more bins"
#endif
#endif
typedef uintptr_t mi_encoded_t;
@ -172,10 +182,10 @@ typedef struct mi_page_s {
bool is_reset:1; // `true` if the page memory was reset
bool is_committed:1; // `true` if the page virtual memory is committed
// layout like this to optimize access in `mi_malloc` and `mi_free`
// layout like this to optimize access in `mi_malloc` and `mi_free`
uint16_t capacity; // number of blocks committed
uint16_t reserved; // number of blocks reserved in memory
// 16 bits padding
// 16 bits padding
mi_block_t* free; // list of available free blocks (`malloc` allocates from this list)
#if MI_SECURE
uintptr_t cookie; // random cookie to encode the free lists

View file

@ -57,6 +57,7 @@ extern inline void* mi_malloc_small(size_t size) mi_attr_noexcept {
return mi_heap_malloc_small(mi_get_default_heap(), size);
}
// zero initialized small block
void* mi_zalloc_small(size_t size) mi_attr_noexcept {
void* p = mi_malloc_small(size);
@ -71,7 +72,7 @@ extern inline void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcep
void* p;
if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
p = mi_heap_malloc_small(heap, size);
}
}
else {
p = _mi_malloc_generic(heap, size);
}
@ -235,11 +236,11 @@ void mi_free(void* p) mi_attr_noexcept
// huge page stat is accounted for in `_mi_page_retire`
#endif
// adjust if it might be an un-aligned block
uintptr_t tid = _mi_thread_id();
if (mi_likely(tid == page->flags.value)) { // local, and not full or aligned
if (mi_likely(tid == page->flags.value)) {
// local, and not full or aligned
mi_block_t* block = (mi_block_t*)p;
mi_block_set_next(page, block, page->local_free); // note: moving this write earlier does not matter for performance
mi_block_set_next(page, block, page->local_free);
page->local_free = block;
page->used--;
if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); }

View file

@ -32,24 +32,37 @@ const mi_page_t _mi_page_empty = {
// Empty page queues for every bin
#define QNULL(sz) { NULL, NULL, (sz)*sizeof(uintptr_t) }
#ifdef MI_BIN4
#define MI_PAGE_QUEUES_EMPTY \
{ QNULL(1), \
QNULL(1), QNULL(2), QNULL(3), QNULL(4), QNULL(5), QNULL(6), QNULL(7), QNULL(8), \
QNULL(10), QNULL(12), QNULL(14), QNULL(16), QNULL(20), QNULL(24), QNULL(28), QNULL(32), \
QNULL(40), QNULL(48), QNULL(56), QNULL(64), QNULL(80), QNULL(96), QNULL(112), QNULL(128), \
QNULL(160), QNULL(192), QNULL(224), QNULL(256), QNULL(320), QNULL(384), QNULL(448), QNULL(512), \
QNULL(640), QNULL(768), QNULL(896), QNULL(1024), QNULL(1280), QNULL(1536), QNULL(1792), QNULL(2048), \
QNULL(2560), QNULL(3072), QNULL(3584), QNULL(4096), QNULL(5120), QNULL(6144), QNULL(7168), QNULL(8192), \
QNULL(10240), QNULL(12288), QNULL(14336), QNULL(16384), QNULL(20480), QNULL(24576), QNULL(28672), QNULL(32768), \
QNULL(40960), QNULL(49152), QNULL(57344), QNULL(65536), QNULL(81920), QNULL(98304), QNULL(114688), \
QNULL(MI_LARGE_WSIZE_MAX + 1 /*131072, Huge queue */), \
QNULL( 1), QNULL( 2), QNULL( 3), QNULL( 4), QNULL( 5), QNULL( 6), QNULL( 7), QNULL( 8), /* 8 */ \
QNULL( 11), QNULL( 15), QNULL( 23), QNULL( 31), QNULL( 47), QNULL( 63), QNULL( 95), QNULL( 127), /* 16 */ \
QNULL( 191), QNULL( 255), QNULL( 383), QNULL( 511), QNULL( 767), QNULL( 1023), QNULL( 1535), QNULL( 2047), /* 24 */ \
QNULL( 3071), QNULL( 4095), QNULL( 6143), QNULL( 8191), QNULL( 12287), QNULL( 16383), QNULL( 24575), QNULL( 32767), /* 32 */ \
QNULL( 49151), QNULL( 65535), QNULL( 98303), QNULL(131071), QNULL(196607), QNULL(262143), QNULL(393215), /* 39 */ \
QNULL(MI_LARGE_WSIZE_MAX + 1 /* 524287, Huge queue */), \
QNULL(MI_LARGE_WSIZE_MAX + 2) /* Full queue */ }
#else
#define MI_PAGE_QUEUES_EMPTY \
{ QNULL(1), \
QNULL( 1), QNULL( 2), QNULL( 3), QNULL( 4), QNULL( 5), QNULL( 6), QNULL( 7), QNULL( 8), /* 8 */ \
QNULL( 10), QNULL( 12), QNULL( 14), QNULL( 16), QNULL( 20), QNULL( 24), QNULL( 28), QNULL( 32), /* 16 */ \
QNULL( 40), QNULL( 48), QNULL( 56), QNULL( 64), QNULL( 80), QNULL( 96), QNULL( 112), QNULL( 128), /* 24 */ \
QNULL( 160), QNULL( 192), QNULL( 224), QNULL( 256), QNULL( 320), QNULL( 384), QNULL( 448), QNULL( 512), /* 32 */ \
QNULL( 640), QNULL( 768), QNULL( 896), QNULL( 1024), QNULL( 1280), QNULL( 1536), QNULL( 1792), QNULL( 2048), /* 40 */ \
QNULL( 2560), QNULL( 3072), QNULL( 3584), QNULL( 4096), QNULL( 5120), QNULL( 6144), QNULL( 7168), QNULL( 8192), /* 48 */ \
QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \
QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \
QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), /* 69 */ \
QNULL(MI_LARGE_WSIZE_MAX + 1 /* 393216, Huge queue */), \
QNULL(MI_LARGE_WSIZE_MAX + 2) /* Full queue */ }
#endif
#define MI_STAT_COUNT_NULL() {0,0,0,0}
// Empty statistics
#if MI_STAT>1
#define MI_STAT_COUNT_END_NULL() , { MI_STAT_COUNT_NULL(), MI_INIT64(MI_STAT_COUNT_NULL) }
#define MI_STAT_COUNT_END_NULL() , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) }
#else
#define MI_STAT_COUNT_END_NULL()
#endif
@ -97,8 +110,8 @@ static mi_tld_t tld_main = {
0,
&_mi_heap_main,
{ { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
{ 0, NULL, NULL, 0, tld_main_stats }, // os
{ MI_STATS_NULL } // stats
{ 0, NULL, NULL, 0, tld_main_stats }, // os
{ MI_STATS_NULL } // stats
};
mi_heap_t _mi_heap_main = {

View file

@ -97,7 +97,7 @@ uint8_t _mi_bsr(uintptr_t x) {
// Returns MI_BIN_HUGE if the size is too large.
// We use `wsize` for the size in "machine word sizes",
// i.e. byte size == `wsize*sizeof(void*)`.
inline uint8_t _mi_bin(size_t size) {
extern inline uint8_t _mi_bin(size_t size) {
size_t wsize = _mi_wsize_from_size(size);
uint8_t bin;
if (wsize <= 1) {
@ -120,16 +120,21 @@ inline uint8_t _mi_bin(size_t size) {
bin = MI_BIN_HUGE;
}
else {
#if defined(MI_ALIGN4W)
#if defined(MI_ALIGN4W)
if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
#endif
#ifdef MI_BIN4
uint8_t b = mi_bsr32((uint32_t)wsize);
bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3;
#else
wsize--;
// find the highest bit
uint8_t b = mi_bsr32((uint32_t)wsize);
// and use the top 3 bits to determine the bin (~16% worst internal fragmentation).
// and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
// - adjust with 3 because we use do not round the first 8 sizes
// which each get an exact bin
bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
#endif
}
mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
return bin;

View file

@ -385,7 +385,7 @@ void _mi_page_retire(mi_page_t* page) {
// is the only page left with free blocks. It is not clear
// how to check this efficiently though... for now we just check
// if its neighbours are almost fully used.
if (mi_likely(page->block_size <= MI_SMALL_SIZE_MAX)) {
if (mi_likely(page->block_size <= MI_MEDIUM_SIZE_MAX)) {
if (mi_page_mostly_used(page->prev) && mi_page_mostly_used(page->next)) {
_mi_stat_counter_increase(&_mi_stats_main.page_no_retire,1);
return; // dont't retire after all
@ -722,10 +722,10 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept
// call potential deferred free routines
_mi_deferred_free(heap, false);
// free delayed frees from other threads
_mi_heap_delayed_free(heap);
// huge allocation?
mi_page_t* page;
if (mi_unlikely(size > MI_LARGE_SIZE_MAX)) {

View file

@ -236,8 +236,8 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se
// The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use,
// and no more than 2.
#define MI_SEGMENT_CACHE_MAX (2)
// and no more than 4.
#define MI_SEGMENT_CACHE_MAX (4)
#define MI_SEGMENT_CACHE_FRACTION (8)
// note: returned segment may be partially reset
@ -708,16 +708,20 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
/* -----------------------------------------------------------
Page allocation and free
----------------------------------------------------------- */
static bool mi_is_good_fit(size_t bsize, size_t size) {
// good fit if no more than 25% wasted
return (bsize > 0 && size > 0 && bsize < size && (size - (size % bsize)) < (size/4));
}
mi_page_t* _mi_segment_page_alloc(size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) {
mi_page_t* page;
if (block_size <= (MI_SMALL_PAGE_SIZE/4)) {
if (block_size <= MI_SMALL_SIZE_MAX || mi_is_good_fit(block_size,MI_SMALL_PAGE_SIZE)) {
page = mi_segment_small_page_alloc(tld,os_tld);
}
else if (block_size <= (MI_MEDIUM_PAGE_SIZE/4)) {
else if (block_size <= MI_MEDIUM_SIZE_MAX || mi_is_good_fit(block_size, MI_MEDIUM_PAGE_SIZE)) {
page = mi_segment_medium_page_alloc(tld, os_tld);
}
else if (block_size < (MI_LARGE_SIZE_MAX - sizeof(mi_segment_t))) {
else if (block_size < MI_LARGE_SIZE_MAX || mi_is_good_fit(block_size, MI_LARGE_PAGE_SIZE - sizeof(mi_segment_t))) {
page = mi_segment_large_page_alloc(tld, os_tld);
}
else {

View file

@ -6,8 +6,154 @@
#include <mimalloc.h>
#include <mimalloc-override.h> // redefines malloc etc.
#include <stdint.h>
#include <stdbool.h>
#define MI_INTPTR_SIZE 8
#define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE)
#define MI_BIN_HUGE 100
//#define MI_ALIGN2W
// Bit scan reverse: return the index of the highest bit.
static inline uint8_t mi_bsr32(uint32_t x);
#if defined(_MSC_VER)
#include <windows.h>
#include <intrin.h>
static inline uint8_t mi_bsr32(uint32_t x) {
uint32_t idx;
_BitScanReverse((DWORD*)&idx, x);
return idx;
}
#elif defined(__GNUC__) || defined(__clang__)
static inline uint8_t mi_bsr32(uint32_t x) {
return (31 - __builtin_clz(x));
}
#else
static inline uint8_t mi_bsr32(uint32_t x) {
// de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
static const uint8_t debruijn[32] = {
31, 0, 22, 1, 28, 23, 18, 2, 29, 26, 24, 10, 19, 7, 3, 12,
30, 21, 27, 17, 25, 9, 6, 11, 20, 16, 8, 5, 15, 4, 14, 13,
};
x |= x >> 1;
x |= x >> 2;
x |= x >> 4;
x |= x >> 8;
x |= x >> 16;
x++;
return debruijn[(x*0x076be629) >> 27];
}
#endif
// Bit scan reverse: return the index of the highest bit.
uint8_t _mi_bsr(uintptr_t x) {
if (x == 0) return 0;
#if MI_INTPTR_SIZE==8
uint32_t hi = (x >> 32);
return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi));
#elif MI_INTPTR_SIZE==4
return mi_bsr32(x);
#else
# error "define bsr for non-32 or 64-bit platforms"
#endif
}
static inline size_t _mi_wsize_from_size(size_t size) {
return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
}
// Return the bin for a given field size.
// Returns MI_BIN_HUGE if the size is too large.
// We use `wsize` for the size in "machine word sizes",
// i.e. byte size == `wsize*sizeof(void*)`.
extern inline uint8_t _mi_bin8(size_t size) {
size_t wsize = _mi_wsize_from_size(size);
uint8_t bin;
if (wsize <= 1) {
bin = 1;
}
#if defined(MI_ALIGN4W)
else if (wsize <= 4) {
bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
}
#elif defined(MI_ALIGN2W)
else if (wsize <= 8) {
bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
}
#else
else if (wsize <= 8) {
bin = (uint8_t)wsize;
}
#endif
else if (wsize > MI_LARGE_WSIZE_MAX) {
bin = MI_BIN_HUGE;
}
else {
#if defined(MI_ALIGN4W)
if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes
#endif
wsize--;
// find the highest bit
uint8_t b = mi_bsr32((uint32_t)wsize);
// and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation).
// - adjust with 3 because we use do not round the first 8 sizes
// which each get an exact bin
bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
}
return bin;
}
extern inline uint8_t _mi_bin4(size_t size) {
size_t wsize = _mi_wsize_from_size(size);
uint8_t bin;
if (wsize <= 1) {
bin = 1;
}
#if defined(MI_ALIGN4W)
else if (wsize <= 4) {
bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
}
#elif defined(MI_ALIGN2W)
else if (wsize <= 8) {
bin = (uint8_t)((wsize+1)&~1); // round to double word sizes
}
#else
else if (wsize <= 8) {
bin = (uint8_t)wsize;
}
#endif
else if (wsize > MI_LARGE_WSIZE_MAX) {
bin = MI_BIN_HUGE;
}
else {
uint8_t b = mi_bsr32((uint32_t)wsize);
bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3;
}
return bin;
}
void mi_bins() {
printf(" QNULL(1), /* 0 */ \\\n ");
size_t last_bin = 0;
for (size_t size = 1; size < (MI_INTPTR_SIZE*MI_LARGE_WSIZE_MAX); size++) {
size_t bin = _mi_bin4(size);
if (bin != last_bin) {
size_t wsize = (size-1)/sizeof(intptr_t);
// printf("size: %6zd, wsize: %6d, bin: %6zd\n", size - 1, (size-1)/sizeof(intptr_t), last_bin);
printf("QNULL(%6zd), ", wsize);
if (last_bin%8 == 0) printf("/* %i */ \\\n ", last_bin);
last_bin = bin;
}
}
}
int main() {
mi_version();
mi_bins();
void* p1 = malloc(78);
void* p2 = malloc(24);
free(p1);