diff --git a/ide/vs2017/mimalloc-override.vcxproj b/ide/vs2017/mimalloc-override.vcxproj index f41b2efc..7d452b55 100644 --- a/ide/vs2017/mimalloc-override.vcxproj +++ b/ide/vs2017/mimalloc-override.vcxproj @@ -35,7 +35,6 @@ DynamicLibrary false v141 - true DynamicLibrary @@ -46,7 +45,6 @@ DynamicLibrary false v141 - true @@ -70,25 +68,25 @@ $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll - mimalloc + mimalloc-override $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll - mimalloc + mimalloc-override $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll - mimalloc + mimalloc-override $(SolutionDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(SolutionDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ .dll - mimalloc + mimalloc-override @@ -100,15 +98,17 @@ MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions); MultiThreadedDebugDLL false - CompileAsCpp + Default - ../../bin/mimalloc-redirect32.lib;%(AdditionalDependencies) + %(AdditionalDependencies) Default + DllEntry + false @@ -121,15 +121,17 @@ MI_SHARED_LIB;MI_SHARED_LIB_EXPORT;MI_MALLOC_OVERRIDE;%(PreprocessorDefinitions); MultiThreadedDebugDLL false - CompileAsCpp + Default - ../../bin/mimalloc-redirect.lib;%(AdditionalDependencies) + %(AdditionalDependencies) Default + DllEntry + false COPY /Y $(SolutionDir)..\..\bin\mimalloc-redirect.dll $(OutputPath) @@ -152,15 +154,17 @@ $(IntDir) false MultiThreadedDLL - CompileAsCpp + Default true true - ../../bin/mimalloc-redirect32.lib;%(AdditionalDependencies) + %(AdditionalDependencies) Default + DllEntry + false @@ -177,15 +181,17 @@ $(IntDir) false MultiThreadedDLL - CompileAsCpp + Default true true - ../../bin/mimalloc-redirect.lib;%(AdditionalDependencies) + %(AdditionalDependencies) Default + DllEntry + false COPY /Y $(SolutionDir)..\..\bin\mimalloc-redirect.dll $(OutputPath) @@ -208,6 +214,7 @@ false false + true true diff --git a/ide/vs2017/mimalloc-override.vcxproj.filters b/ide/vs2017/mimalloc-override.vcxproj.filters index ffabddac..df0bf5ed 100644 --- a/ide/vs2017/mimalloc-override.vcxproj.filters +++ b/ide/vs2017/mimalloc-override.vcxproj.filters @@ -67,5 +67,8 @@ Source Files + + Source Files + \ No newline at end of file diff --git a/ide/vs2017/mimalloc-test-stress.vcxproj b/ide/vs2017/mimalloc-test-stress.vcxproj index e8cc5045..b8267d0b 100644 --- a/ide/vs2017/mimalloc-test-stress.vcxproj +++ b/ide/vs2017/mimalloc-test-stress.vcxproj @@ -67,19 +67,19 @@ - $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ + $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ - $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ + $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ - $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ + $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ - $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ + $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ diff --git a/ide/vs2017/mimalloc-test.vcxproj b/ide/vs2017/mimalloc-test.vcxproj index c1539aeb..27c7bb6e 100644 --- a/ide/vs2017/mimalloc-test.vcxproj +++ b/ide/vs2017/mimalloc-test.vcxproj @@ -67,19 +67,19 @@ - $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ + $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ - $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ + $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ - $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ + $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ - $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ + $(ProjectDir)..\..\out\msvc-$(Platform)\$(Configuration)\ $(ProjectDir)..\..\out\msvc-$(Platform)\$(ProjectName)\$(Configuration)\ diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index e261dba2..ad9b3ecf 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -307,13 +307,23 @@ static inline bool mi_page_all_used(mi_page_t* page) { static inline bool mi_page_mostly_used(const mi_page_t* page) { if (page==NULL) return true; uint16_t frac = page->reserved / 8U; - return (page->reserved - page->used + page->thread_freed < frac); + return (page->reserved - page->used + page->thread_freed <= frac); } static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) { return &((mi_heap_t*)heap)->pages[_mi_bin(size)]; } +static inline uintptr_t mi_page_thread_id(const mi_page_t* page) { + return (page->flags.xthread_id << MI_PAGE_FLAGS_BITS); +} + +static inline void mi_page_init_flags(mi_page_t* page, uintptr_t thread_id) { + page->flags.value = 0; + page->flags.xthread_id = (thread_id >> MI_PAGE_FLAGS_BITS); + mi_assert(page->flags.value == thread_id); +} + // ------------------------------------------------------------------- // Encoding/Decoding the free list next pointers // ------------------------------------------------------------------- diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index 5c14ffd4..4002c12c 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -91,19 +91,19 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_MEDIUM_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_MEDIUM_PAGE_SIZE) #define MI_LARGE_PAGES_PER_SEGMENT (MI_SEGMENT_SIZE/MI_LARGE_PAGE_SIZE) -#define MI_MEDIUM_SIZE_MAX (MI_MEDIUM_PAGE_SIZE/8) // 64kb on 64-bit -#define MI_LARGE_SIZE_MAX (MI_LARGE_PAGE_SIZE/8) // 512kb on 64-bit +#define MI_MEDIUM_SIZE_MAX (MI_MEDIUM_PAGE_SIZE/4) // 128kb on 64-bit +#define MI_LARGE_SIZE_MAX (MI_LARGE_PAGE_SIZE/4) // 1Mb on 64-bit #define MI_LARGE_WSIZE_MAX (MI_LARGE_SIZE_MAX>>MI_INTPTR_SHIFT) -// Maximum number of size classes. (spaced exponentially in 16.7% increments) -#define MI_BIN_HUGE (64U) - // Minimal alignment necessary. On most platforms 16 bytes are needed // due to SSE registers for example. This must be at least `MI_INTPTR_SIZE` #define MI_MAX_ALIGN_SIZE 16 // sizeof(max_align_t) -#if (MI_LARGE_WSIZE_MAX > 131072) +// Maximum number of size classes. (spaced exponentially in 12.5% increments) +#define MI_BIN_HUGE (70U) + +#if (MI_LARGE_WSIZE_MAX > 393216) #error "define more bins" #endif @@ -123,14 +123,26 @@ typedef enum mi_delayed_e { } mi_delayed_t; +// Use the lowest two bits of a thread id for the `in_full` and `has_aligned` flags +// This allows a single test in `mi_free` to check for unlikely cases +// (namely, non-local free, aligned free, or freeing in a full page) +#define MI_PAGE_FLAGS_BITS (2) +#define MI_PAGE_FLAGS_TID_BITS (MI_INTPTR_SIZE*8 - MI_PAGE_FLAGS_BITS) typedef union mi_page_flags_u { - uint16_t value; + uintptr_t value; struct { - bool has_aligned; - bool in_full; + #ifdef MI_BIG_ENDIAN + uintptr_t xthread_id : MI_PAGE_FLAGS_TID_BITS; + #endif + uintptr_t in_full : 1; + uintptr_t has_aligned : 1; + #ifndef MI_BIG_ENDIAN + uintptr_t xthread_id : MI_PAGE_FLAGS_TID_BITS; + #endif }; } mi_page_flags_t; + // Thread free list. // We use the bottom 2 bits of the pointer for mi_delayed_t flags typedef uintptr_t mi_thread_free_t; @@ -161,15 +173,15 @@ typedef struct mi_page_s { bool is_committed:1; // `true` if the page virtual memory is committed // layout like this to optimize access in `mi_malloc` and `mi_free` - mi_page_flags_t flags; uint16_t capacity; // number of blocks committed uint16_t reserved; // number of blocks reserved in memory - + // 16 bits padding mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) #if MI_SECURE uintptr_t cookie; // random cookie to encode the free lists #endif size_t used; // number of blocks in use (including blocks in `local_free` and `thread_free`) + mi_page_flags_t flags; // threadid:62 | has_aligned:1 | in_full:1 mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) volatile uintptr_t thread_freed; // at least this number of blocks are in `thread_free` @@ -182,10 +194,10 @@ typedef struct mi_page_s { struct mi_page_s* prev; // previous page owned by this thread with the same `block_size` // improve page index calculation -#if MI_INTPTR_SIZE==8 - //void* padding[1]; // 12 words on 64-bit +#if (MI_INTPTR_SIZE==8 && MI_SECURE==0) + void* padding[1]; // 12 words on 64-bit #elif MI_INTPTR_SIZE==4 - void* padding[1]; // 12 words on 32-bit + // void* padding[1]; // 12 words on 32-bit #endif } mi_page_t; @@ -215,7 +227,7 @@ typedef struct mi_segment_s { // layout like this to optimize access in `mi_free` size_t page_shift; // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`). - uintptr_t thread_id; // unique id of the thread owning this segment + volatile uintptr_t thread_id; // unique id of the thread owning this segment mi_page_kind_t page_kind; // kind of pages: small, large, or huge mi_page_t pages[1]; // up to `MI_SMALL_PAGES_PER_SEGMENT` pages } mi_segment_t; @@ -324,12 +336,12 @@ typedef struct mi_stats_s { mi_stat_count_t pages_abandoned; mi_stat_count_t pages_extended; mi_stat_count_t mmap_calls; - mi_stat_count_t mmap_right_align; - mi_stat_count_t mmap_ensure_aligned; mi_stat_count_t commit_calls; mi_stat_count_t threads; mi_stat_count_t huge; mi_stat_count_t malloc; + mi_stat_count_t segments_cache; + mi_stat_counter_t page_no_retire; mi_stat_counter_t searches; #if MI_STAT>1 mi_stat_count_t normal[MI_BIN_HUGE+1]; diff --git a/src/alloc-override-win.c b/src/alloc-override-win.c new file mode 100644 index 00000000..d1d51b9a --- /dev/null +++ b/src/alloc-override-win.c @@ -0,0 +1,714 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +#include "mimalloc.h" +#include "mimalloc-internal.h" + +#if !defined(_WIN32) +#error "this file should only be included on Windows" +#endif + +#include +#include + +#include // getenv +#include // strstr + + +/* +To override the C runtime `malloc` on Windows we need to patch the allocation +functions at runtime initialization. Unfortunately we can never patch before the +runtime initializes itself, because as soon as we call `GetProcAddress` on the +runtime module (a DLL or EXE in Windows speak), it will first load and initialize +(by the OS calling `DllMain` on it). + +This means that some things might be already allocated by the C runtime itself +(and possibly other DLL's) before we get to resolve runtime adresses. This is +no problem if everyone unwinds in order: when we unload, we unpatch and restore +the original crt `free` routines and crt malloc'd memory is freed correctly. + +But things go wrong if such early CRT alloc'd memory is freed or re-allocated +_after_ we patch, but _before_ we unload (and unpatch), or if any memory allocated +by us is freed after we unpatched. + +There are two tricky situations to deal with: + +1. The Thread Local Storage (TLS): when the main thread stops it will call registered + callbacks on TLS entries (allocated by `FlsAlloc`). This is done by the OS + before any DLL's are unloaded. Unfortunately, the C runtime registers such + TLS entries with CRT allocated memory which is freed in the callback. + +2. Inside the CRT: + a. Some variables might get initialized by patched allocated + blocks but freed during CRT unloading after we unpatched + (like temporary file buffers). + b. Some blocks are allocated at CRT and freed by the CRT (like the + environment storage). + c. And some blocks are allocated by the CRT and then reallocated + while patched, and finally freed after unpatching! This + happens with the `atexit` functions for example to grow the array + of registered functions. + +In principle situation 2 is hopeless: since we cannot patch before CRT initialization, +we can never be sure how to free or reallocate a pointer during CRT unloading. +However, in practice there is a good solution: when terminating, we just patch +the reallocation and free routines to no-ops -- we are winding down anyway! This leaves +just the reallocation problm of CRT alloc'd memory once we are patched. Here, a study of the +CRT reveals that there seem to be just three such situations: + +1. When registering `atexit` routines (to grow the exit function table), +2. When calling `_setmaxstdio` (to grow the file handle table), +3. and `_popen`/`_wpopen` (to grow handle pairs). These turn out not to be + a problem as these are NULL initialized. + +We fix these by providing wrappers: + +1. We first register a _global_ `atexit` routine ourselves (`mi_patches_at_exit`) before patching, + and then patch the `_crt_atexit` function to implement our own global exit list (and the + same for `_crt_at_quick_exit`). All module local lists are no problem since they are always fully + (un)patched from initialization to end. We can register in the global list by dynamically + getting the global `_crt_atexit` entry from `ucrtbase.dll`. + +2. The `_setmaxstdio` is _detoured_: we patch it by a stub that unpatches first, + calls the original routine and repatches again. + +That leaves us to reliably shutdown and enter "termination mode": + +1. Using our trick to get the global exit list entry point, we register an exit function `mi_patches_atexit` + that first executes all our home brew list of exit functions, and then enters a _termination_ + phase that patches realloc/free variants with no-ops. Patching later again with special no-ops for + `free` also improves efficiency during the program run since no flags need to be checked. + +2. That is not quite good enough yet since after executing exit routines after us on the + global exit list (registered by the CRT), + the OS starts to unwind the TLS callbacks and we would like to run callbacks registered after loading + our DLL to be done in patched mode. So, we also allocate a TLS entry when our DLL is loaded and when its + callback is called, we re-enable the original patches again. Since TLS is destroyed in FIFO order + this runs any callbacks in later DLL's in patched mode. + +3. Finally the DLL's get unloaded by the OS in order (still patched) until our DLL gets unloaded + and then we start a termination phase again, and patch realloc/free with no-ops for good this time. + +*/ + +static int __cdecl mi_setmaxstdio(int newmax); + +// ------------------------------------------------------ +// Microsoft allocation extensions +// ------------------------------------------------------ + + +typedef size_t mi_nothrow_t; + +static void mi_free_nothrow(void* p, mi_nothrow_t tag) { + UNUSED(tag); + mi_free(p); +} + +// Versions of `free`, `realloc`, `recalloc`, `expand` and `msize` +// that are used during termination and are no-ops. +static void mi_free_term(void* p) { + UNUSED(p); +} + +static void mi_free_size_term(void* p, size_t size) { + UNUSED(size); + UNUSED(p); +} + +static void mi_free_nothrow_term(void* p, mi_nothrow_t tag) { + UNUSED(tag); + UNUSED(p); +} + +static void* mi_realloc_term(void* p, size_t newsize) { + UNUSED(p); UNUSED(newsize); + return NULL; +} + +static void* mi__recalloc_term(void* p, size_t newcount, size_t newsize) { + UNUSED(p); UNUSED(newcount); UNUSED(newsize); + return NULL; +} + +static void* mi__expand_term(void* p, size_t newsize) { + UNUSED(p); UNUSED(newsize); + return NULL; +} + +static size_t mi__msize_term(void* p) { + UNUSED(p); + return 0; +} + + +static void* mi__malloc_dbg(size_t size, int block_type, const char* fname, int line) { + UNUSED(block_type); UNUSED(fname); UNUSED(line); + return _malloc_base(size); +} + +static void* mi__calloc_dbg(size_t count, size_t size, int block_type, const char* fname, int line) { + UNUSED(block_type); UNUSED(fname); UNUSED(line); + return _calloc_base(count, size); +} + +static void* mi__realloc_dbg(void* p, size_t size, int block_type, const char* fname, int line) { + UNUSED(block_type); UNUSED(fname); UNUSED(line); + return _realloc_base(p, size); +} + +static void mi__free_dbg(void* p, int block_type) { + UNUSED(block_type); + _free_base(p); +} + + +// the `recalloc`,`expand`, and `msize` don't have base versions and thus need a separate term version + +static void* mi__recalloc_dbg(void* p, size_t count, size_t size, int block_type, const char* fname, int line) { + UNUSED(block_type); UNUSED(fname); UNUSED(line); + return mi_recalloc(p, count, size); +} + +static void* mi__expand_dbg(void* p, size_t size, int block_type, const char* fname, int line) { + UNUSED(block_type); UNUSED(fname); UNUSED(line); + return mi__expand(p, size); +} + +static size_t mi__msize_dbg(void* p, int block_type) { + UNUSED(block_type); + return mi_usable_size(p); +} + +static void* mi__recalloc_dbg_term(void* p, size_t count, size_t size, int block_type, const char* fname, int line) { + UNUSED(block_type); UNUSED(fname); UNUSED(line); + return mi__recalloc_term(p, count, size); +} + +static void* mi__expand_dbg_term(void* p, size_t size, int block_type, const char* fname, int line) { + UNUSED(block_type); UNUSED(fname); UNUSED(line); + return mi__expand_term(p, size); +} + +static size_t mi__msize_dbg_term(void* p, int block_type) { + UNUSED(block_type); + return mi__msize_term(p); +} + + +// ------------------------------------------------------ +// implement our own global atexit handler +// ------------------------------------------------------ +typedef void (cbfun_t)(void); +typedef int (atexit_fun_t)(cbfun_t* fn); +typedef uintptr_t encoded_t; + +typedef struct exit_list_s { + encoded_t functions; // encoded pointer to array of encoded function pointers + size_t count; + size_t capacity; +} exit_list_t; + +#define MI_EXIT_INC (64) + +static exit_list_t atexit_list = { 0, 0, 0 }; +static exit_list_t at_quick_exit_list = { 0, 0, 0 }; +static CRITICAL_SECTION atexit_lock; + +// encode/decode function pointers with a random canary for security +static encoded_t canary; + +static inline void *decode(encoded_t x) { + return (void*)(x^canary); +} + +static inline encoded_t encode(void* p) { + return ((uintptr_t)p ^ canary); +} + + +static void init_canary() +{ + canary = _mi_random_init(0); + atexit_list.functions = at_quick_exit_list.functions = encode(NULL); +} + + +// initialize the list +static void mi_initialize_atexit(void) { + InitializeCriticalSection(&atexit_lock); + init_canary(); +} + +// register an exit function +static int mi_register_atexit(exit_list_t* list, cbfun_t* fn) { + if (fn == NULL) return EINVAL; + EnterCriticalSection(&atexit_lock); + encoded_t* functions = (encoded_t*)decode(list->functions); + if (list->count >= list->capacity) { // at first `functions == decode(0) == NULL` + encoded_t* newf = (encoded_t*)mi_recalloc(functions, list->capacity + MI_EXIT_INC, sizeof(cbfun_t*)); + if (newf != NULL) { + list->capacity += MI_EXIT_INC; + list->functions = encode(newf); + functions = newf; + } + } + int result; + if (list->count < list->capacity && functions != NULL) { + functions[list->count] = encode(fn); + list->count++; + result = 0; // success + } + else { + result = ENOMEM; + } + LeaveCriticalSection(&atexit_lock); + return result; +} + +// Register a global `atexit` function +static int mi_atexit(cbfun_t* fn) { + return mi_register_atexit(&atexit_list,fn); +} + +static int mi_at_quick_exit(cbfun_t* fn) { + return mi_register_atexit(&at_quick_exit_list,fn); +} + +static int mi_register_onexit(void* table, cbfun_t* fn) { + // TODO: how can we distinguish a quick_exit from atexit? + return mi_atexit(fn); +} + +// Execute exit functions in a list +static void mi_execute_exit_list(exit_list_t* list) { + // copy and zero the list structure + EnterCriticalSection(&atexit_lock); + exit_list_t clist = *list; + memset(list,0,sizeof(*list)); + LeaveCriticalSection(&atexit_lock); + + // now execute the functions outside of the lock + encoded_t* functions = (encoded_t*)decode(clist.functions); + if (functions != NULL) { + for (size_t i = clist.count; i > 0; i--) { // careful with unsigned count down.. + cbfun_t* fn = (cbfun_t*)decode(functions[i-1]); + if (fn==NULL) break; // corrupted! + fn(); + } + mi_free(functions); + } +} + + + +// ------------------------------------------------------ +// Jump assembly instructions for patches +// ------------------------------------------------------ + +#if defined(_M_IX86) || defined(_M_X64) + +#define MI_JUMP_SIZE 14 // at most 2+4+8 for a long jump or 1+5 for a short one + +typedef struct mi_jump_s { + uint8_t opcodes[MI_JUMP_SIZE]; +} mi_jump_t; + +void mi_jump_restore(void* current, const mi_jump_t* saved) { + memcpy(current, &saved->opcodes, MI_JUMP_SIZE); +} + +void mi_jump_write(void* current, void* target, mi_jump_t* save) { + if (save != NULL) { + memcpy(&save->opcodes, current, MI_JUMP_SIZE); + } + uint8_t* opcodes = ((mi_jump_t*)current)->opcodes; + ptrdiff_t diff = (uint8_t*)target - (uint8_t*)current; + uint32_t ofs32 = (uint32_t)diff; + #ifdef _M_X64 + uint64_t ofs64 = (uint64_t)diff; + if (ofs64 != (uint64_t)ofs32) { + // use long jump + opcodes[0] = 0xFF; + opcodes[1] = 0x25; + *((uint32_t*)&opcodes[2]) = 0; + *((uint64_t*)&opcodes[6]) = (uint64_t)target; + } + else + #endif + { + // use short jump + opcodes[0] = 0xE9; + *((uint32_t*)&opcodes[1]) = ofs32 - 5 /* size of the short jump instruction */; + } +} + +#elif defined(_M_ARM64) + +#define MI_JUMP_SIZE 16 + +typedef struct mi_jump_s { + uint8_t opcodes[MI_JUMP_SIZE]; +} mi_jump_t; + +void mi_jump_restore(void* current, const mi_jump_t* saved) { + memcpy(current, &saved->opcodes, MI_JUMP_SIZE); +} + +void mi_jump_write(void* current, void* target, mi_jump_t* save) { + if (save != NULL) { + memcpy(&save->opcodes, current, MI_JUMP_SIZE); + } + uint8_t* opcodes = ((mi_jump_t*)current)->opcodes; + uint64_t diff = (uint8_t*)target - (uint8_t*)current; + + // 0x50 0x00 0x00 0x58 ldr x16, .+8 # load PC relative +8 + // 0x00 0x02 0x3F 0xD6 blr x16 # and jump + //
+ //
+ static const uint8_t jump_opcodes[8] = { 0x50, 0x00, 0x00, 0x58, 0x00, 0x02, 0x3F, 0xD6 }; + memcpy(&opcodes[0], jump_opcodes, sizeof(jump_opcodes)); + *((uint64_t*)&opcodes[8]) = diff; +} + +#else +#error "define jump instructions for this platform" +#endif + + +// ------------------------------------------------------ +// Patches +// ------------------------------------------------------ +typedef enum patch_apply_e { + PATCH_NONE, + PATCH_TARGET, + PATCH_TARGET_TERM +} patch_apply_t; + +#define MAX_ENTRIES 4 // maximum number of patched entry points (like `malloc` in ucrtbase and msvcrt) + +typedef struct mi_patch_s { + const char* name; // name of the function to patch + void* target; // the address of the new target (never NULL) + void* target_term; // the address of the target during termination (or NULL) + patch_apply_t applied; // what target has been applied? + void* originals[MAX_ENTRIES]; // the resolved addresses of the function (or NULLs) + mi_jump_t saves[MAX_ENTRIES]; // the saved instructions in case it was applied +} mi_patch_t; + +#define MI_PATCH_NAME3(name,target,term) { name, &target, &term, PATCH_NONE, {NULL,NULL,NULL,NULL} } +#define MI_PATCH_NAME2(name,target) { name, &target, NULL, PATCH_NONE, {NULL,NULL,NULL,NULL} } +#define MI_PATCH3(name,target,term) MI_PATCH_NAME3(#name, target, term) +#define MI_PATCH2(name,target) MI_PATCH_NAME2(#name, target) +#define MI_PATCH1(name) MI_PATCH2(name,mi_##name) + +static mi_patch_t patches[] = { + // we implement our own global exit handler (as the CRT versions do a realloc internally) + //MI_PATCH2(_crt_atexit, mi_atexit), + //MI_PATCH2(_crt_at_quick_exit, mi_at_quick_exit), + MI_PATCH2(_setmaxstdio, mi_setmaxstdio), + MI_PATCH2(_register_onexit_function, mi_register_onexit), + + // override higher level atexit functions so we can implement at_quick_exit correcty + MI_PATCH2(atexit, mi_atexit), + MI_PATCH2(at_quick_exit, mi_at_quick_exit), + + // regular entries + MI_PATCH2(malloc, mi_malloc), + MI_PATCH2(calloc, mi_calloc), + MI_PATCH3(realloc, mi_realloc,mi_realloc_term), + MI_PATCH3(free, mi_free,mi_free_term), + + // extended api + MI_PATCH2(_strdup, mi_strdup), + MI_PATCH2(_strndup, mi_strndup), + MI_PATCH3(_expand, mi__expand,mi__expand_term), + MI_PATCH3(_recalloc, mi_recalloc,mi__recalloc_term), + MI_PATCH3(_msize, mi_usable_size,mi__msize_term), + + // base versions + MI_PATCH2(_malloc_base, mi_malloc), + MI_PATCH2(_calloc_base, mi_calloc), + MI_PATCH3(_realloc_base, mi_realloc,mi_realloc_term), + MI_PATCH3(_free_base, mi_free,mi_free_term), + + // these base versions are in the crt but without import records + MI_PATCH_NAME3("_recalloc_base", mi_recalloc,mi__recalloc_term), + MI_PATCH_NAME3("_msize_base", mi_usable_size,mi__msize_term), + + // debug + MI_PATCH2(_malloc_dbg, mi__malloc_dbg), + MI_PATCH2(_realloc_dbg, mi__realloc_dbg), + MI_PATCH2(_calloc_dbg, mi__calloc_dbg), + MI_PATCH2(_free_dbg, mi__free_dbg), + + MI_PATCH3(_expand_dbg, mi__expand_dbg, mi__expand_dbg_term), + MI_PATCH3(_recalloc_dbg, mi__recalloc_dbg, mi__recalloc_dbg_term), + MI_PATCH3(_msize_dbg, mi__msize_dbg, mi__msize_dbg_term), + +#if 0 + // override new/delete variants for efficiency (?) +#ifdef _WIN64 + // 64 bit new/delete + MI_PATCH_NAME2("??2@YAPEAX_K@Z", mi_new), + MI_PATCH_NAME2("??_U@YAPEAX_K@Z", mi_new), + MI_PATCH_NAME3("??3@YAXPEAX@Z", mi_free, mi_free_term), + MI_PATCH_NAME3("??_V@YAXPEAX@Z", mi_free, mi_free_term), + MI_PATCH_NAME3("??3@YAXPEAX_K@Z", mi_free_size, mi_free_size_term), // delete sized + MI_PATCH_NAME3("??_V@YAXPEAX_K@Z", mi_free_size, mi_free_size_term), // delete sized + MI_PATCH_NAME2("??2@YAPEAX_KAEBUnothrow_t@std@@@Z", mi_new), + MI_PATCH_NAME2("??_U@YAPEAX_KAEBUnothrow_t@std@@@Z", mi_new), + MI_PATCH_NAME3("??3@YAXPEAXAEBUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term), + MI_PATCH_NAME3("??_V@YAXPEAXAEBUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term), + + +#else + // 32 bit new/delete + MI_PATCH_NAME2("??2@YAPAXI@Z", mi_new), + MI_PATCH_NAME2("??_U@YAPAXI@Z", mi_new), + MI_PATCH_NAME3("??3@YAXPAX@Z", mi_free, mi_free_term), + MI_PATCH_NAME3("??_V@YAXPAX@Z", mi_free, mi_free_term), + MI_PATCH_NAME3("??3@YAXPAXI@Z", mi_free_size, mi_free_size_term), // delete sized + MI_PATCH_NAME3("??_V@YAXPAXI@Z", mi_free_size, mi_free_size_term), // delete sized + + MI_PATCH_NAME2("??2@YAPAXIABUnothrow_t@std@@@Z", mi_new), + MI_PATCH_NAME2("??_U@YAPAXIABUnothrow_t@std@@@Z", mi_new), + MI_PATCH_NAME3("??3@YAXPAXABUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term), + MI_PATCH_NAME3("??_V@YAXPAXABUnothrow_t@std@@@Z", mi_free_nothrow, mi_free_nothrow_term), + +#endif +#endif + { NULL, NULL, NULL, PATCH_NONE, {NULL,NULL,NULL,NULL} } +}; + + +// Apply a patch +static bool mi_patch_apply(mi_patch_t* patch, patch_apply_t apply) +{ + if (patch->originals[0] == NULL) return true; // unresolved + if (apply == PATCH_TARGET_TERM && patch->target_term == NULL) apply = PATCH_TARGET; // avoid re-applying non-term variants + if (patch->applied == apply) return false; + + for (int i = 0; i < MAX_ENTRIES; i++) { + void* original = patch->originals[i]; + if (original == NULL) break; // no more + + DWORD protect = PAGE_READWRITE; + if (!VirtualProtect(original, MI_JUMP_SIZE, PAGE_EXECUTE_READWRITE, &protect)) return false; + if (apply == PATCH_NONE) { + mi_jump_restore(original, &patch->saves[i]); + } + else { + void* target = (apply == PATCH_TARGET ? patch->target : patch->target_term); + mi_assert_internal(target != NULL); + if (target != NULL) mi_jump_write(original, target, &patch->saves[i]); + } + VirtualProtect(original, MI_JUMP_SIZE, protect, &protect); + } + patch->applied = apply; + return true; +} + +// Apply all patches +static bool _mi_patches_apply(patch_apply_t apply, patch_apply_t* previous) { + static patch_apply_t current = PATCH_NONE; + if (previous != NULL) *previous = current; + if (current == apply) return true; + current = apply; + bool ok = true; + for (size_t i = 0; patches[i].name != NULL; i++) { + if (!mi_patch_apply(&patches[i], apply)) ok = false; + } + return ok; +} + +// Export the following three functions just in case +// a user needs that level of control. + +// Disable all patches +mi_decl_export void mi_patches_disable(void) { + _mi_patches_apply(PATCH_NONE, NULL); +} + +// Enable all patches normally +mi_decl_export bool mi_patches_enable(void) { + return _mi_patches_apply( PATCH_TARGET, NULL ); +} + +// Enable all patches in termination phase where free is a no-op +mi_decl_export bool mi_patches_enable_term(void) { + return _mi_patches_apply(PATCH_TARGET_TERM, NULL); +} + +// ------------------------------------------------------ +// Stub for _setmaxstdio +// ------------------------------------------------------ + +static int __cdecl mi_setmaxstdio(int newmax) { + patch_apply_t previous; + _mi_patches_apply(PATCH_NONE, &previous); // disable patches + int result = _setmaxstdio(newmax); // call original function (that calls original CRT recalloc) + _mi_patches_apply(previous,NULL); // and re-enable patches + return result; +} + + +// ------------------------------------------------------ +// Resolve addresses dynamically +// ------------------------------------------------------ + +// Try to resolve patches for a given module (DLL) +static void mi_module_resolve(const char* fname, HMODULE mod, int priority) { + // see if any patches apply + for (size_t i = 0; patches[i].name != NULL; i++) { + mi_patch_t* patch = &patches[i]; + if (patch->applied == PATCH_NONE) { + // find an available entry + int i = 0; + while (i < MAX_ENTRIES && patch->originals[i] != NULL) i++; + if (i < MAX_ENTRIES) { + void* addr = GetProcAddress(mod, patch->name); + if (addr != NULL) { + // found it! set the address + patch->originals[i] = addr; + _mi_trace_message(" found %s at %s!%p (entry %i)\n", patch->name, fname, addr, i); + } + } + } + } +} + +#define MIMALLOC_NAME "mimalloc-override.dll" +#define UCRTBASE_NAME "ucrtbase.dll" +#define UCRTBASED_NAME "ucrtbased.dll" + +// Resolve addresses of all patches by inspecting the loaded modules +static atexit_fun_t* crt_atexit = NULL; +static atexit_fun_t* crt_at_quick_exit = NULL; + + +static bool mi_patches_resolve(void) { + // get all loaded modules + HANDLE process = GetCurrentProcess(); // always -1, no need to release + DWORD needed = 0; + HMODULE modules[400]; // try to stay under 4k to not trigger the guard page + EnumProcessModules(process, modules, sizeof(modules), &needed); + if (needed == 0) return false; + int count = needed / sizeof(HMODULE); + int ucrtbase_index = 0; + int mimalloc_index = 0; + // iterate through the loaded modules + for (int i = 0; i < count; i++) { + HMODULE mod = modules[i]; + char filename[MAX_PATH] = { 0 }; + DWORD slen = GetModuleFileName(mod, filename, MAX_PATH); + if (slen > 0 && slen < MAX_PATH) { + // filter out potential crt modules only + filename[slen] = 0; + const char* lastsep = strrchr(filename, '\\'); + const char* basename = (lastsep==NULL ? filename : lastsep+1); + _mi_trace_message(" %i: dynamic module %s\n", i, filename); + + // remember indices so we can check load order (in debug mode) + if (_stricmp(basename, MIMALLOC_NAME) == 0) mimalloc_index = i; + if (_stricmp(basename, UCRTBASE_NAME) == 0) ucrtbase_index = i; + if (_stricmp(basename, UCRTBASED_NAME) == 0) ucrtbase_index = i; + + // see if we potentially patch in this module + int priority = 0; + if (i == 0) priority = 2; // main module to allow static crt linking + else if (_strnicmp(basename, "ucrt", 4) == 0) priority = 3; // new ucrtbase.dll in windows 10 + // NOTE: don't override msvcr -- leads to crashes in setlocale (needs more testing) + // else if (_strnicmp(basename, "msvcr", 5) == 0) priority = 1; // older runtimes + + if (priority > 0) { + // probably found a crt module, try to patch it + mi_module_resolve(basename,mod,priority); + + // try to find the atexit functions for the main process (in `ucrtbase.dll`) + if (crt_atexit==NULL) crt_atexit = (atexit_fun_t*)GetProcAddress(mod, "_crt_atexit"); + if (crt_at_quick_exit == NULL) crt_at_quick_exit = (atexit_fun_t*)GetProcAddress(mod, "_crt_at_quick_exit"); + } + } + } + int diff = mimalloc_index - ucrtbase_index; + if (diff > 1) { + _mi_warning_message("warning: the \"mimalloc-override\" DLL seems not to load before or right after the C runtime (\"ucrtbase\").\n" + " Try to fix this by changing the linking order.\n"); + } + return true; +} + + +// ------------------------------------------------------ +// Dll Entry +// ------------------------------------------------------ + +extern BOOL WINAPI _DllMainCRTStartup(HINSTANCE inst, DWORD reason, LPVOID reserved); + +static DWORD mi_fls_unwind_entry; +static void NTAPI mi_fls_unwind(PVOID value) { + if (value != NULL) mi_patches_enable(); // and re-enable normal patches again for DLL's loaded after us + return; +} + +static void mi_patches_atexit(void) { + mi_execute_exit_list(&atexit_list); + mi_patches_enable_term(); // enter termination phase and patch realloc/free with a no-op +} + +static void mi_patches_at_quick_exit(void) { + mi_execute_exit_list(&at_quick_exit_list); + mi_patches_enable_term(); // enter termination phase and patch realloc/free with a no-op +} + +__declspec(dllexport) BOOL WINAPI DllEntry(HINSTANCE inst, DWORD reason, LPVOID reserved) { + if (reason == DLL_PROCESS_ATTACH) { + __security_init_cookie(); + } + else if (reason == DLL_PROCESS_DETACH) { + // enter termination phase for good now + mi_patches_enable_term(); + } + // C runtime main + BOOL ok = _DllMainCRTStartup(inst, reason, reserved); + if (reason == DLL_PROCESS_ATTACH && ok) { + // initialize at exit lists + mi_initialize_atexit(); + + // Now resolve patches + ok = mi_patches_resolve(); + if (ok) { + // check if patching is not disabled + #pragma warning(suppress:4996) + const char* s = getenv("MIMALLOC_DISABLE_OVERRIDE"); + bool enabled = (s == NULL || !(strstr("1;TRUE;YES;ON", s) != NULL)); + if (!enabled) { + _mi_verbose_message("override is disabled\n"); + } + else { + // and register our unwind entry (this must be after resolving due to possible delayed DLL initialization from GetProcAddress) + mi_fls_unwind_entry = FlsAlloc(&mi_fls_unwind); + if (mi_fls_unwind_entry != FLS_OUT_OF_INDEXES) { + FlsSetValue(mi_fls_unwind_entry, (void*)1); + } + + // register our patch disabler in the global exit list + if (crt_atexit != NULL) (*crt_atexit)(&mi_patches_atexit); + if (crt_at_quick_exit != NULL) (*crt_at_quick_exit)(&mi_patches_at_quick_exit); + + // and patch ! this also redirects the `atexit` handling for the global exit list + mi_patches_enable(); + _mi_verbose_message("override is enabled\n"); + + // hide internal allocation + mi_stats_reset(); + } + } + } + return ok; +} diff --git a/src/alloc-override.c b/src/alloc-override.c index 345d396c..e5eeaab2 100644 --- a/src/alloc-override.c +++ b/src/alloc-override.c @@ -9,7 +9,7 @@ terms of the MIT license. A copy of the license can be found in the file #error "this file should be included from 'alloc.c' (so aliases can work)" #endif -#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL) && defined(_WIN64)) +#if defined(MI_MALLOC_OVERRIDE) && defined(_WIN32) && !(defined(MI_SHARED_LIB) && defined(_DLL)) #error "It is only possible to override "malloc" on Windows when building as a 64-bit DLL (and linking the C runtime as a DLL)" #endif diff --git a/src/alloc-posix.c b/src/alloc-posix.c index 1f55b3a8..672b73b3 100644 --- a/src/alloc-posix.c +++ b/src/alloc-posix.c @@ -38,7 +38,9 @@ size_t mi_malloc_usable_size(const void *p) mi_attr_noexcept { } void mi_cfree(void* p) mi_attr_noexcept { - mi_free(p); + if (mi_is_in_heap_region(p)) { + mi_free(p); + } } int mi_posix_memalign(void** p, size_t alignment, size_t size) mi_attr_noexcept { diff --git a/src/alloc.c b/src/alloc.c index 6a91c0ad..bfb37d19 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -57,6 +57,7 @@ extern inline void* mi_malloc_small(size_t size) mi_attr_noexcept { return mi_heap_malloc_small(mi_get_default_heap(), size); } + // zero initialized small block void* mi_zalloc_small(size_t size) mi_attr_noexcept { void* p = mi_malloc_small(size); @@ -71,7 +72,7 @@ extern inline void* mi_heap_malloc(mi_heap_t* heap, size_t size) mi_attr_noexcep void* p; if (mi_likely(size <= MI_SMALL_SIZE_MAX)) { p = mi_heap_malloc_small(heap, size); - } + } else { p = _mi_malloc_generic(heap, size); } @@ -223,8 +224,7 @@ void mi_free(void* p) mi_attr_noexcept return; } #endif - - bool local = (_mi_thread_id() == segment->thread_id); // preload, note: putting the thread_id in the page->flags does not improve performance + mi_page_t* page = _mi_segment_page_of(segment, p); #if (MI_STAT>1) @@ -236,24 +236,18 @@ void mi_free(void* p) mi_attr_noexcept // huge page stat is accounted for in `_mi_page_retire` #endif - // adjust if it might be an un-aligned block - if (mi_likely(page->flags.value==0)) { // not full or aligned + uintptr_t tid = _mi_thread_id(); + if (mi_likely(tid == page->flags.value)) { + // local, and not full or aligned mi_block_t* block = (mi_block_t*)p; - if (mi_likely(local)) { // note: merging both tests (local | value) does not matter for performance - // owning thread can free a block directly - mi_block_set_next(page, block, page->local_free); // note: moving this write earlier does not matter for performance - page->local_free = block; - page->used--; - if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); } - } - else { - // use atomic operations for a multi-threaded free - _mi_free_block_mt(page, block); - } + mi_block_set_next(page, block, page->local_free); + page->local_free = block; + page->used--; + if (mi_unlikely(mi_page_all_free(page))) { _mi_page_retire(page); } } else { - // aligned blocks, or a full page; use the more generic path - mi_free_generic(segment, page, local, p); + // non-local, aligned blocks, or a full page; use the more generic path + mi_free_generic(segment, page, tid == mi_page_thread_id(page), p); } } diff --git a/src/init.c b/src/init.c index 44e3c9cb..f807d74a 100644 --- a/src/init.c +++ b/src/init.c @@ -12,15 +12,16 @@ terms of the MIT license. A copy of the license can be found in the file // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { - 0, false, false, false, {0}, 0, 0, - NULL, 0, // free, used + 0, false, false, false, 0, 0, + NULL, // free #if MI_SECURE 0, #endif + 0, {0}, // used, flags NULL, 0, 0, 0, NULL, NULL, NULL - #if (MI_INTPTR_SIZE==4) - , { NULL } + #if (MI_INTPTR_SIZE==8 && MI_SECURE==0) + , { NULL } #endif }; @@ -33,22 +34,23 @@ const mi_page_t _mi_page_empty = { #define QNULL(sz) { NULL, NULL, (sz)*sizeof(uintptr_t) } #define MI_PAGE_QUEUES_EMPTY \ { QNULL(1), \ - QNULL(1), QNULL(2), QNULL(3), QNULL(4), QNULL(5), QNULL(6), QNULL(7), QNULL(8), \ - QNULL(10), QNULL(12), QNULL(14), QNULL(16), QNULL(20), QNULL(24), QNULL(28), QNULL(32), \ - QNULL(40), QNULL(48), QNULL(56), QNULL(64), QNULL(80), QNULL(96), QNULL(112), QNULL(128), \ - QNULL(160), QNULL(192), QNULL(224), QNULL(256), QNULL(320), QNULL(384), QNULL(448), QNULL(512), \ - QNULL(640), QNULL(768), QNULL(896), QNULL(1024), QNULL(1280), QNULL(1536), QNULL(1792), QNULL(2048), \ - QNULL(2560), QNULL(3072), QNULL(3584), QNULL(4096), QNULL(5120), QNULL(6144), QNULL(7168), QNULL(8192), \ - QNULL(10240), QNULL(12288), QNULL(14336), QNULL(16384), QNULL(20480), QNULL(24576), QNULL(28672), QNULL(32768), \ - QNULL(40960), QNULL(49152), QNULL(57344), QNULL(65536), QNULL(81920), QNULL(98304), QNULL(114688), \ - QNULL(MI_LARGE_WSIZE_MAX + 1 /*131072, Huge queue */), \ + QNULL( 1), QNULL( 2), QNULL( 3), QNULL( 4), QNULL( 5), QNULL( 6), QNULL( 7), QNULL( 8), /* 8 */ \ + QNULL( 10), QNULL( 12), QNULL( 14), QNULL( 16), QNULL( 20), QNULL( 24), QNULL( 28), QNULL( 32), /* 16 */ \ + QNULL( 40), QNULL( 48), QNULL( 56), QNULL( 64), QNULL( 80), QNULL( 96), QNULL( 112), QNULL( 128), /* 24 */ \ + QNULL( 160), QNULL( 192), QNULL( 224), QNULL( 256), QNULL( 320), QNULL( 384), QNULL( 448), QNULL( 512), /* 32 */ \ + QNULL( 640), QNULL( 768), QNULL( 896), QNULL( 1024), QNULL( 1280), QNULL( 1536), QNULL( 1792), QNULL( 2048), /* 40 */ \ + QNULL( 2560), QNULL( 3072), QNULL( 3584), QNULL( 4096), QNULL( 5120), QNULL( 6144), QNULL( 7168), QNULL( 8192), /* 48 */ \ + QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \ + QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \ + QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), /* 69 */ \ + QNULL(MI_LARGE_WSIZE_MAX + 1 /* 393216, Huge queue */), \ QNULL(MI_LARGE_WSIZE_MAX + 2) /* Full queue */ } #define MI_STAT_COUNT_NULL() {0,0,0,0} // Empty statistics #if MI_STAT>1 -#define MI_STAT_COUNT_END_NULL() , { MI_STAT_COUNT_NULL(), MI_INIT64(MI_STAT_COUNT_NULL) } +#define MI_STAT_COUNT_END_NULL() , { MI_STAT_COUNT_NULL(), MI_INIT32(MI_STAT_COUNT_NULL) } #else #define MI_STAT_COUNT_END_NULL() #endif @@ -61,7 +63,8 @@ const mi_page_t _mi_page_empty = { MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ - MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ + MI_STAT_COUNT_NULL(), \ + { 0, 0 }, \ { 0, 0 } \ MI_STAT_COUNT_END_NULL() @@ -95,8 +98,8 @@ static mi_tld_t tld_main = { 0, &_mi_heap_main, { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments - { 0, NULL, NULL, 0, tld_main_stats }, // os - { MI_STATS_NULL } // stats + { 0, NULL, NULL, 0, tld_main_stats }, // os + { MI_STATS_NULL } // stats }; mi_heap_t _mi_heap_main = { @@ -385,7 +388,7 @@ bool _mi_preloading() { } // Communicate with the redirection module on Windows -#if defined(_WIN32) && defined(MI_SHARED_LIB) +#if 0 #ifdef __cplusplus extern "C" { #endif diff --git a/src/memory.c b/src/memory.c index e7d1887e..7f8cfb14 100644 --- a/src/memory.c +++ b/src/memory.c @@ -106,6 +106,7 @@ static size_t mi_good_commit_size(size_t size) { // Return if a pointer points into a region reserved by us. bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { + if (p==NULL) return false; size_t count = mi_atomic_read(®ions_count); for (size_t i = 0; i < count; i++) { uint8_t* start = (uint8_t*)mi_atomic_read_ptr(®ions[i].start); diff --git a/src/page-queue.c b/src/page-queue.c index fd388113..a386f8a1 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -97,7 +97,7 @@ uint8_t _mi_bsr(uintptr_t x) { // Returns MI_BIN_HUGE if the size is too large. // We use `wsize` for the size in "machine word sizes", // i.e. byte size == `wsize*sizeof(void*)`. -inline uint8_t _mi_bin(size_t size) { +extern inline uint8_t _mi_bin(size_t size) { size_t wsize = _mi_wsize_from_size(size); uint8_t bin; if (wsize <= 1) { @@ -120,13 +120,13 @@ inline uint8_t _mi_bin(size_t size) { bin = MI_BIN_HUGE; } else { - #if defined(MI_ALIGN4W) + #if defined(MI_ALIGN4W) if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes #endif wsize--; // find the highest bit uint8_t b = mi_bsr32((uint32_t)wsize); - // and use the top 3 bits to determine the bin (~16% worst internal fragmentation). + // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3; diff --git a/src/page.c b/src/page.c index 69d32bfe..e6be8df6 100644 --- a/src/page.c +++ b/src/page.c @@ -71,10 +71,11 @@ static bool mi_page_is_valid_init(mi_page_t* page) { mi_assert_internal(page->block_size > 0); mi_assert_internal(page->used <= page->capacity); mi_assert_internal(page->capacity <= page->reserved); - + mi_segment_t* segment = _mi_page_segment(page); uint8_t* start = _mi_page_start(segment,page,NULL); mi_assert_internal(start == _mi_segment_page_start(segment,page,page->block_size,NULL)); + mi_assert_internal(segment->thread_id==0 || segment->thread_id == mi_page_thread_id(page)); //mi_assert_internal(start + page->capacity*page->block_size == page->top); mi_assert_internal(mi_page_list_is_valid(page,page->free)); @@ -216,7 +217,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size mi_page_t* page = _mi_segment_page_alloc(block_size, &heap->tld->segments, &heap->tld->os); if (page == NULL) return NULL; mi_page_init(heap, page, block_size, &heap->tld->stats); - mi_heap_stat_increase( heap, pages, 1); + _mi_stat_increase( &heap->tld->stats.pages, 1); mi_page_queue_push(heap, pq, page); mi_assert_expensive(_mi_page_is_valid(page)); return page; @@ -352,7 +353,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { // account for huge pages here if (page->block_size > MI_LARGE_SIZE_MAX) { - mi_heap_stat_decrease(page->heap, huge, page->block_size); + _mi_stat_decrease(&page->heap->tld->stats.huge, page->block_size); } // remove from the page list @@ -384,8 +385,9 @@ void _mi_page_retire(mi_page_t* page) { // is the only page left with free blocks. It is not clear // how to check this efficiently though... for now we just check // if its neighbours are almost fully used. - if (mi_likely(page->block_size <= MI_SMALL_SIZE_MAX)) { + if (mi_likely(page->block_size <= MI_MEDIUM_SIZE_MAX)) { if (mi_page_mostly_used(page->prev) && mi_page_mostly_used(page->next)) { + _mi_stat_counter_increase(&_mi_stats_main.page_no_retire,1); return; // dont't retire after all } } @@ -404,7 +406,60 @@ void _mi_page_retire(mi_page_t* page) { #define MI_MAX_SLICES (1UL << MI_MAX_SLICE_SHIFT) #define MI_MIN_SLICES (2) -static void mi_page_free_list_extend( mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats) +static void mi_page_free_list_extend_secure(mi_heap_t* heap, mi_page_t* page, size_t extend, mi_stats_t* stats) { + UNUSED(stats); + mi_assert_internal(page->free == NULL); + mi_assert_internal(page->local_free == NULL); + mi_assert_internal(page->capacity + extend <= page->reserved); + void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL); + size_t bsize = page->block_size; + + // initialize a randomized free list + // set up `slice_count` slices to alternate between + size_t shift = MI_MAX_SLICE_SHIFT; + while ((extend >> shift) == 0) { + shift--; + } + size_t slice_count = (size_t)1U << shift; + size_t slice_extend = extend / slice_count; + mi_assert_internal(slice_extend >= 1); + mi_block_t* blocks[MI_MAX_SLICES]; // current start of the slice + size_t counts[MI_MAX_SLICES]; // available objects in the slice + for (size_t i = 0; i < slice_count; i++) { + blocks[i] = mi_page_block_at(page, page_area, page->capacity + i*slice_extend); + counts[i] = slice_extend; + } + counts[slice_count-1] += (extend % slice_count); // final slice holds the modulus too (todo: distribute evenly?) + + // and initialize the free list by randomly threading through them + // set up first element + size_t current = _mi_heap_random(heap) % slice_count; + counts[current]--; + page->free = blocks[current]; + // and iterate through the rest + uintptr_t rnd = heap->random; + for (size_t i = 1; i < extend; i++) { + // call random_shuffle only every INTPTR_SIZE rounds + size_t round = i%MI_INTPTR_SIZE; + if (round == 0) rnd = _mi_random_shuffle(rnd); + // select a random next slice index + size_t next = ((rnd >> 8*round) & (slice_count-1)); + while (counts[next]==0) { // ensure it still has space + next++; + if (next==slice_count) next = 0; + } + // and link the current block to it + counts[next]--; + mi_block_t* block = blocks[current]; + blocks[current] = (mi_block_t*)((uint8_t*)block + bsize); // bump to the following block + mi_block_set_next(page, block, blocks[next]); // and set next; note: we may have `current == next` + current = next; + } + mi_block_set_next(page, blocks[current], NULL); // end of the list + heap->random = _mi_random_shuffle(rnd); +} + +static void mi_page_free_list_extend( mi_page_t* page, size_t extend, mi_stats_t* stats) { UNUSED(stats); mi_assert_internal(page->free == NULL); @@ -413,66 +468,17 @@ static void mi_page_free_list_extend( mi_heap_t* heap, mi_page_t* page, size_t e void* page_area = _mi_page_start(_mi_page_segment(page), page, NULL ); size_t bsize = page->block_size; mi_block_t* start = mi_page_block_at(page, page_area, page->capacity); - if (extend < MI_MIN_SLICES || !mi_option_is_enabled(mi_option_secure)) { - // initialize a sequential free list - mi_block_t* end = mi_page_block_at(page, page_area, page->capacity + extend - 1); - mi_block_t* block = start; - for (size_t i = 0; i < extend; i++) { - mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize); - mi_block_set_next(page,block,next); - block = next; - } - mi_block_set_next(page, end, NULL); - page->free = start; - } - else { - // initialize a randomized free list - // set up `slice_count` slices to alternate between - size_t shift = MI_MAX_SLICE_SHIFT; - while ((extend >> shift) == 0) { - shift--; - } - size_t slice_count = (size_t)1U << shift; - size_t slice_extend = extend / slice_count; - mi_assert_internal(slice_extend >= 1); - mi_block_t* blocks[MI_MAX_SLICES]; // current start of the slice - size_t counts[MI_MAX_SLICES]; // available objects in the slice - for (size_t i = 0; i < slice_count; i++) { - blocks[i] = mi_page_block_at(page, page_area, page->capacity + i*slice_extend); - counts[i] = slice_extend; - } - counts[slice_count-1] += (extend % slice_count); // final slice holds the modulus too (todo: distribute evenly?) - // and initialize the free list by randomly threading through them - // set up first element - size_t current = _mi_heap_random(heap) % slice_count; - counts[current]--; - page->free = blocks[current]; - // and iterate through the rest - uintptr_t rnd = heap->random; - for (size_t i = 1; i < extend; i++) { - // call random_shuffle only every INTPTR_SIZE rounds - size_t round = i%MI_INTPTR_SIZE; - if (round == 0) rnd = _mi_random_shuffle(rnd); - // select a random next slice index - size_t next = ((rnd >> 8*round) & (slice_count-1)); - while (counts[next]==0) { // ensure it still has space - next++; - if (next==slice_count) next = 0; - } - // and link the current block to it - counts[next]--; - mi_block_t* block = blocks[current]; - blocks[current] = (mi_block_t*)((uint8_t*)block + bsize); // bump to the following block - mi_block_set_next(page, block, blocks[next]); // and set next; note: we may have `current == next` - current = next; - } - mi_block_set_next( page, blocks[current], NULL); // end of the list - heap->random = _mi_random_shuffle(rnd); + // initialize a sequential free list + mi_block_t* last = mi_page_block_at(page, page_area, page->capacity + extend - 1); + mi_block_t* block = start; + while(block <= last) { + mi_block_t* next = (mi_block_t*)((uint8_t*)block + bsize); + mi_block_set_next(page,block,next); + block = next; } - // enable the new free list - page->capacity += (uint16_t)extend; - _mi_stat_increase(&stats->page_committed, extend * page->block_size); + mi_block_set_next(page, last, NULL); + page->free = start; } /* ----------------------------------------------------------- @@ -518,7 +524,15 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_stats_t* st mi_assert_internal(extend < (1UL<<16)); // and append the extend the free list - mi_page_free_list_extend(heap, page, extend, stats ); + if (extend < MI_MIN_SLICES || !mi_option_is_enabled(mi_option_secure)) { + mi_page_free_list_extend(page, extend, stats ); + } + else { + mi_page_free_list_extend_secure(heap, page, extend, stats); + } + // enable the new free list + page->capacity += (uint16_t)extend; + _mi_stat_increase(&stats->page_committed, extend * page->block_size); mi_assert_expensive(mi_page_is_valid_init(page)); } @@ -688,7 +702,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) { if (page != NULL) { mi_assert_internal(mi_page_immediate_available(page)); mi_assert_internal(page->block_size == block_size); - mi_heap_stat_increase( heap, huge, block_size); + _mi_stat_increase( &heap->tld->stats.huge, block_size); } return page; } @@ -708,10 +722,10 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size) mi_attr_noexcept // call potential deferred free routines _mi_deferred_free(heap, false); - + // free delayed frees from other threads _mi_heap_delayed_free(heap); - + // huge allocation? mi_page_t* page; if (mi_unlikely(size > MI_LARGE_SIZE_MAX)) { diff --git a/src/segment.c b/src/segment.c index 8f254a26..736345bf 100644 --- a/src/segment.c +++ b/src/segment.c @@ -226,6 +226,7 @@ static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) { static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) { + segment->thread_id = 0; mi_segments_track_size(-((long)segment_size),tld); if (mi_option_is_enabled(mi_option_secure)) { _mi_mem_unprotect(segment, segment->segment_size); // ensure no more guard pages are set @@ -235,8 +236,8 @@ static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_se // The thread local segment cache is limited to be at most 1/8 of the peak size of segments in use, -// and no more than 2. -#define MI_SEGMENT_CACHE_MAX (2) +// and no more than 4. +#define MI_SEGMENT_CACHE_MAX (4) #define MI_SEGMENT_CACHE_FRACTION (8) // note: returned segment may be partially reset @@ -248,17 +249,19 @@ static mi_segment_t* mi_segment_cache_pop(size_t segment_size, mi_segments_tld_t tld->cache = segment->next; segment->next = NULL; mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE); + _mi_stat_decrease(&tld->stats->segments_cache, 1); return segment; } static bool mi_segment_cache_full(mi_segments_tld_t* tld) { - if (tld->cache_count < MI_SEGMENT_CACHE_MAX && - tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) { // always allow 1 element cache + if (tld->cache_count < MI_SEGMENT_CACHE_MAX + && tld->cache_count < (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION)) + ) { // always allow 1 element cache return false; } // take the opportunity to reduce the segment cache if it is too large (now) // TODO: this never happens as we check against peak usage, should we use current usage instead? - while (tld->cache_count > (1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) { + while (tld->cache_count > MI_SEGMENT_CACHE_MAX ) { //(1 + (tld->peak_count / MI_SEGMENT_CACHE_FRACTION))) { mi_segment_t* segment = mi_segment_cache_pop(0,tld); mi_assert_internal(segment != NULL); if (segment != NULL) mi_segment_os_free(segment, segment->segment_size, tld); @@ -269,7 +272,9 @@ static bool mi_segment_cache_full(mi_segments_tld_t* tld) { static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal(!mi_segment_is_in_free_queue(segment, tld)); mi_assert_internal(segment->next == NULL); - if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) return false; + if (segment->segment_size != MI_SEGMENT_SIZE || mi_segment_cache_full(tld)) { + return false; + } mi_assert_internal(segment->segment_size == MI_SEGMENT_SIZE); if (mi_option_is_enabled(mi_option_cache_reset)) { _mi_mem_reset((uint8_t*)segment + segment->segment_info_size, segment->segment_size - segment->segment_info_size, tld->stats); @@ -277,6 +282,7 @@ static bool mi_segment_cache_push(mi_segment_t* segment, mi_segments_tld_t* tld) segment->next = tld->cache; tld->cache = segment; tld->cache_count++; + _mi_stat_increase(&tld->stats->segments_cache,1); return true; } @@ -407,8 +413,7 @@ static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment)); mi_assert(segment->next == NULL); mi_assert(segment->prev == NULL); - _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size); - segment->thread_id = 0; + _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size); // update reset memory statistics /* @@ -613,6 +618,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen } else { // otherwise reclaim it + mi_page_init_flags(page,segment->thread_id); _mi_page_reclaim(heap,page); } } @@ -643,6 +649,7 @@ static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tl mi_assert_internal(mi_segment_has_free(segment)); mi_page_t* page = mi_segment_find_free(segment, tld->stats); page->segment_in_use = true; + mi_page_init_flags(page,segment->thread_id); segment->used++; mi_assert_internal(segment->used <= segment->capacity); if (segment->used == segment->capacity) { @@ -682,6 +689,7 @@ static mi_page_t* mi_segment_large_page_alloc(mi_segments_tld_t* tld, mi_os_tld_ segment->used = 1; mi_page_t* page = &segment->pages[0]; page->segment_in_use = true; + mi_page_init_flags(page,segment->thread_id); return page; } @@ -693,22 +701,27 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld segment->used = 1; mi_page_t* page = &segment->pages[0]; page->segment_in_use = true; + mi_page_init_flags(page,segment->thread_id); return page; } /* ----------------------------------------------------------- Page allocation and free ----------------------------------------------------------- */ +static bool mi_is_good_fit(size_t bsize, size_t size) { + // good fit if no more than 25% wasted + return (bsize > 0 && size > 0 && bsize < size && (size - (size % bsize)) < (size/4)); +} mi_page_t* _mi_segment_page_alloc(size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { mi_page_t* page; - if (block_size <= (MI_SMALL_PAGE_SIZE/4)) { + if (block_size <= MI_SMALL_SIZE_MAX || mi_is_good_fit(block_size,MI_SMALL_PAGE_SIZE)) { page = mi_segment_small_page_alloc(tld,os_tld); } - else if (block_size <= (MI_MEDIUM_PAGE_SIZE/4)) { + else if (block_size <= MI_MEDIUM_SIZE_MAX || mi_is_good_fit(block_size, MI_MEDIUM_PAGE_SIZE)) { page = mi_segment_medium_page_alloc(tld, os_tld); } - else if (block_size < (MI_LARGE_SIZE_MAX - sizeof(mi_segment_t))) { + else if (block_size < MI_LARGE_SIZE_MAX || mi_is_good_fit(block_size, MI_LARGE_PAGE_SIZE - sizeof(mi_segment_t))) { page = mi_segment_large_page_alloc(tld, os_tld); } else { diff --git a/src/stats.c b/src/stats.c index 2b15bf9e..8725e48c 100644 --- a/src/stats.c +++ b/src/stats.c @@ -99,14 +99,14 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1); mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1); mi_stat_add(&stats->mmap_calls, &src->mmap_calls, 1); - mi_stat_add(&stats->mmap_ensure_aligned, &src->mmap_ensure_aligned, 1); - mi_stat_add(&stats->mmap_right_align, &src->mmap_right_align, 1); mi_stat_add(&stats->commit_calls, &src->commit_calls, 1); mi_stat_add(&stats->threads, &src->threads, 1); mi_stat_add(&stats->pages_extended, &src->pages_extended, 1); mi_stat_add(&stats->malloc, &src->malloc, 1); + mi_stat_add(&stats->segments_cache, &src->segments_cache, 1); mi_stat_add(&stats->huge, &src->huge, 1); + mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1); mi_stat_counter_add(&stats->searches, &src->searches, 1); #if MI_STAT>1 for (size_t i = 0; i <= MI_BIN_HUGE; i++) { @@ -172,10 +172,15 @@ static void mi_stat_print(const mi_stat_count_t* stat, const char* msg, int64_t } static void mi_stat_counter_print(const mi_stat_counter_t* stat, const char* msg, FILE* out ) { - double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count); - _mi_fprintf(out,"%10s: %7.1f avg\n", msg, avg); + _mi_fprintf(out, "%10s:", msg); + mi_print_amount(stat->total, -1, out); + _mi_fprintf(out, "\n"); } +static void mi_stat_counter_print_avg(const mi_stat_counter_t* stat, const char* msg, FILE* out) { + double avg = (stat->count == 0 ? 0.0 : (double)stat->total / (double)stat->count); + _mi_fprintf(out, "%10s: %7.1f avg\n", msg, avg); +} static void mi_print_header( FILE* out ) { @@ -229,15 +234,15 @@ static void _mi_stats_print(mi_stats_t* stats, double secs, FILE* out) mi_attr_n mi_stat_print(&stats->page_committed, "touched", 1, out); mi_stat_print(&stats->segments, "segments", -1, out); mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out); + mi_stat_print(&stats->segments_cache, "-cached", -1, out); mi_stat_print(&stats->pages, "pages", -1, out); mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out); mi_stat_print(&stats->pages_extended, "-extended", 0, out); + mi_stat_counter_print(&stats->page_no_retire, "-noretire", out); mi_stat_print(&stats->mmap_calls, "mmaps", 0, out); - mi_stat_print(&stats->mmap_right_align, "mmap fast", 0, out); - mi_stat_print(&stats->mmap_ensure_aligned, "mmap slow", 0, out); mi_stat_print(&stats->commit_calls, "commits", 0, out); mi_stat_print(&stats->threads, "threads", 0, out); - mi_stat_counter_print(&stats->searches, "searches", out); + mi_stat_counter_print_avg(&stats->searches, "searches", out); if (secs >= 0.0) _mi_fprintf(out, "%10s: %9.3f s\n", "elapsed", secs); diff --git a/test/main-override-static.c b/test/main-override-static.c index 6ddf4f37..94891cc3 100644 --- a/test/main-override-static.c +++ b/test/main-override-static.c @@ -6,6 +6,7 @@ #include #include // redefines malloc etc. + int main() { mi_version(); void* p1 = malloc(78);