diff --git a/CMakeLists.txt b/CMakeLists.txt index 2017a7aa..69a0cdc1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -50,7 +50,8 @@ set(mi_sources src/alloc-posix.c src/heap.c src/options.c - src/init.c) + src/init.c + src/prim/prim.c) set(mi_cflags "") set(mi_libraries "") diff --git a/ide/vs2022/mimalloc-override.vcxproj b/ide/vs2022/mimalloc-override.vcxproj index 54964d96..f4e2ab01 100644 --- a/ide/vs2022/mimalloc-override.vcxproj +++ b/ide/vs2022/mimalloc-override.vcxproj @@ -237,6 +237,7 @@ + diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj index c7e820df..14bd3e69 100644 --- a/ide/vs2022/mimalloc-test-stress.vcxproj +++ b/ide/vs2022/mimalloc-test-stress.vcxproj @@ -149,8 +149,8 @@ - - {abb5eae7-b3e6-432e-b636-333449892ea6} + + {abb5eae7-b3e6-432e-b636-333449892ea7} diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 9811aa55..aba9cf8b 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -225,6 +225,7 @@ + true diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index ecc006c9..57564b79 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -73,7 +73,10 @@ extern mi_decl_cache_align mi_stats_t _mi_stats_main; extern mi_decl_cache_align const mi_page_t _mi_page_empty; bool _mi_is_main_thread(void); size_t _mi_current_thread_count(void); -bool _mi_preloading(void); // true while the C runtime is not ready +bool _mi_preloading(void); // true while the C runtime is not ready +mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; +mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap +void _mi_thread_done(mi_heap_t* heap); // os.c size_t _mi_os_page_size(void); @@ -93,6 +96,9 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats); void* _mi_os_alloc_aligned_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool* large, mi_stats_t* tld_stats); void _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_offset, bool was_committed, mi_stats_t* tld_stats); +void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size); +bool _mi_os_use_large_page(size_t size, size_t alignment); +size_t _mi_os_large_page_size(void); // arena.c void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool* commit, bool* large, bool* is_pinned, bool* is_zero, mi_arena_id_t req_arena_id, size_t* memid, mi_os_tld_t* tld); @@ -175,6 +181,15 @@ bool _mi_free_delayed_block(mi_block_t* block); void _mi_free_generic(const mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); +// option.c, c primitives +char _mi_toupper(char c); +int _mi_strnicmp(const char* s, const char* t, size_t n); +void _mi_strlcpy(char* dest, const char* src, size_t dest_size); +void _mi_strlcat(char* dest, const char* src, size_t dest_size); +size_t _mi_strlen(const char* s); +size_t _mi_strnlen(const char* s, size_t max_len); + + #if MI_DEBUG>1 bool _mi_page_is_valid(mi_page_t* page); #endif @@ -341,93 +356,11 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot } -/* ---------------------------------------------------------------------------------------- -The thread local default heap: `_mi_get_default_heap` returns the thread local heap. -On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a -__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures -that the storage will always be available (allocated on the thread stacks). -On some platforms though we cannot use that when overriding `malloc` since the underlying -TLS implementation (or the loader) will call itself `malloc` on a first access and recurse. -We try to circumvent this in an efficient way: -- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the - loader itself calls `malloc` even before the modules are initialized. -- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS). -- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323) +/*---------------------------------------------------------------------------------------- + Heap functions ------------------------------------------------------------------------------------------- */ extern const mi_heap_t _mi_heap_empty; // read-only empty heap, initial value of the thread local default heap -extern bool _mi_process_is_initialized; -mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap - -#if defined(MI_MALLOC_OVERRIDE) -#if defined(__APPLE__) // macOS -#define MI_TLS_SLOT 89 // seems unused? -// #define MI_TLS_RECURSE_GUARD 1 -// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89) -// see -#elif defined(__OpenBSD__) -// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) -// see -#define MI_TLS_PTHREAD_SLOT_OFS (6*sizeof(int) + 4*sizeof(void*) + 24) -// #elif defined(__DragonFly__) -// #warning "mimalloc is not working correctly on DragonFly yet." -// #define MI_TLS_PTHREAD_SLOT_OFS (4 + 1*sizeof(void*)) // offset `uniqueid` (also used by gdb?) -#elif defined(__ANDROID__) -// See issue #381 -#define MI_TLS_PTHREAD -#endif -#endif - -#if defined(MI_TLS_SLOT) -static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept; // forward declaration -#elif defined(MI_TLS_PTHREAD_SLOT_OFS) -static inline mi_heap_t** mi_tls_pthread_heap_slot(void) { - pthread_t self = pthread_self(); - #if defined(__DragonFly__) - if (self==NULL) { - mi_heap_t* pheap_main = _mi_heap_main_get(); - return &pheap_main; - } - #endif - return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS); -} -#elif defined(MI_TLS_PTHREAD) -extern pthread_key_t _mi_heap_default_key; -#endif - -// Default heap to allocate from (if not using TLS- or pthread slots). -// Do not use this directly but use through `mi_heap_get_default()` (or the unchecked `mi_get_default_heap`). -// This thread local variable is only used when neither MI_TLS_SLOT, MI_TLS_PTHREAD, or MI_TLS_PTHREAD_SLOT_OFS are defined. -// However, on the Apple M1 we do use the address of this variable as the unique thread-id (issue #356). -extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from - -static inline mi_heap_t* mi_get_default_heap(void) { -#if defined(MI_TLS_SLOT) - mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT); - if mi_unlikely(heap == NULL) { - #ifdef __GNUC__ - __asm(""); // prevent conditional load of the address of _mi_heap_empty - #endif - heap = (mi_heap_t*)&_mi_heap_empty; - } - return heap; -#elif defined(MI_TLS_PTHREAD_SLOT_OFS) - mi_heap_t* heap = *mi_tls_pthread_heap_slot(); - return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); -#elif defined(MI_TLS_PTHREAD) - mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key)); - return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); -#else - #if defined(MI_TLS_RECURSE_GUARD) - if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get(); - #endif - return _mi_heap_default; -#endif -} - -static inline bool mi_heap_is_default(const mi_heap_t* heap) { - return (heap == mi_get_default_heap()); -} static inline bool mi_heap_is_backing(const mi_heap_t* heap) { return (heap->tld->heap_backing == heap); @@ -455,11 +388,6 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si return heap->pages_free_direct[idx]; } -// Get the page belonging to a certain size class -static inline mi_page_t* _mi_get_free_small_page(size_t size) { - return _mi_heap_get_free_small_page(mi_get_default_heap(), size); -} - // Segment that contains the pointer // Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE), // and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it; @@ -835,107 +763,6 @@ static inline size_t _mi_os_numa_node_count(void) { } -// ------------------------------------------------------------------- -// Getting the thread id should be performant as it is called in the -// fast path of `_mi_free` and we specialize for various platforms. -// We only require _mi_threadid() to return a unique id for each thread. -// ------------------------------------------------------------------- -#if defined(_WIN32) - -#define WIN32_LEAN_AND_MEAN -#include -static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { - // Windows: works on Intel and ARM in both 32- and 64-bit - return (uintptr_t)NtCurrentTeb(); -} - -// We use assembly for a fast thread id on the main platforms. The TLS layout depends on -// both the OS and libc implementation so we use specific tests for each main platform. -// If you test on another platform and it works please send a PR :-) -// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register. -#elif defined(__GNUC__) && ( \ - (defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ - || (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__))) \ - || (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ - || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ - || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ - ) - -static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept { - void* res; - const size_t ofs = (slot*sizeof(void*)); - #if defined(__i386__) - __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86 32-bit always uses GS - #elif defined(__APPLE__) && defined(__x86_64__) - __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 macOSX uses GS - #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) - __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x32 ABI - #elif defined(__x86_64__) - __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 Linux, BSD uses FS - #elif defined(__arm__) - void** tcb; MI_UNUSED(ofs); - __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); - res = tcb[slot]; - #elif defined(__aarch64__) - void** tcb; MI_UNUSED(ofs); - #if defined(__APPLE__) // M1, issue #343 - __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); - #else - __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); - #endif - res = tcb[slot]; - #endif - return res; -} - -// setting a tls slot is only used on macOS for now -static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept { - const size_t ofs = (slot*sizeof(void*)); - #if defined(__i386__) - __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // 32-bit always uses GS - #elif defined(__APPLE__) && defined(__x86_64__) - __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 macOS uses GS - #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) - __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x32 ABI - #elif defined(__x86_64__) - __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS - #elif defined(__arm__) - void** tcb; MI_UNUSED(ofs); - __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); - tcb[slot] = value; - #elif defined(__aarch64__) - void** tcb; MI_UNUSED(ofs); - #if defined(__APPLE__) // M1, issue #343 - __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); - #else - __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); - #endif - tcb[slot] = value; - #endif -} - -static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { - #if defined(__BIONIC__) - // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id - // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86 - return (uintptr_t)mi_tls_slot(1); - #else - // in all our other targets, slot 0 is the thread id - // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h - // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36 - return (uintptr_t)mi_tls_slot(0); - #endif -} - -#else - -// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms). -static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { - return (uintptr_t)&_mi_heap_default; -} - -#endif - // ----------------------------------------------------------------------- // Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero) diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index 1e27a306..a3c502d2 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -7,8 +7,9 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc-internal.h" +#include "prim/prim.h" // mi_prim_get_default_heap -#include // memset +#include // memset // ------------------------------------------------------ // Aligned Allocation @@ -187,27 +188,27 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_calloc_aligned(mi_heap_t* heap, } mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept { - return mi_heap_malloc_aligned_at(mi_get_default_heap(), size, alignment, offset); + return mi_heap_malloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset); } mi_decl_nodiscard mi_decl_restrict void* mi_malloc_aligned(size_t size, size_t alignment) mi_attr_noexcept { - return mi_heap_malloc_aligned(mi_get_default_heap(), size, alignment); + return mi_heap_malloc_aligned(mi_prim_get_default_heap(), size, alignment); } mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned_at(size_t size, size_t alignment, size_t offset) mi_attr_noexcept { - return mi_heap_zalloc_aligned_at(mi_get_default_heap(), size, alignment, offset); + return mi_heap_zalloc_aligned_at(mi_prim_get_default_heap(), size, alignment, offset); } mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_aligned(size_t size, size_t alignment) mi_attr_noexcept { - return mi_heap_zalloc_aligned(mi_get_default_heap(), size, alignment); + return mi_heap_zalloc_aligned(mi_prim_get_default_heap(), size, alignment); } mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned_at(size_t count, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { - return mi_heap_calloc_aligned_at(mi_get_default_heap(), count, size, alignment, offset); + return mi_heap_calloc_aligned_at(mi_prim_get_default_heap(), count, size, alignment, offset); } mi_decl_nodiscard mi_decl_restrict void* mi_calloc_aligned(size_t count, size_t size, size_t alignment) mi_attr_noexcept { - return mi_heap_calloc_aligned(mi_get_default_heap(), count, size, alignment); + return mi_heap_calloc_aligned(mi_prim_get_default_heap(), count, size, alignment); } @@ -282,25 +283,25 @@ mi_decl_nodiscard void* mi_heap_recalloc_aligned(mi_heap_t* heap, void* p, size_ } mi_decl_nodiscard void* mi_realloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept { - return mi_heap_realloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset); + return mi_heap_realloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset); } mi_decl_nodiscard void* mi_realloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept { - return mi_heap_realloc_aligned(mi_get_default_heap(), p, newsize, alignment); + return mi_heap_realloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment); } mi_decl_nodiscard void* mi_rezalloc_aligned_at(void* p, size_t newsize, size_t alignment, size_t offset) mi_attr_noexcept { - return mi_heap_rezalloc_aligned_at(mi_get_default_heap(), p, newsize, alignment, offset); + return mi_heap_rezalloc_aligned_at(mi_prim_get_default_heap(), p, newsize, alignment, offset); } mi_decl_nodiscard void* mi_rezalloc_aligned(void* p, size_t newsize, size_t alignment) mi_attr_noexcept { - return mi_heap_rezalloc_aligned(mi_get_default_heap(), p, newsize, alignment); + return mi_heap_rezalloc_aligned(mi_prim_get_default_heap(), p, newsize, alignment); } mi_decl_nodiscard void* mi_recalloc_aligned_at(void* p, size_t newcount, size_t size, size_t alignment, size_t offset) mi_attr_noexcept { - return mi_heap_recalloc_aligned_at(mi_get_default_heap(), p, newcount, size, alignment, offset); + return mi_heap_recalloc_aligned_at(mi_prim_get_default_heap(), p, newcount, size, alignment, offset); } mi_decl_nodiscard void* mi_recalloc_aligned(void* p, size_t newcount, size_t size, size_t alignment) mi_attr_noexcept { - return mi_heap_recalloc_aligned(mi_get_default_heap(), p, newcount, size, alignment); + return mi_heap_recalloc_aligned(mi_prim_get_default_heap(), p, newcount, size, alignment); } diff --git a/src/alloc-posix.c b/src/alloc-posix.c index e6505f29..f0cfe629 100644 --- a/src/alloc-posix.c +++ b/src/alloc-posix.c @@ -149,7 +149,7 @@ int mi_dupenv_s(char** buf, size_t* size, const char* name) mi_attr_noexcept { else { *buf = mi_strdup(p); if (*buf==NULL) return ENOMEM; - if (size != NULL) *size = strlen(p); + if (size != NULL) *size = _mi_strlen(p); } return 0; } diff --git a/src/alloc.c b/src/alloc.c index 21170d75..a200caed 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -11,10 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc-internal.h" #include "mimalloc-atomic.h" +#include "prim/prim.h" // _mi_prim_thread_id() - -#include // memset, strlen -#include // malloc, exit +#include // memset, strlen (for mi_strdup) +#include // malloc, abort #define MI_IN_ALLOC_C #include "alloc-override.c" @@ -106,7 +106,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, mi_track_malloc(p,size,zero); #if MI_STAT>1 if (p != NULL) { - if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); } + if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); } mi_heap_stat_increase(heap, malloc, mi_usable_size(p)); } #endif @@ -119,7 +119,7 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc_small(mi_h } mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc_small(size_t size) mi_attr_noexcept { - return mi_heap_malloc_small(mi_get_default_heap(), size); + return mi_heap_malloc_small(mi_prim_get_default_heap(), size); } // The main allocation function @@ -135,7 +135,7 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z mi_track_malloc(p,size,zero); #if MI_STAT>1 if (p != NULL) { - if (!mi_heap_is_initialized(heap)) { heap = mi_get_default_heap(); } + if (!mi_heap_is_initialized(heap)) { heap = mi_prim_get_default_heap(); } mi_heap_stat_increase(heap, malloc, mi_usable_size(p)); } #endif @@ -152,12 +152,12 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_malloc(mi_heap_t* } mi_decl_nodiscard extern inline mi_decl_restrict void* mi_malloc(size_t size) mi_attr_noexcept { - return mi_heap_malloc(mi_get_default_heap(), size); + return mi_heap_malloc(mi_prim_get_default_heap(), size); } // zero initialized small block mi_decl_nodiscard mi_decl_restrict void* mi_zalloc_small(size_t size) mi_attr_noexcept { - return mi_heap_malloc_small_zero(mi_get_default_heap(), size, true); + return mi_heap_malloc_small_zero(mi_prim_get_default_heap(), size, true); } mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* heap, size_t size) mi_attr_noexcept { @@ -165,7 +165,7 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_zalloc(mi_heap_t* } mi_decl_nodiscard mi_decl_restrict void* mi_zalloc(size_t size) mi_attr_noexcept { - return mi_heap_zalloc(mi_get_default_heap(),size); + return mi_heap_zalloc(mi_prim_get_default_heap(),size); } @@ -541,7 +541,7 @@ void mi_free(void* p) mi_attr_noexcept { if mi_unlikely(p == NULL) return; mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free"); - const bool is_local= (_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); + const bool is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); mi_page_t* const page = _mi_segment_page_of(segment, p); if mi_likely(is_local) { // thread-local free? @@ -654,7 +654,7 @@ mi_decl_nodiscard extern inline mi_decl_restrict void* mi_heap_calloc(mi_heap_t* } mi_decl_nodiscard mi_decl_restrict void* mi_calloc(size_t count, size_t size) mi_attr_noexcept { - return mi_heap_calloc(mi_get_default_heap(),count,size); + return mi_heap_calloc(mi_prim_get_default_heap(),count,size); } // Uninitialized `calloc` @@ -665,7 +665,7 @@ mi_decl_nodiscard extern mi_decl_restrict void* mi_heap_mallocn(mi_heap_t* heap, } mi_decl_nodiscard mi_decl_restrict void* mi_mallocn(size_t count, size_t size) mi_attr_noexcept { - return mi_heap_mallocn(mi_get_default_heap(),count,size); + return mi_heap_mallocn(mi_prim_get_default_heap(),count,size); } // Expand (or shrink) in place (or fail) @@ -742,24 +742,24 @@ mi_decl_nodiscard void* mi_heap_recalloc(mi_heap_t* heap, void* p, size_t count, mi_decl_nodiscard void* mi_realloc(void* p, size_t newsize) mi_attr_noexcept { - return mi_heap_realloc(mi_get_default_heap(),p,newsize); + return mi_heap_realloc(mi_prim_get_default_heap(),p,newsize); } mi_decl_nodiscard void* mi_reallocn(void* p, size_t count, size_t size) mi_attr_noexcept { - return mi_heap_reallocn(mi_get_default_heap(),p,count,size); + return mi_heap_reallocn(mi_prim_get_default_heap(),p,count,size); } // Reallocate but free `p` on errors mi_decl_nodiscard void* mi_reallocf(void* p, size_t newsize) mi_attr_noexcept { - return mi_heap_reallocf(mi_get_default_heap(),p,newsize); + return mi_heap_reallocf(mi_prim_get_default_heap(),p,newsize); } mi_decl_nodiscard void* mi_rezalloc(void* p, size_t newsize) mi_attr_noexcept { - return mi_heap_rezalloc(mi_get_default_heap(), p, newsize); + return mi_heap_rezalloc(mi_prim_get_default_heap(), p, newsize); } mi_decl_nodiscard void* mi_recalloc(void* p, size_t count, size_t size) mi_attr_noexcept { - return mi_heap_recalloc(mi_get_default_heap(), p, count, size); + return mi_heap_recalloc(mi_prim_get_default_heap(), p, count, size); } @@ -780,7 +780,7 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const c } mi_decl_nodiscard mi_decl_restrict char* mi_strdup(const char* s) mi_attr_noexcept { - return mi_heap_strdup(mi_get_default_heap(), s); + return mi_heap_strdup(mi_prim_get_default_heap(), s); } // `strndup` using mi_malloc @@ -797,7 +797,7 @@ mi_decl_nodiscard mi_decl_restrict char* mi_heap_strndup(mi_heap_t* heap, const } mi_decl_nodiscard mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept { - return mi_heap_strndup(mi_get_default_heap(),s,n); + return mi_heap_strndup(mi_prim_get_default_heap(),s,n); } #ifndef __wasi__ @@ -866,7 +866,7 @@ char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) #endif mi_decl_nodiscard mi_decl_restrict char* mi_realpath(const char* fname, char* resolved_name) mi_attr_noexcept { - return mi_heap_realpath(mi_get_default_heap(),fname,resolved_name); + return mi_heap_realpath(mi_prim_get_default_heap(),fname,resolved_name); } #endif @@ -942,7 +942,7 @@ mi_decl_export mi_decl_noinline void* mi_heap_try_new(mi_heap_t* heap, size_t si } static mi_decl_noinline void* mi_try_new(size_t size, bool nothrow) { - return mi_heap_try_new(mi_get_default_heap(), size, nothrow); + return mi_heap_try_new(mi_prim_get_default_heap(), size, nothrow); } @@ -953,7 +953,7 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new(mi_heap_t* heap, size } mi_decl_nodiscard mi_decl_restrict void* mi_new(size_t size) { - return mi_heap_alloc_new(mi_get_default_heap(), size); + return mi_heap_alloc_new(mi_prim_get_default_heap(), size); } @@ -969,7 +969,7 @@ mi_decl_nodiscard mi_decl_restrict void* mi_heap_alloc_new_n(mi_heap_t* heap, si } mi_decl_nodiscard mi_decl_restrict void* mi_new_n(size_t count, size_t size) { - return mi_heap_alloc_new_n(mi_get_default_heap(), size, count); + return mi_heap_alloc_new_n(mi_prim_get_default_heap(), size, count); } diff --git a/src/heap.c b/src/heap.c index 1913ab74..a7b28f0e 100644 --- a/src/heap.c +++ b/src/heap.c @@ -9,6 +9,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc-internal.h" #include "mimalloc-atomic.h" #include "mimalloc-track.h" +#include "prim/prim.h" // mi_prim_get_default_heap #include // memset, memcpy @@ -179,7 +180,7 @@ void mi_heap_collect(mi_heap_t* heap, bool force) mi_attr_noexcept { } void mi_collect(bool force) mi_attr_noexcept { - mi_heap_collect(mi_get_default_heap(), force); + mi_heap_collect(mi_prim_get_default_heap(), force); } @@ -189,9 +190,14 @@ void mi_collect(bool force) mi_attr_noexcept { mi_heap_t* mi_heap_get_default(void) { mi_thread_init(); - return mi_get_default_heap(); + return mi_prim_get_default_heap(); } +static bool mi_heap_is_default(const mi_heap_t* heap) { + return (heap == mi_prim_get_default_heap()); +} + + mi_heap_t* mi_heap_get_backing(void) { mi_heap_t* heap = mi_heap_get_default(); mi_assert_internal(heap!=NULL); @@ -438,7 +444,7 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) { mi_assert(mi_heap_is_initialized(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return NULL; mi_assert_expensive(mi_heap_is_valid(heap)); - mi_heap_t* old = mi_get_default_heap(); + mi_heap_t* old = mi_prim_get_default_heap(); _mi_heap_set_default_direct(heap); return old; } @@ -488,7 +494,7 @@ bool mi_heap_check_owned(mi_heap_t* heap, const void* p) { } bool mi_check_owned(const void* p) { - return mi_heap_check_owned(mi_get_default_heap(), p); + return mi_heap_check_owned(mi_prim_get_default_heap(), p); } /* ----------------------------------------------------------- diff --git a/src/init.c b/src/init.c index 5a82f2f7..2bab5e7a 100644 --- a/src/init.c +++ b/src/init.c @@ -6,6 +6,7 @@ terms of the MIT license. A copy of the license can be found in the file -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc-internal.h" +#include "prim/prim.h" #include // memcpy, memset #include // atexit @@ -130,6 +131,10 @@ mi_decl_cache_align static const mi_tld_t tld_empty = { { MI_STATS_NULL } // stats }; +mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { + return _mi_prim_thread_id(); +} + // the thread-local default heap for allocation mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty; @@ -259,13 +264,13 @@ static void mi_thread_data_collect(void) { // Initialize the thread local default heap, called from `mi_thread_init` static bool _mi_heap_init(void) { - if (mi_heap_is_initialized(mi_get_default_heap())) return true; + if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true; if (_mi_is_main_thread()) { // mi_assert_internal(_mi_heap_main.thread_id != 0); // can happen on freeBSD where alloc is called before any initialization // the main heap is statically allocated mi_heap_main_init(); _mi_heap_set_default_direct(&_mi_heap_main); - //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_get_default_heap()); + //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap()); } else { // use `_mi_os_alloc` to allocate directly from the OS @@ -363,54 +368,12 @@ static bool _mi_heap_done(mi_heap_t* heap) { // to set up the thread local keys. // -------------------------------------------------------- -static void _mi_thread_done(mi_heap_t* default_heap); - -#if defined(_WIN32) && defined(MI_SHARED_LIB) - // nothing to do as it is done in DllMain -#elif defined(_WIN32) && !defined(MI_SHARED_LIB) - // use thread local storage keys to detect thread ending - #include - #include - #if (_WIN32_WINNT < 0x600) // before Windows Vista - WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback ); - WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex ); - WINBASEAPI BOOL WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData ); - WINBASEAPI BOOL WINAPI FlsFree(_In_ DWORD dwFlsIndex); - #endif - static DWORD mi_fls_key = (DWORD)(-1); - static void NTAPI mi_fls_done(PVOID value) { - mi_heap_t* heap = (mi_heap_t*)value; - if (heap != NULL) { - _mi_thread_done(heap); - FlsSetValue(mi_fls_key, NULL); // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672 - } - } -#elif defined(MI_USE_PTHREADS) - // use pthread local storage keys to detect thread ending - // (and used with MI_TLS_PTHREADS for the default heap) - pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1); - static void mi_pthread_done(void* value) { - if (value!=NULL) _mi_thread_done((mi_heap_t*)value); - } -#elif defined(__wasi__) -// no pthreads in the WebAssembly Standard Interface -#else - #pragma message("define a way to call mi_thread_done when a thread is done") -#endif - // Set up handlers so `mi_thread_done` is called automatically static void mi_process_setup_auto_thread_done(void) { static bool tls_initialized = false; // fine if it races if (tls_initialized) return; tls_initialized = true; - #if defined(_WIN32) && defined(MI_SHARED_LIB) - // nothing to do as it is done in DllMain - #elif defined(_WIN32) && !defined(MI_SHARED_LIB) - mi_fls_key = FlsAlloc(&mi_fls_done); - #elif defined(MI_USE_PTHREADS) - mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1)); - pthread_key_create(&_mi_heap_default_key, &mi_pthread_done); - #endif + _mi_prim_thread_init_auto_done(); _mi_heap_set_default_direct(&_mi_heap_main); } @@ -442,13 +405,19 @@ void mi_thread_init(void) mi_attr_noexcept } void mi_thread_done(void) mi_attr_noexcept { - _mi_thread_done(mi_get_default_heap()); + _mi_thread_done(NULL); } -static void _mi_thread_done(mi_heap_t* heap) { +void _mi_thread_done(mi_heap_t* heap) +{ mi_atomic_decrement_relaxed(&thread_count); _mi_stat_decrease(&_mi_stats_main.threads, 1); + if (heap == NULL) { + heap = mi_prim_get_default_heap(); + if (heap == NULL) return; + } + // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps... if (heap->thread_id != _mi_thread_id()) return; @@ -470,16 +439,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap) { // ensure the default heap is passed to `_mi_thread_done` // setting to a non-NULL value also ensures `mi_thread_done` is called. - #if defined(_WIN32) && defined(MI_SHARED_LIB) - // nothing to do as it is done in DllMain - #elif defined(_WIN32) && !defined(MI_SHARED_LIB) - mi_assert_internal(mi_fls_key != 0); - FlsSetValue(mi_fls_key, heap); - #elif defined(MI_USE_PTHREADS) - if (_mi_heap_default_key != (pthread_key_t)(-1)) { // can happen during recursive invocation on freeBSD - pthread_setspecific(_mi_heap_default_key, heap); - } - #endif + _mi_prim_thread_associate_default_heap(heap); } @@ -594,11 +554,11 @@ void mi_process_init(void) mi_attr_noexcept { _mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL); mi_thread_init(); - #if defined(_WIN32) && !defined(MI_SHARED_LIB) - // When building as a static lib the FLS cleanup happens to early for the main thread. + #if defined(_WIN32) + // On windows, when building as a static lib the FLS cleanup happens to early for the main thread. // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup // will not call _mi_thread_done on the (still executing) main thread. See issue #508. - FlsSetValue(mi_fls_key, NULL); + _mi_prim_thread_associate_default_heap(NULL); #endif mi_stats_reset(); // only call stat reset *after* thread init (or the heap tld == NULL) @@ -629,10 +589,9 @@ static void mi_cdecl mi_process_done(void) { if (process_done) return; process_done = true; - #if defined(_WIN32) && !defined(MI_SHARED_LIB) - FlsFree(mi_fls_key); // call thread-done on all threads (except the main thread) to prevent dangling callback pointer if statically linked with a DLL; Issue #208 - #endif - + // release any thread specific resources and ensure _mi_thread_done is called on all but the main thread + _mi_prim_thread_done_auto_done(); + #ifndef MI_SKIP_COLLECT_ON_EXIT #if (MI_DEBUG != 0) || !defined(MI_SHARED_LIB) // free all memory if possible on process exit. This is not needed for a stand-alone process diff --git a/src/options.c b/src/options.c index 0a82ca65..8f15d647 100644 --- a/src/options.c +++ b/src/options.c @@ -7,17 +7,12 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc-internal.h" #include "mimalloc-atomic.h" +#include "prim/prim.h" // mi_prim_out_stderr -#include -#include // strtol -#include // strncpy, strncat, strlen, strstr -#include // toupper +#include // FILE +#include // abort #include -#ifdef _MSC_VER -#pragma warning(disable:4996) // strncpy, strncat -#endif - static long mi_max_error_count = 16; // stop outputting errors after this (use < 0 for no limit) static long mi_max_warning_count = 16; // stop outputting warnings after this (use < 0 for no limit) @@ -28,9 +23,6 @@ int mi_version(void) mi_attr_noexcept { return MI_MALLOC_VERSION; } -#ifdef _WIN32 -#include -#endif // -------------------------------------------------------- // Options @@ -171,41 +163,11 @@ void mi_option_disable(mi_option_t option) { mi_option_set_enabled(option,false); } - static void mi_cdecl mi_out_stderr(const char* msg, void* arg) { MI_UNUSED(arg); - if (msg == NULL) return; - #ifdef _WIN32 - // on windows with redirection, the C runtime cannot handle locale dependent output - // after the main thread closes so we use direct console output. - if (!_mi_preloading()) { - // _cputs(msg); // _cputs cannot be used at is aborts if it fails to lock the console - static HANDLE hcon = INVALID_HANDLE_VALUE; - static bool hconIsConsole; - if (hcon == INVALID_HANDLE_VALUE) { - CONSOLE_SCREEN_BUFFER_INFO sbi; - hcon = GetStdHandle(STD_ERROR_HANDLE); - hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi)); - } - const size_t len = strlen(msg); - if (len > 0 && len < UINT32_MAX) { - DWORD written = 0; - if (hconIsConsole) { - WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL); - } - else if (hcon != INVALID_HANDLE_VALUE) { - // use direct write if stderr was redirected - WriteFile(hcon, msg, (DWORD)len, &written, NULL); - } - else { - // finally fall back to fputs after all - fputs(msg, stderr); - } - } + if (msg != NULL && msg[0] != 0) { + _mi_prim_out_stderr(msg); } - #else - fputs(msg, stderr); - #endif } // Since an output function can be registered earliest in the `main` @@ -222,7 +184,7 @@ static void mi_cdecl mi_out_buf(const char* msg, void* arg) { MI_UNUSED(arg); if (msg==NULL) return; if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return; - size_t n = strlen(msg); + size_t n = _mi_strlen(msg); if (n==0) return; // claim space size_t start = mi_atomic_add_acq_rel(&out_len, n); @@ -359,7 +321,7 @@ void _mi_fprintf( mi_output_fun* out, void* arg, const char* fmt, ... ) { } static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args) { - if (prefix != NULL && strlen(prefix) <= 32 && !_mi_is_main_thread()) { + if (prefix != NULL && _mi_strnlen(prefix,33) <= 32 && !_mi_is_main_thread()) { char tprefix[64]; snprintf(tprefix, sizeof(tprefix), "%sthread 0x%llx: ", prefix, (unsigned long long)_mi_thread_id()); mi_vfprintf(out, arg, tprefix, fmt, args); @@ -464,8 +426,20 @@ void _mi_error_message(int err, const char* fmt, ...) { // -------------------------------------------------------- // Initialize options by checking the environment // -------------------------------------------------------- +char _mi_toupper(char c) { + if (c >= 'a' && c <= 'z') return (c - 'a' + 'A'); + else return c; +} -static void mi_strlcpy(char* dest, const char* src, size_t dest_size) { +int _mi_strnicmp(const char* s, const char* t, size_t n) { + if (n == 0) return 0; + for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) { + if (_mi_toupper(*s) != _mi_toupper(*t)) break; + } + return (n == 0 ? 0 : *s - *t); +} + +void _mi_strlcpy(char* dest, const char* src, size_t dest_size) { if (dest==NULL || src==NULL || dest_size == 0) return; // copy until end of src, or when dest is (almost) full while (*src != 0 && dest_size > 1) { @@ -476,7 +450,7 @@ static void mi_strlcpy(char* dest, const char* src, size_t dest_size) { *dest = 0; } -static void mi_strlcat(char* dest, const char* src, size_t dest_size) { +void _mi_strlcat(char* dest, const char* src, size_t dest_size) { if (dest==NULL || src==NULL || dest_size == 0) return; // find end of string in the dest buffer while (*dest != 0 && dest_size > 1) { @@ -484,7 +458,21 @@ static void mi_strlcat(char* dest, const char* src, size_t dest_size) { dest_size--; } // and catenate - mi_strlcpy(dest, src, dest_size); + _mi_strlcpy(dest, src, dest_size); +} + +size_t _mi_strlen(const char* s) { + if (s==NULL) return 0; + size_t len = 0; + while(s[len] != 0) { len++; } + return len; +} + +size_t _mi_strnlen(const char* s, size_t max_len) { + if (s==NULL) return 0; + size_t len = 0; + while(s[len] != 0 && len < max_len) { len++; } + return len; } #ifdef MI_NO_GETENV @@ -495,93 +483,27 @@ static bool mi_getenv(const char* name, char* result, size_t result_size) { return false; } #else -#if defined _WIN32 -// On Windows use GetEnvironmentVariable instead of getenv to work -// reliably even when this is invoked before the C runtime is initialized. -// i.e. when `_mi_preloading() == true`. -// Note: on windows, environment names are not case sensitive. -#include static bool mi_getenv(const char* name, char* result, size_t result_size) { - result[0] = 0; - size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size); - return (len > 0 && len < result_size); -} -#elif !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0) -// On Posix systemsr use `environ` to acces environment variables -// even before the C runtime is initialized. -#if defined(__APPLE__) && defined(__has_include) && __has_include() -#include -static char** mi_get_environ(void) { - return (*_NSGetEnviron()); -} -#else -extern char** environ; -static char** mi_get_environ(void) { - return environ; + if (name==NULL || result == NULL || result_size < 64) return false; + return _mi_prim_getenv(name,result,result_size); } #endif -static int mi_strnicmp(const char* s, const char* t, size_t n) { - if (n == 0) return 0; - for (; *s != 0 && *t != 0 && n > 0; s++, t++, n--) { - if (toupper(*s) != toupper(*t)) break; - } - return (n == 0 ? 0 : *s - *t); -} -static bool mi_getenv(const char* name, char* result, size_t result_size) { - if (name==NULL) return false; - const size_t len = strlen(name); - if (len == 0) return false; - char** env = mi_get_environ(); - if (env == NULL) return false; - // compare up to 256 entries - for (int i = 0; i < 256 && env[i] != NULL; i++) { - const char* s = env[i]; - if (mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive - // found it - mi_strlcpy(result, s + len + 1, result_size); - return true; - } - } - return false; -} -#else -// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime -static bool mi_getenv(const char* name, char* result, size_t result_size) { - // cannot call getenv() when still initializing the C runtime. - if (_mi_preloading()) return false; - const char* s = getenv(name); - if (s == NULL) { - // we check the upper case name too. - char buf[64+1]; - size_t len = strlen(name); - if (len >= sizeof(buf)) len = sizeof(buf) - 1; - for (size_t i = 0; i < len; i++) { - buf[i] = toupper(name[i]); - } - buf[len] = 0; - s = getenv(buf); - } - if (s != NULL && strlen(s) < result_size) { - mi_strlcpy(result, s, result_size); - return true; - } - else { - return false; - } -} -#endif // !MI_USE_ENVIRON -#endif // !MI_NO_GETENV + +// TODO: implement ourselves to reduce dependencies on the C runtime +#include // strtol +#include // strstr + static void mi_option_init(mi_option_desc_t* desc) { // Read option value from the environment char s[64+1]; char buf[64+1]; - mi_strlcpy(buf, "mimalloc_", sizeof(buf)); - mi_strlcat(buf, desc->name, sizeof(buf)); + _mi_strlcpy(buf, "mimalloc_", sizeof(buf)); + _mi_strlcat(buf, desc->name, sizeof(buf)); bool found = mi_getenv(buf,s,sizeof(s)); if (!found && desc->legacy_name != NULL) { - mi_strlcpy(buf, "mimalloc_", sizeof(buf)); - mi_strlcat(buf, desc->legacy_name, sizeof(buf)); + _mi_strlcpy(buf, "mimalloc_", sizeof(buf)); + _mi_strlcat(buf, desc->legacy_name, sizeof(buf)); found = mi_getenv(buf,s,sizeof(s)); if (found) { _mi_warning_message("environment option \"mimalloc_%s\" is deprecated -- use \"mimalloc_%s\" instead.\n", desc->legacy_name, desc->name ); @@ -589,10 +511,9 @@ static void mi_option_init(mi_option_desc_t* desc) { } if (found) { - size_t len = strlen(s); - if (len >= sizeof(buf)) len = sizeof(buf) - 1; + size_t len = _mi_strnlen(s,sizeof(buf)-1); for (size_t i = 0; i < len; i++) { - buf[i] = (char)toupper(s[i]); + buf[i] = _mi_toupper(s[i]); } buf[len] = 0; if (buf[0]==0 || strstr("1;TRUE;YES;ON", buf) != NULL) { diff --git a/src/os.c b/src/os.c index 0f984741..98116272 100644 --- a/src/os.c +++ b/src/os.c @@ -1,118 +1,48 @@ /* ---------------------------------------------------------------------------- -Copyright (c) 2018-2021, Microsoft Research, Daan Leijen +Copyright (c) 2018-2023, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ -#ifndef _DEFAULT_SOURCE -#define _DEFAULT_SOURCE // ensure mmap flags are defined -#endif - -#if defined(__sun) -// illumos provides new mman.h api when any of these are defined -// otherwise the old api based on caddr_t which predates the void pointers one. -// stock solaris provides only the former, chose to atomically to discard those -// flags only here rather than project wide tough. -#undef _XOPEN_SOURCE -#undef _POSIX_C_SOURCE -#endif #include "mimalloc.h" #include "mimalloc-internal.h" #include "mimalloc-atomic.h" +#include "prim/prim.h" -#include // strerror - -#ifdef _MSC_VER -#pragma warning(disable:4996) // strerror -#endif - -#if defined(__wasi__) -#define MI_USE_SBRK -#endif - -#if defined(_WIN32) -#include -#elif defined(__wasi__) -#include // sbrk -#else -#include // mmap -#include // sysconf -#if defined(__linux__) -#include -#include -#if defined(__GLIBC__) -#include // linux mmap flags -#else -#include -#endif -#endif -#if defined(__APPLE__) -#include -#if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR -#include -#endif -#endif -#if defined(__FreeBSD__) || defined(__DragonFly__) -#include -#if __FreeBSD_version >= 1200000 -#include -#include -#endif -#include -#endif -#endif /* ----------------------------------------------------------- Initialization. On windows initializes support for aligned allocation and large OS pages (if MIMALLOC_LARGE_OS_PAGES is true). ----------------------------------------------------------- */ -bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats); -bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats); -static void* mi_align_up_ptr(void* p, size_t alignment) { - return (void*)_mi_align_up((uintptr_t)p, alignment); -} - -static void* mi_align_down_ptr(void* p, size_t alignment) { - return (void*)_mi_align_down((uintptr_t)p, alignment); -} - - -// page size (initialized properly in `os_init`) -static size_t os_page_size = 4096; - -// minimal allocation granularity -static size_t os_alloc_granularity = 4096; - -// if non-zero, use large page allocation -static size_t large_os_page_size = 0; - -// is memory overcommit allowed? -// set dynamically in _mi_os_init (and if true we use MAP_NORESERVE) -static bool os_overcommit = true; +static mi_os_mem_config_t mi_os_mem_config = { + 4096, // page size + 0, // large page size (usually 2MiB) + 4096, // allocation granularity + true, // has overcommit? (if true we use MAP_NORESERVE on mmap systems) + false // must free whole? +}; bool _mi_os_has_overcommit(void) { - return os_overcommit; + return mi_os_mem_config.has_overcommit; } // OS (small) page size size_t _mi_os_page_size(void) { - return os_page_size; + return mi_os_mem_config.page_size; } // if large OS pages are supported (2 or 4MiB), then return the size, otherwise return the small page size (4KiB) size_t _mi_os_large_page_size(void) { - return (large_os_page_size != 0 ? large_os_page_size : _mi_os_page_size()); + return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size()); } -#if !defined(MI_USE_SBRK) && !defined(__wasi__) -static bool use_large_os_page(size_t size, size_t alignment) { +bool _mi_os_use_large_page(size_t size, size_t alignment) { // if we have access, check the size and alignment requirements - if (large_os_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false; - return ((size % large_os_page_size) == 0 && (alignment % large_os_page_size) == 0); + if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_large_os_pages)) return false; + return ((size % mi_os_mem_config.large_page_size) == 0 && (alignment % mi_os_mem_config.large_page_size) == 0); } -#endif // round to a good OS allocation size (bounded by max 12.5% waste) size_t _mi_os_good_alloc_size(size_t size) { @@ -126,177 +56,24 @@ size_t _mi_os_good_alloc_size(size_t size) { return _mi_align_up(size, align_size); } -#if defined(_WIN32) -// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016. -// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility) -// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB) -// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's. -typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E { - MiMemExtendedParameterInvalidType = 0, - MiMemExtendedParameterAddressRequirements, - MiMemExtendedParameterNumaNode, - MiMemExtendedParameterPartitionHandle, - MiMemExtendedParameterUserPhysicalHandle, - MiMemExtendedParameterAttributeFlags, - MiMemExtendedParameterMax -} MI_MEM_EXTENDED_PARAMETER_TYPE; - -typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S { - struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type; - union { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg; -} MI_MEM_EXTENDED_PARAMETER; - -typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S { - PVOID LowestStartingAddress; - PVOID HighestEndingAddress; - SIZE_T Alignment; -} MI_MEM_ADDRESS_REQUIREMENTS; - -#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE 0x00000010 - -#include -typedef PVOID (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG); -typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG); -static PVirtualAlloc2 pVirtualAlloc2 = NULL; -static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL; - -// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7 -typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER; - -typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber); -typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber); -typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask); -typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber); -static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL; -static PGetNumaProcessorNodeEx pGetNumaProcessorNodeEx = NULL; -static PGetNumaNodeProcessorMaskEx pGetNumaNodeProcessorMaskEx = NULL; -static PGetNumaProcessorNode pGetNumaProcessorNode = NULL; - -static bool mi_win_enable_large_os_pages(void) -{ - if (large_os_page_size > 0) return true; - - // Try to see if large OS pages are supported - // To use large pages on Windows, we first need access permission - // Set "Lock pages in memory" permission in the group policy editor - // - unsigned long err = 0; - HANDLE token = NULL; - BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); - if (ok) { - TOKEN_PRIVILEGES tp; - ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid); - if (ok) { - tp.PrivilegeCount = 1; - tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; - ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); - if (ok) { - err = GetLastError(); - ok = (err == ERROR_SUCCESS); - if (ok) { - large_os_page_size = GetLargePageMinimum(); - } - } - } - CloseHandle(token); - } - if (!ok) { - if (err == 0) err = GetLastError(); - _mi_warning_message("cannot enable large OS page support, error %lu\n", err); - } - return (ok!=0); -} - -void _mi_os_init(void) -{ - os_overcommit = false; - // get the page size - SYSTEM_INFO si; - GetSystemInfo(&si); - if (si.dwPageSize > 0) os_page_size = si.dwPageSize; - if (si.dwAllocationGranularity > 0) os_alloc_granularity = si.dwAllocationGranularity; - // get the VirtualAlloc2 function - HINSTANCE hDll; - hDll = LoadLibrary(TEXT("kernelbase.dll")); - if (hDll != NULL) { - // use VirtualAlloc2FromApp if possible as it is available to Windows store apps - pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp"); - if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2"); - FreeLibrary(hDll); - } - // NtAllocateVirtualMemoryEx is used for huge page allocation - hDll = LoadLibrary(TEXT("ntdll.dll")); - if (hDll != NULL) { - pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx"); - FreeLibrary(hDll); - } - // Try to use Win7+ numa API - hDll = LoadLibrary(TEXT("kernel32.dll")); - if (hDll != NULL) { - pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx"); - pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx"); - pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx"); - pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode"); - FreeLibrary(hDll); - } - if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) { - mi_win_enable_large_os_pages(); - } -} -#elif defined(__wasi__) void _mi_os_init(void) { - os_overcommit = false; - os_page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB - os_alloc_granularity = 16; + _mi_prim_mem_init(&mi_os_mem_config); } -#else // generic unix -static void os_detect_overcommit(void) { -#if defined(__linux__) - int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY); - if (fd < 0) return; - char buf[32]; - ssize_t nread = read(fd, &buf, sizeof(buf)); - close(fd); - // - // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE) - if (nread >= 1) { - os_overcommit = (buf[0] == '0' || buf[0] == '1'); - } -#elif defined(__FreeBSD__) - int val = 0; - size_t olen = sizeof(val); - if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) { - os_overcommit = (val != 0); - } -#else - // default: overcommit is true -#endif +/* ----------------------------------------------------------- + Util +-------------------------------------------------------------- */ +bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats); +bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats); + +static void* mi_align_up_ptr(void* p, size_t alignment) { + return (void*)_mi_align_up((uintptr_t)p, alignment); } -void _mi_os_init(void) { - // get the page size - long result = sysconf(_SC_PAGESIZE); - if (result > 0) { - os_page_size = (size_t)result; - os_alloc_granularity = os_page_size; - } - large_os_page_size = 2*MI_MiB; // TODO: can we query the OS for this? - os_detect_overcommit(); +static void* mi_align_down_ptr(void* p, size_t alignment) { + return (void*)_mi_align_down((uintptr_t)p, alignment); } -#endif - - -#if defined(MADV_NORMAL) -static int mi_madvise(void* addr, size_t length, int advice) { - #if defined(__sun) - return madvise((caddr_t)addr, length, advice); // Solaris needs cast (issue #520) - #else - return madvise(addr, length, advice); - #endif -} -#endif /* ----------------------------------------------------------- @@ -319,7 +96,7 @@ static mi_decl_cache_align _Atomic(uintptr_t)aligned_base; #define MI_HINT_AREA ((uintptr_t)4 << 40) // upto 6TiB (since before win8 there is "only" 8TiB available to processes) #define MI_HINT_MAX ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages) -static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) +void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL; size = _mi_align_up(size, MI_SEGMENT_SIZE); @@ -332,7 +109,7 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) if (hint == 0 || hint > MI_HINT_MAX) { // wrap or initialize uintptr_t init = MI_HINT_BASE; #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of aligned allocations unless in debug mode - uintptr_t r = _mi_heap_random_next(mi_get_default_heap()); + uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap()); init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA); // (randomly 20 bits)*4MiB == 0 to 4TiB #endif uintptr_t expected = hint + size; @@ -343,362 +120,32 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) return (void*)hint; } #else -static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) { +void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { MI_UNUSED(try_alignment); MI_UNUSED(size); return NULL; } #endif + /* ----------------------------------------------------------- Free memory -------------------------------------------------------------- */ -static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats_t* stats) +void _mi_os_free_ex(void* addr, size_t size, bool was_committed, mi_stats_t* tld_stats) { - if (addr == NULL || size == 0) return true; // || _mi_os_is_huge_reserved(addr) - bool err = false; -#if defined(_WIN32) - DWORD errcode = 0; - err = (VirtualFree(addr, 0, MEM_RELEASE) == 0); - if (err) { errcode = GetLastError(); } - if (errcode == ERROR_INVALID_ADDRESS) { - // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside - // the memory region returned by VirtualAlloc; in that case we need to free using - // the start of the region. - MEMORY_BASIC_INFORMATION info = { 0 }; - VirtualQuery(addr, &info, sizeof(info)); - if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) { - errcode = 0; - err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0); - if (err) { errcode = GetLastError(); } - } - } - if (errcode != 0) { - _mi_warning_message("unable to release OS memory: error code 0x%x, addr: %p, size: %zu\n", errcode, addr, size); - } -#elif defined(MI_USE_SBRK) || defined(__wasi__) - err = false; // sbrk heap cannot be shrunk -#else - err = (munmap(addr, size) == -1); - if (err) { - _mi_warning_message("unable to release OS memory: %s, addr: %p, size: %zu\n", strerror(errno), addr, size); - } -#endif + MI_UNUSED(tld_stats); + mi_stats_t* stats = &_mi_stats_main; + if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr) + const size_t csize = _mi_os_good_alloc_size(size); + _mi_prim_free(addr, csize); if (was_committed) { _mi_stat_decrease(&stats->committed, size); } _mi_stat_decrease(&stats->reserved, size); - return !err; } - -/* ----------------------------------------------------------- - Raw allocation on Windows (VirtualAlloc) --------------------------------------------------------------- */ - -#ifdef _WIN32 - -#define MEM_COMMIT_RESERVE (MEM_COMMIT|MEM_RESERVE) - -static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) { -#if (MI_INTPTR_SIZE >= 8) - // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations - if (addr == NULL) { - void* hint = mi_os_get_aligned_hint(try_alignment,size); - if (hint != NULL) { - void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE); - if (p != NULL) return p; - _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags); - // fall through on error - } - } -#endif - // on modern Windows try use VirtualAlloc2 for aligned allocation - if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) { - MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 }; - reqs.Alignment = try_alignment; - MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} }; - param.Type.Type = MiMemExtendedParameterAddressRequirements; - param.Arg.Pointer = &reqs; - void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, ¶m, 1); - if (p != NULL) return p; - _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags); - // fall through on error - } - // last resort - return VirtualAlloc(addr, size, flags, PAGE_READWRITE); +void _mi_os_free(void* p, size_t size, mi_stats_t* tld_stats) { + _mi_os_free_ex(p, size, true, tld_stats); } -static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) { - mi_assert_internal(!(large_only && !allow_large)); - static _Atomic(size_t) large_page_try_ok; // = 0; - void* p = NULL; - // Try to allocate large OS pages (2MiB) if allowed or required. - if ((large_only || use_large_os_page(size, try_alignment)) - && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) { - size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); - if (!large_only && try_ok > 0) { - // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive. - // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times. - mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); - } - else { - // large OS pages must always reserve and commit. - *is_large = true; - p = mi_win_virtual_allocx(addr, size, try_alignment, flags | MEM_LARGE_PAGES); - if (large_only) return p; - // fall back to non-large page allocation on error (`p == NULL`). - if (p == NULL) { - mi_atomic_store_release(&large_page_try_ok,10UL); // on error, don't try again for the next N allocations - } - } - } - // Fall back to regular page allocation - if (p == NULL) { - *is_large = ((flags&MEM_LARGE_PAGES) != 0); - p = mi_win_virtual_allocx(addr, size, try_alignment, flags); - } - if (p == NULL) { - _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large); - } - return p; -} - -/* ----------------------------------------------------------- - Raw allocation using `sbrk` or `wasm_memory_grow` --------------------------------------------------------------- */ - -#elif defined(MI_USE_SBRK) || defined(__wasi__) -#if defined(MI_USE_SBRK) - static void* mi_memory_grow( size_t size ) { - void* p = sbrk(size); - if (p == (void*)(-1)) return NULL; - #if !defined(__wasi__) // on wasi this is always zero initialized already (?) - memset(p,0,size); - #endif - return p; - } -#elif defined(__wasi__) - static void* mi_memory_grow( size_t size ) { - size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size())) - : __builtin_wasm_memory_size(0)); - if (base == SIZE_MAX) return NULL; - return (void*)(base * _mi_os_page_size()); - } -#endif - -#if defined(MI_USE_PTHREADS) -static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER; -#endif - -static void* mi_heap_grow(size_t size, size_t try_alignment) { - void* p = NULL; - if (try_alignment <= 1) { - // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now) - #if defined(MI_USE_PTHREADS) - pthread_mutex_lock(&mi_heap_grow_mutex); - #endif - p = mi_memory_grow(size); - #if defined(MI_USE_PTHREADS) - pthread_mutex_unlock(&mi_heap_grow_mutex); - #endif - } - else { - void* base = NULL; - size_t alloc_size = 0; - // to allocate aligned use a lock to try to avoid thread interaction - // between getting the current size and actual allocation - // (also, `sbrk` is not thread safe in general) - #if defined(MI_USE_PTHREADS) - pthread_mutex_lock(&mi_heap_grow_mutex); - #endif - { - void* current = mi_memory_grow(0); // get current size - if (current != NULL) { - void* aligned_current = mi_align_up_ptr(current, try_alignment); // and align from there to minimize wasted space - alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size()); - base = mi_memory_grow(alloc_size); - } - } - #if defined(MI_USE_PTHREADS) - pthread_mutex_unlock(&mi_heap_grow_mutex); - #endif - if (base != NULL) { - p = mi_align_up_ptr(base, try_alignment); - if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) { - // another thread used wasm_memory_grow/sbrk in-between and we do not have enough - // space after alignment. Give up (and waste the space as we cannot shrink :-( ) - // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align) - p = NULL; - } - } - } - if (p == NULL) { - _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment); - errno = ENOMEM; - return NULL; - } - mi_assert_internal( try_alignment == 0 || (uintptr_t)p % try_alignment == 0 ); - return p; -} - -/* ----------------------------------------------------------- - Raw allocation on Unix's (mmap) --------------------------------------------------------------- */ -#else -#define MI_OS_USE_MMAP -static void* mi_unix_mmapx(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) { - MI_UNUSED(try_alignment); - #if defined(MAP_ALIGNED) // BSD - if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { - size_t n = mi_bsr(try_alignment); - if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) { // alignment is a power of 2 and 4096 <= alignment <= 1GiB - flags |= MAP_ALIGNED(n); - void* p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0); - if (p!=MAP_FAILED) return p; - // fall back to regular mmap - } - } - #elif defined(MAP_ALIGN) // Solaris - if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { - void* p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0); // addr parameter is the required alignment - if (p!=MAP_FAILED) return p; - // fall back to regular mmap - } - #endif - #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED) - // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations - if (addr == NULL) { - void* hint = mi_os_get_aligned_hint(try_alignment, size); - if (hint != NULL) { - void* p = mmap(hint, size, protect_flags, flags, fd, 0); - if (p!=MAP_FAILED) return p; - // fall back to regular mmap - } - } - #endif - // regular mmap - void* p = mmap(addr, size, protect_flags, flags, fd, 0); - if (p!=MAP_FAILED) return p; - // failed to allocate - return NULL; -} - -static int mi_unix_mmap_fd(void) { -#if defined(VM_MAKE_TAG) - // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99) - int os_tag = (int)mi_option_get(mi_option_os_tag); - if (os_tag < 100 || os_tag > 255) os_tag = 100; - return VM_MAKE_TAG(os_tag); -#else - return -1; -#endif -} - -static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) { - void* p = NULL; - #if !defined(MAP_ANONYMOUS) - #define MAP_ANONYMOUS MAP_ANON - #endif - #if !defined(MAP_NORESERVE) - #define MAP_NORESERVE 0 - #endif - const int fd = mi_unix_mmap_fd(); - int flags = MAP_PRIVATE | MAP_ANONYMOUS; - if (_mi_os_has_overcommit()) { - flags |= MAP_NORESERVE; - } - #if defined(PROT_MAX) - protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD - #endif - // huge page allocation - if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) { - static _Atomic(size_t) large_page_try_ok; // = 0; - size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); - if (!large_only && try_ok > 0) { - // If the OS is not configured for large OS pages, or the user does not have - // enough permission, the `mmap` will always fail (but it might also fail for other reasons). - // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times - // to avoid too many failing calls to mmap. - mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); - } - else { - int lflags = flags & ~MAP_NORESERVE; // using NORESERVE on huge pages seems to fail on Linux - int lfd = fd; - #ifdef MAP_ALIGNED_SUPER - lflags |= MAP_ALIGNED_SUPER; - #endif - #ifdef MAP_HUGETLB - lflags |= MAP_HUGETLB; - #endif - #ifdef MAP_HUGE_1GB - static bool mi_huge_pages_available = true; - if ((size % MI_GiB) == 0 && mi_huge_pages_available) { - lflags |= MAP_HUGE_1GB; - } - else - #endif - { - #ifdef MAP_HUGE_2MB - lflags |= MAP_HUGE_2MB; - #endif - } - #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB - lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; - #endif - if (large_only || lflags != flags) { - // try large OS page allocation - *is_large = true; - p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd); - #ifdef MAP_HUGE_1GB - if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) { - mi_huge_pages_available = false; // don't try huge 1GiB pages again - _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno); - lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB); - p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, lflags, lfd); - } - #endif - if (large_only) return p; - if (p == NULL) { - mi_atomic_store_release(&large_page_try_ok, (size_t)8); // on error, don't try again for the next N allocations - } - } - } - } - // regular allocation - if (p == NULL) { - *is_large = false; - p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd); - if (p != NULL) { - #if defined(MADV_HUGEPAGE) - // Many Linux systems don't allow MAP_HUGETLB but they support instead - // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE - // though since properly aligned allocations will already use large pages if available - // in that case -- in particular for our large regions (in `memory.c`). - // However, some systems only allow THP if called with explicit `madvise`, so - // when large OS pages are enabled for mimalloc, we call `madvise` anyways. - if (allow_large && use_large_os_page(size, try_alignment)) { - if (mi_madvise(p, size, MADV_HUGEPAGE) == 0) { - *is_large = true; // possibly - }; - } - #elif defined(__sun) - if (allow_large && use_large_os_page(size, try_alignment)) { - struct memcntl_mha cmd = {0}; - cmd.mha_pagesize = large_os_page_size; - cmd.mha_cmd = MHA_MAPSIZE_VA; - if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) { - *is_large = true; - } - } - #endif - } - } - if (p == NULL) { - _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large); - } - return p; -} -#endif - /* ----------------------------------------------------------- Primitive allocation from the OS. @@ -711,7 +158,7 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo if (!commit) allow_large = false; if (try_alignment == 0) try_alignment = 1; // avoid 0 to ensure there will be no divide by zero when aligning - void* p = NULL; + void* p = _mi_prim_alloc(size, try_alignment, commit, allow_large, is_large); /* if (commit && allow_large) { p = _mi_os_try_alloc_from_huge_reserved(size, try_alignment); @@ -722,18 +169,6 @@ static void* mi_os_mem_alloc(size_t size, size_t try_alignment, bool commit, boo } */ - #if defined(_WIN32) - int flags = MEM_RESERVE; - if (commit) { flags |= MEM_COMMIT; } - p = mi_win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large); - #elif defined(MI_USE_SBRK) || defined(__wasi__) - MI_UNUSED(allow_large); - *is_large = false; - p = mi_heap_grow(size, try_alignment); - #else - int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE); - p = mi_unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large); - #endif mi_stat_counter_increase(stats->mmap_calls, 1); if (p != NULL) { _mi_stat_increase(&stats->reserved, size); @@ -759,40 +194,41 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, // if not aligned, free it, overallocate, and unmap around it if (((uintptr_t)p % alignment != 0)) { - mi_os_mem_free(p, size, commit, stats); + _mi_os_free_ex(p, size, commit, stats); _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (%zu bytes, address: %p, alignment: %zu, commit: %d)\n", size, p, alignment, commit); if (size >= (SIZE_MAX - alignment)) return NULL; // overflow const size_t over_size = size + alignment; -#if _WIN32 - // over-allocate uncommitted (virtual) memory - p = mi_os_mem_alloc(over_size, 0 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, stats); - if (p == NULL) return NULL; + if (mi_os_mem_config.must_free_whole) { // win32 virtualAlloc cannot free parts of an allocate block + // over-allocate uncommitted (virtual) memory + p = mi_os_mem_alloc(over_size, 0 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, stats); + if (p == NULL) return NULL; - // set p to the aligned part in the full region - // note: this is dangerous on Windows as VirtualFree needs the actual region pointer - // but in mi_os_mem_free we handle this (hopefully exceptional) situation. - p = mi_align_up_ptr(p, alignment); + // set p to the aligned part in the full region + // note: this is dangerous on Windows as VirtualFree needs the actual region pointer + // but in mi_os_mem_free we handle this (hopefully exceptional) situation. + p = mi_align_up_ptr(p, alignment); - // explicitly commit only the aligned part - if (commit) { - _mi_os_commit(p, size, NULL, stats); + // explicitly commit only the aligned part + if (commit) { + _mi_os_commit(p, size, NULL, stats); + } + } + else { // mmap can free inside an allocation + // overallocate... + p = mi_os_mem_alloc(over_size, 1, commit, false, is_large, stats); + if (p == NULL) return NULL; + // and selectively unmap parts around the over-allocated area. (noop on sbrk) + void* aligned_p = mi_align_up_ptr(p, alignment); + size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p; + size_t mid_size = _mi_align_up(size, _mi_os_page_size()); + size_t post_size = over_size - pre_size - mid_size; + mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size); + if (pre_size > 0) _mi_os_free_ex(p, pre_size, commit, stats); + if (post_size > 0) _mi_os_free_ex((uint8_t*)aligned_p + mid_size, post_size, commit, stats); + // we can return the aligned pointer on `mmap` (and sbrk) systems + p = aligned_p; } -#else - // overallocate... - p = mi_os_mem_alloc(over_size, 1, commit, false, is_large, stats); - if (p == NULL) return NULL; - // and selectively unmap parts around the over-allocated area. (noop on sbrk) - void* aligned_p = mi_align_up_ptr(p, alignment); - size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p; - size_t mid_size = _mi_align_up(size, _mi_os_page_size()); - size_t post_size = over_size - pre_size - mid_size; - mi_assert_internal(pre_size < over_size && post_size < over_size && mid_size >= size); - if (pre_size > 0) mi_os_mem_free(p, pre_size, commit, stats); - if (post_size > 0) mi_os_mem_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); - // we can return the aligned pointer on `mmap` (and sbrk) systems - p = aligned_p; -#endif } mi_assert_internal(p == NULL || (p != NULL && ((uintptr_t)p % alignment) == 0)); @@ -801,7 +237,7 @@ static void* mi_os_mem_alloc_aligned(size_t size, size_t alignment, bool commit, /* ----------------------------------------------------------- - OS API: alloc, free, alloc_aligned + OS API: alloc and alloc_aligned ----------------------------------------------------------- */ void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) { @@ -813,21 +249,9 @@ void* _mi_os_alloc(size_t size, mi_stats_t* tld_stats) { return mi_os_mem_alloc(size, 0, true, false, &is_large, stats); } -void _mi_os_free_ex(void* p, size_t size, bool was_committed, mi_stats_t* tld_stats) { - MI_UNUSED(tld_stats); - mi_stats_t* stats = &_mi_stats_main; - if (size == 0 || p == NULL) return; - size = _mi_os_good_alloc_size(size); - mi_os_mem_free(p, size, was_committed, stats); -} - -void _mi_os_free(void* p, size_t size, mi_stats_t* stats) { - _mi_os_free_ex(p, size, true, stats); -} - void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool* large, mi_stats_t* tld_stats) { - MI_UNUSED(&mi_os_get_aligned_hint); // suppress unused warnings + MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings MI_UNUSED(tld_stats); if (size == 0) return NULL; size = _mi_os_good_alloc_size(size); @@ -880,11 +304,11 @@ void _mi_os_free_aligned(void* p, size_t size, size_t alignment, size_t align_of _mi_os_free_ex(start, size + extra, was_committed, tld_stats); } + /* ----------------------------------------------------------- OS memory API: reset, commit, decommit, protect, unprotect. ----------------------------------------------------------- */ - // OS page align within a given area, either conservative (pages inside the area only), // or not (straddling pages outside the area is possible) static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, size_t* newsize) { @@ -909,18 +333,6 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* return mi_os_page_align_areax(true, addr, size, newsize); } -static void mi_mprotect_hint(int err) { -#if defined(MI_OS_USE_MMAP) && (MI_SECURE>=2) // guard page around every mimalloc page - if (err == ENOMEM) { - _mi_warning_message("the previous warning may have been caused by a low memory map limit.\n" - " On Linux this is controlled by the vm.max_map_count. For example:\n" - " > sudo sysctl -w vm.max_map_count=262144\n"); - } -#else - MI_UNUSED(err); -#endif -} - // Commit/Decommit memory. // Usually commit is aligned liberal, while decommit is aligned conservative. // (but not for the reset version where we want commit to be conservative as well) @@ -930,7 +342,6 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ size_t csize; void* start = mi_os_page_align_areax(conservative, addr, size, &csize); if (csize == 0) return true; // || _mi_os_is_huge_reserved(addr)) - int err = 0; if (commit) { _mi_stat_increase(&stats->committed, size); // use size for precise commit vs. decommit _mi_stat_counter_increase(&stats->commit_calls, 1); @@ -939,56 +350,9 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ _mi_stat_decrease(&stats->committed, size); } - #if defined(_WIN32) - if (commit) { - // *is_zero = true; // note: if the memory was already committed, the call succeeds but the memory is not zero'd - void* p = VirtualAlloc(start, csize, MEM_COMMIT, PAGE_READWRITE); - err = (p == start ? 0 : GetLastError()); - } - else { - BOOL ok = VirtualFree(start, csize, MEM_DECOMMIT); - err = (ok ? 0 : GetLastError()); - } - #elif defined(__wasi__) - // WebAssembly guests can't control memory protection - #elif 0 && defined(MAP_FIXED) && !defined(__APPLE__) - // Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?) - if (commit) { - // commit: just change the protection - err = mprotect(start, csize, (PROT_READ | PROT_WRITE)); - if (err != 0) { err = errno; } - } - else { - // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss) - const int fd = mi_unix_mmap_fd(); - void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0); - if (p != start) { err = errno; } - } - #else - // Linux, macOSX and others. - if (commit) { - // commit: ensure we can access the area - err = mprotect(start, csize, (PROT_READ | PROT_WRITE)); - if (err != 0) { err = errno; } - } - else { - #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0 - // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE) - // (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( ) - err = madvise(start, csize, MADV_DONTNEED); - #else - // decommit: just disable access (also used in debug and secure mode to trap on illegal access) - err = mprotect(start, csize, PROT_NONE); - if (err != 0) { err = errno; } - #endif - //#if defined(MADV_FREE_REUSE) - // while ((err = mi_madvise(start, csize, MADV_FREE_REUSE)) != 0 && errno == EAGAIN) { errno = 0; } - //#endif - } - #endif + int err = _mi_prim_commit(start, csize, commit); if (err != 0) { _mi_warning_message("%s error: start: %p, csize: 0x%zx, err: %i\n", commit ? "commit" : "decommit", start, csize, err); - mi_mprotect_hint(err); } mi_assert_internal(err == 0); return (err == 0); @@ -1033,39 +397,11 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) } #endif -#if defined(_WIN32) - // Testing shows that for us (on `malloc-large`) MEM_RESET is 2x faster than DiscardVirtualMemory - void* p = VirtualAlloc(start, csize, MEM_RESET, PAGE_READWRITE); - mi_assert_internal(p == start); - #if 1 - if (p == start && start != NULL) { - VirtualUnlock(start,csize); // VirtualUnlock after MEM_RESET removes the memory from the working set - } - #endif - if (p != start) return false; -#else -#if defined(MADV_FREE) - static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE); - int oadvice = (int)mi_atomic_load_relaxed(&advice); - int err; - while ((err = mi_madvise(start, csize, oadvice)) != 0 && errno == EAGAIN) { errno = 0; }; - if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) { - // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on - mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED); - err = mi_madvise(start, csize, MADV_DONTNEED); - } -#elif defined(__wasi__) - int err = 0; -#else - int err = mi_madvise(start, csize, MADV_DONTNEED); -#endif + int err = _mi_prim_reset(start, csize); if (err != 0) { - _mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, errno); + _mi_warning_message("madvise reset error: start: %p, csize: 0x%zx, errno: %i\n", start, csize, err); } - //mi_assert(err == 0); - if (err != 0) return false; -#endif - return true; + return (err == 0); } // Signal to the OS that the address range is no longer in use @@ -1098,20 +434,9 @@ static bool mi_os_protectx(void* addr, size_t size, bool protect) { _mi_warning_message("cannot mprotect memory allocated in huge OS pages\n"); } */ - int err = 0; -#ifdef _WIN32 - DWORD oldprotect = 0; - BOOL ok = VirtualProtect(start, csize, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect); - err = (ok ? 0 : GetLastError()); -#elif defined(__wasi__) - err = 0; -#else - err = mprotect(start, csize, protect ? PROT_NONE : (PROT_READ | PROT_WRITE)); - if (err != 0) { err = errno; } -#endif + int err = _mi_prim_protect(start,csize,protect); if (err != 0) { _mi_warning_message("mprotect error: start: %p, csize: 0x%zx, err: %i\n", start, csize, err); - mi_mprotect_hint(err); } return (err == 0); } @@ -1126,115 +451,12 @@ bool _mi_os_unprotect(void* addr, size_t size) { -bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) { - // page align conservatively within the range - mi_assert_internal(oldsize > newsize && p != NULL); - if (oldsize < newsize || p == NULL) return false; - if (oldsize == newsize) return true; - - // oldsize and newsize should be page aligned or we cannot shrink precisely - void* addr = (uint8_t*)p + newsize; - size_t size = 0; - void* start = mi_os_page_align_area_conservative(addr, oldsize - newsize, &size); - if (size == 0 || start != addr) return false; - -#ifdef _WIN32 - // we cannot shrink on windows, but we can decommit - return _mi_os_decommit(start, size, stats); -#else - return mi_os_mem_free(start, size, true, stats); -#endif -} - - /* ---------------------------------------------------------------------------- Support for allocating huge OS pages (1Gib) that are reserved up-front and possibly associated with a specific NUMA node. (use `numa_node>=0`) -----------------------------------------------------------------------------*/ #define MI_HUGE_OS_PAGE_SIZE (MI_GiB) -#if defined(_WIN32) && (MI_INTPTR_SIZE >= 8) -static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) -{ - mi_assert_internal(size%MI_GiB == 0); - mi_assert_internal(addr != NULL); - const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE; - - mi_win_enable_large_os_pages(); - - MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} }; - // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages - static bool mi_huge_pages_available = true; - if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) { - params[0].Type.Type = MiMemExtendedParameterAttributeFlags; - params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE; - ULONG param_count = 1; - if (numa_node >= 0) { - param_count++; - params[1].Type.Type = MiMemExtendedParameterNumaNode; - params[1].Arg.ULong = (unsigned)numa_node; - } - SIZE_T psize = size; - void* base = addr; - NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count); - if (err == 0 && base != NULL) { - return base; - } - else { - // fall back to regular large pages - mi_huge_pages_available = false; // don't try further huge pages - _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err); - } - } - // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation - if (pVirtualAlloc2 != NULL && numa_node >= 0) { - params[0].Type.Type = MiMemExtendedParameterNumaNode; - params[0].Arg.ULong = (unsigned)numa_node; - return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1); - } - - // otherwise use regular virtual alloc on older windows - return VirtualAlloc(addr, size, flags, PAGE_READWRITE); -} - -#elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__) -#include -#ifndef MPOL_PREFERRED -#define MPOL_PREFERRED 1 -#endif -#if defined(SYS_mbind) -static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) { - return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags); -} -#else -static long mi_os_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) { - MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags); - return 0; -} -#endif -static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) { - mi_assert_internal(size%MI_GiB == 0); - bool is_large = true; - void* p = mi_unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large); - if (p == NULL) return NULL; - if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes - unsigned long numa_mask = (1UL << numa_node); - // TODO: does `mbind` work correctly for huge OS pages? should we - // use `set_mempolicy` before calling mmap instead? - // see: - long err = mi_os_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0); - if (err != 0) { - _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno)); - } - } - return p; -} -#else -static void* mi_os_alloc_huge_os_pagesx(void* addr, size_t size, int numa_node) { - MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node); - return NULL; -} -#endif #if (MI_INTPTR_SIZE >= 8) // To ensure proper alignment, use our own area for huge OS pages @@ -1253,10 +475,10 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { if (start == 0) { // Initialize the start address after the 32TiB area start = ((uintptr_t)32 << 40); // 32TiB virtual start address -#if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of huge pages unless in debug mode - uintptr_t r = _mi_heap_random_next(mi_get_default_heap()); + #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of huge pages unless in debug mode + uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap()); start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF)); // (randomly 12bits)*1GiB == between 0 to 4TiB -#endif + #endif } end = start + size; mi_assert_internal(end % MI_SEGMENT_SIZE == 0); @@ -1289,7 +511,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse for (page = 0; page < pages; page++) { // allocate a page void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE); - void* p = mi_os_alloc_huge_os_pagesx(addr, MI_HUGE_OS_PAGE_SIZE, numa_node); + void* p = _mi_prim_alloc_huge_os_pages(addr, MI_HUGE_OS_PAGE_SIZE, numa_node); // Did we succeed at a contiguous address? if (p != addr) { @@ -1341,113 +563,6 @@ void _mi_os_free_huge_pages(void* p, size_t size, mi_stats_t* stats) { /* ---------------------------------------------------------------------------- Support NUMA aware allocation -----------------------------------------------------------------------------*/ -#ifdef _WIN32 -static size_t mi_os_numa_nodex(void) { - USHORT numa_node = 0; - if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) { - // Extended API is supported - MI_PROCESSOR_NUMBER pnum; - (*pGetCurrentProcessorNumberEx)(&pnum); - USHORT nnode = 0; - BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode); - if (ok) { numa_node = nnode; } - } - else if (pGetNumaProcessorNode != NULL) { - // Vista or earlier, use older API that is limited to 64 processors. Issue #277 - DWORD pnum = GetCurrentProcessorNumber(); - UCHAR nnode = 0; - BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode); - if (ok) { numa_node = nnode; } - } - return numa_node; -} - -static size_t mi_os_numa_node_countx(void) { - ULONG numa_max = 0; - GetNumaHighestNodeNumber(&numa_max); - // find the highest node number that has actual processors assigned to it. Issue #282 - while(numa_max > 0) { - if (pGetNumaNodeProcessorMaskEx != NULL) { - // Extended API is supported - GROUP_AFFINITY affinity; - if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) { - if (affinity.Mask != 0) break; // found the maximum non-empty node - } - } - else { - // Vista or earlier, use older API that is limited to 64 processors. - ULONGLONG mask; - if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) { - if (mask != 0) break; // found the maximum non-empty node - }; - } - // max node was invalid or had no processor assigned, try again - numa_max--; - } - return ((size_t)numa_max + 1); -} -#elif defined(__linux__) -#include // getcpu -#include // access - -static size_t mi_os_numa_nodex(void) { -#ifdef SYS_getcpu - unsigned long node = 0; - unsigned long ncpu = 0; - long err = syscall(SYS_getcpu, &ncpu, &node, NULL); - if (err != 0) return 0; - return node; -#else - return 0; -#endif -} -static size_t mi_os_numa_node_countx(void) { - char buf[128]; - unsigned node = 0; - for(node = 0; node < 256; node++) { - // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation) - snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1); - if (access(buf,R_OK) != 0) break; - } - return (node+1); -} -#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000 -static size_t mi_os_numa_nodex(void) { - domainset_t dom; - size_t node; - int policy; - if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul; - for (node = 0; node < MAXMEMDOM; node++) { - if (DOMAINSET_ISSET(node, &dom)) return node; - } - return 0ul; -} -static size_t mi_os_numa_node_countx(void) { - size_t ndomains = 0; - size_t len = sizeof(ndomains); - if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul; - return ndomains; -} -#elif defined(__DragonFly__) -static size_t mi_os_numa_nodex(void) { - // TODO: DragonFly does not seem to provide any userland means to get this information. - return 0ul; -} -static size_t mi_os_numa_node_countx(void) { - size_t ncpus = 0, nvirtcoresperphys = 0; - size_t len = sizeof(size_t); - if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul; - if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul; - return nvirtcoresperphys * ncpus; -} -#else -static size_t mi_os_numa_nodex(void) { - return 0; -} -static size_t mi_os_numa_node_countx(void) { - return 1; -} -#endif _Atomic(size_t) _mi_numa_node_count; // = 0 // cache the node count @@ -1459,7 +574,7 @@ size_t _mi_os_numa_node_count_get(void) { count = (size_t)ncount; } else { - count = mi_os_numa_node_countx(); // or detect dynamically + count = _mi_prim_numa_node_count(); // or detect dynamically if (count == 0) count = 1; } mi_atomic_store_release(&_mi_numa_node_count, count); // save it @@ -1473,7 +588,7 @@ int _mi_os_numa_node_get(mi_os_tld_t* tld) { size_t numa_count = _mi_os_numa_node_count(); if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0 // never more than the node count and >= 0 - size_t numa_node = mi_os_numa_nodex(); + size_t numa_node = _mi_prim_numa_node(); if (numa_node >= numa_count) { numa_node = numa_node % numa_count; } return (int)numa_node; } diff --git a/src/page.c b/src/page.c index 5df14bab..90fc359e 100644 --- a/src/page.c +++ b/src/page.c @@ -103,6 +103,8 @@ static bool mi_page_is_valid_init(mi_page_t* page) { return true; } +extern bool _mi_process_is_initialized; // has mi_process_init been called? + bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(mi_page_is_valid_init(page)); #if MI_SECURE @@ -886,8 +888,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al // initialize if necessary if mi_unlikely(!mi_heap_is_initialized(heap)) { - mi_thread_init(); // calls `_mi_heap_init` in turn - heap = mi_get_default_heap(); + heap = mi_heap_get_default(); // calls mi_thread_init if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; } } mi_assert_internal(mi_heap_is_initialized(heap)); diff --git a/src/prim/prim-unix.c b/src/prim/prim-unix.c new file mode 100644 index 00000000..9270e088 --- /dev/null +++ b/src/prim/prim-unix.c @@ -0,0 +1,799 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018-2023, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +#ifndef _DEFAULT_SOURCE +#define _DEFAULT_SOURCE // ensure mmap flags and syscall are defined +#endif + +#if defined(__sun) +// illumos provides new mman.h api when any of these are defined +// otherwise the old api based on caddr_t which predates the void pointers one. +// stock solaris provides only the former, chose to atomically to discard those +// flags only here rather than project wide tough. +#undef _XOPEN_SOURCE +#undef _POSIX_C_SOURCE +#endif + +#include "mimalloc.h" +#include "mimalloc-internal.h" +#include "mimalloc-atomic.h" +#include "prim.h" + +#include // mmap +#include // sysconf + +#if defined(__linux__) + #include + #include + #if defined(__GLIBC__) + #include // linux mmap flags + #else + #include + #endif +#elif defined(__APPLE__) + #include + #if !TARGET_IOS_IPHONE && !TARGET_IOS_SIMULATOR + #include + #endif +#elif defined(__FreeBSD__) || defined(__DragonFly__) + #include + #if __FreeBSD_version >= 1200000 + #include + #include + #endif + #include +#endif + +//--------------------------------------------- +// init +//--------------------------------------------- + +static bool unix_detect_overcommit(void) { + bool os_overcommit = true; +#if defined(__linux__) + int fd = open("/proc/sys/vm/overcommit_memory", O_RDONLY); + if (fd >= 0) { + char buf[32]; + ssize_t nread = read(fd, &buf, sizeof(buf)); + close(fd); + // + // 0: heuristic overcommit, 1: always overcommit, 2: never overcommit (ignore NORESERVE) + if (nread >= 1) { + os_overcommit = (buf[0] == '0' || buf[0] == '1'); + } + } +#elif defined(__FreeBSD__) + int val = 0; + size_t olen = sizeof(val); + if (sysctlbyname("vm.overcommit", &val, &olen, NULL, 0) == 0) { + os_overcommit = (val != 0); + } +#else + // default: overcommit is true +#endif + return os_overcommit; +} + +void _mi_prim_mem_init( mi_os_mem_config_t* config ) { + long psize = sysconf(_SC_PAGESIZE); + if (psize > 0) { + config->page_size = (size_t)psize; + config->alloc_granularity = (size_t)psize; + } + config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this? + config->has_overcommit = unix_detect_overcommit(); + config->must_free_whole = false; // mmap can free in parts +} + + +//--------------------------------------------- +// free +//--------------------------------------------- + +void _mi_prim_free(void* addr, size_t size ) { + bool err = (munmap(addr, size) == -1); + if (err) { + _mi_warning_message("unable to release OS memory: %s, addr: %p, size: %zu\n", strerror(errno), addr, size); + } +} + + +//--------------------------------------------- +// mmap +//--------------------------------------------- + +static int unix_madvise(void* addr, size_t size, int advice) { + #if defined(__sun) + return madvise((caddr_t)addr, size, advice); // Solaris needs cast (issue #520) + #else + return madvise(addr, size, advice); + #endif +} + +static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int protect_flags, int flags, int fd) { + MI_UNUSED(try_alignment); + #if defined(MAP_ALIGNED) // BSD + if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { + size_t n = mi_bsr(try_alignment); + if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) { // alignment is a power of 2 and 4096 <= alignment <= 1GiB + flags |= MAP_ALIGNED(n); + void* p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0); + if (p!=MAP_FAILED) return p; + // fall back to regular mmap + } + } + #elif defined(MAP_ALIGN) // Solaris + if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { + void* p = mmap((void*)try_alignment, size, protect_flags, flags | MAP_ALIGN, fd, 0); // addr parameter is the required alignment + if (p!=MAP_FAILED) return p; + // fall back to regular mmap + } + #endif + #if (MI_INTPTR_SIZE >= 8) && !defined(MAP_ALIGNED) + // on 64-bit systems, use the virtual address area after 2TiB for 4MiB aligned allocations + if (addr == NULL) { + void* hint = _mi_os_get_aligned_hint(try_alignment, size); + if (hint != NULL) { + void* p = mmap(hint, size, protect_flags, flags, fd, 0); + if (p!=MAP_FAILED) return p; + // fall back to regular mmap + } + } + #endif + // regular mmap + void* p = mmap(addr, size, protect_flags, flags, fd, 0); + if (p!=MAP_FAILED) return p; + // failed to allocate + return NULL; +} + +static void* unix_mmap(void* addr, size_t size, size_t try_alignment, int protect_flags, bool large_only, bool allow_large, bool* is_large) { + void* p = NULL; + #if !defined(MAP_ANONYMOUS) + #define MAP_ANONYMOUS MAP_ANON + #endif + #if !defined(MAP_NORESERVE) + #define MAP_NORESERVE 0 + #endif + int flags = MAP_PRIVATE | MAP_ANONYMOUS; + int fd = -1; + if (_mi_os_has_overcommit()) { + flags |= MAP_NORESERVE; + } + #if defined(PROT_MAX) + protect_flags |= PROT_MAX(PROT_READ | PROT_WRITE); // BSD + #endif + #if defined(VM_MAKE_TAG) + // macOS: tracking anonymous page with a specific ID. (All up to 98 are taken officially but LLVM sanitizers had taken 99) + int os_tag = (int)mi_option_get(mi_option_os_tag); + if (os_tag < 100 || os_tag > 255) { os_tag = 100; } + fd = VM_MAKE_TAG(os_tag); + #endif + // huge page allocation + if ((large_only || _mi_os_use_large_page(size, try_alignment)) && allow_large) { + static _Atomic(size_t) large_page_try_ok; // = 0; + size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); + if (!large_only && try_ok > 0) { + // If the OS is not configured for large OS pages, or the user does not have + // enough permission, the `mmap` will always fail (but it might also fail for other reasons). + // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times + // to avoid too many failing calls to mmap. + mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); + } + else { + int lflags = flags & ~MAP_NORESERVE; // using NORESERVE on huge pages seems to fail on Linux + int lfd = fd; + #ifdef MAP_ALIGNED_SUPER + lflags |= MAP_ALIGNED_SUPER; + #endif + #ifdef MAP_HUGETLB + lflags |= MAP_HUGETLB; + #endif + #ifdef MAP_HUGE_1GB + static bool mi_huge_pages_available = true; + if ((size % MI_GiB) == 0 && mi_huge_pages_available) { + lflags |= MAP_HUGE_1GB; + } + else + #endif + { + #ifdef MAP_HUGE_2MB + lflags |= MAP_HUGE_2MB; + #endif + } + #ifdef VM_FLAGS_SUPERPAGE_SIZE_2MB + lfd |= VM_FLAGS_SUPERPAGE_SIZE_2MB; + #endif + if (large_only || lflags != flags) { + // try large OS page allocation + *is_large = true; + p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd); + #ifdef MAP_HUGE_1GB + if (p == NULL && (lflags & MAP_HUGE_1GB) != 0) { + mi_huge_pages_available = false; // don't try huge 1GiB pages again + _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %i)\n", errno); + lflags = ((lflags & ~MAP_HUGE_1GB) | MAP_HUGE_2MB); + p = unix_mmap_prim(addr, size, try_alignment, protect_flags, lflags, lfd); + } + #endif + if (large_only) return p; + if (p == NULL) { + mi_atomic_store_release(&large_page_try_ok, (size_t)8); // on error, don't try again for the next N allocations + } + } + } + } + // regular allocation + if (p == NULL) { + *is_large = false; + p = unix_mmap_prim(addr, size, try_alignment, protect_flags, flags, fd); + if (p != NULL) { + #if defined(MADV_HUGEPAGE) + // Many Linux systems don't allow MAP_HUGETLB but they support instead + // transparent huge pages (THP). Generally, it is not required to call `madvise` with MADV_HUGE + // though since properly aligned allocations will already use large pages if available + // in that case -- in particular for our large regions (in `memory.c`). + // However, some systems only allow THP if called with explicit `madvise`, so + // when large OS pages are enabled for mimalloc, we call `madvise` anyways. + if (allow_large && _mi_os_use_large_page(size, try_alignment)) { + if (unix_madvise(p, size, MADV_HUGEPAGE) == 0) { + *is_large = true; // possibly + }; + } + #elif defined(__sun) + if (allow_large && _mi_os_use_large_page(size, try_alignment)) { + struct memcntl_mha cmd = {0}; + cmd.mha_pagesize = large_os_page_size; + cmd.mha_cmd = MHA_MAPSIZE_VA; + if (memcntl((caddr_t)p, size, MC_HAT_ADVISE, (caddr_t)&cmd, 0, 0) == 0) { + *is_large = true; + } + } + #endif + } + } + if (p == NULL) { + _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: %i, address: %p, large only: %d, allow large: %d)\n", size, errno, addr, large_only, allow_large); + } + return p; +} + +// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. +void* _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large) { + mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); + mi_assert_internal(commit || !allow_large); + mi_assert_internal(try_alignment > 0); + + int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE); + return unix_mmap(NULL, size, try_alignment, protect_flags, false, allow_large, is_large); +} + + +//--------------------------------------------- +// Commit/Reset +//--------------------------------------------- + +static void unix_mprotect_hint(int err) { + #if defined(__linux__) && (MI_SECURE>=2) // guard page around every mimalloc page + if (err == ENOMEM) { + _mi_warning_message("The next warning may be caused by a low memory map limit.\n" + " On Linux this is controlled by the vm.max_map_count -- maybe increase it?\n" + " For example: sudo sysctl -w vm.max_map_count=262144\n"); + } + #else + MI_UNUSED(err); + #endif +} + + +int _mi_prim_commit(void* start, size_t size, bool commit) { + /* + #if 0 && defined(MAP_FIXED) && !defined(__APPLE__) + // Linux: disabled for now as mmap fixed seems much more expensive than MADV_DONTNEED (and splits VMA's?) + if (commit) { + // commit: just change the protection + err = mprotect(start, csize, (PROT_READ | PROT_WRITE)); + if (err != 0) { err = errno; } + } + else { + // decommit: use mmap with MAP_FIXED to discard the existing memory (and reduce rss) + const int fd = mi_unix_mmap_fd(); + void* p = mmap(start, csize, PROT_NONE, (MAP_FIXED | MAP_PRIVATE | MAP_ANONYMOUS | MAP_NORESERVE), fd, 0); + if (p != start) { err = errno; } + } + #else + */ + int err = 0; + if (commit) { + // commit: ensure we can access the area + err = mprotect(start, size, (PROT_READ | PROT_WRITE)); + if (err != 0) { err = errno; } + } + else { + #if defined(MADV_DONTNEED) && MI_DEBUG == 0 && MI_SECURE == 0 + // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE) + // (on the other hand, MADV_FREE would be good enough.. it is just not reflected in the stats :-( ) + err = unix_madvise(start, size, MADV_DONTNEED); + #else + // decommit: just disable access (also used in debug and secure mode to trap on illegal access) + err = mprotect(start, size, PROT_NONE); + if (err != 0) { err = errno; } + #endif + } + unix_mprotect_hint(err); + return err; +} + +int _mi_prim_reset(void* start, size_t size) { + #if defined(MADV_FREE) + static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE); + int oadvice = (int)mi_atomic_load_relaxed(&advice); + int err; + while ((err = unix_madvise(start, size, oadvice)) != 0 && errno == EAGAIN) { errno = 0; }; + if (err != 0 && errno == EINVAL && oadvice == MADV_FREE) { + // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on + mi_atomic_store_release(&advice, (size_t)MADV_DONTNEED); + err = unix_madvise(start, size, MADV_DONTNEED); + } + #else + int err = unix_madvise(start, csize, MADV_DONTNEED); + #endif + return err; +} + +int _mi_prim_protect(void* start, size_t size, bool protect) { + int err = mprotect(start, size, protect ? PROT_NONE : (PROT_READ | PROT_WRITE)); + if (err != 0) { err = errno; } + unix_mprotect_hint(err); + return err; +} + + + +//--------------------------------------------- +// Huge page allocation +//--------------------------------------------- + +#if (MI_INTPTR_SIZE >= 8) && !defined(__HAIKU__) + +#include + +#ifndef MPOL_PREFERRED +#define MPOL_PREFERRED 1 +#endif + +#if defined(SYS_mbind) +static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) { + return syscall(SYS_mbind, start, len, mode, nmask, maxnode, flags); +} +#else +static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, const unsigned long* nmask, unsigned long maxnode, unsigned flags) { + MI_UNUSED(start); MI_UNUSED(len); MI_UNUSED(mode); MI_UNUSED(nmask); MI_UNUSED(maxnode); MI_UNUSED(flags); + return 0; +} +#endif + +void* _mi_prim_alloc_huge_os_pages(void* addr, size_t size, int numa_node) { + bool is_large = true; + void* p = unix_mmap(addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large); + if (p == NULL) return NULL; + if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes + unsigned long numa_mask = (1UL << numa_node); + // TODO: does `mbind` work correctly for huge OS pages? should we + // use `set_mempolicy` before calling mmap instead? + // see: + long err = mi_prim_mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0); + if (err != 0) { + _mi_warning_message("failed to bind huge (1GiB) pages to numa node %d: %s\n", numa_node, strerror(errno)); + } + } + return p; +} + +#else + +void* _mi_prim_alloc_huge_os_pages(void* addr, size_t size, int numa_node) { + MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node); + return NULL; +} + +#endif + +//--------------------------------------------- +// NUMA nodes +//--------------------------------------------- + +#if defined(__linux__) + +#include // getcpu +#include // access + +size_t _mi_prim_numa_node(void) { + #ifdef SYS_getcpu + unsigned long node = 0; + unsigned long ncpu = 0; + long err = syscall(SYS_getcpu, &ncpu, &node, NULL); + if (err != 0) return 0; + return node; + #else + return 0; + #endif +} + +size_t _mi_prim_numa_node_count(void) { + char buf[128]; + unsigned node = 0; + for(node = 0; node < 256; node++) { + // enumerate node entries -- todo: it there a more efficient way to do this? (but ensure there is no allocation) + snprintf(buf, 127, "/sys/devices/system/node/node%u", node + 1); + if (access(buf,R_OK) != 0) break; + } + return (node+1); +} + +#elif defined(__FreeBSD__) && __FreeBSD_version >= 1200000 + +size_t mi_prim_numa_node(void) { + domainset_t dom; + size_t node; + int policy; + if (cpuset_getdomain(CPU_LEVEL_CPUSET, CPU_WHICH_PID, -1, sizeof(dom), &dom, &policy) == -1) return 0ul; + for (node = 0; node < MAXMEMDOM; node++) { + if (DOMAINSET_ISSET(node, &dom)) return node; + } + return 0ul; +} + +size_t _mi_prim_numa_node_count(void) { + size_t ndomains = 0; + size_t len = sizeof(ndomains); + if (sysctlbyname("vm.ndomains", &ndomains, &len, NULL, 0) == -1) return 0ul; + return ndomains; +} + +#elif defined(__DragonFly__) + +size_t _mi_prim_numa_node(void) { + // TODO: DragonFly does not seem to provide any userland means to get this information. + return 0ul; +} + +size_t _mi_prim_numa_node_count(void) { + size_t ncpus = 0, nvirtcoresperphys = 0; + size_t len = sizeof(size_t); + if (sysctlbyname("hw.ncpu", &ncpus, &len, NULL, 0) == -1) return 0ul; + if (sysctlbyname("hw.cpu_topology_ht_ids", &nvirtcoresperphys, &len, NULL, 0) == -1) return 0ul; + return nvirtcoresperphys * ncpus; +} + +#else + +size_t _mi_prim_numa_node(void) { + return 0; +} + +size_t _mi_prim_numa_node_count(void) { + return 1; +} + +#endif + +// ---------------------------------------------------------------- +// Clock +// ---------------------------------------------------------------- + +#include + +#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC) + +mi_msecs_t _mi_prim_clock_now(void) { + struct timespec t; + #ifdef CLOCK_MONOTONIC + clock_gettime(CLOCK_MONOTONIC, &t); + #else + clock_gettime(CLOCK_REALTIME, &t); + #endif + return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000); +} + +#else + +// low resolution timer +mi_msecs_t _mi_prim_clock_now(void) { + #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0) + return (mi_msecs_t)clock(); + #elif (CLOCKS_PER_SEC < 1000) + return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); + #else + return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000); + #endif +} + +#endif + + + + +//---------------------------------------------------------------- +// Process info +//---------------------------------------------------------------- + +#if defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__) +#include +#include +#include + +#if defined(__APPLE__) +#include +#endif + +#if defined(__HAIKU__) +#include +#endif + +static mi_msecs_t timeval_secs(const struct timeval* tv) { + return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L); +} + +void _mi_prim_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) +{ + struct rusage rusage; + getrusage(RUSAGE_SELF, &rusage); + *utime = timeval_secs(&rusage.ru_utime); + *stime = timeval_secs(&rusage.ru_stime); +#if !defined(__HAIKU__) + *page_faults = rusage.ru_majflt; +#endif + // estimate commit using our stats + *peak_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak)); + *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current)); + *current_rss = *current_commit; // estimate +#if defined(__HAIKU__) + // Haiku does not have (yet?) a way to + // get these stats per process + thread_info tid; + area_info mem; + ssize_t c; + get_thread_info(find_thread(0), &tid); + while (get_next_area_info(tid.team, &c, &mem) == B_OK) { + *peak_rss += mem.ram_size; + } + *page_faults = 0; +#elif defined(__APPLE__) + *peak_rss = rusage.ru_maxrss; // BSD reports in bytes + struct mach_task_basic_info info; + mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; + if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) { + *current_rss = (size_t)info.resident_size; + } +#else + *peak_rss = rusage.ru_maxrss * 1024; // Linux reports in KiB +#endif +} + +#else + +#ifndef __wasi__ +// WebAssembly instances are not processes +#pragma message("define a way to get process info") +#endif + +void _mi_prim_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) +{ + *peak_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak)); + *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current)); + *peak_rss = *peak_commit; + *current_rss = *current_commit; + *page_faults = 0; + *utime = 0; + *stime = 0; +} + +#endif + + +//---------------------------------------------------------------- +// Output +//---------------------------------------------------------------- + +void _mi_prim_out_stderr( const char* msg ) { + fputs(msg,stderr); +} + + +//---------------------------------------------------------------- +// Environment +//---------------------------------------------------------------- + +#if !defined(MI_USE_ENVIRON) || (MI_USE_ENVIRON!=0) +// On Posix systemsr use `environ` to acces environment variables +// even before the C runtime is initialized. +#if defined(__APPLE__) && defined(__has_include) && __has_include() +#include +static char** mi_get_environ(void) { + return (*_NSGetEnviron()); +} +#else +extern char** environ; +static char** mi_get_environ(void) { + return environ; +} +#endif +bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { + if (name==NULL) return false; + const size_t len = _mi_strlen(name); + if (len == 0) return false; + char** env = mi_get_environ(); + if (env == NULL) return false; + // compare up to 256 entries + for (int i = 0; i < 256 && env[i] != NULL; i++) { + const char* s = env[i]; + if (_mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive + // found it + _mi_strlcpy(result, s + len + 1, result_size); + return true; + } + } + return false; +} +#else +// fallback: use standard C `getenv` but this cannot be used while initializing the C runtime +bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { + // cannot call getenv() when still initializing the C runtime. + if (_mi_preloading()) return false; + const char* s = getenv(name); + if (s == NULL) { + // we check the upper case name too. + char buf[64+1]; + size_t len = _mi_strnlen(name,sizeof(buf)-1); + for (size_t i = 0; i < len; i++) { + buf[i] = _mi_toupper(name[i]); + } + buf[len] = 0; + s = getenv(buf); + } + if (s == NULL || _mi_strnlen(s,result_size) >= result_size) return false; + _mi_strlcpy(result, s, result_size); + return true; +} +#endif // !MI_USE_ENVIRON + + +//---------------------------------------------------------------- +// Random +//---------------------------------------------------------------- + +#if defined(__APPLE__) + +#include +#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10 +#include +#include +#endif +bool _mi_prim_random_buf(void* buf, size_t buf_len) { + #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 + // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf + // may fail silently on macOS. See PR #390, and + return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess); + #else + // fall back on older macOS + arc4random_buf(buf, buf_len); + return true; + #endif +} + +#elif defined(__ANDROID__) || defined(__DragonFly__) || \ + defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ + defined(__sun) + +#include +bool _mi_prim_random_buf(void* buf, size_t buf_len) { + arc4random_buf(buf, buf_len); + return true; +} + +#elif defined(__linux__) || defined(__HAIKU__) + +#if defined(__linux__) +#include +#endif +#include +#include +#include +#include +bool _mi_prim_random_buf(void* buf, size_t buf_len) { + // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h` + // and for the latter the actual `getrandom` call is not always defined. + // (see ) + // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed. + #ifdef SYS_getrandom + #ifndef GRND_NONBLOCK + #define GRND_NONBLOCK (1) + #endif + static _Atomic(uintptr_t) no_getrandom; // = 0 + if (mi_atomic_load_acquire(&no_getrandom)==0) { + ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK); + if (ret >= 0) return (buf_len == (size_t)ret); + if (errno != ENOSYS) return false; + mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom + } + #endif + int flags = O_RDONLY; + #if defined(O_CLOEXEC) + flags |= O_CLOEXEC; + #endif + int fd = open("/dev/urandom", flags, 0); + if (fd < 0) return false; + size_t count = 0; + while(count < buf_len) { + ssize_t ret = read(fd, (char*)buf + count, buf_len - count); + if (ret<=0) { + if (errno!=EAGAIN && errno!=EINTR) break; + } + else { + count += ret; + } + } + close(fd); + return (count==buf_len); +} + +#else + +bool _mi_prim_random_buf(void* buf, size_t buf_len) { + return false; +} + +#endif + + +//---------------------------------------------------------------- +// Thread init/done +//---------------------------------------------------------------- + +#if defined(MI_USE_PTHREADS) + +// use pthread local storage keys to detect thread ending +// (and used with MI_TLS_PTHREADS for the default heap) +pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1); + +static void mi_pthread_done(void* value) { + if (value!=NULL) { + _mi_thread_done((mi_heap_t*)value); + } +} + +void _mi_prim_thread_init_auto_done(void) { + mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1)); + pthread_key_create(&_mi_heap_default_key, &mi_pthread_done); +} + +void _mi_prim_thread_done_auto_done(void) { + // nothing to do +} + +void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { + if (_mi_heap_default_key != (pthread_key_t)(-1)) { // can happen during recursive invocation on freeBSD + pthread_setspecific(_mi_heap_default_key, heap); + } +} + +#else + +void _mi_prim_thread_init_auto_done(void) { + // nothing +} + +void _mi_prim_thread_done_auto_done(void) { + // nothing +} + +void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { + MI_UNUSED(heap); +} + +#endif \ No newline at end of file diff --git a/src/prim/prim-wasi.c b/src/prim/prim-wasi.c new file mode 100644 index 00000000..f4d3be58 --- /dev/null +++ b/src/prim/prim-wasi.c @@ -0,0 +1,262 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018-2023, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +#include "mimalloc.h" +#include "mimalloc-internal.h" +#include "mimalloc-atomic.h" +#include "prim.h" + +//--------------------------------------------- +// Initialize +//--------------------------------------------- + +void _mi_prim_mem_init( mi_os_mem_config_t* config ) { + config->page_size = 64*MI_KiB; // WebAssembly has a fixed page size: 64KiB + config->alloc_granularity = 16; + config->has_overcommit = false; + config->must_free_whole = true; +} + +//--------------------------------------------- +// Free +//--------------------------------------------- + +void _mi_prim_free(void* addr, size_t size ) { + MI_UNUSED(addr); MI_UNUSED(size); + // wasi heap cannot be shrunk +} + + +//--------------------------------------------- +// Allocation: sbrk or memory_grow +//--------------------------------------------- + +#if defined(MI_USE_SBRK) + static void* mi_memory_grow( size_t size ) { + void* p = sbrk(size); + if (p == (void*)(-1)) return NULL; + #if !defined(__wasi__) // on wasi this is always zero initialized already (?) + memset(p,0,size); + #endif + return p; + } +#elif defined(__wasi__) + static void* mi_memory_grow( size_t size ) { + size_t base = (size > 0 ? __builtin_wasm_memory_grow(0,_mi_divide_up(size, _mi_os_page_size())) + : __builtin_wasm_memory_size(0)); + if (base == SIZE_MAX) return NULL; + return (void*)(base * _mi_os_page_size()); + } +#endif + +#if defined(MI_USE_PTHREADS) +static pthread_mutex_t mi_heap_grow_mutex = PTHREAD_MUTEX_INITIALIZER; +#endif + +static void* mi_prim_mem_grow(size_t size, size_t try_alignment) { + void* p = NULL; + if (try_alignment <= 1) { + // `sbrk` is not thread safe in general so try to protect it (we could skip this on WASM but leave it in for now) + #if defined(MI_USE_PTHREADS) + pthread_mutex_lock(&mi_heap_grow_mutex); + #endif + p = mi_memory_grow(size); + #if defined(MI_USE_PTHREADS) + pthread_mutex_unlock(&mi_heap_grow_mutex); + #endif + } + else { + void* base = NULL; + size_t alloc_size = 0; + // to allocate aligned use a lock to try to avoid thread interaction + // between getting the current size and actual allocation + // (also, `sbrk` is not thread safe in general) + #if defined(MI_USE_PTHREADS) + pthread_mutex_lock(&mi_heap_grow_mutex); + #endif + { + void* current = mi_memory_grow(0); // get current size + if (current != NULL) { + void* aligned_current = mi_align_up_ptr(current, try_alignment); // and align from there to minimize wasted space + alloc_size = _mi_align_up( ((uint8_t*)aligned_current - (uint8_t*)current) + size, _mi_os_page_size()); + base = mi_memory_grow(alloc_size); + } + } + #if defined(MI_USE_PTHREADS) + pthread_mutex_unlock(&mi_heap_grow_mutex); + #endif + if (base != NULL) { + p = mi_align_up_ptr(base, try_alignment); + if ((uint8_t*)p + size > (uint8_t*)base + alloc_size) { + // another thread used wasm_memory_grow/sbrk in-between and we do not have enough + // space after alignment. Give up (and waste the space as we cannot shrink :-( ) + // (in `mi_os_mem_alloc_aligned` this will fall back to overallocation to align) + p = NULL; + } + } + } + if (p == NULL) { + _mi_warning_message("unable to allocate sbrk/wasm_memory_grow OS memory (%zu bytes, %zu alignment)\n", size, try_alignment); + errno = ENOMEM; + return NULL; + } + mi_assert_internal( try_alignment == 0 || (uintptr_t)p % try_alignment == 0 ); + return p; +} + +// Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. +void* _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large) { + MI_UNUSED(allow_large); MI_UNUSED(commit); + *is_large = false; + return mi_prim_mem_grow(size, try_alignment); +} + + +//--------------------------------------------- +// Commit/Reset/Protect +//--------------------------------------------- + +int _mi_prim_commit(void* addr, size_t size, bool commit) { + MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(commit); + return 0; +} + +int _mi_prim_reset(void* addr, size_t size) { + MI_UNUSED(addr); MI_UNUSED(size); + return 0; +} + +int _mi_prim_protect(void* addr, size_t size, bool protect) { + MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(protect); + return 0; +} + + +//--------------------------------------------- +// Huge pages and NUMA nodes +//--------------------------------------------- + +void* _mi_prim_alloc_huge_os_pages(void* addr, size_t size, int numa_node) { + MI_UNUSED(addr); MI_UNUSED(size); MI_UNUSED(numa_node); + return NULL; +} + +size_t _mi_prim_numa_node(void) { + return 0; +} + +size_t _mi_prim_numa_node_count(void) { + return 1; +} + + +//---------------------------------------------------------------- +// Clock +//---------------------------------------------------------------- + +#include + +#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC) + +mi_msecs_t _mi_prim_clock_now(void) { + struct timespec t; + #ifdef CLOCK_MONOTONIC + clock_gettime(CLOCK_MONOTONIC, &t); + #else + clock_gettime(CLOCK_REALTIME, &t); + #endif + return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000); +} + +#else + +// low resolution timer +mi_msecs_t _mi_prim_clock_now(void) { + #if !defined(CLOCKS_PER_SEC) || (CLOCKS_PER_SEC == 1000) || (CLOCKS_PER_SEC == 0) + return (mi_msecs_t)clock(); + #elif (CLOCKS_PER_SEC < 1000) + return (mi_msecs_t)clock() * (1000 / (mi_msecs_t)CLOCKS_PER_SEC); + #else + return (mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000); + #endif +} + +#endif + + +//---------------------------------------------------------------- +// Process info +//---------------------------------------------------------------- + +void _mi_prim_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) +{ + *peak_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak)); + *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current)); + *peak_rss = *peak_commit; + *current_rss = *current_commit; + *page_faults = 0; + *utime = 0; + *stime = 0; +} + +//---------------------------------------------------------------- +// Output +//---------------------------------------------------------------- + +void _mi_prim_out_stderr( const char* msg ) { + fputs(msg,stderr); +} + + +//---------------------------------------------------------------- +// Environment +//---------------------------------------------------------------- + +bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { + // cannot call getenv() when still initializing the C runtime. + if (_mi_preloading()) return false; + const char* s = getenv(name); + if (s == NULL) { + // we check the upper case name too. + char buf[64+1]; + size_t len = _mi_strnlen(name,sizeof(buf)-1); + for (size_t i = 0; i < len; i++) { + buf[i] = _mi_toupper(name[i]); + } + buf[len] = 0; + s = getenv(buf); + } + if (s == NULL || _mi_strnlen(s,result_size) >= result_size) return false; + _mi_strlcpy(result, s, result_size); + return true; +} + + +//---------------------------------------------------------------- +// Random +//---------------------------------------------------------------- + +bool _mi_prim_random_buf(void* buf, size_t buf_len) { + return false; +} + + +//---------------------------------------------------------------- +// Thread init/done +//---------------------------------------------------------------- + +void _mi_prim_thread_init_auto_done(void) { + // nothing +} + +void _mi_prim_thread_done_auto_done(void) { + // nothing +} + +void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { + MI_UNUSED(heap); +} diff --git a/src/prim/prim-windows.c b/src/prim/prim-windows.c new file mode 100644 index 00000000..008b9fa4 --- /dev/null +++ b/src/prim/prim-windows.c @@ -0,0 +1,606 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018-2023, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +#include "mimalloc.h" +#include "mimalloc-internal.h" +#include "mimalloc-atomic.h" +#include "prim.h" +#include // strerror +#include // fputs, stderr + +#ifdef _MSC_VER +#pragma warning(disable:4996) // strerror +#endif + +//--------------------------------------------- +// Dynamically bind Windows API points for portability +//--------------------------------------------- + +// We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016. +// So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility) +// NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB) +// We define a minimal MEM_EXTENDED_PARAMETER ourselves in order to be able to compile with older SDK's. +typedef enum MI_MEM_EXTENDED_PARAMETER_TYPE_E { + MiMemExtendedParameterInvalidType = 0, + MiMemExtendedParameterAddressRequirements, + MiMemExtendedParameterNumaNode, + MiMemExtendedParameterPartitionHandle, + MiMemExtendedParameterUserPhysicalHandle, + MiMemExtendedParameterAttributeFlags, + MiMemExtendedParameterMax +} MI_MEM_EXTENDED_PARAMETER_TYPE; + +typedef struct DECLSPEC_ALIGN(8) MI_MEM_EXTENDED_PARAMETER_S { + struct { DWORD64 Type : 8; DWORD64 Reserved : 56; } Type; + union { DWORD64 ULong64; PVOID Pointer; SIZE_T Size; HANDLE Handle; DWORD ULong; } Arg; +} MI_MEM_EXTENDED_PARAMETER; + +typedef struct MI_MEM_ADDRESS_REQUIREMENTS_S { + PVOID LowestStartingAddress; + PVOID HighestEndingAddress; + SIZE_T Alignment; +} MI_MEM_ADDRESS_REQUIREMENTS; + +#define MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE 0x00000010 + +#include +typedef PVOID (__stdcall *PVirtualAlloc2)(HANDLE, PVOID, SIZE_T, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG); +typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*, ULONG, ULONG, MI_MEM_EXTENDED_PARAMETER*, ULONG); +static PVirtualAlloc2 pVirtualAlloc2 = NULL; +static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL; + +// Similarly, GetNumaProcesorNodeEx is only supported since Windows 7 +typedef struct MI_PROCESSOR_NUMBER_S { WORD Group; BYTE Number; BYTE Reserved; } MI_PROCESSOR_NUMBER; + +typedef VOID (__stdcall *PGetCurrentProcessorNumberEx)(MI_PROCESSOR_NUMBER* ProcNumber); +typedef BOOL (__stdcall *PGetNumaProcessorNodeEx)(MI_PROCESSOR_NUMBER* Processor, PUSHORT NodeNumber); +typedef BOOL (__stdcall* PGetNumaNodeProcessorMaskEx)(USHORT Node, PGROUP_AFFINITY ProcessorMask); +typedef BOOL (__stdcall *PGetNumaProcessorNode)(UCHAR Processor, PUCHAR NodeNumber); +static PGetCurrentProcessorNumberEx pGetCurrentProcessorNumberEx = NULL; +static PGetNumaProcessorNodeEx pGetNumaProcessorNodeEx = NULL; +static PGetNumaNodeProcessorMaskEx pGetNumaNodeProcessorMaskEx = NULL; +static PGetNumaProcessorNode pGetNumaProcessorNode = NULL; + +//--------------------------------------------- +// Enable large page support dynamically (if possible) +//--------------------------------------------- + +static bool win_enable_large_os_pages(size_t* large_page_size) +{ + static bool large_initialized = false; + if (large_initialized) return (_mi_os_large_page_size() > 0); + large_initialized = true; + + // Try to see if large OS pages are supported + // To use large pages on Windows, we first need access permission + // Set "Lock pages in memory" permission in the group policy editor + // + unsigned long err = 0; + HANDLE token = NULL; + BOOL ok = OpenProcessToken(GetCurrentProcess(), TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token); + if (ok) { + TOKEN_PRIVILEGES tp; + ok = LookupPrivilegeValue(NULL, TEXT("SeLockMemoryPrivilege"), &tp.Privileges[0].Luid); + if (ok) { + tp.PrivilegeCount = 1; + tp.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED; + ok = AdjustTokenPrivileges(token, FALSE, &tp, 0, (PTOKEN_PRIVILEGES)NULL, 0); + if (ok) { + err = GetLastError(); + ok = (err == ERROR_SUCCESS); + if (ok && large_page_size != NULL) { + *large_page_size = GetLargePageMinimum(); + } + } + } + CloseHandle(token); + } + if (!ok) { + if (err == 0) err = GetLastError(); + _mi_warning_message("cannot enable large OS page support, error %lu\n", err); + } + return (ok!=0); +} + + +//--------------------------------------------- +// Initialize +//--------------------------------------------- + +void _mi_prim_mem_init( mi_os_mem_config_t* config ) +{ + config->has_overcommit = false; + config->must_free_whole = true; + // get the page size + SYSTEM_INFO si; + GetSystemInfo(&si); + if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; } + if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; } + // get the VirtualAlloc2 function + HINSTANCE hDll; + hDll = LoadLibrary(TEXT("kernelbase.dll")); + if (hDll != NULL) { + // use VirtualAlloc2FromApp if possible as it is available to Windows store apps + pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2FromApp"); + if (pVirtualAlloc2==NULL) pVirtualAlloc2 = (PVirtualAlloc2)(void (*)(void))GetProcAddress(hDll, "VirtualAlloc2"); + FreeLibrary(hDll); + } + // NtAllocateVirtualMemoryEx is used for huge page allocation + hDll = LoadLibrary(TEXT("ntdll.dll")); + if (hDll != NULL) { + pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx"); + FreeLibrary(hDll); + } + // Try to use Win7+ numa API + hDll = LoadLibrary(TEXT("kernel32.dll")); + if (hDll != NULL) { + pGetCurrentProcessorNumberEx = (PGetCurrentProcessorNumberEx)(void (*)(void))GetProcAddress(hDll, "GetCurrentProcessorNumberEx"); + pGetNumaProcessorNodeEx = (PGetNumaProcessorNodeEx)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNodeEx"); + pGetNumaNodeProcessorMaskEx = (PGetNumaNodeProcessorMaskEx)(void (*)(void))GetProcAddress(hDll, "GetNumaNodeProcessorMaskEx"); + pGetNumaProcessorNode = (PGetNumaProcessorNode)(void (*)(void))GetProcAddress(hDll, "GetNumaProcessorNode"); + FreeLibrary(hDll); + } + if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) { + win_enable_large_os_pages(&config->large_page_size); + } +} + + +//--------------------------------------------- +// Free +//--------------------------------------------- + +void _mi_prim_free(void* addr, size_t size ) { + DWORD errcode = 0; + bool err = (VirtualFree(addr, 0, MEM_RELEASE) == 0); + if (err) { errcode = GetLastError(); } + if (errcode == ERROR_INVALID_ADDRESS) { + // In mi_os_mem_alloc_aligned the fallback path may have returned a pointer inside + // the memory region returned by VirtualAlloc; in that case we need to free using + // the start of the region. + MEMORY_BASIC_INFORMATION info = { 0 }; + VirtualQuery(addr, &info, sizeof(info)); + if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) { + errcode = 0; + err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0); + if (err) { errcode = GetLastError(); } + } + } + if (errcode != 0) { + _mi_warning_message("unable to release OS memory: error code 0x%x, addr: %p, size: %zu\n", errcode, addr, size); + } +} + + +//--------------------------------------------- +// VirtualAlloc +//--------------------------------------------- + +static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignment, DWORD flags) { + #if (MI_INTPTR_SIZE >= 8) + // on 64-bit systems, try to use the virtual address area after 2TiB for 4MiB aligned allocations + if (addr == NULL) { + void* hint = _mi_os_get_aligned_hint(try_alignment,size); + if (hint != NULL) { + void* p = VirtualAlloc(hint, size, flags, PAGE_READWRITE); + if (p != NULL) return p; + _mi_verbose_message("warning: unable to allocate hinted aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), hint, try_alignment, flags); + // fall through on error + } + } + #endif + // on modern Windows try use VirtualAlloc2 for aligned allocation + if (try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) { + MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 }; + reqs.Alignment = try_alignment; + MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} }; + param.Type.Type = MiMemExtendedParameterAddressRequirements; + param.Arg.Pointer = &reqs; + void* p = (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, ¶m, 1); + if (p != NULL) return p; + _mi_warning_message("unable to allocate aligned OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x)\n", size, GetLastError(), addr, try_alignment, flags); + // fall through on error + } + // last resort + return VirtualAlloc(addr, size, flags, PAGE_READWRITE); +} + +static void* win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) { + mi_assert_internal(!(large_only && !allow_large)); + static _Atomic(size_t) large_page_try_ok; // = 0; + void* p = NULL; + // Try to allocate large OS pages (2MiB) if allowed or required. + if ((large_only || _mi_os_use_large_page(size, try_alignment)) + && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) { + size_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); + if (!large_only && try_ok > 0) { + // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive. + // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times. + mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); + } + else { + // large OS pages must always reserve and commit. + *is_large = true; + p = win_virtual_alloc_prim(addr, size, try_alignment, flags | MEM_LARGE_PAGES); + if (large_only) return p; + // fall back to non-large page allocation on error (`p == NULL`). + if (p == NULL) { + mi_atomic_store_release(&large_page_try_ok,10UL); // on error, don't try again for the next N allocations + } + } + } + // Fall back to regular page allocation + if (p == NULL) { + *is_large = ((flags&MEM_LARGE_PAGES) != 0); + p = win_virtual_alloc_prim(addr, size, try_alignment, flags); + } + if (p == NULL) { + _mi_warning_message("unable to allocate OS memory (%zu bytes, error code: 0x%x, address: %p, alignment: %zu, flags: 0x%x, large only: %d, allow large: %d)\n", size, GetLastError(), addr, try_alignment, flags, large_only, allow_large); + } + return p; +} + +void* _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large) { + mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); + mi_assert_internal(commit || !allow_large); + mi_assert_internal(try_alignment > 0); + int flags = MEM_RESERVE; + if (commit) { flags |= MEM_COMMIT; } + return win_virtual_alloc(NULL, size, try_alignment, flags, false, allow_large, is_large); +} + + +//--------------------------------------------- +// Commit/Reset/Protect +//--------------------------------------------- +#ifdef _MSC_VER +#pragma warning(disable:6250) // suppress warning calling VirtualFree without MEM_RELEASE (for decommit) +#endif + +int _mi_prim_commit(void* addr, size_t size, bool commit) { + if (commit) { + void* p = VirtualAlloc(addr, size, MEM_COMMIT, PAGE_READWRITE); + return (p == addr ? 0 : (int)GetLastError()); + } + else { + BOOL ok = VirtualFree(addr, size, MEM_DECOMMIT); + return (ok ? 0 : (int)GetLastError()); + } +} + +int _mi_prim_reset(void* addr, size_t size) { + void* p = VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE); + mi_assert_internal(p == addr); + #if 1 + if (p == addr && addr != NULL) { + VirtualUnlock(addr,size); // VirtualUnlock after MEM_RESET removes the memory from the working set + } + #endif + return (p == addr ? 0 : (int)GetLastError()); +} + +int _mi_prim_protect(void* addr, size_t size, bool protect) { + DWORD oldprotect = 0; + BOOL ok = VirtualProtect(addr, size, protect ? PAGE_NOACCESS : PAGE_READWRITE, &oldprotect); + return (ok ? 0 : (int)GetLastError()); +} + + +//--------------------------------------------- +// Huge page allocation +//--------------------------------------------- + +void* _mi_prim_alloc_huge_os_pages(void* addr, size_t size, int numa_node) +{ + const DWORD flags = MEM_LARGE_PAGES | MEM_COMMIT | MEM_RESERVE; + + win_enable_large_os_pages(NULL); + + MI_MEM_EXTENDED_PARAMETER params[3] = { {{0,0},{0}},{{0,0},{0}},{{0,0},{0}} }; + // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages + static bool mi_huge_pages_available = true; + if (pNtAllocateVirtualMemoryEx != NULL && mi_huge_pages_available) { + params[0].Type.Type = MiMemExtendedParameterAttributeFlags; + params[0].Arg.ULong64 = MI_MEM_EXTENDED_PARAMETER_NONPAGED_HUGE; + ULONG param_count = 1; + if (numa_node >= 0) { + param_count++; + params[1].Type.Type = MiMemExtendedParameterNumaNode; + params[1].Arg.ULong = (unsigned)numa_node; + } + SIZE_T psize = size; + void* base = addr; + NTSTATUS err = (*pNtAllocateVirtualMemoryEx)(GetCurrentProcess(), &base, &psize, flags, PAGE_READWRITE, params, param_count); + if (err == 0 && base != NULL) { + return base; + } + else { + // fall back to regular large pages + mi_huge_pages_available = false; // don't try further huge pages + _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (status 0x%lx)\n", err); + } + } + // on modern Windows try use VirtualAlloc2 for numa aware large OS page allocation + if (pVirtualAlloc2 != NULL && numa_node >= 0) { + params[0].Type.Type = MiMemExtendedParameterNumaNode; + params[0].Arg.ULong = (unsigned)numa_node; + return (*pVirtualAlloc2)(GetCurrentProcess(), addr, size, flags, PAGE_READWRITE, params, 1); + } + + // otherwise use regular virtual alloc on older windows + return VirtualAlloc(addr, size, flags, PAGE_READWRITE); +} + + +//--------------------------------------------- +// Numa nodes +//--------------------------------------------- + +size_t _mi_prim_numa_node(void) { + USHORT numa_node = 0; + if (pGetCurrentProcessorNumberEx != NULL && pGetNumaProcessorNodeEx != NULL) { + // Extended API is supported + MI_PROCESSOR_NUMBER pnum; + (*pGetCurrentProcessorNumberEx)(&pnum); + USHORT nnode = 0; + BOOL ok = (*pGetNumaProcessorNodeEx)(&pnum, &nnode); + if (ok) { numa_node = nnode; } + } + else if (pGetNumaProcessorNode != NULL) { + // Vista or earlier, use older API that is limited to 64 processors. Issue #277 + DWORD pnum = GetCurrentProcessorNumber(); + UCHAR nnode = 0; + BOOL ok = pGetNumaProcessorNode((UCHAR)pnum, &nnode); + if (ok) { numa_node = nnode; } + } + return numa_node; +} + +size_t _mi_prim_numa_node_count(void) { + ULONG numa_max = 0; + GetNumaHighestNodeNumber(&numa_max); + // find the highest node number that has actual processors assigned to it. Issue #282 + while(numa_max > 0) { + if (pGetNumaNodeProcessorMaskEx != NULL) { + // Extended API is supported + GROUP_AFFINITY affinity; + if ((*pGetNumaNodeProcessorMaskEx)((USHORT)numa_max, &affinity)) { + if (affinity.Mask != 0) break; // found the maximum non-empty node + } + } + else { + // Vista or earlier, use older API that is limited to 64 processors. + ULONGLONG mask; + if (GetNumaNodeProcessorMask((UCHAR)numa_max, &mask)) { + if (mask != 0) break; // found the maximum non-empty node + }; + } + // max node was invalid or had no processor assigned, try again + numa_max--; + } + return ((size_t)numa_max + 1); +} + + +//---------------------------------------------------------------- +// Clock +//---------------------------------------------------------------- + +static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) { + static LARGE_INTEGER mfreq; // = 0 + if (mfreq.QuadPart == 0LL) { + LARGE_INTEGER f; + QueryPerformanceFrequency(&f); + mfreq.QuadPart = f.QuadPart/1000LL; + if (mfreq.QuadPart == 0) mfreq.QuadPart = 1; + } + return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart); +} + +mi_msecs_t _mi_prim_clock_now(void) { + LARGE_INTEGER t; + QueryPerformanceCounter(&t); + return mi_to_msecs(t); +} + + +//---------------------------------------------------------------- +// Process Info +//---------------------------------------------------------------- + +#include +#include + +static mi_msecs_t filetime_msecs(const FILETIME* ftime) { + ULARGE_INTEGER i; + i.LowPart = ftime->dwLowDateTime; + i.HighPart = ftime->dwHighDateTime; + mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds + return msecs; +} + +typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD); +static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL; + +void _mi_prim_process_info(mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) +{ + FILETIME ct; + FILETIME ut; + FILETIME st; + FILETIME et; + GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut); + *utime = filetime_msecs(&ut); + *stime = filetime_msecs(&st); + + // load psapi on demand + if (pGetProcessMemoryInfo == NULL) { + HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll")); + if (hDll != NULL) { + pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo"); + } + } + + // get process info + PROCESS_MEMORY_COUNTERS info; + memset(&info, 0, sizeof(info)); + if (pGetProcessMemoryInfo != NULL) { + pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info)); + } + *current_rss = (size_t)info.WorkingSetSize; + *peak_rss = (size_t)info.PeakWorkingSetSize; + *current_commit = (size_t)info.PagefileUsage; + *peak_commit = (size_t)info.PeakPagefileUsage; + *page_faults = (size_t)info.PageFaultCount; +} + +//---------------------------------------------------------------- +// Output +//---------------------------------------------------------------- + +void _mi_prim_out_stderr( const char* msg ) +{ + // on windows with redirection, the C runtime cannot handle locale dependent output + // after the main thread closes so we use direct console output. + if (!_mi_preloading()) { + // _cputs(msg); // _cputs cannot be used at is aborts if it fails to lock the console + static HANDLE hcon = INVALID_HANDLE_VALUE; + static bool hconIsConsole; + if (hcon == INVALID_HANDLE_VALUE) { + CONSOLE_SCREEN_BUFFER_INFO sbi; + hcon = GetStdHandle(STD_ERROR_HANDLE); + hconIsConsole = ((hcon != INVALID_HANDLE_VALUE) && GetConsoleScreenBufferInfo(hcon, &sbi)); + } + const size_t len = _mi_strlen(msg); + if (len > 0 && len < UINT32_MAX) { + DWORD written = 0; + if (hconIsConsole) { + WriteConsoleA(hcon, msg, (DWORD)len, &written, NULL); + } + else if (hcon != INVALID_HANDLE_VALUE) { + // use direct write if stderr was redirected + WriteFile(hcon, msg, (DWORD)len, &written, NULL); + } + else { + // finally fall back to fputs after all + fputs(msg, stderr); + } + } + } +} + + +//---------------------------------------------------------------- +// Environment +//---------------------------------------------------------------- + +// On Windows use GetEnvironmentVariable instead of getenv to work +// reliably even when this is invoked before the C runtime is initialized. +// i.e. when `_mi_preloading() == true`. +// Note: on windows, environment names are not case sensitive. +bool _mi_prim_getenv(const char* name, char* result, size_t result_size) { + result[0] = 0; + size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size); + return (len > 0 && len < result_size); +} + + + +//---------------------------------------------------------------- +// Random +//---------------------------------------------------------------- + +#if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus) +// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using +// dynamic overriding, we observed it can raise an exception when compiled with C++, and +// sometimes deadlocks when also running under the VS debugger. +// In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom. +// To be continued.. +#pragma comment (lib,"advapi32.lib") +#define RtlGenRandom SystemFunction036 +mi_decl_externc BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength); + +bool _mi_prim_random_buf(void* buf, size_t buf_len) { + return (RtlGenRandom(buf, (ULONG)buf_len) != 0); +} + +#else + +#ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG +#define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002 +#endif + +typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG); +static PBCryptGenRandom pBCryptGenRandom = NULL; + +bool _mi_prim_random_buf(void* buf, size_t buf_len) { + if (pBCryptGenRandom == NULL) { + HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll")); + if (hDll != NULL) { + pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom"); + } + if (pBCryptGenRandom == NULL) return false; + } + return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0); +} + +#endif // MI_USE_RTLGENRANDOM + +//---------------------------------------------------------------- +// Thread init/done +//---------------------------------------------------------------- + +#if !defined(MI_SHARED_LIB) + +// use thread local storage keys to detect thread ending +#include +#if (_WIN32_WINNT < 0x600) // before Windows Vista +WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback ); +WINBASEAPI PVOID WINAPI FlsGetValue( _In_ DWORD dwFlsIndex ); +WINBASEAPI BOOL WINAPI FlsSetValue( _In_ DWORD dwFlsIndex, _In_opt_ PVOID lpFlsData ); +WINBASEAPI BOOL WINAPI FlsFree(_In_ DWORD dwFlsIndex); +#endif + +static DWORD mi_fls_key = (DWORD)(-1); + +static void NTAPI mi_fls_done(PVOID value) { + mi_heap_t* heap = (mi_heap_t*)value; + if (heap != NULL) { + _mi_thread_done(heap); + FlsSetValue(mi_fls_key, NULL); // prevent recursion as _mi_thread_done may set it back to the main heap, issue #672 + } +} + +void _mi_prim_thread_init_auto_done(void) { + mi_fls_key = FlsAlloc(&mi_fls_done); +} + +void _mi_prim_thread_done_auto_done(void) { + // call thread-done on all threads (except the main thread) to prevent + // dangling callback pointer if statically linked with a DLL; Issue #208 + FlsFree(mi_fls_key); +} + +void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { + mi_assert_internal(mi_fls_key != (DWORD)(-1)); + FlsSetValue(mi_fls_key, heap); +} + +#else + +// Dll; nothing to do as in that case thread_done is handled through the DLL_THREAD_DETACH event. + +void _mi_prim_thread_init_auto_done(void) { +} + +void _mi_prim_thread_done_auto_done(void) { +} + +void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { + MI_UNUSED(heap); +} + +#endif diff --git a/src/prim/prim.c b/src/prim/prim.c new file mode 100644 index 00000000..83b7abc1 --- /dev/null +++ b/src/prim/prim.c @@ -0,0 +1,18 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018-2023, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +// Select the implementation of the primitives +// depending on the OS. + +#if defined(_WIN32) +#include "prim-windows.c" // VirtualAlloc (Windows) +#elif defined(__wasi__) +#define MI_USE_SBRK +#include "prim-wasi.h" // memory-grow or sbrk (Wasm) +#else +#include "prim-unix.c" // mmap() (Linux, macOSX, BSD, Illumnos, Haiku, DragonFly, etc.) +#endif diff --git a/src/prim/prim.h b/src/prim/prim.h new file mode 100644 index 00000000..5ed0f2e0 --- /dev/null +++ b/src/prim/prim.h @@ -0,0 +1,288 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2018-2023, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ +#pragma once +#ifndef MIMALLOC_PRIM_H +#define MIMALLOC_PRIM_H + +// note: on all primitive functions, we always get: +// addr != NULL and page aligned +// size > 0 and page aligned + +// OS memory configuration +typedef struct mi_os_mem_config_s { + size_t page_size; // 4KiB + size_t large_page_size; // 2MiB + size_t alloc_granularity; // smallest allocation size (on Windows 64KiB) + bool has_overcommit; // can we reserve more memory than can be actually committed? + bool must_free_whole; // must allocated blocks free as a whole (false for mmap, true for VirtualAlloc) +} mi_os_mem_config_t; + +// Initialize +void _mi_prim_mem_init( mi_os_mem_config_t* config ); + +// Free OS memory +void _mi_prim_free(void* addr, size_t size ); + +// Allocate OS memory. Return NULL on error. +// The `try_alignment` is just a hint and the returned pointer does not have to be aligned. +// pre: !commit => !allow_large +// try_alignment >= _mi_os_page_size() and a power of 2 +void* _mi_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large); + +// Commit memory. Returns error code or 0 on success. +int _mi_prim_commit(void* addr, size_t size, bool commit); + +// Reset memory. The range keeps being accessible but the content might be reset. +// Returns error code or 0 on success. +int _mi_prim_reset(void* addr, size_t size); + +// Protect memory. Returns error code or 0 on success. +int _mi_prim_protect(void* addr, size_t size, bool protect); + +// Allocate huge (1GiB) pages possibly associated with a NUMA node. +// pre: size > 0 and a multiple of 1GiB. +// addr is either NULL or an address hint. +// numa_node is either negative (don't care), or a numa node number. +void* _mi_prim_alloc_huge_os_pages(void* addr, size_t size, int numa_node); + +// Return the current NUMA node +size_t _mi_prim_numa_node(void); + +// Return the number of logical NUMA nodes +size_t _mi_prim_numa_node_count(void); + +// Clock ticks +mi_msecs_t _mi_prim_clock_now(void); + +// Return process information (only for statistics) +void _mi_prim_process_info(mi_msecs_t* utime, mi_msecs_t* stime, + size_t* current_rss, size_t* peak_rss, + size_t* current_commit, size_t* peak_commit, size_t* page_faults); + +// Default stderr output. (only for warnings etc. with verbose enabled) +// msg != NULL && _mi_strlen(msg) > 0 +void _mi_prim_out_stderr( const char* msg ); + +// Get an environment variable. (only for options) +// name != NULL, result != NULL, result_size >= 64 +bool _mi_prim_getenv(const char* name, char* result, size_t result_size); + + +// Fill a buffer with strong randomness; return `false` on error or if +// there is no strong randomization available. +bool _mi_prim_random_buf(void* buf, size_t buf_len); + +// Called on the first thread start, and should ensure `_mi_thread_done` is called on thread termination. +void _mi_prim_thread_init_auto_done(void); + +// Called on process exit and may take action to clean up resources associated with the thread auto done. +void _mi_prim_thread_done_auto_done(void); + +// Called when the default heap for a thread changes +void _mi_prim_thread_associate_default_heap(mi_heap_t* heap); + + +//------------------------------------------------------------------- +// Thread id +// +// Getting the thread id should be performant as it is called in the +// fast path of `_mi_free` and we specialize for various platforms as +// inlined definitions. Regular code should call `init.c:_mi_thread_id()`. +// We only require _mi_prim_thread_id() to return a unique id for each thread. +//------------------------------------------------------------------- + +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept; + +#if defined(_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + // Windows: works on Intel and ARM in both 32- and 64-bit + return (uintptr_t)NtCurrentTeb(); +} + +// We use assembly for a fast thread id on the main platforms. The TLS layout depends on +// both the OS and libc implementation so we use specific tests for each main platform. +// If you test on another platform and it works please send a PR :-) +// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register. +#elif defined(__GNUC__) && ( \ + (defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ + || (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__))) \ + || (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ + || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ + || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ + ) + +static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept { + void* res; + const size_t ofs = (slot*sizeof(void*)); + #if defined(__i386__) + __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86 32-bit always uses GS + #elif defined(__APPLE__) && defined(__x86_64__) + __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 macOSX uses GS + #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) + __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x32 ABI + #elif defined(__x86_64__) + __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 Linux, BSD uses FS + #elif defined(__arm__) + void** tcb; MI_UNUSED(ofs); + __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); + res = tcb[slot]; + #elif defined(__aarch64__) + void** tcb; MI_UNUSED(ofs); + #if defined(__APPLE__) // M1, issue #343 + __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); + #else + __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); + #endif + res = tcb[slot]; + #endif + return res; +} + +// setting a tls slot is only used on macOS for now +static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept { + const size_t ofs = (slot*sizeof(void*)); + #if defined(__i386__) + __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // 32-bit always uses GS + #elif defined(__APPLE__) && defined(__x86_64__) + __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 macOS uses GS + #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) + __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x32 ABI + #elif defined(__x86_64__) + __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS + #elif defined(__arm__) + void** tcb; MI_UNUSED(ofs); + __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); + tcb[slot] = value; + #elif defined(__aarch64__) + void** tcb; MI_UNUSED(ofs); + #if defined(__APPLE__) // M1, issue #343 + __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); + #else + __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); + #endif + tcb[slot] = value; + #endif +} + +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + #if defined(__BIONIC__) + // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id + // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86 + return (uintptr_t)mi_prim_tls_slot(1); + #else + // in all our other targets, slot 0 is the thread id + // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h + // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36 + return (uintptr_t)mi_prim_tls_slot(0); + #endif +} + +#else + +// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms). +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + return (uintptr_t)&_mi_heap_default; +} + +#endif + + + +/* ---------------------------------------------------------------------------------------- +The thread local default heap: `_mi_prim_get_default_heap()` +This is inlined here as it is on the fast path for allocation functions. + +On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a +__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures +that the storage will always be available (allocated on the thread stacks). +On some platforms though we cannot use that when overriding `malloc` since the underlying +TLS implementation (or the loader) will call itself `malloc` on a first access and recurse. +We try to circumvent this in an efficient way: +- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the + loader itself calls `malloc` even before the modules are initialized. +- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS). +- DragonFly: defaults are working but seem slow compared to freeBSD (see PR #323) +------------------------------------------------------------------------------------------- */ + +// defined in `init.c`; do not use these directly +extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from +extern bool _mi_process_is_initialized; // has mi_process_init been called? + +static inline mi_heap_t* mi_prim_get_default_heap(void); + +#if defined(MI_MALLOC_OVERRIDE) +#if defined(__APPLE__) // macOS + #define MI_TLS_SLOT 89 // seems unused? + // #define MI_TLS_RECURSE_GUARD 1 + // other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89) + // see +#elif defined(__OpenBSD__) + // use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) + // see + #define MI_TLS_PTHREAD_SLOT_OFS (6*sizeof(int) + 4*sizeof(void*) + 24) + // #elif defined(__DragonFly__) + // #warning "mimalloc is not working correctly on DragonFly yet." + // #define MI_TLS_PTHREAD_SLOT_OFS (4 + 1*sizeof(void*)) // offset `uniqueid` (also used by gdb?) +#elif defined(__ANDROID__) + // See issue #381 + #define MI_TLS_PTHREAD +#endif +#endif + + +#if defined(MI_TLS_SLOT) + +static inline mi_heap_t* mi_prim_get_default_heap(void) { + mi_heap_t* heap = (mi_heap_t*)mi_prim_tls_slot(MI_TLS_SLOT); + if mi_unlikely(heap == NULL) { + #ifdef __GNUC__ + __asm(""); // prevent conditional load of the address of _mi_heap_empty + #endif + heap = (mi_heap_t*)&_mi_heap_empty; + } + return heap; +} + +#elif defined(MI_TLS_PTHREAD_SLOT_OFS) + +static inline mi_heap_t* mi_prim_get_default_heap(void) { + mi_heap_t* heap; + pthread_t self = pthread_self(); + #if defined(__DragonFly__) + if (self==NULL) { heap = _mi_heap_main_get(); } else + #endif + { + heap = *((mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS)); + } + return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); +} + +#elif defined(MI_TLS_PTHREAD) + +extern pthread_key_t _mi_heap_default_key; +static inline mi_heap_t* mi_prim_get_default_heap(void) { + mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key)); + return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); +} + +#else // default using a thread local variable; used on most platforms. + +static inline mi_heap_t* mi_prim_get_default_heap(void) { + #if defined(MI_TLS_RECURSE_GUARD) + if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get(); + #endif + return _mi_heap_default; +} + +#endif // mi_prim_get_default_heap() + + + +#endif // MIMALLOC_PRIM_H diff --git a/src/prim/readme.md b/src/prim/readme.md new file mode 100644 index 00000000..14248496 --- /dev/null +++ b/src/prim/readme.md @@ -0,0 +1,6 @@ +This is the portability layer where all primitives needed from the OS are defined. + +- `prim.h`: API definition +- `prim.c`: Selects one of `prim-unix.c`, `prim-wasi.c`, or `prim-windows.c` depending on the host platform. + +Note: still work in progress, there may be other places in the sources that still depend on OS ifdef's. \ No newline at end of file diff --git a/src/random.c b/src/random.c index 06d4ba4a..3c8372c8 100644 --- a/src/random.c +++ b/src/random.c @@ -4,14 +4,10 @@ This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ -#ifndef _DEFAULT_SOURCE -#define _DEFAULT_SOURCE // for syscall() on Linux -#endif - #include "mimalloc.h" #include "mimalloc-internal.h" - -#include // memset +#include "prim/prim.h" // _mi_prim_random_buf +#include // memset /* ---------------------------------------------------------------------------- We use our own PRNG to keep predictable performance of random number generation @@ -158,159 +154,13 @@ uintptr_t _mi_random_next(mi_random_ctx_t* ctx) { /* ---------------------------------------------------------------------------- -To initialize a fresh random context we rely on the OS: -- Windows : BCryptGenRandom (or RtlGenRandom) -- macOS : CCRandomGenerateBytes, arc4random_buf -- bsd,wasi : arc4random_buf -- Linux : getrandom,/dev/urandom +To initialize a fresh random context. If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR. -----------------------------------------------------------------------------*/ -#if defined(_WIN32) - -#if defined(MI_USE_RTLGENRANDOM) // || defined(__cplusplus) -// We prefer to use BCryptGenRandom instead of (the unofficial) RtlGenRandom but when using -// dynamic overriding, we observed it can raise an exception when compiled with C++, and -// sometimes deadlocks when also running under the VS debugger. -// In contrast, issue #623 implies that on Windows Server 2019 we need to use BCryptGenRandom. -// To be continued.. -#pragma comment (lib,"advapi32.lib") -#define RtlGenRandom SystemFunction036 -#ifdef __cplusplus -extern "C" { -#endif -BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength); -#ifdef __cplusplus -} -#endif -static bool os_random_buf(void* buf, size_t buf_len) { - return (RtlGenRandom(buf, (ULONG)buf_len) != 0); -} -#else - -#ifndef BCRYPT_USE_SYSTEM_PREFERRED_RNG -#define BCRYPT_USE_SYSTEM_PREFERRED_RNG 0x00000002 -#endif - -typedef LONG (NTAPI *PBCryptGenRandom)(HANDLE, PUCHAR, ULONG, ULONG); -static PBCryptGenRandom pBCryptGenRandom = NULL; - -static bool os_random_buf(void* buf, size_t buf_len) { - if (pBCryptGenRandom == NULL) { - HINSTANCE hDll = LoadLibrary(TEXT("bcrypt.dll")); - if (hDll != NULL) { - pBCryptGenRandom = (PBCryptGenRandom)(void (*)(void))GetProcAddress(hDll, "BCryptGenRandom"); - } - } - if (pBCryptGenRandom == NULL) { - return false; - } - else { - return (pBCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0); - } -} -#endif - -#elif defined(__APPLE__) -#include -#if defined(MAC_OS_X_VERSION_10_10) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_10 -#include -#include -#endif -static bool os_random_buf(void* buf, size_t buf_len) { - #if defined(MAC_OS_X_VERSION_10_15) && MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_15 - // We prefere CCRandomGenerateBytes as it returns an error code while arc4random_buf - // may fail silently on macOS. See PR #390, and - return (CCRandomGenerateBytes(buf, buf_len) == kCCSuccess); - #else - // fall back on older macOS - arc4random_buf(buf, buf_len); - return true; - #endif -} - -#elif defined(__ANDROID__) || defined(__DragonFly__) || \ - defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ - defined(__sun) // todo: what to use with __wasi__? -#include -static bool os_random_buf(void* buf, size_t buf_len) { - arc4random_buf(buf, buf_len); - return true; -} -#elif defined(__linux__) || defined(__HAIKU__) -#if defined(__linux__) -#include -#endif -#include -#include -#include -#include -#include -static bool os_random_buf(void* buf, size_t buf_len) { - // Modern Linux provides `getrandom` but different distributions either use `sys/random.h` or `linux/random.h` - // and for the latter the actual `getrandom` call is not always defined. - // (see ) - // We therefore use a syscall directly and fall back dynamically to /dev/urandom when needed. -#ifdef SYS_getrandom - #ifndef GRND_NONBLOCK - #define GRND_NONBLOCK (1) - #endif - static _Atomic(uintptr_t) no_getrandom; // = 0 - if (mi_atomic_load_acquire(&no_getrandom)==0) { - ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK); - if (ret >= 0) return (buf_len == (size_t)ret); - if (errno != ENOSYS) return false; - mi_atomic_store_release(&no_getrandom, 1UL); // don't call again, and fall back to /dev/urandom - } -#endif - int flags = O_RDONLY; - #if defined(O_CLOEXEC) - flags |= O_CLOEXEC; - #endif - int fd = open("/dev/urandom", flags, 0); - if (fd < 0) return false; - size_t count = 0; - while(count < buf_len) { - ssize_t ret = read(fd, (char*)buf + count, buf_len - count); - if (ret<=0) { - if (errno!=EAGAIN && errno!=EINTR) break; - } - else { - count += ret; - } - } - close(fd); - return (count==buf_len); -} -#else -static bool os_random_buf(void* buf, size_t buf_len) { - return false; -} -#endif - -#if defined(_WIN32) -#include -#elif defined(__APPLE__) -#include -#else -#include -#endif - uintptr_t _mi_os_random_weak(uintptr_t extra_seed) { uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random - - #if defined(_WIN32) - LARGE_INTEGER pcount; - QueryPerformanceCounter(&pcount); - x ^= (uintptr_t)(pcount.QuadPart); - #elif defined(__APPLE__) - x ^= (uintptr_t)mach_absolute_time(); - #else - struct timespec time; - clock_gettime(CLOCK_MONOTONIC, &time); - x ^= (uintptr_t)time.tv_sec; - x ^= (uintptr_t)time.tv_nsec; - #endif + x ^= _mi_prim_clock_now(); // and do a few randomization steps uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1; for (uintptr_t i = 0; i < max; i++) { @@ -322,7 +172,7 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed) { static void mi_random_init_ex(mi_random_ctx_t* ctx, bool use_weak) { uint8_t key[32]; - if (use_weak || !os_random_buf(key, sizeof(key))) { + if (use_weak || !_mi_prim_random_buf(key, sizeof(key))) { // if we fail to get random data from the OS, we fall back to a // weak random source based on the current time #if !defined(__wasi__) diff --git a/src/stats.c b/src/stats.c index 4345e9bd..357bebce 100644 --- a/src/stats.c +++ b/src/stats.c @@ -7,8 +7,9 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc-internal.h" #include "mimalloc-atomic.h" +#include "prim/prim.h" -#include // fputs, stderr +#include // snprintf #include // memset #if defined(_MSC_VER) && (_MSC_VER < 1920) @@ -291,8 +292,6 @@ static void mi_cdecl mi_buffered_out(const char* msg, void* arg) { // Print statistics //------------------------------------------------------------ -static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults); - static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept { // wrap the output function to be line buffered char buf[256]; @@ -337,15 +336,15 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_stat_counter_print_avg(&stats->searches, "searches", out, arg); _mi_fprintf(out, arg, "%10s: %7zu\n", "numa nodes", _mi_os_numa_node_count()); - mi_msecs_t elapsed; - mi_msecs_t user_time; - mi_msecs_t sys_time; + size_t elapsed; + size_t user_time; + size_t sys_time; size_t current_rss; size_t peak_rss; size_t current_commit; size_t peak_commit; size_t page_faults; - mi_stat_process_info(&elapsed, &user_time, &sys_time, ¤t_rss, &peak_rss, ¤t_commit, &peak_commit, &page_faults); + mi_process_info(&elapsed, &user_time, &sys_time, ¤t_rss, &peak_rss, ¤t_commit, &peak_commit, &page_faults); _mi_fprintf(out, arg, "%10s: %7ld.%03ld s\n", "elapsed", elapsed/1000, elapsed%1000); _mi_fprintf(out, arg, "%10s: user: %ld.%03ld s, system: %ld.%03ld s, faults: %lu, rss: ", "process", user_time/1000, user_time%1000, sys_time/1000, sys_time%1000, (unsigned long)page_faults ); @@ -404,47 +403,13 @@ void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept { // ---------------------------------------------------------------- // Basic timer for convenience; use milli-seconds to avoid doubles // ---------------------------------------------------------------- -#ifdef _WIN32 -#include -static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) { - static LARGE_INTEGER mfreq; // = 0 - if (mfreq.QuadPart == 0LL) { - LARGE_INTEGER f; - QueryPerformanceFrequency(&f); - mfreq.QuadPart = f.QuadPart/1000LL; - if (mfreq.QuadPart == 0) mfreq.QuadPart = 1; - } - return (mi_msecs_t)(t.QuadPart / mfreq.QuadPart); -} - -mi_msecs_t _mi_clock_now(void) { - LARGE_INTEGER t; - QueryPerformanceCounter(&t); - return mi_to_msecs(t); -} -#else -#include -#if defined(CLOCK_REALTIME) || defined(CLOCK_MONOTONIC) -mi_msecs_t _mi_clock_now(void) { - struct timespec t; - #ifdef CLOCK_MONOTONIC - clock_gettime(CLOCK_MONOTONIC, &t); - #else - clock_gettime(CLOCK_REALTIME, &t); - #endif - return ((mi_msecs_t)t.tv_sec * 1000) + ((mi_msecs_t)t.tv_nsec / 1000000); -} -#else -// low resolution timer -mi_msecs_t _mi_clock_now(void) { - return ((mi_msecs_t)clock() / ((mi_msecs_t)CLOCKS_PER_SEC / 1000)); -} -#endif -#endif - static mi_msecs_t mi_clock_diff; +mi_msecs_t _mi_clock_now(void) { + return _mi_prim_clock_now(); +} + mi_msecs_t _mi_clock_start(void) { if (mi_clock_diff == 0.0) { mi_msecs_t t0 = _mi_clock_now(); @@ -463,130 +428,9 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) { // Basic process statistics // -------------------------------------------------------- -#if defined(_WIN32) -#include -#include - -static mi_msecs_t filetime_msecs(const FILETIME* ftime) { - ULARGE_INTEGER i; - i.LowPart = ftime->dwLowDateTime; - i.HighPart = ftime->dwHighDateTime; - mi_msecs_t msecs = (i.QuadPart / 10000); // FILETIME is in 100 nano seconds - return msecs; -} - -typedef BOOL (WINAPI *PGetProcessMemoryInfo)(HANDLE, PPROCESS_MEMORY_COUNTERS, DWORD); -static PGetProcessMemoryInfo pGetProcessMemoryInfo = NULL; - -static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) -{ - *elapsed = _mi_clock_end(mi_process_start); - FILETIME ct; - FILETIME ut; - FILETIME st; - FILETIME et; - GetProcessTimes(GetCurrentProcess(), &ct, &et, &st, &ut); - *utime = filetime_msecs(&ut); - *stime = filetime_msecs(&st); - - // load psapi on demand - if (pGetProcessMemoryInfo == NULL) { - HINSTANCE hDll = LoadLibrary(TEXT("psapi.dll")); - if (hDll != NULL) { - pGetProcessMemoryInfo = (PGetProcessMemoryInfo)(void (*)(void))GetProcAddress(hDll, "GetProcessMemoryInfo"); - } - } - - // get process info - PROCESS_MEMORY_COUNTERS info; - memset(&info, 0, sizeof(info)); - if (pGetProcessMemoryInfo != NULL) { - pGetProcessMemoryInfo(GetCurrentProcess(), &info, sizeof(info)); - } - *current_rss = (size_t)info.WorkingSetSize; - *peak_rss = (size_t)info.PeakWorkingSetSize; - *current_commit = (size_t)info.PagefileUsage; - *peak_commit = (size_t)info.PeakPagefileUsage; - *page_faults = (size_t)info.PageFaultCount; -} - -#elif !defined(__wasi__) && (defined(__unix__) || defined(__unix) || defined(unix) || defined(__APPLE__) || defined(__HAIKU__)) -#include -#include -#include - -#if defined(__APPLE__) -#include -#endif - -#if defined(__HAIKU__) -#include -#endif - -static mi_msecs_t timeval_secs(const struct timeval* tv) { - return ((mi_msecs_t)tv->tv_sec * 1000L) + ((mi_msecs_t)tv->tv_usec / 1000L); -} - -static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) -{ - *elapsed = _mi_clock_end(mi_process_start); - struct rusage rusage; - getrusage(RUSAGE_SELF, &rusage); - *utime = timeval_secs(&rusage.ru_utime); - *stime = timeval_secs(&rusage.ru_stime); -#if !defined(__HAIKU__) - *page_faults = rusage.ru_majflt; -#endif - // estimate commit using our stats - *peak_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak)); - *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current)); - *current_rss = *current_commit; // estimate -#if defined(__HAIKU__) - // Haiku does not have (yet?) a way to - // get these stats per process - thread_info tid; - area_info mem; - ssize_t c; - get_thread_info(find_thread(0), &tid); - while (get_next_area_info(tid.team, &c, &mem) == B_OK) { - *peak_rss += mem.ram_size; - } - *page_faults = 0; -#elif defined(__APPLE__) - *peak_rss = rusage.ru_maxrss; // BSD reports in bytes - struct mach_task_basic_info info; - mach_msg_type_number_t infoCount = MACH_TASK_BASIC_INFO_COUNT; - if (task_info(mach_task_self(), MACH_TASK_BASIC_INFO, (task_info_t)&info, &infoCount) == KERN_SUCCESS) { - *current_rss = (size_t)info.resident_size; - } -#else - *peak_rss = rusage.ru_maxrss * 1024; // Linux reports in KiB -#endif -} - -#else -#ifndef __wasi__ -// WebAssembly instances are not processes -#pragma message("define a way to get process info") -#endif - -static void mi_stat_process_info(mi_msecs_t* elapsed, mi_msecs_t* utime, mi_msecs_t* stime, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) -{ - *elapsed = _mi_clock_end(mi_process_start); - *peak_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak)); - *current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current)); - *peak_rss = *peak_commit; - *current_rss = *current_commit; - *page_faults = 0; - *utime = 0; - *stime = 0; -} -#endif - - mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept { - mi_msecs_t elapsed = 0; + mi_msecs_t elapsed = _mi_clock_end(mi_process_start); mi_msecs_t utime = 0; mi_msecs_t stime = 0; size_t current_rss0 = 0; @@ -594,8 +438,8 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s size_t current_commit0 = 0; size_t peak_commit0 = 0; size_t page_faults0 = 0; - mi_stat_process_info(&elapsed,&utime, &stime, ¤t_rss0, &peak_rss0, ¤t_commit0, &peak_commit0, &page_faults0); - if (elapsed_msecs!=NULL) *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX)); + _mi_prim_process_info(&utime, &stime, ¤t_rss0, &peak_rss0, ¤t_commit0, &peak_commit0, &page_faults0); + if (elapsed_msecs!=NULL) *elapsed_msecs = (elapsed < 0 ? 0 : (elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)elapsed : PTRDIFF_MAX)); if (user_msecs!=NULL) *user_msecs = (utime < 0 ? 0 : (utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)utime : PTRDIFF_MAX)); if (system_msecs!=NULL) *system_msecs = (stime < 0 ? 0 : (stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)stime : PTRDIFF_MAX)); if (current_rss!=NULL) *current_rss = current_rss0;