diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index ce6e6a6b..6e6da9f3 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -74,6 +74,7 @@ extern mi_decl_cache_align const mi_page_t _mi_page_empty; bool _mi_is_main_thread(void); size_t _mi_current_thread_count(void); bool _mi_preloading(void); // true while the C runtime is not ready +mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; // os.c size_t _mi_os_page_size(void); @@ -754,107 +755,6 @@ static inline size_t _mi_os_numa_node_count(void) { } -// ------------------------------------------------------------------- -// Getting the thread id should be performant as it is called in the -// fast path of `_mi_free` and we specialize for various platforms. -// We only require _mi_threadid() to return a unique id for each thread. -// ------------------------------------------------------------------- -#if defined(_WIN32) - -#define WIN32_LEAN_AND_MEAN -#include -static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { - // Windows: works on Intel and ARM in both 32- and 64-bit - return (uintptr_t)NtCurrentTeb(); -} - -// We use assembly for a fast thread id on the main platforms. The TLS layout depends on -// both the OS and libc implementation so we use specific tests for each main platform. -// If you test on another platform and it works please send a PR :-) -// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register. -#elif defined(__GNUC__) && ( \ - (defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ - || (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__))) \ - || (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ - || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ - || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ - ) - -static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept { - void* res; - const size_t ofs = (slot*sizeof(void*)); - #if defined(__i386__) - __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86 32-bit always uses GS - #elif defined(__APPLE__) && defined(__x86_64__) - __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 macOSX uses GS - #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) - __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x32 ABI - #elif defined(__x86_64__) - __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 Linux, BSD uses FS - #elif defined(__arm__) - void** tcb; MI_UNUSED(ofs); - __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); - res = tcb[slot]; - #elif defined(__aarch64__) - void** tcb; MI_UNUSED(ofs); - #if defined(__APPLE__) // M1, issue #343 - __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); - #else - __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); - #endif - res = tcb[slot]; - #endif - return res; -} - -// setting a tls slot is only used on macOS for now -static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept { - const size_t ofs = (slot*sizeof(void*)); - #if defined(__i386__) - __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // 32-bit always uses GS - #elif defined(__APPLE__) && defined(__x86_64__) - __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 macOS uses GS - #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) - __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x32 ABI - #elif defined(__x86_64__) - __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS - #elif defined(__arm__) - void** tcb; MI_UNUSED(ofs); - __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); - tcb[slot] = value; - #elif defined(__aarch64__) - void** tcb; MI_UNUSED(ofs); - #if defined(__APPLE__) // M1, issue #343 - __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); - #else - __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); - #endif - tcb[slot] = value; - #endif -} - -static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { - #if defined(__BIONIC__) - // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id - // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86 - return (uintptr_t)mi_tls_slot(1); - #else - // in all our other targets, slot 0 is the thread id - // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h - // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36 - return (uintptr_t)mi_tls_slot(0); - #endif -} - -#else - -// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms). -static inline mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { - return (uintptr_t)&_mi_heap_default; -} - -#endif - // ----------------------------------------------------------------------- // Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero) diff --git a/src/alloc.c b/src/alloc.c index 04c4c48c..d8276df9 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -11,10 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc-internal.h" #include "mimalloc-atomic.h" +#include "prim/prim.h" // _mi_prim_thread_id() - -#include // memset, strlen -#include // malloc, exit +#include // memset, strlen (for mi_strdup) +#include // malloc, abort #define MI_IN_ALLOC_C #include "alloc-override.c" @@ -536,7 +536,7 @@ void mi_free(void* p) mi_attr_noexcept { if mi_unlikely(p == NULL) return; mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free"); - const bool is_local= (_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); + const bool is_local= (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); mi_page_t* const page = _mi_segment_page_of(segment, p); if mi_likely(is_local) { // thread-local free? diff --git a/src/init.c b/src/init.c index a2a6be75..dd555030 100644 --- a/src/init.c +++ b/src/init.c @@ -6,6 +6,7 @@ terms of the MIT license. A copy of the license can be found in the file -----------------------------------------------------------------------------*/ #include "mimalloc.h" #include "mimalloc-internal.h" +#include "prim/prim.h" #include // memcpy, memset #include // atexit @@ -103,6 +104,10 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { }; +mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { + return _mi_prim_thread_id(); +} + // the thread-local default heap for allocation mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty; diff --git a/src/prim/prim.h b/src/prim/prim.h index 2cbbc85c..2d951930 100644 --- a/src/prim/prim.h +++ b/src/prim/prim.h @@ -12,7 +12,6 @@ terms of the MIT license. A copy of the license can be found in the file // addr != NULL and page aligned // size > 0 and page aligned - // OS memory configuration typedef struct mi_os_mem_config_s { size_t page_size; // 4KiB @@ -72,4 +71,113 @@ void _mi_prim_out_stderr( const char* msg ); // name != NULL, result != NULL, result_size >= 64 bool _mi_prim_getenv(const char* name, char* result, size_t result_size); + +//------------------------------------------------------------------- +// Thread id +// +// Getting the thread id should be performant as it is called in the +// fast path of `_mi_free` and we specialize for various platforms as +// inlined definitions. Regular code should call `init.c:_mi_thread_id()`. +// We only require _mi_prim_thread_id() to return a unique id for each thread. +//------------------------------------------------------------------- + +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept; + +#if defined(_WIN32) + +#define WIN32_LEAN_AND_MEAN +#include +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + // Windows: works on Intel and ARM in both 32- and 64-bit + return (uintptr_t)NtCurrentTeb(); +} + +// We use assembly for a fast thread id on the main platforms. The TLS layout depends on +// both the OS and libc implementation so we use specific tests for each main platform. +// If you test on another platform and it works please send a PR :-) +// see also https://akkadia.org/drepper/tls.pdf for more info on the TLS register. +#elif defined(__GNUC__) && ( \ + (defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ + || (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__))) \ + || (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ + || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ + || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ + ) + +static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept { + void* res; + const size_t ofs = (slot*sizeof(void*)); + #if defined(__i386__) + __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86 32-bit always uses GS + #elif defined(__APPLE__) && defined(__x86_64__) + __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 macOSX uses GS + #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) + __asm__("movl %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x32 ABI + #elif defined(__x86_64__) + __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : ); // x86_64 Linux, BSD uses FS + #elif defined(__arm__) + void** tcb; MI_UNUSED(ofs); + __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); + res = tcb[slot]; + #elif defined(__aarch64__) + void** tcb; MI_UNUSED(ofs); + #if defined(__APPLE__) // M1, issue #343 + __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); + #else + __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); + #endif + res = tcb[slot]; + #endif + return res; +} + +// setting a tls slot is only used on macOS for now +static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept { + const size_t ofs = (slot*sizeof(void*)); + #if defined(__i386__) + __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // 32-bit always uses GS + #elif defined(__APPLE__) && defined(__x86_64__) + __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 macOS uses GS + #elif defined(__x86_64__) && (MI_INTPTR_SIZE==4) + __asm__("movl %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x32 ABI + #elif defined(__x86_64__) + __asm__("movq %1,%%fs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : ); // x86_64 Linux, BSD uses FS + #elif defined(__arm__) + void** tcb; MI_UNUSED(ofs); + __asm__ volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb)); + tcb[slot] = value; + #elif defined(__aarch64__) + void** tcb; MI_UNUSED(ofs); + #if defined(__APPLE__) // M1, issue #343 + __asm__ volatile ("mrs %0, tpidrro_el0\nbic %0, %0, #7" : "=r" (tcb)); + #else + __asm__ volatile ("mrs %0, tpidr_el0" : "=r" (tcb)); + #endif + tcb[slot] = value; + #endif +} + +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + #if defined(__BIONIC__) + // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id + // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86 + return (uintptr_t)mi_tls_slot(1); + #else + // in all our other targets, slot 0 is the thread id + // glibc: https://sourceware.org/git/?p=glibc.git;a=blob_plain;f=sysdeps/x86_64/nptl/tls.h + // apple: https://github.com/apple/darwin-xnu/blob/main/libsyscall/os/tsd.h#L36 + return (uintptr_t)mi_tls_slot(0); + #endif +} + +#else + +// otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms). +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + return (uintptr_t)&_mi_heap_default; +} + +#endif + + #endif // MIMALLOC_PRIM_H