diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index 7173a189..0e3ebed8 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -11,7 +11,10 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc-types.h" #if defined(MI_MALLOC_OVERRIDE) -#if defined(__APPLE__) +#if defined(__APPLE__) && (defined(__i386__) || defined(__x86_64__)) +#define MI_TLS_OSX_FAST +#define MI_TLS_OSX_SLOT 94 // seems unused, except in Webkit? See: +#elif defined(__APPLE__) #include #define MI_TLS_PTHREADS #elif (defined(__OpenBSD__) || defined(__DragonFly__)) @@ -284,14 +287,31 @@ extern const mi_heap_t _mi_heap_empty; // read-only empty heap, initial value o extern mi_heap_t _mi_heap_main; // statically allocated main backing heap extern bool _mi_process_is_initialized; -#if defined(MI_TLS_PTHREADS) +#if defined(MI_TLS_OSX_FAST) +#define MI_TLS_OSX_OFFSET (MI_TLS_OSX_SLOT*sizeof(void*)) +static inline void* mi_tls_osx_fast_get(void) { + void* ret; + __asm__("mov %%gs:%1, %0" : "=r" (ret) : "m" (*(void**)(MI_TLS_OSX_OFFSET))); + return ret; +} +static inline void mi_tls_osx_fast_set(void* value) { + __asm__("movq %1,%%gs:%0" : "=m" (*(void**)(MI_TLS_OSX_OFFSET)) : "rn" (value)); +} +#elif defined(MI_TLS_PTHREADS) extern pthread_key_t _mi_heap_default_key; #else extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from #endif static inline mi_heap_t* mi_get_default_heap(void) { -#if defined(MI_TLS_PTHREADS) +#if defined(MI_TLS_OSX_FAST) + // Use a fixed slot in the TSD on MacOSX to avoid recursion (since the loader calls malloc). + // We use slot 94 (__PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4) + // which seems unused except for the more recent Webkit + // Use with care. + mi_heap_t* heap = (mi_heap_t*)mi_tls_osx_fast_get(); + return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); +#elif defined(MI_TLS_PTHREADS) // Use pthreads for TLS; this is used on macOSX with interpose as the loader calls `malloc` // to allocate TLS storage leading to recursive calls if __thread declared variables are accessed. // Using pthreads allows us to initialize without recursive calls. (performance seems still quite good). @@ -300,9 +320,9 @@ static inline mi_heap_t* mi_get_default_heap(void) { #else #if defined(MI_TLS_RECURSE_GUARD) // On some BSD platforms, like openBSD, the dynamic loader calls `malloc` - // to initialize thread local data. To avoid recursion, we need to avoid - // accessing the thread local `_mi_default_heap` until our module is loaded - // and use the statically allocated main heap until that time. + // to initialize thread local data (before our module is loaded). + // To avoid recursion, we need to avoid accessing the thread local `_mi_default_heap` + // until our module is loaded and use the statically allocated main heap until that time. // TODO: patch ourselves dynamically to avoid this check every time? if (mi_unlikely(!_mi_process_is_initialized)) return &_mi_heap_main; #endif diff --git a/src/init.c b/src/init.c index 431b7fee..960cccf1 100644 --- a/src/init.c +++ b/src/init.c @@ -260,14 +260,15 @@ static void _mi_thread_done(mi_heap_t* default_heap); // use thread local storage keys to detect thread ending #include #include - static DWORD mi_fls_key; + static DWORD mi_fls_key = (DWORD)(-1); static void NTAPI mi_fls_done(PVOID value) { if (value!=NULL) _mi_thread_done((mi_heap_t*)value); } #elif defined(MI_USE_PTHREADS) - // use pthread locol storage keys to detect thread ending + // use pthread local storage keys to detect thread ending + // (and used with MI_TLS_PTHREADS for the default heap) #include - pthread_key_t _mi_heap_default_key; + pthread_key_t _mi_heap_default_key = (pthread_key_t)(-1); static void mi_pthread_done(void* value) { if (value!=NULL) _mi_thread_done((mi_heap_t*)value); } @@ -287,6 +288,7 @@ static void mi_process_setup_auto_thread_done(void) { #elif defined(_WIN32) && !defined(MI_SHARED_LIB) mi_fls_key = FlsAlloc(&mi_fls_done); #elif defined(MI_USE_PTHREADS) + mi_assert_internal(_mi_heap_default_key == (pthread_key_t)(-1)); pthread_key_create(&_mi_heap_default_key, &mi_pthread_done); #endif _mi_heap_set_default_direct(&_mi_heap_main); @@ -331,9 +333,14 @@ static void _mi_thread_done(mi_heap_t* heap) { void _mi_heap_set_default_direct(mi_heap_t* heap) { mi_assert_internal(heap != NULL); - #if !defined(MI_TLS_PTHREADS) + #if defined(MI_TLS_OSX_FAST) + mi_tls_osx_fast_set(heap); + #elif defined(MI_TLS_PTHREADS) + // we use _mi_heap_default_key + #else _mi_heap_default = heap; - #endif + #endif + // ensure the default heap is passed to `_mi_thread_done` // setting to a non-NULL value also ensures `mi_thread_done` is called. #if defined(_WIN32) && defined(MI_SHARED_LIB) @@ -342,7 +349,7 @@ void _mi_heap_set_default_direct(mi_heap_t* heap) { mi_assert_internal(mi_fls_key != 0); FlsSetValue(mi_fls_key, heap); #elif defined(MI_USE_PTHREADS) - // mi_assert_internal(_mi_heap_default_key != 0); // often 0 is also the allocated key + mi_assert_internal(_mi_heap_default_key != (pthread_key_t)(-1)); pthread_setspecific(_mi_heap_default_key, heap); #endif } diff --git a/test/test-stress.c b/test/test-stress.c index 1bfc5012..7d8993a0 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -27,7 +27,7 @@ terms of the MIT license. // argument defaults static int THREADS = 32; // more repeatable if THREADS <= #processors static int SCALE = 10; // scaling factor -static int ITER = 5; // N full iterations destructing and re-creating all threads +static int ITER = 50; // N full iterations destructing and re-creating all threads // static int THREADS = 8; // more repeatable if THREADS <= #processors // static int SCALE = 100; // scaling factor @@ -250,7 +250,7 @@ int main(int argc, char** argv) { #endif // mi_collect(true); - // mi_stats_print(NULL); + mi_stats_print(NULL); //bench_end_program(); return 0; }