From 9194362e4858bdd2eaf1b1cb9075abaa6ace2460 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 4 May 2025 09:04:57 -0700 Subject: [PATCH 1/2] improve TLS access on Windows with msvc (by Frank Richter, issue #1078) --- ide/vs2022/mimalloc-test-stress.vcxproj | 4 ++-- include/mimalloc/prim.h | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj index d6af71ce..128a4ff6 100644 --- a/ide/vs2022/mimalloc-test-stress.vcxproj +++ b/ide/vs2022/mimalloc-test-stress.vcxproj @@ -282,8 +282,8 @@ - - {abb5eae7-b3e6-432e-b636-333449892ea6} + + {abb5eae7-b3e6-432e-b636-333449892ea7} diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index b0ddc2d0..a722d721 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -208,7 +208,7 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce #elif _WIN32 && MI_WIN_USE_FIXED_TLS && !defined(MI_WIN_USE_FLS) // On windows we can store the thread-local heap at a fixed TLS slot to avoid -// thread-local initialization checks in the fast path. +// thread-local initialization checks in the fast path. // We always use the second user TLS slot (the first one is always allocated already), // and at initialization (`windows/prim.c`) we call TlsAlloc and verify // we indeed get the second slot (and fail otherwise). @@ -270,6 +270,9 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce // defined in `init.c`; do not use these directly +#ifdef _MSC_VER +__declspec(selectany) // make it part of the comdat section to have faster TLS access (issue #1078) +#endif extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from extern bool _mi_process_is_initialized; // has mi_process_init been called? From 9c24c428cb06c735ccc3dcca996c2d09bb139d08 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 4 May 2025 09:10:38 -0700 Subject: [PATCH 2/2] add more decl_hidden specifiers on extern variables to improve access on arm64 --- include/mimalloc/internal.h | 6 +++--- include/mimalloc/prim.h | 6 +++--- src/page.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 6283f1d1..b11bd357 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -96,7 +96,7 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed); static inline uintptr_t _mi_random_shuffle(uintptr_t x); // init.c -extern mi_decl_cache_align mi_stats_t _mi_stats_main; +extern mi_decl_hidden mi_decl_cache_align mi_stats_t _mi_stats_main; extern mi_decl_hidden mi_decl_cache_align const mi_page_t _mi_page_empty; void _mi_process_load(void); void mi_cdecl _mi_process_done(void); @@ -958,8 +958,8 @@ static inline size_t mi_popcount(size_t x) { #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) #include -extern bool _mi_cpu_has_fsrm; -extern bool _mi_cpu_has_erms; +extern mi_decl_hidden bool _mi_cpu_has_fsrm; +extern mi_decl_hidden bool _mi_cpu_has_erms; static inline void _mi_memcpy(void* dst, const void* src, size_t n) { if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { __movsb((unsigned char*)dst, (const unsigned char*)src, n); diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index a722d721..527bb97a 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -273,8 +273,8 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce #ifdef _MSC_VER __declspec(selectany) // make it part of the comdat section to have faster TLS access (issue #1078) #endif -extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from -extern bool _mi_process_is_initialized; // has mi_process_init been called? +extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from +extern mi_decl_hidden bool _mi_process_is_initialized; // has mi_process_init been called? static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept; @@ -402,7 +402,7 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) { #elif defined(MI_TLS_PTHREAD) -extern pthread_key_t _mi_heap_default_key; +extern mi_decl_hidden pthread_key_t _mi_heap_default_key; static inline mi_heap_t* mi_prim_get_default_heap(void) { mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key)); return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); diff --git a/src/page.c b/src/page.c index 6a693e89..55150f33 100644 --- a/src/page.c +++ b/src/page.c @@ -114,7 +114,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) { return true; } -extern bool _mi_process_is_initialized; // has mi_process_init been called? +extern mi_decl_hidden bool _mi_process_is_initialized; // has mi_process_init been called? bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(mi_page_is_valid_init(page)); @@ -979,9 +979,9 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al // free delayed frees from other threads (but skip contended ones) _mi_heap_delayed_free_partial(heap); - + // collect every once in a while (10000 by default) - const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L); + const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L); if (heap->generic_collect_count >= generic_collect) { heap->generic_collect_count = 0; mi_heap_collect(heap, false /* force? */);