From e4c5d09d65ff7743fe5e5dfadd6f082e839ff791 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 4 May 2025 09:04:57 -0700 Subject: [PATCH 1/6] improve TLS access on Windows with msvc (by Frank Richter, issue #1078) --- ide/vs2022/mimalloc-test-stress.vcxproj | 4 ++-- include/mimalloc/prim.h | 5 ++++- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj index d6af71ce..128a4ff6 100644 --- a/ide/vs2022/mimalloc-test-stress.vcxproj +++ b/ide/vs2022/mimalloc-test-stress.vcxproj @@ -282,8 +282,8 @@ - - {abb5eae7-b3e6-432e-b636-333449892ea6} + + {abb5eae7-b3e6-432e-b636-333449892ea7} diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index b0ddc2d0..a722d721 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -208,7 +208,7 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce #elif _WIN32 && MI_WIN_USE_FIXED_TLS && !defined(MI_WIN_USE_FLS) // On windows we can store the thread-local heap at a fixed TLS slot to avoid -// thread-local initialization checks in the fast path. +// thread-local initialization checks in the fast path. // We always use the second user TLS slot (the first one is always allocated already), // and at initialization (`windows/prim.c`) we call TlsAlloc and verify // we indeed get the second slot (and fail otherwise). @@ -270,6 +270,9 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce // defined in `init.c`; do not use these directly +#ifdef _MSC_VER +__declspec(selectany) // make it part of the comdat section to have faster TLS access (issue #1078) +#endif extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from extern bool _mi_process_is_initialized; // has mi_process_init been called? From f989a1cbb9b63043f1e56d248efe1ede9a6651d7 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 4 May 2025 09:10:38 -0700 Subject: [PATCH 2/6] add more decl_hidden specifiers on extern variables to improve access on arm64 --- include/mimalloc/internal.h | 6 +++--- include/mimalloc/prim.h | 6 +++--- src/page.c | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 51fad09c..c9362aa0 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -96,7 +96,7 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed); static inline uintptr_t _mi_random_shuffle(uintptr_t x); // init.c -extern mi_decl_cache_align mi_stats_t _mi_stats_main; +extern mi_decl_hidden mi_decl_cache_align mi_stats_t _mi_stats_main; extern mi_decl_hidden mi_decl_cache_align const mi_page_t _mi_page_empty; void _mi_process_load(void); void mi_cdecl _mi_process_done(void); @@ -958,8 +958,8 @@ static inline size_t mi_popcount(size_t x) { #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) #include -extern bool _mi_cpu_has_fsrm; -extern bool _mi_cpu_has_erms; +extern mi_decl_hidden bool _mi_cpu_has_fsrm; +extern mi_decl_hidden bool _mi_cpu_has_erms; static inline void _mi_memcpy(void* dst, const void* src, size_t n) { if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { __movsb((unsigned char*)dst, (const unsigned char*)src, n); diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index a722d721..527bb97a 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -273,8 +273,8 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce #ifdef _MSC_VER __declspec(selectany) // make it part of the comdat section to have faster TLS access (issue #1078) #endif -extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from -extern bool _mi_process_is_initialized; // has mi_process_init been called? +extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from +extern mi_decl_hidden bool _mi_process_is_initialized; // has mi_process_init been called? static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept; @@ -402,7 +402,7 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) { #elif defined(MI_TLS_PTHREAD) -extern pthread_key_t _mi_heap_default_key; +extern mi_decl_hidden pthread_key_t _mi_heap_default_key; static inline mi_heap_t* mi_prim_get_default_heap(void) { mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key)); return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap); diff --git a/src/page.c b/src/page.c index 6a693e89..55150f33 100644 --- a/src/page.c +++ b/src/page.c @@ -114,7 +114,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) { return true; } -extern bool _mi_process_is_initialized; // has mi_process_init been called? +extern mi_decl_hidden bool _mi_process_is_initialized; // has mi_process_init been called? bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(mi_page_is_valid_init(page)); @@ -979,9 +979,9 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al // free delayed frees from other threads (but skip contended ones) _mi_heap_delayed_free_partial(heap); - + // collect every once in a while (10000 by default) - const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L); + const long generic_collect = mi_option_get_clamp(mi_option_generic_collect, 1, 1000000L); if (heap->generic_collect_count >= generic_collect) { heap->generic_collect_count = 0; mi_heap_collect(heap, false /* force? */); From 63b0989df57a9dd2b867920307b0d038df695a54 Mon Sep 17 00:00:00 2001 From: Daan Date: Sun, 4 May 2025 21:41:26 -0700 Subject: [PATCH 3/6] revert use of selectany for msvc (issue #1078) --- include/mimalloc/prim.h | 3 --- 1 file changed, 3 deletions(-) diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 527bb97a..2d508148 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -270,9 +270,6 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce // defined in `init.c`; do not use these directly -#ifdef _MSC_VER -__declspec(selectany) // make it part of the comdat section to have faster TLS access (issue #1078) -#endif extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from extern mi_decl_hidden bool _mi_process_is_initialized; // has mi_process_init been called? From 52b75693c48308e8b19b94ffa7fbc0580021ba87 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 4 May 2025 22:03:10 -0700 Subject: [PATCH 4/6] use TlsAlloc with a dynamic offset for MI_WIN_USE_FIXED_TLS by default (issue #1078) --- ide/vs2022/mimalloc-test-stress.vcxproj | 4 ++-- include/mimalloc/prim.h | 13 ++++++------- src/prim/windows/prim.c | 15 ++++++++++----- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj index 128a4ff6..d6af71ce 100644 --- a/ide/vs2022/mimalloc-test-stress.vcxproj +++ b/ide/vs2022/mimalloc-test-stress.vcxproj @@ -282,8 +282,8 @@ - - {abb5eae7-b3e6-432e-b636-333449892ea7} + + {abb5eae7-b3e6-432e-b636-333449892ea6} diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 2d508148..60af4d59 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -209,19 +209,18 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce // On windows we can store the thread-local heap at a fixed TLS slot to avoid // thread-local initialization checks in the fast path. -// We always use the second user TLS slot (the first one is always allocated already), -// and at initialization (`windows/prim.c`) we call TlsAlloc and verify -// we indeed get the second slot (and fail otherwise). -// Todo: we could make the Tls slot completely dynamic but that would require -// an extra read of the static Tls slot instead of using a constant offset. +// We allocate a user TLS slot at process initialization (see `windows/prim.c`) +// and store the offset `_mi_win_tls_offset`. #define MI_HAS_TLS_SLOT 2 // 2 = we can reliably initialize the slot (saving a test on each malloc) +extern mi_decl_hidden size_t _mi_win_tls_offset; + #if MI_WIN_USE_FIXED_TLS > 1 #define MI_TLS_SLOT (MI_WIN_USE_FIXED_TLS) #elif MI_SIZE_SIZE == 4 -#define MI_TLS_SLOT (0x0E18) // Second User TLS slot +#define MI_TLS_SLOT (0x0E10 + _mi_win_tls_offset) // User TLS slots #else -#define MI_TLS_SLOT (0x1488) // Second User TLS slot +#define MI_TLS_SLOT (0x1480 + _mi_win_tls_offset) // User TLS slots #endif static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept { diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 7daa09ef..c91102a2 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -627,22 +627,27 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) { // Process & Thread Init/Done //---------------------------------------------------------------- +#if MI_HAS_TLS_SLOT +mi_decl_cache_align size_t _mi_win_tls_offset = sizeof(void*); // use 2nd slot by default +#endif + static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { MI_UNUSED(reserved); MI_UNUSED(module); - #if MI_TLS_SLOT >= 2 + #if MI_HAS_TLS_SLOT >= 2 // we must initialize the TLS slot before any allocation if (reason==DLL_PROCESS_ATTACH) { const DWORD tls_slot = TlsAlloc(); - if (tls_slot != 1) { - _mi_error_message(EFAULT, "unable to allocate the second TLS slot (rebuild without MI_WIN_USE_FIXED_TLS?)\n"); + if (tls_slot == TLS_OUT_OF_INDEXES) { + _mi_error_message(EFAULT, "unable to allocate the a TLS slot (rebuild without MI_WIN_USE_FIXED_TLS?)\n"); } + _mi_win_tls_offset = (size_t)tls_slot * sizeof(void*); } if (reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) { if (mi_prim_get_default_heap() == NULL) { _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty); } #if MI_DEBUG - void* const p = TlsGetValue(1); + void* const p = TlsGetValue((DWORD)(_mi_win_tls_offset / sizeof(void*))); mi_assert_internal(p == (void*)&_mi_heap_empty); #endif } @@ -808,7 +813,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { #endif mi_decl_export void _mi_redirect_entry(DWORD reason) { // called on redirection; careful as this may be called before DllMain - #if MI_TLS_SLOT >= 2 + #if MI_HAS_TLS_SLOT >= 2 // we must initialize the TLS slot before any allocation if ((reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) && mi_prim_get_default_heap() == NULL) { _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty); } From 303b196d403876f324e7456854a148e85682c2d9 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 4 May 2025 22:09:56 -0700 Subject: [PATCH 5/6] fix MI_WIN_USE_FIXED_TLS conditions --- src/prim/windows/prim.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index c91102a2..d0fee4c2 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -627,7 +627,7 @@ bool _mi_prim_random_buf(void* buf, size_t buf_len) { // Process & Thread Init/Done //---------------------------------------------------------------- -#if MI_HAS_TLS_SLOT +#if MI_WIN_USE_FIXED_TLS==1 mi_decl_cache_align size_t _mi_win_tls_offset = sizeof(void*); // use 2nd slot by default #endif @@ -635,6 +635,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { MI_UNUSED(reserved); MI_UNUSED(module); #if MI_HAS_TLS_SLOT >= 2 // we must initialize the TLS slot before any allocation + #if MI_WIN_USE_FIXED_TLS==1 if (reason==DLL_PROCESS_ATTACH) { const DWORD tls_slot = TlsAlloc(); if (tls_slot == TLS_OUT_OF_INDEXES) { @@ -642,11 +643,12 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { } _mi_win_tls_offset = (size_t)tls_slot * sizeof(void*); } + #endif if (reason==DLL_PROCESS_ATTACH || reason==DLL_THREAD_ATTACH) { if (mi_prim_get_default_heap() == NULL) { _mi_heap_set_default_direct((mi_heap_t*)&_mi_heap_empty); } - #if MI_DEBUG + #if MI_DEBUG && MI_WIN_USE_FIXED_TLS==1 void* const p = TlsGetValue((DWORD)(_mi_win_tls_offset / sizeof(void*))); mi_assert_internal(p == (void*)&_mi_heap_empty); #endif From e2d7c24c7362a19429f7338f0e5ed493f7c1d7b0 Mon Sep 17 00:00:00 2001 From: Daan Date: Sun, 4 May 2025 22:17:59 -0700 Subject: [PATCH 6/6] add fixed TLS slot test to pipeline on Windows --- azure-pipelines.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 25d4a6e0..b7fc59d4 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -42,6 +42,14 @@ jobs: BuildType: release cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -A Win32 MSBuildConfiguration: Release + Debug Fixed TLS: + BuildType: debug + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON -DMI_WIN_USE_FIXED_TLS=ON + MSBuildConfiguration: Debug + Release Fixed TLS: + BuildType: release + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_WIN_USE_FIXED_TLS=ON + MSBuildConfiguration: Release steps: - task: CMake@1 inputs: