diff --git a/CMakeLists.txt b/CMakeLists.txt index f624f311..b76647f2 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -125,6 +125,7 @@ endif() if(MI_DEBUG_TSAN MATCHES "ON") if(CMAKE_C_COMPILER_ID MATCHES "Clang") message(STATUS "Build with thread sanitizer (MI_DEBUG_TSAN=ON)") + list(APPEND mi_defines MI_TSAN=1) list(APPEND mi_cflags -fsanitize=thread -g -O1) list(APPEND CMAKE_EXE_LINKER_FLAGS -fsanitize=thread) else() @@ -229,6 +230,11 @@ endif() message(STATUS "") message(STATUS "Library base name: ${mi_basename}") message(STATUS "Build type : ${CMAKE_BUILD_TYPE_LC}") +if(MI_USE_CXX MATCHES "ON") + message(STATUS "Compiler : ${CMAKE_CXX_COMPILER}") +else() + message(STATUS "Compiler : ${CMAKE_C_COMPILER}") +endif() message(STATUS "Install directory: ${mi_install_dir}") message(STATUS "Build targets : ${mi_build_targets}") message(STATUS "") diff --git a/include/mimalloc-atomic.h b/include/mimalloc-atomic.h index b6b06cbc..2c08680b 100644 --- a/include/mimalloc-atomic.h +++ b/include/mimalloc-atomic.h @@ -1,5 +1,5 @@ /* ---------------------------------------------------------------------------- -Copyright (c) 2018, Microsoft Research, Daan Leijen +Copyright (c) 2018,2020 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. @@ -8,127 +8,99 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MIMALLOC_ATOMIC_H #define MIMALLOC_ATOMIC_H -// ------------------------------------------------------ +// -------------------------------------------------------------------------------------------- // Atomics // We need to be portable between C, C++, and MSVC. -// ------------------------------------------------------ +// We base the primitives on the C/C++ atomics and create a mimimal wrapper for MSVC in C compilation mode. +// This is why we try to use only `uintptr_t` and `*` as atomic types. +// To gain better insight in the range of used atomics, we use explicitly named memory order operations +// instead of passing the memory order as a parameter. +// ----------------------------------------------------------------------------------------------- -#if defined(_MSC_VER) -#define _Atomic(tp) tp -#define ATOMIC_VAR_INIT(x) x -#elif defined(__cplusplus) +#if defined(__cplusplus) +// Use C++ atomics #include -#define _Atomic(tp) std::atomic +#define _Atomic(tp) std::atomic +#define mi_atomic(name) std::atomic_##name +#define mi_memory_order(name) std::memory_order_##name +#elif defined(_MSC_VER) +// Use MSVC C wrapper for C11 atomics +#define _Atomic(tp) tp +#define ATOMIC_VAR_INIT(x) x +#define mi_atomic(name) mi_atomic_##name +#define mi_memory_order(name) mi_memory_order_##name #else +// Use C11 atomics #include +#define mi_atomic(name) atomic_##name +#define mi_memory_order(name) memory_order_##name #endif -// ------------------------------------------------------ -// Atomic operations specialized for mimalloc -// ------------------------------------------------------ +// Various defines for all used memory orders in mimalloc +#define mi_atomic_cas_weak(p,expected,desired,mem_success,mem_fail) \ + mi_atomic(compare_exchange_weak_explicit)(p,expected,desired,mem_success,mem_fail) -// Atomically add a value; returns the previous value. Memory ordering is relaxed. -static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add); +#define mi_atomic_cas_strong(p,expected,desired,mem_success,mem_fail) \ + mi_atomic(compare_exchange_strong_explicit)(p,expected,desired,mem_success,mem_fail) -// Atomically "and" a value; returns the previous value. Memory ordering is relaxed. -static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x); +#define mi_atomic_load_acquire(p) mi_atomic(load_explicit)(p,mi_memory_order(acquire)) +#define mi_atomic_load_relaxed(p) mi_atomic(load_explicit)(p,mi_memory_order(relaxed)) +#define mi_atomic_store_release(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(release)) +#define mi_atomic_store_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed)) +#define mi_atomic_exchange_release(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(release)) +#define mi_atomic_exchange_acq_rel(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_cas_weak_release(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) +#define mi_atomic_cas_weak_acq_rel(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) +#define mi_atomic_cas_strong_release(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) +#define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) -// Atomically "or" a value; returns the previous value. Memory ordering is relaxed. -static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x); +#define mi_atomic_add_relaxed(p,x) mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed)) +#define mi_atomic_sub_relaxed(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed)) +#define mi_atomic_add_acq_rel(p,x) mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_sub_acq_rel(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_and_acq_rel(p,x) mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_or_acq_rel(p,x) mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel)) -// Atomically compare and exchange a value; returns `true` if successful. -// May fail spuriously. Memory ordering as release on success, and relaxed on failure. -// (Note: expected and desired are in opposite order from atomic_compare_exchange) -static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected); +#define mi_atomic_increment_relaxed(p) mi_atomic_add_relaxed(p,1) +#define mi_atomic_decrement_relaxed(p) mi_atomic_sub_relaxed(p,1) +#define mi_atomic_increment_acq_rel(p) mi_atomic_add_acq_rel(p,1) +#define mi_atomic_decrement_acq_rel(p) mi_atomic_sub_acq_rel(p,1) -// Atomically compare and exchange a value; returns `true` if successful. -// Memory ordering is acquire-release -// (Note: expected and desired are in opposite order from atomic_compare_exchange) -static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected); - -// Atomically exchange a value. Memory ordering is acquire-release. -static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange); - -// Atomically read a value. Memory ordering is relaxed. -static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p); - -// Atomically read a value. Memory ordering is acquire. -static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p); - -// Atomically write a value. Memory ordering is release. -static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x); - -// Yield static inline void mi_atomic_yield(void); +static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)* p, intptr_t add); +static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)* p, intptr_t sub); -// Atomically add a 64-bit value; returns the previous value. -// Note: not using _Atomic(int64_t) as it is only used for statistics. -static inline void mi_atomic_addi64(volatile int64_t* p, int64_t add); -// Atomically update `*p` with the maximum of `*p` and `x` as a 64-bit value. -// Returns the previous value. Note: not using _Atomic(int64_t) as it is only used for statistics. -static inline void mi_atomic_maxi64(volatile int64_t* p, int64_t x); +#if defined(__cplusplus) || !defined(_MSC_VER) -// Atomically read a 64-bit value -// Note: not using _Atomic(int64_t) as it is only used for statistics. -static inline int64_t mi_atomic_readi64(volatile int64_t* p); +// In C++/C11 atomics we have polymorpic atomics so can use the typed `ptr` variants +// (where `tp` is the type of atomic value) +// We use these macros so we can provide a typed wrapper in MSVC in C compilation mode as well +#define mi_atomic_load_ptr_acquire(tp,p) mi_atomic_load_acquire(p) +#define mi_atomic_load_ptr_relaxed(tp,p) mi_atomic_load_relaxed(p) +#define mi_atomic_store_ptr_release(tp,p,x) mi_atomic_store_release(p,x) +#define mi_atomic_store_ptr_relaxed(tp,p,x) mi_atomic_store_relaxed(p,x) +#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des) mi_atomic_cas_weak_release(p,exp,des) +#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des) mi_atomic_cas_weak_acq_rel(p,exp,des) +#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des) mi_atomic_cas_strong_release(p,exp,des) +#define mi_atomic_exchange_ptr_release(tp,p,x) mi_atomic_exchange_release(p,x) +#define mi_atomic_exchange_ptr_acq_rel(tp,p,x) mi_atomic_exchange_acq_rel(p,x) -// Atomically subtract a value; returns the previous value. -static inline uintptr_t mi_atomic_sub(volatile _Atomic(uintptr_t)* p, uintptr_t sub) { - return mi_atomic_add(p, (uintptr_t)(-((intptr_t)sub))); +// These are used by the statistics +static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) { + return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed)); +} +static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) { + int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p); + while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, ¤t, x)) { /* nothing */ }; } -// Atomically increment a value; returns the incremented result. -static inline uintptr_t mi_atomic_increment(volatile _Atomic(uintptr_t)* p) { - return mi_atomic_add(p, 1); -} -// Atomically decrement a value; returns the decremented result. -static inline uintptr_t mi_atomic_decrement(volatile _Atomic(uintptr_t)* p) { - return mi_atomic_sub(p, 1); -} +#elif defined(_MSC_VER) -// Atomically add a signed value; returns the previous value. -static inline intptr_t mi_atomic_addi(volatile _Atomic(intptr_t)* p, intptr_t add) { - return (intptr_t)mi_atomic_add((volatile _Atomic(uintptr_t)*)p, (uintptr_t)add); -} - -// Atomically subtract a signed value; returns the previous value. -static inline intptr_t mi_atomic_subi(volatile _Atomic(intptr_t)* p, intptr_t sub) { - return (intptr_t)mi_atomic_addi(p,-sub); -} - -// Atomically read a pointer; Memory order is relaxed (i.e. no fence, only atomic). -#define mi_atomic_read_ptr_relaxed(T,p) \ - (T*)(mi_atomic_read_relaxed((const volatile _Atomic(uintptr_t)*)(p))) - -// Atomically read a pointer; Memory order is acquire. -#define mi_atomic_read_ptr(T,p) \ - (T*)(mi_atomic_read((const volatile _Atomic(uintptr_t)*)(p))) - -// Atomically write a pointer; Memory order is acquire. -#define mi_atomic_write_ptr(T,p,x) \ - mi_atomic_write((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)x)) - -// Atomically compare and exchange a pointer; returns `true` if successful. May fail spuriously. -// Memory order is release. (like a write) -// (Note: expected and desired are in opposite order from atomic_compare_exchange) -#define mi_atomic_cas_ptr_weak(T,p,desired,expected) \ - mi_atomic_cas_weak((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)(desired)), (uintptr_t)((T*)(expected))) - -// Atomically compare and exchange a pointer; returns `true` if successful. Memory order is acquire_release. -// (Note: expected and desired are in opposite order from atomic_compare_exchange) -#define mi_atomic_cas_ptr_strong(T,p,desired,expected) \ - mi_atomic_cas_strong((volatile _Atomic(uintptr_t)*)(p),(uintptr_t)((T*)(desired)), (uintptr_t)((T*)(expected))) - -// Atomically exchange a pointer value. -#define mi_atomic_exchange_ptr(T,p,exchange) \ - (T*)mi_atomic_exchange((volatile _Atomic(uintptr_t)*)(p), (uintptr_t)((T*)exchange)) - - -#ifdef _MSC_VER +// MSVC C compilation wrapper that uses Interlocked operations to model C11 atomics. #define WIN32_LEAN_AND_MEAN -#include +#include #include #ifdef _WIN64 typedef LONG64 msc_intptr_t; @@ -137,43 +109,71 @@ typedef LONG64 msc_intptr_t; typedef LONG msc_intptr_t; #define MI_64(f) f #endif -static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add) { + +typedef enum mi_memory_order_e { + mi_memory_order_relaxed, + mi_memory_order_consume, + mi_memory_order_acquire, + mi_memory_order_release, + mi_memory_order_acq_rel, + mi_memory_order_seq_cst +} mi_memory_order; + +static inline uintptr_t mi_atomic_fetch_add_explicit(_Atomic(uintptr_t)* p, uintptr_t add, mi_memory_order mo) { return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, (msc_intptr_t)add); } -static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) { +static inline uintptr_t mi_atomic_fetch_sub_explicit(_Atomic(uintptr_t)*p, uintptr_t sub, mi_memory_order mo) { + return (uintptr_t)MI_64(_InterlockedExchangeAdd)((volatile msc_intptr_t*)p, -((msc_intptr_t)sub)); +} +static inline uintptr_t mi_atomic_fetch_and_explicit(_Atomic(uintptr_t)* p, uintptr_t x, mi_memory_order mo) { return (uintptr_t)MI_64(_InterlockedAnd)((volatile msc_intptr_t*)p, (msc_intptr_t)x); } -static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) { +static inline uintptr_t mi_atomic_fetch_or_explicit(_Atomic(uintptr_t)* p, uintptr_t x, mi_memory_order mo) { return (uintptr_t)MI_64(_InterlockedOr)((volatile msc_intptr_t*)p, (msc_intptr_t)x); } -static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { - return (expected == (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)expected)); +static inline bool mi_atomic_compare_exchange_strong_explicit(_Atomic(uintptr_t)* p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) { + uintptr_t read = (uintptr_t)MI_64(_InterlockedCompareExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)desired, (msc_intptr_t)(*expected)); + if (read == *expected) { + return true; + } + else { + *expected = read; + return false; + } } -static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { - return mi_atomic_cas_strong(p,desired,expected); +static inline bool mi_atomic_compare_exchange_weak_explicit(_Atomic(uintptr_t)*p, uintptr_t* expected, uintptr_t desired, mi_memory_order mo1, mi_memory_order mo2) { + return mi_atomic_compare_exchange_strong_explicit(p, expected, desired, mo1, mo2); } -static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) { +static inline uintptr_t mi_atomic_exchange_explicit(_Atomic(uintptr_t)* p, uintptr_t exchange, mi_memory_order mo) { return (uintptr_t)MI_64(_InterlockedExchange)((volatile msc_intptr_t*)p, (msc_intptr_t)exchange); } -static inline uintptr_t mi_atomic_read(volatile _Atomic(uintptr_t) const* p) { - return *p; +static inline mi_atomic_thread_fence(mi_memory_order mo) { + _Atomic(uintptr_t)x = 0; + mi_atomic_exchange_explicit(&x, 1, mo); } -static inline uintptr_t mi_atomic_read_relaxed(volatile _Atomic(uintptr_t) const* p) { +static inline uintptr_t mi_atomic_load_explicit(_Atomic(uintptr_t) const* p, mi_memory_order mo) { + #if defined(_M_IX86) || defined(_M_X64) return *p; + #else + uintptr_t x = *p; + if (mo > mi_memory_order_relaxed) { + while (!mi_atomic_compare_exchange_weak_explicit(p, &x, x, mo, mi_memory_order_relaxed)) { /* nothing */ }; + } + return x; + #endif } -static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) { +static inline void mi_atomic_store_explicit(_Atomic(uintptr_t)* p, uintptr_t x, mi_memory_order mo) { #if defined(_M_IX86) || defined(_M_X64) *p = x; #else - mi_atomic_exchange(p,x); + mi_atomic_exchange_explicit(p,x,mo); #endif } -static inline void mi_atomic_yield(void) { - YieldProcessor(); -} -static inline void mi_atomic_addi64(volatile _Atomic(int64_t)* p, int64_t add) { + +// These are used by the statistics +static inline int64_t mi_atomic_addi64_relaxed(volatile _Atomic(int64_t)* p, int64_t add) { #ifdef _WIN64 - mi_atomic_addi(p,add); + return (int64_t)mi_atomic_addi((int64_t*)p,add); #else int64_t current; int64_t sum; @@ -181,91 +181,52 @@ static inline void mi_atomic_addi64(volatile _Atomic(int64_t)* p, int64_t add) { current = *p; sum = current + add; } while (_InterlockedCompareExchange64(p, sum, current) != current); + return current; #endif } - -static inline void mi_atomic_maxi64(volatile _Atomic(int64_t)*p, int64_t x) { +static inline void mi_atomic_maxi64_relaxed(volatile _Atomic(int64_t)*p, int64_t x) { int64_t current; do { current = *p; } while (current < x && _InterlockedCompareExchange64(p, x, current) != current); } -static inline int64_t mi_atomic_readi64(volatile _Atomic(int64_t)*p) { - #ifdef _WIN64 - return *p; - #else - int64_t current; - do { - current = *p; - } while (_InterlockedCompareExchange64(p, current, current) != current); - return current; - #endif -} +// The pointer macros cast to `uintptr_t`. +#define mi_atomic_load_ptr_acquire(tp,p) (tp*)mi_atomic_load_acquire((_Atomic(uintptr_t)*)(p)) +#define mi_atomic_load_ptr_relaxed(tp,p) (tp*)mi_atomic_load_relaxed((_Atomic(uintptr_t)*)(p)) +#define mi_atomic_store_ptr_release(tp,p,x) mi_atomic_store_release((_Atomic(uintptr_t)*)(p),(uintptr_t)(x)) +#define mi_atomic_store_ptr_relaxed(tp,p,x) mi_atomic_store_relaxed((_Atomic(uintptr_t)*)(p),(uintptr_t)(x)) +#define mi_atomic_cas_ptr_weak_release(tp,p,exp,des) mi_atomic_cas_weak_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des) +#define mi_atomic_cas_ptr_weak_acq_rel(tp,p,exp,des) mi_atomic_cas_weak_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des) +#define mi_atomic_cas_ptr_strong_release(tp,p,exp,des) mi_atomic_cas_strong_release((_Atomic(uintptr_t)*)(p),(uintptr_t*)exp,(uintptr_t)des) +#define mi_atomic_exchange_ptr_release(tp,p,x) (tp*)mi_atomic_exchange_release((_Atomic(uintptr_t)*)(p),(uintptr_t)x) +#define mi_atomic_exchange_ptr_acq_rel(tp,p,x) (tp*)mi_atomic_exchange_acq_rel((_Atomic(uintptr_t)*)(p),(uintptr_t)x) -#else -#ifdef __cplusplus -#define MI_USING_STD using namespace std; -#else -#define MI_USING_STD #endif -static inline uintptr_t mi_atomic_add(volatile _Atomic(uintptr_t)* p, uintptr_t add) { - MI_USING_STD - return atomic_fetch_add_explicit(p, add, memory_order_relaxed); -} -static inline uintptr_t mi_atomic_and(volatile _Atomic(uintptr_t)* p, uintptr_t x) { - MI_USING_STD - return atomic_fetch_and_explicit(p, x, memory_order_acq_rel); -} -static inline uintptr_t mi_atomic_or(volatile _Atomic(uintptr_t)* p, uintptr_t x) { - MI_USING_STD - return atomic_fetch_or_explicit(p, x, memory_order_acq_rel); -} -static inline bool mi_atomic_cas_weak(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { - MI_USING_STD - return atomic_compare_exchange_weak_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_acquire); -} -static inline bool mi_atomic_cas_strong(volatile _Atomic(uintptr_t)* p, uintptr_t desired, uintptr_t expected) { - MI_USING_STD - return atomic_compare_exchange_strong_explicit(p, &expected, desired, memory_order_acq_rel, memory_order_acquire); -} -static inline uintptr_t mi_atomic_exchange(volatile _Atomic(uintptr_t)* p, uintptr_t exchange) { - MI_USING_STD - return atomic_exchange_explicit(p, exchange, memory_order_acq_rel); -} -static inline uintptr_t mi_atomic_read_relaxed(const volatile _Atomic(uintptr_t)* p) { - MI_USING_STD - return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_relaxed); -} -static inline uintptr_t mi_atomic_read(const volatile _Atomic(uintptr_t)* p) { - MI_USING_STD - return atomic_load_explicit((volatile _Atomic(uintptr_t)*) p, memory_order_acquire); -} -static inline void mi_atomic_write(volatile _Atomic(uintptr_t)* p, uintptr_t x) { - MI_USING_STD - return atomic_store_explicit(p, x, memory_order_release); -} -static inline void mi_atomic_addi64(volatile int64_t* p, int64_t add) { - MI_USING_STD - atomic_fetch_add_explicit((volatile _Atomic(int64_t)*)p, add, memory_order_relaxed); -} -static inline int64_t mi_atomic_readi64(volatile int64_t* p) { - MI_USING_STD - return atomic_load_explicit((volatile _Atomic(int64_t)*) p, memory_order_relaxed); -} -static inline void mi_atomic_maxi64(volatile int64_t* p, int64_t x) { - MI_USING_STD - int64_t current; - do { - current = mi_atomic_readi64(p); - } while (current < x && !atomic_compare_exchange_weak_explicit((volatile _Atomic(int64_t)*)p, ¤t, x, memory_order_acq_rel, memory_order_relaxed)); + + +// Atomically add a signed value; returns the previous value. +static inline intptr_t mi_atomic_addi(_Atomic(intptr_t)*p, intptr_t add) { + return (intptr_t)mi_atomic_add_acq_rel((_Atomic(uintptr_t)*)p, (uintptr_t)add); } +// Atomically subtract a signed value; returns the previous value. +static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) { + return (intptr_t)mi_atomic_addi(p, -sub); +} + +// Yield #if defined(__cplusplus) - #include - static inline void mi_atomic_yield(void) { - std::this_thread::yield(); - } +#include +static inline void mi_atomic_yield(void) { + std::this_thread::yield(); +} +#elif defined(_WIN32) +#define WIN32_LEAN_AND_MEAN +#include +static inline void mi_atomic_yield(void) { + YieldProcessor(); +} #elif (defined(__GNUC__) || defined(__clang__)) && \ (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__)) #if defined(__x86_64__) || defined(__i386__) @@ -278,17 +239,16 @@ static inline void mi_atomic_maxi64(volatile int64_t* p, int64_t x) { } #endif #elif defined(__wasi__) - #include - static inline void mi_atomic_yield(void) { - sched_yield(); - } +#include +static inline void mi_atomic_yield(void) { + sched_yield(); +} #else - #include - static inline void mi_atomic_yield(void) { - sleep(0); - } +#include +static inline void mi_atomic_yield(void) { + sleep(0); +} #endif -#endif #endif // __MIMALLOC_ATOMIC_H diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index d6618d28..a113b121 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -467,21 +467,21 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) { // Thread free access static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { - return (mi_block_t*)(mi_atomic_read_relaxed(&page->xthread_free) & ~3); + return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3); } static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) { - return (mi_delayed_t)(mi_atomic_read_relaxed(&page->xthread_free) & 3); + return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3); } // Heap access static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { - return (mi_heap_t*)(mi_atomic_read_relaxed(&page->xheap)); + return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap)); } static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); - mi_atomic_write(&page->xheap,(uintptr_t)heap); + mi_atomic_store_release(&page->xheap,(uintptr_t)heap); } // Thread free flag helpers diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index 01e087b9..931d3270 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -14,7 +14,9 @@ terms of the MIT license. A copy of the license can be found in the file // Minimal alignment necessary. On most platforms 16 bytes are needed // due to SSE registers for example. This must be at least `MI_INTPTR_SIZE` +#ifndef MI_MAX_ALIGN_SIZE #define MI_MAX_ALIGN_SIZE 16 // sizeof(max_align_t) +#endif // ------------------------------------------------------ // Variants @@ -160,6 +162,7 @@ typedef enum mi_delayed_e { // The `in_full` and `has_aligned` page flags are put in a union to efficiently // test if both are false (`full_aligned == 0`) in the `mi_free` routine. +#if !MI_TSAN typedef union mi_page_flags_s { uint8_t full_aligned; struct { @@ -167,6 +170,16 @@ typedef union mi_page_flags_s { uint8_t has_aligned : 1; } x; } mi_page_flags_t; +#else +// under thread sanitizer, use a byte for each flag to suppress warning, issue #130 +typedef union mi_page_flags_s { + uint16_t full_aligned; + struct { + uint8_t in_full; + uint8_t has_aligned; + } x; +} mi_page_flags_t; +#endif // Thread free list. // We use the bottom 2 bits of the pointer for mi_delayed_t flags @@ -226,12 +239,12 @@ typedef struct mi_page_s { uint32_t used; // number of blocks in use (including blocks in `local_free` and `thread_free`) uint32_t xblock_size; // size available in each block (always `>0`) - mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) - volatile _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - volatile _Atomic(uintptr_t) xheap; + mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) + _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads + _Atomic(uintptr_t) xheap; - struct mi_page_s* next; // next page owned by this thread with the same `block_size` - struct mi_page_s* prev; // previous page owned by this thread with the same `block_size` + struct mi_page_s* next; // next page owned by this thread with the same `block_size` + struct mi_page_s* prev; // previous page owned by this thread with the same `block_size` // 64-bit 9 words, 32-bit 12 words, (+2 for secure) #if MI_INTPTR_SIZE==8 @@ -277,10 +290,11 @@ typedef struct mi_segment_s { uintptr_t decommit_mask; uintptr_t commit_mask; - // from here is zero initialized - struct mi_segment_s* next; // the list of freed segments in the cache - struct mi_segment_s* abandoned_next; + _Atomic(struct mi_segment_s*) abandoned_next; + // from here is zero initialized + struct mi_segment_s* next; // the list of freed segments in the cache (must be first field, see `segment.c:mi_segment_init`) + size_t abandoned; // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`) size_t abandoned_visits; // count how often this segment is visited in the abandoned list (to force reclaim it it is too long) size_t used; // count of pages in use @@ -291,7 +305,7 @@ typedef struct mi_segment_s { // layout like this to optimize access in `mi_free` mi_segment_kind_t kind; - volatile _Atomic(uintptr_t) thread_id; // unique id of the thread owning this segment + _Atomic(uintptr_t) thread_id; // unique id of the thread owning this segment size_t slice_entries; // entries in the `slices` array, at most `MI_SLICES_PER_SEGMENT` mi_slice_t slices[MI_SLICES_PER_SEGMENT]; } mi_segment_t; @@ -351,7 +365,7 @@ struct mi_heap_s { mi_tld_t* tld; mi_page_t* pages_free_direct[MI_PAGES_DIRECT]; // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size. mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") - volatile _Atomic(mi_block_t*) thread_delayed_free; + _Atomic(mi_block_t*) thread_delayed_free; uintptr_t thread_id; // thread this heap belongs too uintptr_t cookie; // random cookie to verify pointers (see `_mi_ptr_cookie`) uintptr_t keys[2]; // two random keys used to encode the `thread_delayed_free` list diff --git a/include/mimalloc.h b/include/mimalloc.h index 5196ba9f..b4f24137 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MIMALLOC_H #define MIMALLOC_H -#define MI_MALLOC_VERSION 164 // major + 2 digits minor +#define MI_MALLOC_VERSION 166 // major + 2 digits minor // ------------------------------------------------------ // Compiler specific attributes @@ -192,7 +192,7 @@ mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_mallocn(mi_heap_ mi_decl_nodiscard mi_decl_export mi_decl_restrict void* mi_heap_malloc_small(mi_heap_t* heap, size_t size) mi_attr_noexcept mi_attr_malloc mi_attr_alloc_size(2); mi_decl_nodiscard mi_decl_export void* mi_heap_realloc(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(3); -mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(3,4);; +mi_decl_nodiscard mi_decl_export void* mi_heap_reallocn(mi_heap_t* heap, void* p, size_t count, size_t size) mi_attr_noexcept mi_attr_alloc_size2(3,4); mi_decl_nodiscard mi_decl_export void* mi_heap_reallocf(mi_heap_t* heap, void* p, size_t newsize) mi_attr_noexcept mi_attr_alloc_size(3); mi_decl_nodiscard mi_decl_export mi_decl_restrict char* mi_heap_strdup(mi_heap_t* heap, const char* s) mi_attr_noexcept mi_attr_malloc; @@ -256,7 +256,7 @@ mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_all_b // Experimental mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept; -mi_decl_nodiscard mi_decl_export bool mi_is_redirected() mi_attr_noexcept; +mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept; mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept; mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept; diff --git a/src/alloc.c b/src/alloc.c index 6fbda857..590952d4 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -305,11 +305,10 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc } // Try to put the block on either the page-local thread free list, or the heap delayed free list. - mi_thread_free_t tfree; mi_thread_free_t tfreex; bool use_delayed; + mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); do { - tfree = mi_atomic_read_relaxed(&page->xthread_free); use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE); if (mi_unlikely(use_delayed)) { // unlikely: this only happens on the first concurrent free in a page that is in the full list @@ -320,27 +319,27 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc mi_block_set_next(page, block, mi_tf_block(tfree)); tfreex = mi_tf_set_block(tfree,block); } - } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree)); + } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); if (mi_unlikely(use_delayed)) { // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`) - mi_heap_t* const heap = mi_page_heap(page); + mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page); mi_assert_internal(heap != NULL); if (heap != NULL) { // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity) - mi_block_t* dfree; + mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); do { - dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free); mi_block_set_nextx(heap,block,dfree, heap->keys); - } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree)); + } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); } // and reset the MI_DELAYED_FREEING flag + tfree = mi_atomic_load_relaxed(&page->xthread_free); do { - tfreex = tfree = mi_atomic_read_relaxed(&page->xthread_free); + tfreex = tfree; mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING); tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE); - } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree)); + } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); } } @@ -684,7 +683,7 @@ mi_decl_restrict char* mi_strndup(const char* s, size_t n) mi_attr_noexcept { #ifndef PATH_MAX #define PATH_MAX MAX_PATH #endif -#include +#include mi_decl_restrict char* mi_heap_realpath(mi_heap_t* heap, const char* fname, char* resolved_name) mi_attr_noexcept { // todo: use GetFullPathNameW to allow longer file names char buf[PATH_MAX]; @@ -765,12 +764,12 @@ typedef void (*std_new_handler_t)(); std_new_handler_t __attribute((weak)) _ZSt15get_new_handlerv() { return NULL; } -std_new_handler_t mi_get_new_handler() { +static std_new_handler_t mi_get_new_handler() { return _ZSt15get_new_handlerv(); } #else // note: on windows we could dynamically link to `?get_new_handler@std@@YAP6AXXZXZ`. -std_new_handler_t mi_get_new_handler() { +static std_new_handler_t mi_get_new_handler() { return NULL; } #endif diff --git a/src/arena.c b/src/arena.c index 87474bcd..8f4e1783 100644 --- a/src/arena.c +++ b/src/arena.c @@ -69,7 +69,7 @@ typedef struct mi_arena_s { bool is_zero_init; // is the arena zero initialized? bool is_committed; // is the memory committed bool is_large; // large OS page allocated - volatile _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks + _Atomic(uintptr_t) search_idx; // optimization to start the search for free blocks mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? mi_bitmap_field_t* blocks_committed; // if `!is_committed`, are the blocks committed? mi_bitmap_field_t blocks_inuse[1]; // in-place bitmap of in-use blocks (of size `field_count`) @@ -111,12 +111,12 @@ static size_t mi_block_count_of_size(size_t size) { static bool mi_arena_alloc(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx) { const size_t fcount = arena->field_count; - size_t idx = mi_atomic_read(&arena->search_idx); // start from last search + size_t idx = mi_atomic_load_acquire(&arena->search_idx); // start from last search for (size_t visited = 0; visited < fcount; visited++, idx++) { if (idx >= fcount) idx = 0; // wrap around // try to atomically claim a range of bits if (mi_bitmap_try_find_claim_field(arena->blocks_inuse, idx, blocks, bitmap_idx)) { - mi_atomic_write(&arena->search_idx, idx); // start search from here next time + mi_atomic_store_release(&arena->search_idx, idx); // start search from here next time return true; } } @@ -332,7 +332,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE); // try numa affine allocation for (size_t i = 0; i < MI_MAX_ARENAS; i++) { - mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena==NULL) break; // end reached if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local? (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages @@ -344,7 +344,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, } // try from another numa node instead.. for (size_t i = 0; i < MI_MAX_ARENAS; i++) { - mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena==NULL) break; // end reached if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local! (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages @@ -394,7 +394,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed, bool size_t bitmap_idx; mi_arena_id_indices(memid, &arena_idx, &bitmap_idx); mi_assert_internal(arena_idx < MI_MAX_ARENAS); - mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]); + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]); mi_assert_internal(arena != NULL); if (arena == NULL) { _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); @@ -420,15 +420,15 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed, bool static bool mi_arena_add(mi_arena_t* arena) { mi_assert_internal(arena != NULL); - mi_assert_internal((uintptr_t)mi_atomic_read_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0); + mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0); mi_assert_internal(arena->block_count > 0); - uintptr_t i = mi_atomic_increment(&mi_arena_count); + uintptr_t i = mi_atomic_increment_acq_rel(&mi_arena_count); if (i >= MI_MAX_ARENAS) { - mi_atomic_decrement(&mi_arena_count); + mi_atomic_decrement_acq_rel(&mi_arena_count); return false; } - mi_atomic_write_ptr(mi_arena_t,&mi_arenas[i], arena); + mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); return true; } diff --git a/src/bitmap.inc.c b/src/bitmap.inc.c index a107545f..18372b61 100644 --- a/src/bitmap.inc.c +++ b/src/bitmap.inc.c @@ -30,7 +30,7 @@ and that the sequence must be smaller or equal to the bits in a field. #define MI_BITMAP_FIELD_FULL (~((uintptr_t)0)) // all bits set // An atomic bitmap of `uintptr_t` fields -typedef volatile _Atomic(uintptr_t) mi_bitmap_field_t; +typedef _Atomic(uintptr_t) mi_bitmap_field_t; typedef mi_bitmap_field_t* mi_bitmap_t; // A bitmap index is the index of the bit in a bitmap. @@ -77,6 +77,14 @@ static inline uintptr_t mi_bitmap_mask_(size_t count, size_t bitidx) { #if defined(_MSC_VER) #define MI_HAVE_BITSCAN #include +#ifndef MI_64 +#if MI_INTPTR_SIZE==8 +#define MI_64(f) f##64 +#else +#define MI_64(f) f +#endif +#endif + static inline size_t mi_bsf(uintptr_t x) { if (x==0) return 8*MI_INTPTR_SIZE; DWORD idx; @@ -118,9 +126,9 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_f mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields); mi_assert_internal(bitidx + count <= MI_BITMAP_FIELD_BITS); - uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]); + uintptr_t field = mi_atomic_load_relaxed(&bitmap[idx]); if ((field & mask) == 0) { // free? - if (mi_atomic_cas_strong(&bitmap[idx], (field|mask), field)) { + if (mi_atomic_cas_strong_acq_rel(&bitmap[idx], &field, (field|mask))) { // claimed! return true; } @@ -134,8 +142,8 @@ static inline bool mi_bitmap_try_claim_field(mi_bitmap_t bitmap, size_t bitmap_f static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) { mi_assert_internal(bitmap_idx != NULL); - volatile _Atomic(uintptr_t)* field = &bitmap[idx]; - uintptr_t map = mi_atomic_read(field); + _Atomic(uintptr_t)* field = &bitmap[idx]; + uintptr_t map = mi_atomic_load_relaxed(field); if (map==MI_BITMAP_FIELD_FULL) return false; // short cut // search for 0-bit sequence of length count @@ -155,9 +163,8 @@ static inline bool mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx mi_assert_internal((m >> bitidx) == mask); // no overflow? const uintptr_t newmap = map | m; mi_assert_internal((newmap^map) >> bitidx == mask); - if (!mi_atomic_cas_weak(field, newmap, map)) { // TODO: use strong cas here? - // no success, another thread claimed concurrently.. keep going - map = mi_atomic_read(field); + if (!mi_atomic_cas_weak_acq_rel(field, &map, newmap)) { // TODO: use strong cas here? + // no success, another thread claimed concurrently.. keep going (with updated `map`) continue; } else { @@ -211,7 +218,7 @@ static inline bool mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, s const uintptr_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields); // mi_assert_internal((bitmap[idx] & mask) == mask); - uintptr_t prev = mi_atomic_and(&bitmap[idx], ~mask); + uintptr_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask); return ((prev & mask) == mask); } @@ -224,7 +231,7 @@ static inline bool mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, siz const uintptr_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields); //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0); - uintptr_t prev = mi_atomic_or(&bitmap[idx], mask); + uintptr_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask); if (any_zero != NULL) *any_zero = ((prev & mask) != mask); return ((prev & mask) == 0); } @@ -235,7 +242,7 @@ static inline bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_field const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); const uintptr_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); UNUSED(bitmap_fields); - uintptr_t field = mi_atomic_read_relaxed(&bitmap[idx]); + uintptr_t field = mi_atomic_load_relaxed(&bitmap[idx]); if (any_ones != NULL) *any_ones = ((field & mask) != 0); return ((field & mask) == mask); } diff --git a/src/heap.c b/src/heap.c index cc78f3e6..f6e25851 100644 --- a/src/heap.c +++ b/src/heap.c @@ -142,7 +142,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // collect all pages owned by this thread mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); - mi_assert_internal( collect != MI_ABANDON || mi_atomic_read_ptr(mi_block_t,&heap->thread_delayed_free) == NULL ); + mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL ); // collect segment caches if (collect >= MI_FORCE) { diff --git a/src/init.c b/src/init.c index 63dd8387..8d6c8504 100644 --- a/src/init.c +++ b/src/init.c @@ -310,7 +310,7 @@ static void _mi_thread_done(mi_heap_t* default_heap); // nothing to do as it is done in DllMain #elif defined(_WIN32) && !defined(MI_SHARED_LIB) // use thread local storage keys to detect thread ending - #include + #include #include #if (_WIN32_WINNT < 0x600) // before Windows Vista WINBASEAPI DWORD WINAPI FlsAlloc( _In_opt_ PFLS_CALLBACK_FUNCTION lpCallback ); @@ -430,11 +430,11 @@ static bool os_preloading = true; // true until this module is initialized static bool mi_redirected = false; // true if malloc redirects to mi_malloc // Returns true if this module has not been initialized; Don't use C runtime routines until it returns false. -bool _mi_preloading() { +bool _mi_preloading(void) { return os_preloading; } -bool mi_is_redirected() mi_attr_noexcept { +bool mi_is_redirected(void) mi_attr_noexcept { return mi_redirected; } @@ -456,7 +456,7 @@ mi_decl_export void _mi_redirect_entry(DWORD reason) { } } __declspec(dllimport) bool mi_allocator_init(const char** message); -__declspec(dllimport) void mi_allocator_done(); +__declspec(dllimport) void mi_allocator_done(void); #ifdef __cplusplus } #endif @@ -465,7 +465,7 @@ static bool mi_allocator_init(const char** message) { if (message != NULL) *message = NULL; return true; } -static void mi_allocator_done() { +static void mi_allocator_done(void) { // nothing to do } #endif diff --git a/src/options.c b/src/options.c index dffae1f9..4c45d6e5 100644 --- a/src/options.c +++ b/src/options.c @@ -60,7 +60,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(verbose) }, // the following options are experimental and not all combinations make sense. - { 1, UNINIT, MI_OPTION(eager_commit) }, // commit on demand? + { 1, UNINIT, MI_OPTION(eager_commit) }, // commit per segment directly (4MiB) (but see also `eager_commit_delay`) #if defined(_WIN32) || (MI_INTPTR_SIZE <= 4) // and other OS's without overcommit? { 0, UNINIT, MI_OPTION(eager_region_commit) }, { 0, UNINIT, MI_OPTION(reset_decommits) }, // reset decommits memory @@ -77,7 +77,7 @@ static mi_option_desc_t options[_mi_option_last] = #if defined(__NetBSD__) { 0, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed #else - { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed + { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif { 1, UNINIT, MI_OPTION(allow_decommit) }, // decommit pages when not eager committed { 250, UNINIT, MI_OPTION(reset_delay) }, // reset delay in milli-seconds @@ -175,11 +175,11 @@ static _Atomic(uintptr_t) out_len; static void mi_out_buf(const char* msg, void* arg) { UNUSED(arg); if (msg==NULL) return; - if (mi_atomic_read_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return; + if (mi_atomic_load_relaxed(&out_len)>=MI_MAX_DELAY_OUTPUT) return; size_t n = strlen(msg); if (n==0) return; // claim space - uintptr_t start = mi_atomic_add(&out_len, n); + uintptr_t start = mi_atomic_add_acq_rel(&out_len, n); if (start >= MI_MAX_DELAY_OUTPUT) return; // check bound if (start+n >= MI_MAX_DELAY_OUTPUT) { @@ -191,7 +191,7 @@ static void mi_out_buf(const char* msg, void* arg) { static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) { if (out==NULL) return; // claim (if `no_more_buf == true`, no more output will be added after this point) - size_t count = mi_atomic_add(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1)); + size_t count = mi_atomic_add_acq_rel(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1)); // and output the current contents if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT; out_buf[count] = 0; @@ -219,17 +219,17 @@ static void mi_out_buf_stderr(const char* msg, void* arg) { // For now, don't register output from multiple threads. #pragma warning(suppress:4180) static mi_output_fun* volatile mi_out_default; // = NULL -static volatile _Atomic(void*) mi_out_arg; // = NULL +static _Atomic(void*) mi_out_arg; // = NULL static mi_output_fun* mi_out_get_default(void** parg) { - if (parg != NULL) { *parg = mi_atomic_read_ptr(void,&mi_out_arg); } + if (parg != NULL) { *parg = mi_atomic_load_ptr_acquire(void,&mi_out_arg); } mi_output_fun* out = mi_out_default; return (out == NULL ? &mi_out_buf : out); } void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept { mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer - mi_atomic_write_ptr(void,&mi_out_arg, arg); + mi_atomic_store_ptr_release(void,&mi_out_arg, arg); if (out!=NULL) mi_out_buf_flush(out,true,arg); // output all the delayed output now } @@ -243,7 +243,7 @@ static void mi_add_stderr_output() { // -------------------------------------------------------- // Messages, all end up calling `_mi_fputs`. // -------------------------------------------------------- -static volatile _Atomic(uintptr_t) error_count; // = 0; // when MAX_ERROR_COUNT stop emitting errors and warnings +static _Atomic(uintptr_t) error_count; // = 0; // when MAX_ERROR_COUNT stop emitting errors and warnings // When overriding malloc, we may recurse into mi_vfprintf if an allocation // inside the C runtime causes another message. @@ -315,13 +315,13 @@ void _mi_verbose_message(const char* fmt, ...) { static void mi_show_error_message(const char* fmt, va_list args) { if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return; - if (mi_atomic_increment(&error_count) > mi_max_error_count) return; + if (mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return; mi_vfprintf(NULL, NULL, "mimalloc: error: ", fmt, args); } void _mi_warning_message(const char* fmt, ...) { if (!mi_option_is_enabled(mi_option_show_errors) && !mi_option_is_enabled(mi_option_verbose)) return; - if (mi_atomic_increment(&error_count) > mi_max_error_count) return; + if (mi_atomic_increment_acq_rel(&error_count) > mi_max_error_count) return; va_list args; va_start(args,fmt); mi_vfprintf(NULL, NULL, "mimalloc: warning: ", fmt, args); @@ -341,7 +341,7 @@ void _mi_assert_fail(const char* assertion, const char* fname, unsigned line, co // -------------------------------------------------------- static mi_error_fun* volatile mi_error_handler; // = NULL -static volatile _Atomic(void*) mi_error_arg; // = NULL +static _Atomic(void*) mi_error_arg; // = NULL static void mi_error_default(int err) { UNUSED(err); @@ -367,7 +367,7 @@ static void mi_error_default(int err) { void mi_register_error(mi_error_fun* fun, void* arg) { mi_error_handler = fun; // can be NULL - mi_atomic_write_ptr(void,&mi_error_arg, arg); + mi_atomic_store_ptr_release(void,&mi_error_arg, arg); } void _mi_error_message(int err, const char* fmt, ...) { @@ -378,7 +378,7 @@ void _mi_error_message(int err, const char* fmt, ...) { va_end(args); // and call the error handler which may abort (or return normally) if (mi_error_handler != NULL) { - mi_error_handler(err, mi_atomic_read_ptr(void,&mi_error_arg)); + mi_error_handler(err, mi_atomic_load_ptr_acquire(void,&mi_error_arg)); } else { mi_error_default(err); @@ -415,7 +415,7 @@ static inline int mi_strnicmp(const char* s, const char* t, size_t n) { // reliably even when this is invoked before the C runtime is initialized. // i.e. when `_mi_preloading() == true`. // Note: on windows, environment names are not case sensitive. -#include +#include static bool mi_getenv(const char* name, char* result, size_t result_size) { result[0] = 0; size_t len = GetEnvironmentVariableA(name, result, (DWORD)result_size); diff --git a/src/os.c b/src/os.c index 20974968..46739cd9 100644 --- a/src/os.c +++ b/src/os.c @@ -24,7 +24,7 @@ terms of the MIT license. A copy of the license can be found in the file #if defined(_WIN32) -#include +#include #elif defined(__wasi__) // stdlib.h is all we need, and has already been included in mimalloc.h #else @@ -276,15 +276,15 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, DWORD flags, bool large_only, bool allow_large, bool* is_large) { mi_assert_internal(!(large_only && !allow_large)); - static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0; + static _Atomic(uintptr_t) large_page_try_ok; // = 0; void* p = NULL; if ((large_only || use_large_os_page(size, try_alignment)) && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) { - uintptr_t try_ok = mi_atomic_read(&large_page_try_ok); + uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); if (!large_only && try_ok > 0) { // if a large page allocation fails, it seems the calls to VirtualAlloc get very expensive. // therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times. - mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok); + mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); } else { // large OS pages must always reserve and commit. @@ -293,7 +293,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, if (large_only) return p; // fall back to non-large page allocation on error (`p == NULL`). if (p == NULL) { - mi_atomic_write(&large_page_try_ok,10); // on error, don't try again for the next N allocations + mi_atomic_store_release(&large_page_try_ok,10); // on error, don't try again for the next N allocations } } } @@ -370,14 +370,14 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro fd = VM_MAKE_TAG(os_tag); #endif if ((large_only || use_large_os_page(size, try_alignment)) && allow_large) { - static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0; - uintptr_t try_ok = mi_atomic_read(&large_page_try_ok); + static _Atomic(uintptr_t) large_page_try_ok; // = 0; + uintptr_t try_ok = mi_atomic_load_acquire(&large_page_try_ok); if (!large_only && try_ok > 0) { // If the OS is not configured for large OS pages, or the user does not have // enough permission, the `mmap` will always fail (but it might also fail for other reasons). // Therefore, once a large page allocation failed, we don't try again for `large_page_try_ok` times // to avoid too many failing calls to mmap. - mi_atomic_cas_weak(&large_page_try_ok, try_ok - 1, try_ok); + mi_atomic_cas_strong_acq_rel(&large_page_try_ok, &try_ok, try_ok - 1); } else { int lflags = flags & ~MAP_NORESERVE; // using NORESERVE on huge pages seems to fail on Linux @@ -417,7 +417,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro #endif if (large_only) return p; if (p == NULL) { - mi_atomic_write(&large_page_try_ok, 10); // on error, don't try again for the next N allocations + mi_atomic_store_release(&large_page_try_ok, 10); // on error, don't try again for the next N allocations } } } @@ -459,21 +459,22 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro // On 64-bit systems, we can do efficient aligned allocation by using // the 4TiB to 30TiB area to allocate them. #if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED))) -static volatile mi_decl_cache_align _Atomic(uintptr_t) aligned_base; +static mi_decl_cache_align _Atomic(uintptr_t) aligned_base; // Return a 4MiB aligned address that is probably available static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) { if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL; if ((size%MI_SEGMENT_SIZE) != 0) return NULL; - uintptr_t hint = mi_atomic_add(&aligned_base, size); + uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size); if (hint == 0 || hint > ((intptr_t)30<<40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages) uintptr_t init = ((uintptr_t)4 << 40); // start at 4TiB area #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of aligned allocations unless in debug mode uintptr_t r = _mi_heap_random_next(mi_get_default_heap()); init = init + (MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)); // (randomly 20 bits)*4MiB == 0 to 4TiB #endif - mi_atomic_cas_strong(&aligned_base, init, hint + size); - hint = mi_atomic_add(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all + uintptr_t expected = hint + size; + mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init); + hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all } if (hint%try_alignment != 0) return NULL; return (void*)hint; @@ -768,12 +769,12 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats) if (p != start) return false; #else #if defined(MADV_FREE) - static int advice = MADV_FREE; - int err = madvise(start, csize, advice); + static _Atomic(uintptr_t) advice = ATOMIC_VAR_INIT(MADV_FREE); + int err = madvise(start, csize, (int)mi_atomic_load_relaxed(&advice)); if (err != 0 && errno == EINVAL && advice == MADV_FREE) { // if MADV_FREE is not supported, fall back to MADV_DONTNEED from now on - advice = MADV_DONTNEED; - err = madvise(start, csize, advice); + mi_atomic_store_release(&advice, MADV_DONTNEED); + err = madvise(start, csize, MADV_DONTNEED); } #elif defined(__wasi__) int err = 0; @@ -979,9 +980,9 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { uintptr_t start = 0; uintptr_t end = 0; - uintptr_t expected; + uintptr_t huge_start = mi_atomic_load_relaxed(&mi_huge_start); do { - start = expected = mi_atomic_read_relaxed(&mi_huge_start); + start = huge_start; if (start == 0) { // Initialize the start address after the 32TiB area start = ((uintptr_t)32 << 40); // 32TiB virtual start address @@ -992,7 +993,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { } end = start + size; mi_assert_internal(end % MI_SEGMENT_SIZE == 0); - } while (!mi_atomic_cas_strong(&mi_huge_start, end, expected)); + } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end)); if (total_size != NULL) *total_size = size; return (uint8_t*)start; diff --git a/src/page-queue.c b/src/page-queue.c index 6097a0bb..ebf98097 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -261,7 +261,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { heap->page_count--; page->next = NULL; page->prev = NULL; - // mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), NULL); + // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL); mi_page_set_in_full(page,false); } @@ -276,7 +276,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_ (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); mi_page_set_in_full(page, mi_page_queue_is_full(queue)); - // mi_atomic_write_ptr(mi_atomic_cast(void*, &page->heap), heap); + // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap); page->next = queue->first; page->prev = NULL; if (queue->first != NULL) { @@ -344,7 +344,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue for (mi_page_t* page = append->first; page != NULL; page = page->next) { // inline `mi_page_set_heap` to avoid wrong assertion during absorption; // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive. - mi_atomic_write(&page->xheap, (uintptr_t)heap); + mi_atomic_store_release(&page->xheap, (uintptr_t)heap); // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a // side effect that it spins until any DELAYED_FREEING is finished. This ensures // that after appending only the new heap will be used for delayed free operations. diff --git a/src/page.c b/src/page.c index 97438d75..9d919cfa 100644 --- a/src/page.c +++ b/src/page.c @@ -123,11 +123,11 @@ bool _mi_page_is_valid(mi_page_t* page) { #endif void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) { - mi_thread_free_t tfree; mi_thread_free_t tfreex; mi_delayed_t old_delay; + mi_thread_free_t tfree; do { - tfree = mi_atomic_read(&page->xthread_free); // note: must acquire as we can break this loop and not do a CAS + tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS; tfreex = mi_tf_set_delayed(tfree, delay); old_delay = mi_tf_delayed(tfree); if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) { @@ -141,7 +141,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid break; // leave never-delayed flag set } } while ((old_delay == MI_DELAYED_FREEING) || - !mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree)); + !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); } /* ----------------------------------------------------------- @@ -155,13 +155,12 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid static void _mi_page_thread_free_collect(mi_page_t* page) { mi_block_t* head; - mi_thread_free_t tfree; mi_thread_free_t tfreex; + mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); do { - tfree = mi_atomic_read_relaxed(&page->xthread_free); head = mi_tf_block(tfree); tfreex = mi_tf_set_block(tfree,NULL); - } while (!mi_atomic_cas_weak(&page->xthread_free, tfreex, tfree)); + } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex)); // return if the list is empty if (head == NULL) return; @@ -273,11 +272,9 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) { (put there by other threads if they deallocated in a full page) ----------------------------------------------------------- */ void _mi_heap_delayed_free(mi_heap_t* heap) { - // take over the list (note: no atomic exchange is it is often NULL) - mi_block_t* block; - do { - block = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free); - } while (block != NULL && !mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, NULL, block)); + // take over the list (note: no atomic exchange since it is often NULL) + mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); + while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { /* nothing */ }; // and free them all while(block != NULL) { @@ -286,11 +283,10 @@ void _mi_heap_delayed_free(mi_heap_t* heap) { if (!_mi_free_delayed_block(block)) { // we might already start delayed freeing while another thread has not yet // reset the delayed_freeing flag; in that case delay it further by reinserting. - mi_block_t* dfree; + mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); do { - dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free); mi_block_set_nextx(heap, block, dfree, heap->keys); - } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree)); + } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); } block = next; } @@ -750,20 +746,20 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) { ----------------------------------------------------------- */ static mi_deferred_free_fun* volatile deferred_free = NULL; -static volatile _Atomic(void*) deferred_arg; // = NULL +static _Atomic(void*) deferred_arg; // = NULL void _mi_deferred_free(mi_heap_t* heap, bool force) { heap->tld->heartbeat++; if (deferred_free != NULL && !heap->tld->recurse) { heap->tld->recurse = true; - deferred_free(force, heap->tld->heartbeat, mi_atomic_read_ptr_relaxed(void,&deferred_arg)); + deferred_free(force, heap->tld->heartbeat, mi_atomic_load_ptr_relaxed(void,&deferred_arg)); heap->tld->recurse = false; } } void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept { deferred_free = fn; - mi_atomic_write_ptr(void,&deferred_arg, arg); + mi_atomic_store_ptr_release(void,&deferred_arg, arg); } diff --git a/src/random.c b/src/random.c index 2a96ccf6..4736a0ba 100644 --- a/src/random.c +++ b/src/random.c @@ -155,27 +155,36 @@ uintptr_t _mi_random_next(mi_random_ctx_t* ctx) { /* ---------------------------------------------------------------------------- To initialize a fresh random context we rely on the OS: -- Windows : BCryptGenRandom +- Windows : RtlGenRandom - osX,bsd,wasi: arc4random_buf - Linux : getrandom,/dev/urandom If we cannot get good randomness, we fall back to weak randomness based on a timer and ASLR. -----------------------------------------------------------------------------*/ #if defined(_WIN32) +/* +// We prefer BCryptGenRandom over RtlGenRandom but it leads to a crash a when using dynamic override combined with the C++ runtime :-( #pragma comment (lib,"bcrypt.lib") #include static bool os_random_buf(void* buf, size_t buf_len) { return (BCryptGenRandom(NULL, (PUCHAR)buf, (ULONG)buf_len, BCRYPT_USE_SYSTEM_PREFERRED_RNG) >= 0); } -/* -#define SystemFunction036 NTAPI SystemFunction036 -#include -#undef SystemFunction036 -static bool os_random_buf(void* buf, size_t buf_len) { - RtlGenRandom(buf, (ULONG)buf_len); - return true; -} */ +#define RtlGenRandom SystemFunction036 +#ifdef __cplusplus +extern "C" { +#endif +BOOLEAN NTAPI RtlGenRandom(PVOID RandomBuffer, ULONG RandomBufferLength); +#ifdef __cplusplus +} +#endif +static bool os_random_buf(void* buf, size_t buf_len) { + mi_assert_internal(buf_len >= sizeof(uintptr_t)); + memset(buf, 0, buf_len); + RtlGenRandom(buf, (ULONG)buf_len); + return (((uintptr_t*)buf)[0] != 0); // sanity check (but RtlGenRandom should never fail) +} + #elif defined(ANDROID) || defined(XP_DARWIN) || defined(__APPLE__) || defined(__DragonFly__) || \ defined(__FreeBSD__) || defined(__NetBSD__) || defined(__OpenBSD__) || \ defined(__sun) || defined(__wasi__) @@ -200,12 +209,12 @@ static bool os_random_buf(void* buf, size_t buf_len) { #ifndef GRND_NONBLOCK #define GRND_NONBLOCK (1) #endif - static volatile _Atomic(uintptr_t) no_getrandom; // = 0 - if (mi_atomic_read(&no_getrandom)==0) { + static _Atomic(uintptr_t) no_getrandom; // = 0 + if (mi_atomic_load_acquire(&no_getrandom)==0) { ssize_t ret = syscall(SYS_getrandom, buf, buf_len, GRND_NONBLOCK); if (ret >= 0) return (buf_len == (size_t)ret); if (ret != ENOSYS) return false; - mi_atomic_write(&no_getrandom,1); // don't call again, and fall back to /dev/urandom + mi_atomic_store_release(&no_getrandom,1); // don't call again, and fall back to /dev/urandom } #endif int flags = O_RDONLY; @@ -234,7 +243,7 @@ static bool os_random_buf(void* buf, size_t buf_len) { #endif #if defined(_WIN32) -#include +#include #elif defined(__APPLE__) #include #else diff --git a/src/region.c b/src/region.c index ae3a799a..e916e452 100644 --- a/src/region.c +++ b/src/region.c @@ -86,13 +86,13 @@ typedef union mi_region_info_u { // A region owns a chunk of REGION_SIZE (256MiB) (virtual) memory with // a bit map with one bit per MI_SEGMENT_SIZE (4MiB) block. typedef struct mem_region_s { - volatile _Atomic(uintptr_t) info; // mi_region_info_t.value - volatile _Atomic(void*) start; // start of the memory area + _Atomic(uintptr_t) info; // mi_region_info_t.value + _Atomic(void*) start; // start of the memory area mi_bitmap_field_t in_use; // bit per in-use block mi_bitmap_field_t dirty; // track if non-zero per block mi_bitmap_field_t commit; // track if committed per block mi_bitmap_field_t reset; // track if reset per block - volatile _Atomic(uintptr_t) arena_memid; // if allocated from a (huge page) arena + _Atomic(uintptr_t) arena_memid; // if allocated from a (huge page) arena uintptr_t padding; // round to 8 fields } mem_region_t; @@ -100,7 +100,7 @@ typedef struct mem_region_s { static mem_region_t regions[MI_REGION_MAX]; // Allocated regions -static volatile _Atomic(uintptr_t) regions_count; // = 0; +static _Atomic(uintptr_t) regions_count; // = 0; /* ---------------------------------------------------------------------------- @@ -123,9 +123,9 @@ static size_t mi_good_commit_size(size_t size) { // Return if a pointer points into a region reserved by us. bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { if (p==NULL) return false; - size_t count = mi_atomic_read_relaxed(®ions_count); + size_t count = mi_atomic_load_relaxed(®ions_count); for (size_t i = 0; i < count; i++) { - uint8_t* start = mi_atomic_read_ptr_relaxed(uint8_t,®ions[i].start); + uint8_t* start = (uint8_t*)mi_atomic_load_ptr_relaxed(uint8_t, ®ions[i].start); if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true; } return false; @@ -133,7 +133,7 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) { - uint8_t* start = mi_atomic_read_ptr(uint8_t,®ion->start); + uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t, &((mem_region_t*)region)->start); mi_assert_internal(start != NULL); return (start + (bit_idx * MI_SEGMENT_SIZE)); } @@ -171,7 +171,7 @@ static bool mi_memid_is_arena(size_t id, mem_region_t** region, mi_bitmap_index_ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld) { // not out of regions yet? - if (mi_atomic_read_relaxed(®ions_count) >= MI_REGION_MAX - 1) return false; + if (mi_atomic_load_relaxed(®ions_count) >= MI_REGION_MAX - 1) return false; // try to allocate a fresh region from the OS bool region_commit = (commit && mi_option_is_enabled(mi_option_eager_region_commit)); @@ -184,9 +184,9 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, mi_assert_internal(!region_large || region_commit); // claim a fresh slot - const uintptr_t idx = mi_atomic_increment(®ions_count); + const uintptr_t idx = mi_atomic_increment_acq_rel(®ions_count); if (idx >= MI_REGION_MAX) { - mi_atomic_decrement(®ions_count); + mi_atomic_decrement_acq_rel(®ions_count); _mi_arena_free(start, MI_REGION_SIZE, arena_memid, region_commit, tld->stats); _mi_warning_message("maximum regions used: %zu GiB (perhaps recompile with a larger setting for MI_HEAP_REGION_MAX_SIZE)", _mi_divide_up(MI_HEAP_REGION_MAX_SIZE, GiB)); return false; @@ -195,13 +195,13 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, // allocated, initialize and claim the initial blocks mem_region_t* r = ®ions[idx]; r->arena_memid = arena_memid; - mi_atomic_write(&r->in_use, 0); - mi_atomic_write(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL)); - mi_atomic_write(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0)); - mi_atomic_write(&r->reset, 0); + mi_atomic_store_release(&r->in_use, 0); + mi_atomic_store_release(&r->dirty, (is_zero ? 0 : MI_BITMAP_FIELD_FULL)); + mi_atomic_store_release(&r->commit, (region_commit ? MI_BITMAP_FIELD_FULL : 0)); + mi_atomic_store_release(&r->reset, 0); *bit_idx = 0; mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL); - mi_atomic_write_ptr(uint8_t*,&r->start, start); + mi_atomic_store_ptr_release(uint8_t*,&r->start, start); // and share it mi_region_info_t info; @@ -209,7 +209,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, info.x.valid = true; info.x.is_large = region_large; info.x.numa_node = (short)_mi_os_numa_node(tld); - mi_atomic_write(&r->info, info.value); // now make it available to others + mi_atomic_store_release(&r->info, info.value); // now make it available to others *region = r; return true; } @@ -221,7 +221,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large, static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, bool allow_large ) { // initialized at all? mi_region_info_t info; - info.value = mi_atomic_read_relaxed(®ion->info); + info.value = mi_atomic_load_relaxed(&((mem_region_t*)region)->info); if (info.value==0) return false; // numa correct @@ -240,7 +240,7 @@ static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, boo static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld) { // try all regions for a free slot - const size_t count = mi_atomic_read(®ions_count); + const size_t count = mi_atomic_load_acquire(®ions_count); size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though for (size_t visited = 0; visited < count; visited++, idx++) { if (idx >= count) idx = 0; // wrap around @@ -280,8 +280,8 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo mi_assert_internal(mi_bitmap_is_claimed(®ion->in_use, 1, blocks, bit_idx)); mi_region_info_t info; - info.value = mi_atomic_read(®ion->info); - uint8_t* start = mi_atomic_read_ptr(uint8_t,®ion->start); + info.value = mi_atomic_load_acquire(®ion->info); + uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,®ion->start); mi_assert_internal(!(info.x.is_large && !*is_large)); mi_assert_internal(start != NULL); @@ -400,7 +400,7 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re const size_t blocks = mi_region_block_count(size); mi_assert_internal(blocks + bit_idx <= MI_BITMAP_FIELD_BITS); mi_region_info_t info; - info.value = mi_atomic_read(®ion->info); + info.value = mi_atomic_load_acquire(®ion->info); mi_assert_internal(info.value != 0); void* blocks_start = mi_region_blocks_start(region, bit_idx); mi_assert_internal(blocks_start == p); // not a pointer in our area? @@ -442,23 +442,21 @@ void _mi_mem_free(void* p, size_t size, size_t id, bool full_commit, bool any_re -----------------------------------------------------------------------------*/ void _mi_mem_collect(mi_os_tld_t* tld) { // free every region that has no segments in use. - uintptr_t rcount = mi_atomic_read_relaxed(®ions_count); + uintptr_t rcount = mi_atomic_load_relaxed(®ions_count); for (size_t i = 0; i < rcount; i++) { mem_region_t* region = ®ions[i]; - if (mi_atomic_read_relaxed(®ion->info) != 0) { + if (mi_atomic_load_relaxed(®ion->info) != 0) { // if no segments used, try to claim the whole region - uintptr_t m; - do { - m = mi_atomic_read_relaxed(®ion->in_use); - } while(m == 0 && !mi_atomic_cas_weak(®ion->in_use, MI_BITMAP_FIELD_FULL, 0 )); + uintptr_t m = mi_atomic_load_relaxed(®ion->in_use); + while (m == 0 && !mi_atomic_cas_weak_release(®ion->in_use, &m, MI_BITMAP_FIELD_FULL)) { /* nothing */ }; if (m == 0) { // on success, free the whole region - uint8_t* start = mi_atomic_read_ptr(uint8_t,®ions[i].start); - size_t arena_memid = mi_atomic_read_relaxed(®ions[i].arena_memid); - uintptr_t commit = mi_atomic_read_relaxed(®ions[i].commit); + uint8_t* start = (uint8_t*)mi_atomic_load_ptr_acquire(uint8_t,®ions[i].start); + size_t arena_memid = mi_atomic_load_relaxed(®ions[i].arena_memid); + uintptr_t commit = mi_atomic_load_relaxed(®ions[i].commit); memset(®ions[i], 0, sizeof(mem_region_t)); // and release the whole region - mi_atomic_write(®ion->info, 0); + mi_atomic_store_release(®ion->info, 0); if (start != NULL) { // && !_mi_os_is_huge_reserved(start)) { _mi_abandoned_await_readers(); // ensure no pending reads _mi_arena_free(start, MI_REGION_SIZE, arena_memid, (~commit == 0), tld->stats); diff --git a/src/segment.c b/src/segment.c index 42919851..2c559f04 100644 --- a/src/segment.c +++ b/src/segment.c @@ -685,6 +685,7 @@ static mi_segment_t* mi_segment_init(mi_segment_t* segment, size_t required, mi_ } // zero the segment info? -- not always needed as it is zero initialized from the OS + mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL); // tsan if (!is_zero) { ptrdiff_t ofs = offsetof(mi_segment_t, next); size_t prefix = offsetof(mi_segment_t, slices) - ofs; @@ -891,77 +892,75 @@ static mi_tagged_segment_t mi_tagged_segment(mi_segment_t* segment, mi_tagged_se // This is a list of visited abandoned pages that were full at the time. // this list migrates to `abandoned` when that becomes NULL. The use of // this list reduces contention and the rate at which segments are visited. -static mi_decl_cache_align volatile _Atomic(mi_segment_t*) abandoned_visited; // = NULL +static mi_decl_cache_align _Atomic(mi_segment_t*) abandoned_visited; // = NULL // The abandoned page list (tagged as it supports pop) -static mi_decl_cache_align volatile _Atomic(mi_tagged_segment_t) abandoned; // = NULL +static mi_decl_cache_align _Atomic(mi_tagged_segment_t) abandoned; // = NULL // We also maintain a count of current readers of the abandoned list // in order to prevent resetting/decommitting segment memory if it might // still be read. -static mi_decl_cache_align volatile _Atomic(uintptr_t) abandoned_readers; // = 0 +static mi_decl_cache_align _Atomic(uintptr_t) abandoned_readers; // = 0 // Push on the visited list static void mi_abandoned_visited_push(mi_segment_t* segment) { mi_assert_internal(segment->thread_id == 0); - mi_assert_internal(segment->abandoned_next == NULL); + mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t,&segment->abandoned_next) == NULL); mi_assert_internal(segment->next == NULL); mi_assert_internal(segment->used > 0); - mi_segment_t* anext; + mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited); do { - anext = mi_atomic_read_ptr_relaxed(mi_segment_t, &abandoned_visited); - segment->abandoned_next = anext; - } while (!mi_atomic_cas_ptr_weak(mi_segment_t, &abandoned_visited, segment, anext)); + mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, anext); + } while (!mi_atomic_cas_ptr_weak_release(mi_segment_t, &abandoned_visited, &anext, segment)); } // Move the visited list to the abandoned list. static bool mi_abandoned_visited_revisit(void) { // quick check if the visited list is empty - if (mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned_visited)==NULL) return false; + if (mi_atomic_load_ptr_relaxed(mi_segment_t, &abandoned_visited) == NULL) return false; // grab the whole visited list - mi_segment_t* first = mi_atomic_exchange_ptr(mi_segment_t, &abandoned_visited, NULL); + mi_segment_t* first = mi_atomic_exchange_ptr_acq_rel(mi_segment_t, &abandoned_visited, NULL); if (first == NULL) return false; // first try to swap directly if the abandoned list happens to be NULL - const mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned); mi_tagged_segment_t afirst; + mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); if (mi_tagged_segment_ptr(ts)==NULL) { afirst = mi_tagged_segment(first, ts); - if (mi_atomic_cas_strong(&abandoned, afirst, ts)) return true; + if (mi_atomic_cas_strong_acq_rel(&abandoned, &ts, afirst)) return true; } // find the last element of the visited list: O(n) mi_segment_t* last = first; - while (last->abandoned_next != NULL) { - last = last->abandoned_next; + mi_segment_t* next; + while ((next = mi_atomic_load_ptr_relaxed(mi_segment_t, &last->abandoned_next)) != NULL) { + last = next; } // and atomically prepend to the abandoned list // (no need to increase the readers as we don't access the abandoned segments) - mi_tagged_segment_t anext; + mi_tagged_segment_t anext = mi_atomic_load_relaxed(&abandoned); do { - anext = mi_atomic_read_relaxed(&abandoned); - last->abandoned_next = mi_tagged_segment_ptr(anext); + mi_atomic_store_ptr_release(mi_segment_t, &last->abandoned_next, mi_tagged_segment_ptr(anext)); afirst = mi_tagged_segment(first, anext); - } while (!mi_atomic_cas_weak(&abandoned, afirst, anext)); + } while (!mi_atomic_cas_weak_release(&abandoned, &anext, afirst)); return true; } // Push on the abandoned list. static void mi_abandoned_push(mi_segment_t* segment) { mi_assert_internal(segment->thread_id == 0); - mi_assert_internal(segment->abandoned_next == NULL); + mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL); mi_assert_internal(segment->next == NULL); mi_assert_internal(segment->used > 0); - mi_tagged_segment_t ts; mi_tagged_segment_t next; + mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); do { - ts = mi_atomic_read_relaxed(&abandoned); - segment->abandoned_next = mi_tagged_segment_ptr(ts); + mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, mi_tagged_segment_ptr(ts)); next = mi_tagged_segment(segment, ts); - } while (!mi_atomic_cas_weak(&abandoned, next, ts)); + } while (!mi_atomic_cas_weak_release(&abandoned, &ts, next)); } // Wait until there are no more pending reads on segments that used to be in the abandoned list @@ -969,7 +968,7 @@ static void mi_abandoned_push(mi_segment_t* segment) { void _mi_abandoned_await_readers(void) { uintptr_t n; do { - n = mi_atomic_read(&abandoned_readers); + n = mi_atomic_load_acquire(&abandoned_readers); if (n != 0) mi_atomic_yield(); } while (n != 0); } @@ -978,7 +977,7 @@ void _mi_abandoned_await_readers(void) { static mi_segment_t* mi_abandoned_pop(void) { mi_segment_t* segment; // Check efficiently if it is empty (or if the visited list needs to be moved) - mi_tagged_segment_t ts = mi_atomic_read_relaxed(&abandoned); + mi_tagged_segment_t ts = mi_atomic_load_relaxed(&abandoned); segment = mi_tagged_segment_ptr(ts); if (mi_likely(segment == NULL)) { if (mi_likely(!mi_abandoned_visited_revisit())) { // try to swap in the visited list on NULL @@ -988,19 +987,21 @@ static mi_segment_t* mi_abandoned_pop(void) { // Do a pop. We use a reader count to prevent // a segment to be decommitted while a read is still pending, - // and a tagged pointer to prevent A-B-A link corruption. - mi_atomic_increment(&abandoned_readers); // ensure no segment gets decommitted + // and a tagged pointer to prevent A-B-A link corruption. + // (this is called from `region.c:_mi_mem_free` for example) + mi_atomic_increment_relaxed(&abandoned_readers); // ensure no segment gets decommitted mi_tagged_segment_t next = 0; + ts = mi_atomic_load_acquire(&abandoned); do { - ts = mi_atomic_read(&abandoned); segment = mi_tagged_segment_ptr(ts); if (segment != NULL) { - next = mi_tagged_segment(segment->abandoned_next, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted + mi_segment_t* anext = mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next); + next = mi_tagged_segment(anext, ts); // note: reads the segment's `abandoned_next` field so should not be decommitted } - } while (segment != NULL && !mi_atomic_cas_weak(&abandoned, next, ts)); - mi_atomic_decrement(&abandoned_readers); // release reader lock + } while (segment != NULL && !mi_atomic_cas_weak_acq_rel(&abandoned, &ts, next)); + mi_atomic_decrement_relaxed(&abandoned_readers); // release reader lock if (segment != NULL) { - segment->abandoned_next = NULL; + mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL); } return segment; } @@ -1012,7 +1013,7 @@ static mi_segment_t* mi_abandoned_pop(void) { static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { mi_assert_internal(segment->used == segment->abandoned); mi_assert_internal(segment->used > 0); - mi_assert_internal(segment->abandoned_next == NULL); + mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL); mi_assert_internal(segment->abandoned_visits == 0); mi_assert_expensive(mi_segment_is_valid(segment,tld)); @@ -1036,7 +1037,7 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { _mi_stat_increase(&tld->stats->segments_abandoned, 1); mi_segments_track_size(-((long)mi_segment_size(segment)), tld); segment->thread_id = 0; - segment->abandoned_next = NULL; + mi_atomic_store_ptr_release(mi_segment_t, &segment->abandoned_next, NULL); segment->abandoned_visits = 1; // from 0 to 1 to signify it is abandoned mi_abandoned_push(segment); } @@ -1118,7 +1119,7 @@ static bool mi_segment_check_free(mi_segment_t* segment, size_t slices_needed, s // Reclaim an abandoned segment; returns NULL if the segment was freed // set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full. static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) { - mi_assert_internal(segment->abandoned_next == NULL); + mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_segment_t, &segment->abandoned_next) == NULL); mi_assert_expensive(mi_segment_is_valid(segment, tld)); if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; } @@ -1306,12 +1307,13 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block // huge page segments are always abandoned and can be freed immediately by any thread mi_assert_internal(segment->kind==MI_SEGMENT_HUGE); mi_assert_internal(segment == _mi_page_segment(page)); - mi_assert_internal(mi_atomic_read_relaxed(&segment->thread_id)==0); + mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0); // claim it and free mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized. // paranoia: if this it the last reference, the cas should always succeed - if (mi_atomic_cas_strong(&segment->thread_id, heap->thread_id, 0)) { + uintptr_t expected_tid = 0; + if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) { mi_block_set_next(page, block, page->free); page->free = block; page->used--; @@ -1328,6 +1330,11 @@ void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block // mi_segments_track_size((long)segment->segment_size, tld); _mi_segment_page_free(page, true, &tld->segments); } +#if (MI_DEBUG!=0) + else { + mi_assert_internal(false); + } +#endif } /* ----------------------------------------------------------- @@ -1371,7 +1378,7 @@ mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_segment #define MI_SEGMENT_MAP_SIZE (MI_SEGMENT_MAP_BITS / 8) #define MI_SEGMENT_MAP_WSIZE (MI_SEGMENT_MAP_SIZE / MI_INTPTR_SIZE) -static volatile _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE]; // 2KiB per TB with 64MiB segments +static _Atomic(uintptr_t) mi_segment_map[MI_SEGMENT_MAP_WSIZE]; // 2KiB per TB with 64MiB segments static size_t mi_segment_map_index_of(const mi_segment_t* segment, size_t* bitidx) { mi_assert_internal(_mi_ptr_segment(segment) == segment); // is it aligned on MI_SEGMENT_SIZE? @@ -1385,12 +1392,11 @@ static void mi_segment_map_allocated_at(const mi_segment_t* segment) { size_t index = mi_segment_map_index_of(segment, &bitidx); mi_assert_internal(index < MI_SEGMENT_MAP_WSIZE); if (index==0) return; - uintptr_t mask; + uintptr_t mask = mi_segment_map[index]; uintptr_t newmask; do { - mask = mi_segment_map[index]; newmask = (mask | ((uintptr_t)1 << bitidx)); - } while (!mi_atomic_cas_weak(&mi_segment_map[index], newmask, mask)); + } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask)); } static void mi_segment_map_freed_at(const mi_segment_t* segment) { @@ -1398,12 +1404,11 @@ static void mi_segment_map_freed_at(const mi_segment_t* segment) { size_t index = mi_segment_map_index_of(segment, &bitidx); mi_assert_internal(index < MI_SEGMENT_MAP_WSIZE); if (index == 0) return; - uintptr_t mask; + uintptr_t mask = mi_segment_map[index]; uintptr_t newmask; - do { - mask = mi_segment_map[index]; + do { newmask = (mask & ~((uintptr_t)1 << bitidx)); - } while (!mi_atomic_cas_weak(&mi_segment_map[index], newmask, mask)); + } while (!mi_atomic_cas_weak_release(&mi_segment_map[index], &mask, newmask)); } // Determine the segment belonging to a pointer or NULL if it is not in a valid segment. diff --git a/src/stats.c b/src/stats.c index 98bc3b04..aa002068 100644 --- a/src/stats.c +++ b/src/stats.c @@ -26,13 +26,13 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { if (mi_is_in_main(stat)) { // add atomically (for abandoned pages) - mi_atomic_addi64(&stat->current,amount); - mi_atomic_maxi64(&stat->peak, mi_atomic_readi64(&stat->current)); + int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount); + mi_atomic_maxi64_relaxed(&stat->peak, current + amount); if (amount > 0) { - mi_atomic_addi64(&stat->allocated,amount); + mi_atomic_addi64_relaxed(&stat->allocated,amount); } else { - mi_atomic_addi64(&stat->freed, -amount); + mi_atomic_addi64_relaxed(&stat->freed, -amount); } } else { @@ -50,8 +50,8 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) { if (mi_is_in_main(stat)) { - mi_atomic_addi64( &stat->count, 1 ); - mi_atomic_addi64( &stat->total, (int64_t)amount ); + mi_atomic_addi64_relaxed( &stat->count, 1 ); + mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount ); } else { stat->count++; @@ -71,17 +71,17 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) { static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) { if (stat==src) return; if (src->allocated==0 && src->freed==0) return; - mi_atomic_addi64( &stat->allocated, src->allocated * unit); - mi_atomic_addi64( &stat->current, src->current * unit); - mi_atomic_addi64( &stat->freed, src->freed * unit); + mi_atomic_addi64_relaxed( &stat->allocated, src->allocated * unit); + mi_atomic_addi64_relaxed( &stat->current, src->current * unit); + mi_atomic_addi64_relaxed( &stat->freed, src->freed * unit); // peak scores do not work across threads.. - mi_atomic_addi64( &stat->peak, src->peak * unit); + mi_atomic_addi64_relaxed( &stat->peak, src->peak * unit); } static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) { if (stat==src) return; - mi_atomic_addi64( &stat->total, src->total * unit); - mi_atomic_addi64( &stat->count, src->count * unit); + mi_atomic_addi64_relaxed( &stat->total, src->total * unit); + mi_atomic_addi64_relaxed( &stat->count, src->count * unit); } // must be thread safe as it is called from stats_merge @@ -384,7 +384,7 @@ void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept { // Basic timer for convenience; use milli-seconds to avoid doubles // ---------------------------------------------------------------- #ifdef _WIN32 -#include +#include static mi_msecs_t mi_to_msecs(LARGE_INTEGER t) { static LARGE_INTEGER mfreq; // = 0 if (mfreq.QuadPart == 0LL) { @@ -439,7 +439,7 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) { // -------------------------------------------------------- #if defined(_WIN32) -#include +#include #include #pragma comment(lib,"psapi.lib") diff --git a/test/main-override.cpp b/test/main-override.cpp index 16c40281..fe5403d1 100644 --- a/test/main-override.cpp +++ b/test/main-override.cpp @@ -19,7 +19,7 @@ #endif #ifdef _WIN32 -#include +#include static void msleep(unsigned long msecs) { Sleep(msecs); } #else #include diff --git a/test/test-stress.c b/test/test-stress.c index b75f9e97..0e8d3ef3 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -189,7 +189,7 @@ static void test_stress(void) { } } // mi_collect(false); -#ifndef NDEBUG +#if !defined(NDEBUG) || defined(MI_TSAN) if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); } #endif } @@ -260,7 +260,7 @@ static void (*thread_entry_fun)(intptr_t) = &stress; #ifdef _WIN32 -#include +#include static DWORD WINAPI thread_entry(LPVOID param) { thread_entry_fun((intptr_t)param);