merge from dev3

This commit is contained in:
daanx 2025-02-08 13:14:19 -08:00
commit 8d9b6b2b9e
8 changed files with 110 additions and 78 deletions

View file

@ -431,9 +431,18 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel")
endif() endif()
endif() endif()
# Compiler and architecture specific flags
if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku") if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
if(MI_OPT_ARCH) if(MI_OPT_ARCH)
if(MI_ARCH STREQUAL "x64") if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES) # to support multi-arch binaries (#999)
set(MI_OPT_ARCH_FLAGS "")
if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a;-mtune=native")
endif()
if("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES)
list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_x86_64;-march=haswell;-Xarch_x86_64;-mavx2")
endif()
elseif(MI_ARCH STREQUAL "x64")
set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2;-mtune=native") # fast bit scan (since 2013) set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2;-mtune=native") # fast bit scan (since 2013)
elseif(MI_ARCH STREQUAL "arm64") elseif(MI_ARCH STREQUAL "arm64")
set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a;-mtune=native") # fast atomics (since 2016) set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a;-mtune=native") # fast atomics (since 2016)

View file

@ -199,6 +199,8 @@ static inline size_t mi_ctz(size_t x) {
size_t r; size_t r;
__asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
return r; return r;
#elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__)
return _tzcnt_u64(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long idx; unsigned long idx;
return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS); return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
@ -221,6 +223,8 @@ static inline size_t mi_clz(size_t x) {
size_t r; size_t r;
__asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
return r; return r;
#elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__)
return _lzcnt_u64(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long idx; unsigned long idx;
return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS); return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
@ -254,7 +258,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
bool is_zero; bool is_zero;
__asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
return !is_zero; return !is_zero;
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long i; unsigned long i;
return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false); return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
#else #else
@ -271,7 +275,7 @@ static inline bool mi_bsr(size_t x, size_t* idx) {
bool is_zero; bool is_zero;
__asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc"); __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
return !is_zero; return !is_zero;
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long i; unsigned long i;
return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false); return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
#else #else

View file

@ -99,7 +99,7 @@ terms of the MIT license. A copy of the license can be found in the file
#define MI_ENCODE_FREELIST 1 #define MI_ENCODE_FREELIST 1
#endif #endif
// Enable large pages for objects between 64KiB and 256KiB. // Enable large pages for objects between 64KiB and 512KiB.
// Disabled by default as for many workloads the block sizes above 64 KiB are quite random which can lead to too many partially used large pages. // Disabled by default as for many workloads the block sizes above 64 KiB are quite random which can lead to too many partially used large pages.
#ifndef MI_ENABLE_LARGE_PAGES #ifndef MI_ENABLE_LARGE_PAGES
#define MI_ENABLE_LARGE_PAGES 0 #define MI_ENABLE_LARGE_PAGES 0
@ -342,7 +342,7 @@ typedef struct mi_page_s {
#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 16 KiB #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 16 KiB
#if MI_ENABLE_LARGE_PAGES #if MI_ENABLE_LARGE_PAGES
#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB
#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 256 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
#else #else
#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/4) // <= 128 KiB #define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/4) // <= 128 KiB
#define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`

View file

@ -165,25 +165,30 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already
// Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0 // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0
// and false otherwise (leaving the bit field as is). // and false otherwise (leaving the bit field as is).
// `all_clear` is set to `true` if the new bfield became zero. // `all_clear` is set to `true` if the new bfield became zero.
static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) { static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) {
mi_assert_internal(mask != 0); mi_assert_internal(mask != 0);
mi_bfield_t old = mi_atomic_load_relaxed(b); // try to atomically clear the mask bits
do { do {
if ((old&mask) != mask) { if ((expect & mask) != mask) { // are all bits still set?
// the mask bits are no longer set if (all_clear != NULL) { *all_clear = (expect == 0); }
if (all_clear != NULL) { *all_clear = (old==0); }
return false; return false;
} }
} while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)); // try to atomically clear the mask bits } while (!mi_atomic_cas_weak_acq_rel(b, &expect, expect & ~mask));
if (all_clear != NULL) { *all_clear = ((old&~mask) == 0); } if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0); }
return true; return true;
} }
static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear) {
mi_assert_internal(mask != 0);
const mi_bfield_t expect = mi_atomic_load_relaxed(b);
return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear);
}
/*
// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0 // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0
// and `false` otherwise leaving the bfield `b` as-is. // and `false` otherwise leaving the bfield `b` as-is.
// `all_clear` is set to true if the new bfield became zero (and false otherwise) // `all_clear` is set to true if the new bfield became zero (and false otherwise)
static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) { static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) {
mi_assert_internal(idx < MI_BFIELD_BITS); mi_assert_internal(idx < MI_BFIELD_BITS);
const mi_bfield_t mask = mi_bfield_one()<<idx; const mi_bfield_t mask = mi_bfield_one()<<idx;
return mi_bfield_atomic_try_clear_mask(b, mask, all_clear); return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
@ -198,6 +203,7 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t id
const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<idx; const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<idx;
return mi_bfield_atomic_try_clear_mask(b, mask, all_clear); return mi_bfield_atomic_try_clear_mask(b, mask, all_clear);
} }
*/
// Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's. // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's.
// and false otherwise leaving the bit field as-is. // and false otherwise leaving the bit field as-is.
@ -534,15 +540,14 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
} }
#endif #endif
static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) { static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
// note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever // note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever
// as the compiler won't reload the registers vec1 and vec2 from memory again. // as the compiler won't reload the registers vec1 and vec2 from memory again.
const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]); const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]);
size_t idx; size_t idx;
if (!allow_allset && (~b == 0)) return false;
if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit
if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], mi_bfield_mask(1,idx), b, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx; *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx < MI_BCHUNK_BITS);
return true; return true;
@ -565,7 +570,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
if (mask==0) return false; if (mask==0) return false;
mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24
const size_t chunk_idx = _tzcnt_u32(mask) / 8; const size_t chunk_idx = _tzcnt_u32(mask) / 8;
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
// try again // try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
} }
@ -600,7 +605,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
chunk_idx = mi_ctz(mask) / 8; chunk_idx = mi_ctz(mask) / 8;
#endif #endif
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
// try again // try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
} }
@ -621,17 +626,13 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx
if (mask==0) return false; if (mask==0) return false;
mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , ..
const size_t chunk_idx = mi_ctz(mask) / 8; const size_t chunk_idx = mi_ctz(mask) / 8;
if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true;
// try again // try again
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded
} }
#else #else
// try first to find a field that is not all set (to reduce fragmentation) (not needed for binned bitmaps)
// for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
// if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true;
// }
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true; if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx)) return true;
} }
return false; return false;
#endif #endif
@ -643,9 +644,8 @@ static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n,
} }
#if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)) #if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512))
static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) { static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) {
const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]);
if (!allow_all_set && (~b == 0)) return false;
// has_set8 has low bit in each byte set if the byte in x == 0xFF // has_set8 has low bit in each byte set if the byte in x == 0xFF
const mi_bfield_t has_set8 = const mi_bfield_t has_set8 =
((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F ((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F
@ -655,7 +655,7 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c
if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit
mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); mi_assert_internal(idx <= (MI_BFIELD_BITS - 8));
mi_assert_internal((idx%8)==0); mi_assert_internal((idx%8)==0);
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // unset the byte atomically if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], (mi_bfield_t)0xFF << idx, b, NULL)) { // unset the byte atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + idx; *pidx = (chunk_idx*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
return true; return true;
@ -696,12 +696,8 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s
// note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded }
} }
#else #else
// first skip allset fields to reduce fragmentation (not needed for binned bitmaps)
// for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
// if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true;
// }
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true; if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true;
} }
return false; return false;
#endif #endif
@ -771,16 +767,18 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
const mi_bfield_t mask = mi_bfield_mask(n, 0); const mi_bfield_t mask = mi_bfield_mask(n, 0);
// for all fields in the chunk // for all fields in the chunk
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); mi_bfield_t b0 = mi_atomic_load_relaxed(&chunk->bfields[i]);
mi_bfield_t b = b0;
size_t idx; size_t idx;
// is there a range inside the field? // is there a range inside the field?
while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
if (idx + n > MI_BFIELD_BITS) break; // too short, maybe cross over, or continue with the next field if (idx + n > MI_BFIELD_BITS) break; // too short: maybe cross over, or continue with the next field
const size_t bmask = mask<<idx; const size_t bmask = mask<<idx;
mi_assert_internal(bmask>>idx == mask); mi_assert_internal(bmask>>idx == mask);
if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically
if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) { if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[i], bmask, b0, NULL)) {
*pidx = (i*MI_BFIELD_BITS) + idx; *pidx = (i*MI_BFIELD_BITS) + idx;
mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx < MI_BCHUNK_BITS);
mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS);
@ -788,23 +786,24 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
} }
else { else {
// if we failed to atomically commit, reload b and try again from the start // if we failed to atomically commit, reload b and try again from the start
b = mi_atomic_load_acquire(&chunk->bfields[i]); b = b0 = mi_atomic_load_acquire(&chunk->bfields[i]);
} }
} }
else { else {
// advance // advance by clearing the least run of ones, for example, with n>=4, idx=2:
const size_t ones = mi_bfield_ctz(~(b>>idx)); // skip all ones (since it didn't fit the mask) // b = 1111 1101 1010 1100
mi_assert_internal(ones>0); // .. + (1<<idx) = 1111 1101 1011 0000
b = b & ~mi_bfield_mask(ones, idx); // clear the ones // .. & b = 1111 1101 1010 0000
b = b & (b + (mi_bfield_one() << idx));
} }
} }
// check if we can cross into the next bfield // check if we can cross into the next bfield
if (i < MI_BCHUNK_FIELDS-1) { if (b!=0 && i < MI_BCHUNK_FIELDS-1) {
const size_t post = mi_bfield_clz(~b); const size_t post = mi_bfield_clz(~b);
if (post > 0) { if (post > 0) {
const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1])); const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1]));
if (post + pre <= n) { if (post + pre >= n) {
// it fits -- try to claim it atomically // it fits -- try to claim it atomically
const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post); const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post);
if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) { if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) {
@ -889,15 +888,6 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk,
} }
//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
// if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages
// if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages
// // if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages
// if (n==0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
// if (n<=MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
// return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
//}
// ------- mi_bchunk_clear_once_set --------------------------------------- // ------- mi_bchunk_clear_once_set ---------------------------------------

View file

@ -271,10 +271,6 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's
// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)!
bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n);
// Is a sequence of n bits already all set/cleared? // Is a sequence of n bits already all set/cleared?
bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n); bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n);

View file

@ -201,7 +201,7 @@ void mi_free(void* p) mi_attr_noexcept
// ------------------------------------------------------ // ------------------------------------------------------
// Multi-threaded Free (`_mt`) // Multi-threaded Free (`_mt`)
// ------------------------------------------------------ // ------------------------------------------------------
static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free);
static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept { static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept {
mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_owned(page));
@ -269,7 +269,36 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t*
// not reclaimed or free'd, unown again // not reclaimed or free'd, unown again
_mi_page_unown(page); // _mi_page_unown(page);
mi_page_unown_from_free(page, mt_free);
}
// release ownership of a page. This may free the page if all (other) blocks were concurrently
// freed in the meantime. Returns true if the page was freed.
// This is a specialized version of `mi_page_unown` to (try to) avoid calling `mi_page_free_collect` again.
static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free) {
mi_assert_internal(mi_page_is_owned(page));
mi_assert_internal(mi_page_is_abandoned(page));
mi_assert_internal(mt_free != NULL);
mi_assert_internal(page->used > 1);
mi_thread_free_t tf_expect = mi_tf_create(mt_free, true);
mi_thread_free_t tf_new = mi_tf_create(mt_free, false);
while mi_unlikely(!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_expect, tf_new)) {
mi_assert_internal(mi_tf_is_owned(tf_expect));
while (mi_tf_block(tf_expect) != NULL) {
_mi_page_free_collect(page,false); // update used
if (mi_page_all_free(page)) { // it may become free just before unowning it
_mi_arenas_page_unabandon(page);
_mi_arenas_page_free(page);
return true;
}
tf_expect = mi_atomic_load_relaxed(&page->xthread_free);
}
mi_assert_internal(mi_tf_block(tf_expect)==NULL);
tf_new = mi_tf_create(NULL, false);
}
return false;
} }

View file

@ -215,11 +215,17 @@ void _mi_page_free_collect(mi_page_t* page, bool force) {
mi_assert_internal(!force || page->local_free == NULL); mi_assert_internal(!force || page->local_free == NULL);
} }
// collect elements in the thread-free list starting at `head`. // Collect elements in the thread-free list starting at `head`. This is an optimized
// version of `_mi_page_free_collect` to be used from `free.c:_mi_free_collect_mt` that avoids atomic access to `xthread_free`.
//
// `head` must be in the `xthread_free` list. It will not collect `head` itself
// so the `used` count is not fully updated in general. However, if the `head` is
// the last remaining element, it will be collected and the used count will become `0` (so `mi_page_all_free` becomes true).
void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) { void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) {
if (head == NULL) return; if (head == NULL) return;
mi_block_t* next = mi_block_next(page,head); // we cannot collect the head element itself as `page->thread_free` may point at it (and we want to avoid atomic ops) mi_block_t* next = mi_block_next(page,head); // we cannot collect the head element itself as `page->thread_free` may point to it (and we want to avoid atomic ops)
if (next != NULL) { if (next != NULL) {
mi_block_set_next(page, head, NULL);
mi_page_thread_collect_to_local(page, next); mi_page_thread_collect_to_local(page, next);
if (page->local_free != NULL && page->free == NULL) { if (page->local_free != NULL && page->free == NULL) {
page->free = page->local_free; page->free = page->local_free;
@ -229,6 +235,8 @@ void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) {
} }
if (page->used == 1) { if (page->used == 1) {
// all elements are free'd since we skipped the `head` element itself // all elements are free'd since we skipped the `head` element itself
mi_assert_internal(mi_tf_block(mi_atomic_load_relaxed(&page->xthread_free)) == head);
mi_assert_internal(mi_block_next(page,head) == NULL);
_mi_page_free_collect(page, false); // collect the final element _mi_page_free_collect(page, false); // collect the final element
} }
} }
@ -816,31 +824,25 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m
// Find a page with free blocks of `size`. // Find a page with free blocks of `size`.
static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) { static mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) {
// mi_page_queue_t* pq = mi_page_queue(heap, size); // mi_page_queue_t* pq = mi_page_queue(heap, size);
mi_assert_internal(!mi_page_queue_is_huge(pq)); mi_assert_internal(!mi_page_queue_is_huge(pq));
// check the first page: we even do this with candidate search or otherwise we re-search every time // check the first page: we even do this with candidate search or otherwise we re-search every time
mi_page_t* page = pq->first; mi_page_t* page = pq->first;
if (page != NULL) { if mi_likely(page != NULL && mi_page_immediate_available(page)) {
#if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness
if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) { if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) {
mi_page_extend_free(heap, page); mi_page_extend_free(heap, page);
mi_assert_internal(mi_page_immediate_available(page)); mi_assert_internal(mi_page_immediate_available(page));
} }
else #endif
#endif page->retire_expire = 0;
{ return page; // fast path
_mi_page_free_collect(page,false); }
} else {
return mi_page_queue_find_free_ex(heap, pq, true);
if (mi_page_immediate_available(page)) {
page->retire_expire = 0;
return page; // fast path
}
} }
return mi_page_queue_find_free_ex(heap, pq, true);
} }

View file

@ -127,9 +127,11 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config )
config->has_partial_free = false; config->has_partial_free = false;
config->has_virtual_reserve = true; config->has_virtual_reserve = true;
// windows version // windows version
const DWORD win_version = GetVersion(); OSVERSIONINFOW version; _mi_memzero_var(version);
win_major_version = (DWORD)(LOBYTE(LOWORD(win_version))); if (GetVersionExW(&version)) {
win_minor_version = (DWORD)(HIBYTE(LOWORD(win_version))); win_major_version = version.dwMajorVersion;
win_minor_version = version.dwMinorVersion;
}
// get the page size // get the page size
SYSTEM_INFO si; SYSTEM_INFO si;
GetSystemInfo(&si); GetSystemInfo(&si);
@ -668,7 +670,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) {
#define MI_PRIM_HAS_PROCESS_ATTACH 1 #define MI_PRIM_HAS_PROCESS_ATTACH 1
// Windows DLL: easy to hook into process_init and thread_done // Windows DLL: easy to hook into process_init and thread_done
__declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) { BOOL WINAPI DllMain(HINSTANCE inst, DWORD reason, LPVOID reserved) {
mi_win_main((PVOID)inst,reason,reserved); mi_win_main((PVOID)inst,reason,reserved);
return TRUE; return TRUE;
} }