diff --git a/CMakeLists.txt b/CMakeLists.txt
index 89dad3b5..b1f66f5c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -385,7 +385,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
else()
- list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2)
+ list(APPEND mi_cflags -ftls-model=initial-exec)
endif()
endif()
if(MI_OVERRIDE)
diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj
index d03fd281..e9a4a339 100644
--- a/ide/vs2022/mimalloc.vcxproj
+++ b/ide/vs2022/mimalloc.vcxproj
@@ -120,7 +120,6 @@
CompileAsCpp
false
stdcpp20
- AdvancedVectorExtensions2
diff --git a/src/bitmap.c b/src/bitmap.c
index fb8468fa..8479555c 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -505,7 +505,7 @@ static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
}
-// ------ find_and_try_xset --------
+// ------ try_find_and_clear --------
#if defined(__AVX2__)
static inline __m256i mi_mm256_zero(void) {
@@ -526,7 +526,7 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// This is used to find free slices and abandoned pages and should be efficient.
// todo: try neon version
-static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) {
+static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
while (true) {
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
@@ -613,10 +613,10 @@ static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// Used to find medium size pages in the free blocks.
// todo: try neon version
-static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) {
+static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) {
- // since a cache-line is 64b, load all at once
+ // since a cache-line is 64b, load all at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
const __m256i cmpv = mi_mm256_ones();
@@ -628,9 +628,9 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid
// mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared)
if (mask==0) return false;
const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk
- const size_t chunk_idx = bidx / 8;
+ const size_t chunk_idx = bidx / 8;
const size_t byte_idx = bidx % 8; // byte index of the byte in the bfield
- mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
+ mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
@@ -668,10 +668,10 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// Used to find large size pages in the free blocks.
// todo: try neon version
-static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pidx) {
-#if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
+static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
+ #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) {
- // since a cache-line is 64b, load all at once
+ // since a cache-line is 64b, load all at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
const __m256i cmpv = mi_mm256_ones();
@@ -689,7 +689,7 @@ static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pid
*pidx = chunk_idx*MI_BFIELD_BITS;
mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
return true;
- }
+ }
// try again
}
#else
@@ -710,7 +710,7 @@ static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pid
// and try to clear them atomically.
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
// (We do not cross bfield boundaries)
-static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+static mi_decl_noinline bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n == 0 || n > MI_BFIELD_BITS) return false;
const mi_bfield_t mask = mi_bfield_mask(n, 0);
for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
@@ -752,10 +752,10 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t*
// and try to clear them atomically.
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
// This can cross bfield boundaries.
-static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
-
- // we align at a bfield, and scan `field_count` fields
+
+ // we align at a bfield, and scan `field_count` fields
// n >= MI_BFIELD_BITS; find a first field that is 0
const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields
for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
@@ -780,7 +780,7 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
m -= MI_BFIELD_BITS; // note: can underflow
}
} while (++j < field_count);
-
+
// if all set, we can try to atomically clear them
if (allset) {
const size_t cidx = i*MI_BFIELD_BITS;
@@ -798,13 +798,13 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
}
-static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
- if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx); // small pages
- if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx); // medium pages
- if (n==MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearX(chunk, pidx); // large pages
+static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
+ if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages
+ if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages
+ if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
- if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx);
- return mi_bchunk_find_and_try_clearN_(chunk, n, pidx);
+ if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
+ return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
}
@@ -858,7 +858,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
return false;
}
- // record the max clear
+ // record the max clear
size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
do {
if mi_likely(chunk_idx <= oldmax) break;
@@ -1139,23 +1139,22 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
-// (Used to find fresh free slices.)
+// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS)
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
{
// const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
{
size_t cidx;
- if mi_likely(mi_bchunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
+ if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
*pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
- mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n);
+ mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
return true;
}
else {
// we may find that all are cleared only on a second iteration but that is ok as
// the chunkmap is a conservative approximation.
mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
- // continue
}
}
mi_bitmap_forall_chunks_end();
@@ -1171,7 +1170,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
{
size_t cidx;
- if mi_likely(mi_bchunk_find_and_try_clear(&bitmap->chunks[chunk_idx], &cidx)) {
+ if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) {
const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
bool keep_set = true;
@@ -1182,19 +1181,17 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
return true;
}
else {
- // failed to claim it, set abandoned mapping again (unless thet page was freed)
+ // failed to claim it, set abandoned mapping again (unless the page was freed)
if (keep_set) {
const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
mi_assert_internal(wasclear); MI_UNUSED(wasclear);
}
- // continue
}
}
else {
// we may find that all are cleared only on a second iteration but that is ok as
// the chunkmap is a conservative approximation.
mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
- // continue
}
}
mi_bitmap_forall_chunks_end();