arch specific optimizations

This commit is contained in:
daanx 2024-12-07 15:02:27 -08:00
parent 0e5d5831e4
commit 6b52b19e3b
3 changed files with 28 additions and 32 deletions

View file

@ -385,7 +385,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
list(APPEND mi_cflags_dynamic -ftls-model=initial-exec) list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)") message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
else() else()
list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2) list(APPEND mi_cflags -ftls-model=initial-exec)
endif() endif()
endif() endif()
if(MI_OVERRIDE) if(MI_OVERRIDE)

View file

@ -120,7 +120,6 @@
<CompileAs>CompileAsCpp</CompileAs> <CompileAs>CompileAsCpp</CompileAs>
<SupportJustMyCode>false</SupportJustMyCode> <SupportJustMyCode>false</SupportJustMyCode>
<LanguageStandard>stdcpp20</LanguageStandard> <LanguageStandard>stdcpp20</LanguageStandard>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
</ClCompile> </ClCompile>
<PostBuildEvent> <PostBuildEvent>
<Command> <Command>

View file

@ -505,7 +505,7 @@ static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) {
mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx); mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx);
} }
// ------ find_and_try_xset -------- // ------ try_find_and_clear --------
#if defined(__AVX2__) #if defined(__AVX2__)
static inline __m256i mi_mm256_zero(void) { static inline __m256i mi_mm256_zero(void) {
@ -526,7 +526,7 @@ static inline bool mi_mm256_is_zero( __m256i vec) {
// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// This is used to find free slices and abandoned pages and should be efficient. // This is used to find free slices and abandoned pages and should be efficient.
// todo: try neon version // todo: try neon version
static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) { static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==256) #if defined(__AVX2__) && (MI_BCHUNK_BITS==256)
while (true) { while (true) {
const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields);
@ -613,10 +613,10 @@ static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// Used to find medium size pages in the free blocks. // Used to find medium size pages in the free blocks.
// todo: try neon version // todo: try neon version
static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) { static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==512) #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) { while (true) {
// since a cache-line is 64b, load all at once // since a cache-line is 64b, load all at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
const __m256i cmpv = mi_mm256_ones(); const __m256i cmpv = mi_mm256_ones();
@ -628,9 +628,9 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid
// mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared) // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared)
if (mask==0) return false; if (mask==0) return false;
const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk
const size_t chunk_idx = bidx / 8; const size_t chunk_idx = bidx / 8;
const size_t byte_idx = bidx % 8; // byte index of the byte in the bfield const size_t byte_idx = bidx % 8; // byte index of the byte in the bfield
mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS);
if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // clear it atomically if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // clear it atomically
*pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx; *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx;
mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS);
@ -668,10 +668,10 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid
// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success.
// Used to find large size pages in the free blocks. // Used to find large size pages in the free blocks.
// todo: try neon version // todo: try neon version
static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pidx) { static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) {
#if defined(__AVX2__) && (MI_BCHUNK_BITS==512) #if defined(__AVX2__) && (MI_BCHUNK_BITS==512)
while (true) { while (true) {
// since a cache-line is 64b, load all at once // since a cache-line is 64b, load all at once
const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields);
const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1);
const __m256i cmpv = mi_mm256_ones(); const __m256i cmpv = mi_mm256_ones();
@ -689,7 +689,7 @@ static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pid
*pidx = chunk_idx*MI_BFIELD_BITS; *pidx = chunk_idx*MI_BFIELD_BITS;
mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS); mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS);
return true; return true;
} }
// try again // try again
} }
#else #else
@ -710,7 +710,7 @@ static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pid
// and try to clear them atomically. // and try to clear them atomically.
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
// (We do not cross bfield boundaries) // (We do not cross bfield boundaries)
static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { static mi_decl_noinline bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n == 0 || n > MI_BFIELD_BITS) return false; if (n == 0 || n > MI_BFIELD_BITS) return false;
const mi_bfield_t mask = mi_bfield_mask(n, 0); const mi_bfield_t mask = mi_bfield_mask(n, 0);
for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { for(int i = 0; i < MI_BCHUNK_FIELDS; i++) {
@ -752,10 +752,10 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t*
// and try to clear them atomically. // and try to clear them atomically.
// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success.
// This can cross bfield boundaries. // This can cross bfield boundaries.
static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
// we align at a bfield, and scan `field_count` fields // we align at a bfield, and scan `field_count` fields
// n >= MI_BFIELD_BITS; find a first field that is 0 // n >= MI_BFIELD_BITS; find a first field that is 0
const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields
for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++) for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++)
@ -780,7 +780,7 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
m -= MI_BFIELD_BITS; // note: can underflow m -= MI_BFIELD_BITS; // note: can underflow
} }
} while (++j < field_count); } while (++j < field_count);
// if all set, we can try to atomically clear them // if all set, we can try to atomically clear them
if (allset) { if (allset) {
const size_t cidx = i*MI_BFIELD_BITS; const size_t cidx = i*MI_BFIELD_BITS;
@ -798,13 +798,13 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t*
} }
static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) {
if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx); // small pages if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages
if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx); // medium pages if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages
if (n==MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearX(chunk, pidx); // large pages if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages
if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk
if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx); if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx);
return mi_bchunk_find_and_try_clearN_(chunk, n, pidx); return mi_bchunk_try_find_and_clearN_(chunk, n, pidx);
} }
@ -858,7 +858,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
mi_bchunk_set(&bitmap->chunkmap, chunk_idx); mi_bchunk_set(&bitmap->chunkmap, chunk_idx);
return false; return false;
} }
// record the max clear // record the max clear
size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
do { do {
if mi_likely(chunk_idx <= oldmax) break; if mi_likely(chunk_idx <= oldmax) break;
@ -1139,23 +1139,22 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all.
// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
// (Used to find fresh free slices.) // (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS)
mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx)
{ {
// const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
{ {
size_t cidx; size_t cidx;
if mi_likely(mi_bchunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) {
*pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap));
return true; return true;
} }
else { else {
// we may find that all are cleared only on a second iteration but that is ok as // we may find that all are cleared only on a second iteration but that is ok as
// the chunkmap is a conservative approximation. // the chunkmap is a conservative approximation.
mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
// continue
} }
} }
mi_bitmap_forall_chunks_end(); mi_bitmap_forall_chunks_end();
@ -1171,7 +1170,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx)
{ {
size_t cidx; size_t cidx;
if mi_likely(mi_bchunk_find_and_try_clear(&bitmap->chunks[chunk_idx], &cidx)) { if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) {
const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx; const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx;
mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap)); mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap));
bool keep_set = true; bool keep_set = true;
@ -1182,19 +1181,17 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t
return true; return true;
} }
else { else {
// failed to claim it, set abandoned mapping again (unless thet page was freed) // failed to claim it, set abandoned mapping again (unless the page was freed)
if (keep_set) { if (keep_set) {
const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx);
mi_assert_internal(wasclear); MI_UNUSED(wasclear); mi_assert_internal(wasclear); MI_UNUSED(wasclear);
} }
// continue
} }
} }
else { else {
// we may find that all are cleared only on a second iteration but that is ok as // we may find that all are cleared only on a second iteration but that is ok as
// the chunkmap is a conservative approximation. // the chunkmap is a conservative approximation.
mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx);
// continue
} }
} }
mi_bitmap_forall_chunks_end(); mi_bitmap_forall_chunks_end();