further optimize mi_bchunk_try_find_and_clearNX

This commit is contained in:
daanx 2025-02-06 22:59:14 -08:00
parent 64aaf9d88f
commit 7931678899
4 changed files with 17 additions and 11 deletions

View file

@ -199,6 +199,8 @@ static inline size_t mi_ctz(size_t x) {
size_t r; size_t r;
__asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
return r; return r;
#elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__)
return _tzcnt_u64(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long idx; unsigned long idx;
return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS); return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
@ -221,6 +223,8 @@ static inline size_t mi_clz(size_t x) {
size_t r; size_t r;
__asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc");
return r; return r;
#elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__)
return _lzcnt_u64(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long idx; unsigned long idx;
return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS); return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
@ -254,7 +258,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
bool is_zero; bool is_zero;
__asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
return !is_zero; return !is_zero;
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long i; unsigned long i;
return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false); return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
#else #else
@ -271,7 +275,7 @@ static inline bool mi_bsr(size_t x, size_t* idx) {
bool is_zero; bool is_zero;
__asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc"); __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
return !is_zero; return !is_zero;
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long i; unsigned long i;
return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false); return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
#else #else

View file

@ -773,9 +773,10 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { for (int i = 0; i < MI_BCHUNK_FIELDS; i++) {
mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]);
size_t idx; size_t idx;
// is there a range inside the field? // is there a range inside the field?
while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit
if (idx + n > MI_BFIELD_BITS) break; // too short, maybe cross over, or continue with the next field if (idx + n > MI_BFIELD_BITS) break; // too short: maybe cross over, or continue with the next field
const size_t bmask = mask<<idx; const size_t bmask = mask<<idx;
mi_assert_internal(bmask>>idx == mask); mi_assert_internal(bmask>>idx == mask);
@ -792,15 +793,16 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk,
} }
} }
else { else {
// advance // advance by clearing the least run of ones, for example, with n>=4, idx=2:
const size_t ones = mi_bfield_ctz(~(b>>idx)); // skip all ones (since it didn't fit the mask) // b = 1111 1101 1010 1100
mi_assert_internal(ones>0); // .. + (1<<idx) = 1111 1101 1011 0000
b = b & ~mi_bfield_mask(ones, idx); // clear the ones // .. & b = 1111 1101 1010 0000
b = b & (b + (mi_bfield_one() << idx));
} }
} }
// check if we can cross into the next bfield // check if we can cross into the next bfield
if (i < MI_BCHUNK_FIELDS-1) { if (b!=0 && i < MI_BCHUNK_FIELDS-1) {
const size_t post = mi_bfield_clz(~b); const size_t post = mi_bfield_clz(~b);
if (post > 0) { if (post > 0) {
const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1])); const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1]));

View file

@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] =
{ 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits
{ MI_DEFAULT_PAGEMAP_COMMIT, { MI_DEFAULT_PAGEMAP_COMMIT,
UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront?
{ 2, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux))
}; };
static void mi_option_init(mi_option_desc_t* desc); static void mi_option_init(mi_option_desc_t* desc);

View file

@ -137,7 +137,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
Page collect the `local_free` and `thread_free` lists Page collect the `local_free` and `thread_free` lists
----------------------------------------------------------- */ ----------------------------------------------------------- */
static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
{ {
if (head == NULL) return; if (head == NULL) return;
@ -167,7 +167,7 @@ static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head)
} }
// Collect the local `thread_free` list using an atomic exchange. // Collect the local `thread_free` list using an atomic exchange.
static void mi_page_thread_free_collect(mi_page_t* page) static mi_decl_noinline void mi_page_thread_free_collect(mi_page_t* page)
{ {
// atomically capture the thread free list // atomically capture the thread free list
mi_block_t* head; mi_block_t* head;