From d9580f3bfb4491f21fb7b40d54ca3d93465a7902 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 6 Mar 2025 18:54:04 -0800 Subject: [PATCH] update popcnt to be more efficient on x64 even without MI_OPT_ARCH=ON --- ide/vs2022/mimalloc-lib.vcxproj | 2 +- include/mimalloc/bits.h | 35 +++++++++++++++++++++++----- include/mimalloc/internal.h | 23 ++---------------- src/init.c | 41 +++++++++++++++++++++++++++------ src/libc.c | 6 ++--- 5 files changed, 68 insertions(+), 39 deletions(-) diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj index b4bf013e..c294ea0e 100644 --- a/ide/vs2022/mimalloc-lib.vcxproj +++ b/ide/vs2022/mimalloc-lib.vcxproj @@ -316,7 +316,7 @@ CompileAsCpp true stdcpp20 - AdvancedVectorExtensions2 + StreamingSIMDExtensions /Zc:__cplusplus %(AdditionalOptions) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index fc56e8ea..2debaf25 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -90,7 +90,7 @@ typedef int32_t mi_ssize_t; #endif #endif -#if MI_ARCH_X64 && defined(__AVX2__) +#if (MI_ARCH_X86 || MI_ARCH_X64) #include #elif MI_ARCH_ARM64 && MI_OPT_SIMD #include @@ -134,6 +134,18 @@ typedef int32_t mi_ssize_t; Builtin's -------------------------------------------------------------------------------- */ +#if defined(__GNUC__) || defined(__clang__) +#define mi_unlikely(x) (__builtin_expect(!!(x),false)) +#define mi_likely(x) (__builtin_expect(!!(x),true)) +#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) +#define mi_unlikely(x) (x) [[unlikely]] +#define mi_likely(x) (x) [[likely]] +#else +#define mi_unlikely(x) (x) +#define mi_likely(x) (x) +#endif + + #ifndef __has_builtin #define __has_builtin(x) 0 #endif @@ -171,14 +183,25 @@ typedef int32_t mi_ssize_t; -------------------------------------------------------------------------------- */ size_t _mi_popcount_generic(size_t x); +extern bool _mi_cpu_has_popcnt; static inline size_t mi_popcount(size_t x) { - #if mi_has_builtinz(popcount) + #if defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86) + #if !defined(__BMI1__) + if mi_unlikely(!_mi_cpu_has_popcnt) { return _mi_popcount_generic(x); } + #endif + size_t r; + __asm ("popcnt\t%1,%0" : "=r"(r) : "r"(x) : "cc"); + return r; + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86) + #if !defined(__BMI1__) + if mi_unlikely(!_mi_cpu_has_popcnt) { return _mi_popcount_generic(x); } + #endif + return (size_t)mi_msc_builtinz(__popcnt)(x); + #elif defined(_MSC_VER) && MI_ARCH_ARM64 + return (size_t)mi_msc_builtinz(__popcnt)(x); + #elif mi_has_builtinz(popcount) return mi_builtinz(popcount)(x); - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) - return mi_msc_builtinz(__popcnt)(x); - #elif MI_ARCH_X64 && defined(__BMI1__) - return (size_t)_mm_popcnt_u64(x); #else #define MI_HAS_FAST_POPCOUNT 0 return (x<=1 ? x : _mi_popcount_generic(x)); diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index e8b1c919..8a880b8d 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -256,25 +256,6 @@ bool _mi_page_is_valid(mi_page_t* page); #endif -// ------------------------------------------------------ -// Branches -// ------------------------------------------------------ - -#if defined(__GNUC__) || defined(__clang__) -#define mi_unlikely(x) (__builtin_expect(!!(x),false)) -#define mi_likely(x) (__builtin_expect(!!(x),true)) -#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L) -#define mi_unlikely(x) (x) [[unlikely]] -#define mi_likely(x) (x) [[likely]] -#else -#define mi_unlikely(x) (x) -#define mi_likely(x) (x) -#endif - -#ifndef __has_builtin -#define __has_builtin(x) 0 -#endif - /* ----------------------------------------------------------- Assertions @@ -1037,10 +1018,10 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) { // (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253. // --------------------------------------------------------------------------------- -#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) -#include +#if !MI_TRACK_ENABLED && defined(_WIN32) && (MI_ARCH_X64 || MI_ARCH_X86) extern bool _mi_cpu_has_fsrm; extern bool _mi_cpu_has_erms; + static inline void _mi_memcpy(void* dst, const void* src, size_t n) { if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { __movsb((unsigned char*)dst, (const unsigned char*)src, n); diff --git a/src/init.c b/src/init.c index f9678cc5..54905dc8 100644 --- a/src/init.c +++ b/src/init.c @@ -652,25 +652,52 @@ void _mi_process_load(void) { _mi_random_reinit_if_weak(&heap_main.random); } -#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) -#include +// CPU features mi_decl_cache_align bool _mi_cpu_has_fsrm = false; mi_decl_cache_align bool _mi_cpu_has_erms = false; +mi_decl_cache_align bool _mi_cpu_has_popcnt = false; + +#if (MI_ARCH_X64 || MI_ARCH_X86) +#if defined(__GNUC__) +#include +static bool mi_cpuid(uint32_t* regs4, uint32_t level) { + return (__get_cpuid(level, ®s4[0], ®s4[1], ®s4[2], ®s4[3]) == 1); +} + +#elif defined(_MSC_VER) +static bool mi_cpuid(uint32_t* regs4, uint32_t level) { + __cpuid((int32_t*)regs4, (int32_t)level); + return true; +} +#else +static bool mi_cpuid(uint32_t* regs4, uint32_t level) { + MI_UNUSED(regs4); MI_UNUSED(level); + return false; +} +#endif static void mi_detect_cpu_features(void) { // FSRM for fast short rep movsb/stosb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017)) // EMRS for fast enhanced rep movsb/stosb support - int32_t cpu_info[4]; - __cpuid(cpu_info, 7); - _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see - _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see + uint32_t cpu_info[4]; + if (mi_cpuid(cpu_info, 7)) { + _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see + _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see + } + if (mi_cpuid(cpu_info, 1)) { + _mi_cpu_has_popcnt = ((cpu_info[2] & (1 << 23)) != 0); // bit 23 of ECX : see + } } + #else static void mi_detect_cpu_features(void) { - // nothing + #if MI_ARCH_ARM64 + _mi_cpu_has_popcnt = true; + #endif } #endif + // Initialize the process; called by thread_init or the process loader void mi_process_init(void) mi_attr_noexcept { // ensure we are called once diff --git a/src/libc.c b/src/libc.c index b18dff2c..a54eec5b 100644 --- a/src/libc.c +++ b/src/libc.c @@ -355,7 +355,6 @@ size_t _mi_clz_generic(size_t x) { #endif // bit scan -#if !MI_HAS_FAST_POPCOUNT #if MI_SIZE_SIZE == 4 #define mi_mask_even_bits32 (0x55555555) @@ -383,7 +382,7 @@ static size_t mi_popcount_generic32(uint32_t x) { return mi_byte_sum32(x); } -size_t _mi_popcount_generic(size_t x) { +mi_decl_noinline size_t _mi_popcount_generic(size_t x) { return mi_popcount_generic32(x); } @@ -407,9 +406,8 @@ static size_t mi_popcount_generic64(uint64_t x) { return mi_byte_sum64(x); } -size_t _mi_popcount_generic(size_t x) { +mi_decl_noinline size_t _mi_popcount_generic(size_t x) { return mi_popcount_generic64(x); } #endif -#endif // popcount