diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 9f745d4e..57ca79fc 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -1022,8 +1022,9 @@ static inline size_t mi_bsr(uintptr_t x) { #if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) #include extern bool _mi_cpu_has_fsrm; +extern bool _mi_cpu_has_erms; static inline void _mi_memcpy(void* dst, const void* src, size_t n) { - if (_mi_cpu_has_fsrm) { + if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { __movsb((unsigned char*)dst, (const unsigned char*)src, n); } else { @@ -1031,7 +1032,7 @@ static inline void _mi_memcpy(void* dst, const void* src, size_t n) { } } static inline void _mi_memzero(void* dst, size_t n) { - if (_mi_cpu_has_fsrm) { + if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { __stosb((unsigned char*)dst, 0, n); } else { diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index f8bf948e..56715df4 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -138,9 +138,9 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap); // but unfortunately we can not detect support reliably (see issue #883) // We also use it on Apple OS as we use a TLS slot for the default heap there. #if defined(__GNUC__) && ( \ - (defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ + (defined(__GLIBC__) && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \ || (defined(__APPLE__) && (defined(__x86_64__) || defined(__aarch64__) || defined(__POWERPC__))) \ - || (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))) \ + || (defined(__BIONIC__) && (defined(__x86_64__) || defined(__i386__) || (defined(__arm__) && __ARM_ARCH >= 7) || defined(__aarch64__))) \ || (defined(__FreeBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ || (defined(__OpenBSD__) && (defined(__x86_64__) || defined(__i386__) || defined(__aarch64__))) \ ) diff --git a/src/init.c b/src/init.c index ccaf9445..3f431ee4 100644 --- a/src/init.c +++ b/src/init.c @@ -614,12 +614,15 @@ void _mi_process_load(void) { #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) #include mi_decl_cache_align bool _mi_cpu_has_fsrm = false; +mi_decl_cache_align bool _mi_cpu_has_erms = false; static void mi_detect_cpu_features(void) { - // FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017)) + // FSRM for fast short rep movsb/stosb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017)) + // EMRS for fast enhanced rep movsb/stosb support int32_t cpu_info[4]; __cpuid(cpu_info, 7); _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see + _mi_cpu_has_erms = ((cpu_info[2] & (1 << 9)) != 0); // bit 9 of ECX : see } #else static void mi_detect_cpu_features(void) {