diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj
index b4bf013e..c294ea0e 100644
--- a/ide/vs2022/mimalloc-lib.vcxproj
+++ b/ide/vs2022/mimalloc-lib.vcxproj
@@ -316,7 +316,7 @@
CompileAsCpp
true
stdcpp20
- AdvancedVectorExtensions2
+ StreamingSIMDExtensions
/Zc:__cplusplus %(AdditionalOptions)
diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h
index fc56e8ea..2debaf25 100644
--- a/include/mimalloc/bits.h
+++ b/include/mimalloc/bits.h
@@ -90,7 +90,7 @@ typedef int32_t mi_ssize_t;
#endif
#endif
-#if MI_ARCH_X64 && defined(__AVX2__)
+#if (MI_ARCH_X86 || MI_ARCH_X64)
#include
#elif MI_ARCH_ARM64 && MI_OPT_SIMD
#include
@@ -134,6 +134,18 @@ typedef int32_t mi_ssize_t;
Builtin's
-------------------------------------------------------------------------------- */
+#if defined(__GNUC__) || defined(__clang__)
+#define mi_unlikely(x) (__builtin_expect(!!(x),false))
+#define mi_likely(x) (__builtin_expect(!!(x),true))
+#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
+#define mi_unlikely(x) (x) [[unlikely]]
+#define mi_likely(x) (x) [[likely]]
+#else
+#define mi_unlikely(x) (x)
+#define mi_likely(x) (x)
+#endif
+
+
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
@@ -171,14 +183,25 @@ typedef int32_t mi_ssize_t;
-------------------------------------------------------------------------------- */
size_t _mi_popcount_generic(size_t x);
+extern bool _mi_cpu_has_popcnt;
static inline size_t mi_popcount(size_t x) {
- #if mi_has_builtinz(popcount)
+ #if defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
+ #if !defined(__BMI1__)
+ if mi_unlikely(!_mi_cpu_has_popcnt) { return _mi_popcount_generic(x); }
+ #endif
+ size_t r;
+ __asm ("popcnt\t%1,%0" : "=r"(r) : "r"(x) : "cc");
+ return r;
+ #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86)
+ #if !defined(__BMI1__)
+ if mi_unlikely(!_mi_cpu_has_popcnt) { return _mi_popcount_generic(x); }
+ #endif
+ return (size_t)mi_msc_builtinz(__popcnt)(x);
+ #elif defined(_MSC_VER) && MI_ARCH_ARM64
+ return (size_t)mi_msc_builtinz(__popcnt)(x);
+ #elif mi_has_builtinz(popcount)
return mi_builtinz(popcount)(x);
- #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
- return mi_msc_builtinz(__popcnt)(x);
- #elif MI_ARCH_X64 && defined(__BMI1__)
- return (size_t)_mm_popcnt_u64(x);
#else
#define MI_HAS_FAST_POPCOUNT 0
return (x<=1 ? x : _mi_popcount_generic(x));
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index e8b1c919..8a880b8d 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -256,25 +256,6 @@ bool _mi_page_is_valid(mi_page_t* page);
#endif
-// ------------------------------------------------------
-// Branches
-// ------------------------------------------------------
-
-#if defined(__GNUC__) || defined(__clang__)
-#define mi_unlikely(x) (__builtin_expect(!!(x),false))
-#define mi_likely(x) (__builtin_expect(!!(x),true))
-#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
-#define mi_unlikely(x) (x) [[unlikely]]
-#define mi_likely(x) (x) [[likely]]
-#else
-#define mi_unlikely(x) (x)
-#define mi_likely(x) (x)
-#endif
-
-#ifndef __has_builtin
-#define __has_builtin(x) 0
-#endif
-
/* -----------------------------------------------------------
Assertions
@@ -1037,10 +1018,10 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
// ---------------------------------------------------------------------------------
-#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
-#include
+#if !MI_TRACK_ENABLED && defined(_WIN32) && (MI_ARCH_X64 || MI_ARCH_X86)
extern bool _mi_cpu_has_fsrm;
extern bool _mi_cpu_has_erms;
+
static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) {
__movsb((unsigned char*)dst, (const unsigned char*)src, n);
diff --git a/src/init.c b/src/init.c
index f9678cc5..54905dc8 100644
--- a/src/init.c
+++ b/src/init.c
@@ -652,25 +652,52 @@ void _mi_process_load(void) {
_mi_random_reinit_if_weak(&heap_main.random);
}
-#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
-#include
+// CPU features
mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
mi_decl_cache_align bool _mi_cpu_has_erms = false;
+mi_decl_cache_align bool _mi_cpu_has_popcnt = false;
+
+#if (MI_ARCH_X64 || MI_ARCH_X86)
+#if defined(__GNUC__)
+#include
+static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
+ return (__get_cpuid(level, ®s4[0], ®s4[1], ®s4[2], ®s4[3]) == 1);
+}
+
+#elif defined(_MSC_VER)
+static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
+ __cpuid((int32_t*)regs4, (int32_t)level);
+ return true;
+}
+#else
+static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
+ MI_UNUSED(regs4); MI_UNUSED(level);
+ return false;
+}
+#endif
static void mi_detect_cpu_features(void) {
// FSRM for fast short rep movsb/stosb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
// EMRS for fast enhanced rep movsb/stosb support
- int32_t cpu_info[4];
- __cpuid(cpu_info, 7);
- _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see
- _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see
+ uint32_t cpu_info[4];
+ if (mi_cpuid(cpu_info, 7)) {
+ _mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see
+ _mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see
+ }
+ if (mi_cpuid(cpu_info, 1)) {
+ _mi_cpu_has_popcnt = ((cpu_info[2] & (1 << 23)) != 0); // bit 23 of ECX : see
+ }
}
+
#else
static void mi_detect_cpu_features(void) {
- // nothing
+ #if MI_ARCH_ARM64
+ _mi_cpu_has_popcnt = true;
+ #endif
}
#endif
+
// Initialize the process; called by thread_init or the process loader
void mi_process_init(void) mi_attr_noexcept {
// ensure we are called once
diff --git a/src/libc.c b/src/libc.c
index b18dff2c..a54eec5b 100644
--- a/src/libc.c
+++ b/src/libc.c
@@ -355,7 +355,6 @@ size_t _mi_clz_generic(size_t x) {
#endif // bit scan
-#if !MI_HAS_FAST_POPCOUNT
#if MI_SIZE_SIZE == 4
#define mi_mask_even_bits32 (0x55555555)
@@ -383,7 +382,7 @@ static size_t mi_popcount_generic32(uint32_t x) {
return mi_byte_sum32(x);
}
-size_t _mi_popcount_generic(size_t x) {
+mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
return mi_popcount_generic32(x);
}
@@ -407,9 +406,8 @@ static size_t mi_popcount_generic64(uint64_t x) {
return mi_byte_sum64(x);
}
-size_t _mi_popcount_generic(size_t x) {
+mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
return mi_popcount_generic64(x);
}
#endif
-#endif // popcount