limit memcpy as rep stosb to windows where the cpu supporst FSRM; add mi_memcpy_aligned for machine-word aligned copy. see issue #201 and pr #253

This commit is contained in:
Daan Leijen 2021-01-30 14:33:46 -08:00
parent 92ec493a5d
commit 35c1fc2be9
6 changed files with 77 additions and 24 deletions

View file

@ -87,7 +87,7 @@ const mi_page_t _mi_page_empty = {
// may lead to allocation itself on some platforms)
// --------------------------------------------------------
const mi_heap_t _mi_heap_empty = {
mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
NULL,
MI_SMALL_PAGES_EMPTY,
MI_PAGE_QUEUES_EMPTY,
@ -189,7 +189,7 @@ static bool _mi_heap_init(void) {
// OS allocated so already zero initialized
mi_tld_t* tld = &td->tld;
mi_heap_t* heap = &td->heap;
_mi_memcpy(heap, &_mi_heap_empty, sizeof(*heap));
_mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(*heap));
heap->thread_id = _mi_thread_id();
_mi_random_init(&heap->random);
heap->cookie = _mi_heap_random_next(heap) | 1;
@ -458,6 +458,22 @@ static void mi_process_load(void) {
}
}
#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
#include <intrin.h>
mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
static void mi_detect_cpu_features(void) {
// FSRM for fast rep movsb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
int32_t cpu_info[4];
__cpuid(cpu_info, 7);
_mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https ://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
}
#else
static void mi_detect_cpu_features(void) {
// nothing
}
#endif
// Initialize the process; called by thread_init or the process loader
void mi_process_init(void) mi_attr_noexcept {
// ensure we are called once
@ -466,6 +482,7 @@ void mi_process_init(void) mi_attr_noexcept {
mi_process_setup_auto_thread_done();
_mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
mi_detect_cpu_features();
_mi_os_init();
mi_heap_main_init();
#if (MI_DEBUG)