From 2c12d7f2234b25308478e22c9342a07623b6f891 Mon Sep 17 00:00:00 2001 From: daan Date: Fri, 1 Nov 2019 22:01:52 -0700 Subject: [PATCH] optimized numa calls; better Linux support --- CMakeLists.txt | 12 ++++ include/mimalloc-internal.h | 2 +- include/mimalloc-types.h | 1 + src/arena.c | 2 +- src/init.c | 3 +- src/memory.c | 6 +- src/os.c | 114 ++++++++++++++++++++++++------------ 7 files changed, 97 insertions(+), 43 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e9eb6feb..1e96c237 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,6 +1,8 @@ cmake_minimum_required(VERSION 3.0) project(libmimalloc C CXX) include("cmake/mimalloc-config-version.cmake") +include("CheckIncludeFile") + set(CMAKE_C_STANDARD 11) set(CMAKE_CXX_STANDARD 17) @@ -88,6 +90,16 @@ if(MI_USE_CXX MATCHES "ON") set_source_files_properties(src/static.c test/test-api.c PROPERTIES LANGUAGE CXX ) endif() +CHECK_INCLUDE_FILE("numaif.h" MI_HAVE_NUMA_H) +if(MI_HAVE_NUMA_H) + list(APPEND mi_defines MI_HAS_NUMA) + list(APPEND mi_libraries numa) +else() + if (NOT(WIN32)) + message(WARNING "Compiling without using NUMA optimized allocation (on Linux, install libnuma-dev?)") + endif() +endif() + # Compiler flags if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU") list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas) diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h index dd677a02..b4d3351d 100644 --- a/include/mimalloc-internal.h +++ b/include/mimalloc-internal.h @@ -56,7 +56,7 @@ void _mi_os_init(void); // called fro void* _mi_os_alloc(size_t size, mi_stats_t* stats); // to allocate thread local data void _mi_os_free(void* p, size_t size, mi_stats_t* stats); // to free thread local data size_t _mi_os_good_alloc_size(size_t size); -int _mi_os_numa_node(void); +int _mi_os_numa_node(mi_os_tld_t* tld); // memory.c diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h index 99b6b22b..0208d5c7 100644 --- a/include/mimalloc-types.h +++ b/include/mimalloc-types.h @@ -413,6 +413,7 @@ typedef struct mi_segments_tld_s { // OS thread local data typedef struct mi_os_tld_s { size_t region_idx; // start point for next allocation + int numa_node; // numa node associated with this thread mi_stats_t* stats; // points to tld stats } mi_os_tld_t; diff --git a/src/arena.c b/src/arena.c index 381d4486..7eb755c4 100644 --- a/src/arena.c +++ b/src/arena.c @@ -267,7 +267,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, { size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); size_t bcount = asize / MI_ARENA_BLOCK_SIZE; - int numa_node = _mi_os_numa_node(); // current numa node + int numa_node = _mi_os_numa_node(tld); // current numa node mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE); // try numa affine allocation diff --git a/src/init.c b/src/init.c index 0813fddd..166ca451 100644 --- a/src/init.c +++ b/src/init.c @@ -99,7 +99,7 @@ static mi_tld_t tld_main = { 0, false, &_mi_heap_main, { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments - { 0, tld_main_stats }, // os + { 0, -1, tld_main_stats }, // os { MI_STATS_NULL } // stats }; @@ -218,6 +218,7 @@ static bool _mi_heap_init(void) { memset(tld, 0, sizeof(*tld)); tld->heap_backing = heap; tld->segments.stats = &tld->stats; + tld->os.numa_node = -1; tld->os.stats = &tld->stats; _mi_heap_default = heap; } diff --git a/src/memory.c b/src/memory.c index 02e82e4d..a425393c 100644 --- a/src/memory.c +++ b/src/memory.c @@ -211,7 +211,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit if (mi_atomic_cas_strong(®ion->info, info, 0)) { // update the region count region->arena_memid = arena_memid; - mi_atomic_write(®ion->numa_node, _mi_os_numa_node() + 1); + mi_atomic_write(®ion->numa_node, _mi_os_numa_node(tld) + 1); mi_atomic_increment(®ions_count); } else { @@ -220,7 +220,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) { if (mi_atomic_cas_strong(®ions[idx+i].info, info, 0)) { regions[idx+i].arena_memid = arena_memid; - mi_atomic_write(®ions[idx+i].numa_node, _mi_os_numa_node() + 1); + mi_atomic_write(®ions[idx+i].numa_node, _mi_os_numa_node(tld) + 1); mi_atomic_increment(®ions_count); start = NULL; break; @@ -430,7 +430,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE); // find a range of free blocks - int numa_node = _mi_os_numa_node(); + int numa_node = _mi_os_numa_node(tld); void* p = NULL; size_t count = mi_atomic_read(®ions_count); size_t idx = tld->region_idx; // start at 0 to reuse low addresses? Or, use tld->region_idx to reduce contention? diff --git a/src/os.c b/src/os.c index 2bb3ee3c..677d0ea2 100644 --- a/src/os.c +++ b/src/os.c @@ -97,7 +97,7 @@ typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T* static PVirtualAlloc2 pVirtualAlloc2 = NULL; static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL; -static bool mi_win_enable_large_os_pages() +static bool mi_win_enable_large_os_pages() { if (large_os_page_size > 0) return true; @@ -148,10 +148,10 @@ void _mi_os_init(void) { FreeLibrary(hDll); } hDll = LoadLibrary(TEXT("ntdll.dll")); - if (hDll != NULL) { + if (hDll != NULL) { pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx"); FreeLibrary(hDll); - } + } if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) { mi_win_enable_large_os_pages(); } @@ -191,7 +191,7 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats #else err = (munmap(addr, size) == -1); #endif - if (was_committed) _mi_stat_decrease(&stats->committed, size); + if (was_committed) _mi_stat_decrease(&stats->committed, size); _mi_stat_decrease(&stats->reserved, size); if (err) { #pragma warning(suppress:4996) @@ -207,14 +207,14 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size); #ifdef _WIN32 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) { -#if (MI_INTPTR_SIZE >= 8) +#if (MI_INTPTR_SIZE >= 8) // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations void* hint; if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) { return VirtualAlloc(hint, size, flags, PAGE_READWRITE); } #endif -#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS) +#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS) // on modern Windows try use VirtualAlloc2 for aligned allocation if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) { MEM_ADDRESS_REQUIREMENTS reqs = { 0 }; @@ -232,7 +232,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment, mi_assert_internal(!(large_only && !allow_large)); static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0; void* p = NULL; - if ((large_only || use_large_os_page(size, try_alignment)) + if ((large_only || use_large_os_page(size, try_alignment)) && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) { uintptr_t try_ok = mi_atomic_read(&large_page_try_ok); if (!large_only && try_ok > 0) { @@ -372,7 +372,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro } if (p == NULL) { *is_large = false; - p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd); + p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd); #if defined(MADV_HUGEPAGE) // Many Linux systems don't allow MAP_HUGETLB but they support instead // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE @@ -391,7 +391,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro } #endif -// On 64-bit systems, we can do efficient aligned allocation by using +// On 64-bit systems, we can do efficient aligned allocation by using // the 4TiB to 30TiB area to allocate them. #if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED))) static volatile _Atomic(intptr_t) aligned_base; @@ -785,14 +785,14 @@ bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) { /* ---------------------------------------------------------------------------- -Support for allocating huge OS pages (1Gib) that are reserved up-front +Support for allocating huge OS pages (1Gib) that are reserved up-front and possibly associated with a specific NUMA node. (use `numa_node>=0`) -----------------------------------------------------------------------------*/ -#define MI_HUGE_OS_PAGE_SIZE (GiB) +#define MI_HUGE_OS_PAGE_SIZE (GiB) #if defined(WIN32) && (MI_INTPTR_SIZE >= 8) -static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) -{ +static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) +{ mi_assert_internal(size%GiB == 0); #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS) @@ -802,8 +802,8 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) reqs.HighestEndingAddress = NULL; reqs.LowestStartingAddress = NULL; reqs.Alignment = MI_SEGMENT_SIZE; - - // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages + + // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages if (pNtAllocateVirtualMemoryEx != NULL) { #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE (0x10) @@ -825,10 +825,10 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) return base; } else { - // fall back to regular huge pages + // fall back to regular huge pages _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (error 0x%lx)\n", err); } - } + } // on modern Windows try use VirtualAlloc2 for aligned large OS page allocation if (pVirtualAlloc2 != NULL) { params[0].Type = MemExtendedParameterAddressRequirements; @@ -842,7 +842,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) return (*pVirtualAlloc2)(GetCurrentProcess(), NULL, size, flags, PAGE_READWRITE, params, param_count); } #endif - return NULL; // give up on older Windows.. + return NULL; // give up on older Windows.. } #elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8) #ifdef MI_HAS_NUMA @@ -853,7 +853,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) { bool is_large = true; void* p = mi_unix_mmap(NULL, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large); if (p == NULL) return NULL; - #ifdef MI_HAS_NUMA + #ifdef MI_HAS_NUMA if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { uintptr_t numa_mask = (1UL << numa_node); long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0); @@ -866,7 +866,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) { #endif return p; } -#else +#else static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) { return NULL; } @@ -884,12 +884,12 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) { } #ifdef WIN32 -static int mi_os_numa_nodex(void) { +static int mi_os_numa_nodex() { PROCESSOR_NUMBER pnum; USHORT numa_node = 0; GetCurrentProcessorNumberEx(&pnum); GetNumaProcessorNodeEx(&pnum,&numa_node); - return (int)numa_node; + return (int)numa_node; } static int mi_os_numa_node_countx(void) { @@ -898,12 +898,42 @@ static int mi_os_numa_node_countx(void) { return (int)(numa_max + 1); } #elif MI_HAS_NUMA -#include +#include +#include +#include static int mi_os_numa_nodex(void) { - return numa_preferred(); + #define MI_MAX_MASK (4) // support at most 256 nodes + unsigned long mask[MI_MAX_MASK]; + memset(mask,0,MI_MAX_MASK*sizeof(long)); + int mode = 0; + long err = get_mempolicy(&mode, mask, MI_MAX_MASK*sizeof(long)*8, NULL, 0 /* thread policy */); + if (err != 0) return 0; + // find the lowest bit that is set + for(int i = 0; i < MI_MAX_MASK; i++) { + for(int j = 0; j < (int)(sizeof(long)*8); j++) { + if ((mask[i] & (1UL << j)) != 0) { + return (i*sizeof(long)*8 + j); + } + } + } + return 0; } + static int mi_os_numa_node_countx(void) { - return (numa_max_node() + 1); + DIR* d = opendir("/sys/devices/system/node"); + if (d==NULL) return 1; + + struct dirent* de; + int max_node_num = 0; + while ((de = readdir(d)) != NULL) { + int node_num; + if (strncmp(de->d_name, "node", 4) == 0) { + node_num = (int)strtol(de->d_name+4, NULL, 0); + if (max_node_num < node_num) max_node_num = node_num; + } + } + closedir(d); + return (max_node_num + 1); } #else static int mi_os_numa_nodex(void) { @@ -915,18 +945,28 @@ static int mi_os_numa_node_countx(void) { #endif int _mi_os_numa_node_count(void) { - long ncount = mi_os_numa_node_countx(); - // never more than max numa node and at least 1 - long nmax = 1 + mi_option_get(mi_option_max_numa_node); - if (ncount > nmax) ncount = nmax; - if (ncount <= 0) ncount = 1; - return ncount; + static int numa_node_count = 0; + if (mi_unlikely(numa_node_count <= 0)) { + int ncount = mi_os_numa_node_countx(); + // never more than max numa node and at least 1 + int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node); + if (ncount > nmax) ncount = nmax; + if (ncount <= 0) ncount = 1; + numa_node_count = ncount; + } + mi_assert_internal(numa_node_count >= 1); + return numa_node_count; } -int _mi_os_numa_node(void) { - int nnode = mi_os_numa_nodex(); - // never more than the node count - int ncount = _mi_os_numa_node_count(); - if (nnode >= ncount) { nnode = nnode % ncount; } - return nnode; +int _mi_os_numa_node(mi_os_tld_t* tld) { + if (mi_unlikely(tld->numa_node < 0)) { + int nnode = mi_os_numa_nodex(); + // never more than the node count + int ncount = _mi_os_numa_node_count(); + if (nnode >= ncount) { nnode = nnode % ncount; } + if (nnode < 0) nnode = 0; + tld->numa_node = nnode; + } + mi_assert_internal(tld->numa_node >= 0 && tld->numa_node < _mi_os_numa_node_count()); + return tld->numa_node; }