From 2c12d7f2234b25308478e22c9342a07623b6f891 Mon Sep 17 00:00:00 2001
From: daan <daanl@outlook.com>
Date: Fri, 1 Nov 2019 22:01:52 -0700
Subject: [PATCH] optimized numa calls; better Linux support

---
 CMakeLists.txt              |  12 ++++
 include/mimalloc-internal.h |   2 +-
 include/mimalloc-types.h    |   1 +
 src/arena.c                 |   2 +-
 src/init.c                  |   3 +-
 src/memory.c                |   6 +-
 src/os.c                    | 114 ++++++++++++++++++++++++------------
 7 files changed, 97 insertions(+), 43 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index e9eb6feb..1e96c237 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,6 +1,8 @@
 cmake_minimum_required(VERSION 3.0)
 project(libmimalloc C CXX)
 include("cmake/mimalloc-config-version.cmake")
+include("CheckIncludeFile")
+
 set(CMAKE_C_STANDARD 11)
 set(CMAKE_CXX_STANDARD 17)
 
@@ -88,6 +90,16 @@ if(MI_USE_CXX MATCHES "ON")
   set_source_files_properties(src/static.c test/test-api.c PROPERTIES LANGUAGE CXX )
 endif()
 
+CHECK_INCLUDE_FILE("numaif.h" MI_HAVE_NUMA_H)
+if(MI_HAVE_NUMA_H)
+  list(APPEND mi_defines MI_HAS_NUMA)
+  list(APPEND mi_libraries numa)
+else()
+  if (NOT(WIN32))
+    message(WARNING "Compiling without using NUMA optimized allocation (on Linux, install libnuma-dev?)")
+  endif()
+endif()
+
 # Compiler flags
 if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU")
   list(APPEND mi_cflags -Wall -Wextra -Wno-unknown-pragmas)
diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index dd677a02..b4d3351d 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -56,7 +56,7 @@ void       _mi_os_init(void);                                      // called fro
 void*      _mi_os_alloc(size_t size, mi_stats_t* stats);           // to allocate thread local data
 void       _mi_os_free(void* p, size_t size, mi_stats_t* stats);   // to free thread local data
 size_t     _mi_os_good_alloc_size(size_t size);
-int        _mi_os_numa_node(void);
+int        _mi_os_numa_node(mi_os_tld_t* tld);
 
 
 // memory.c
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index 99b6b22b..0208d5c7 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -413,6 +413,7 @@ typedef struct mi_segments_tld_s {
 // OS thread local data
 typedef struct mi_os_tld_s {
   size_t              region_idx;   // start point for next allocation
+  int                 numa_node;    // numa node associated with this thread
   mi_stats_t*         stats;        // points to tld stats
 } mi_os_tld_t;
 
diff --git a/src/arena.c b/src/arena.c
index 381d4486..7eb755c4 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -267,7 +267,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
   {
     size_t asize = _mi_align_up(size, MI_ARENA_BLOCK_SIZE);
     size_t bcount = asize / MI_ARENA_BLOCK_SIZE;
-    int numa_node = _mi_os_numa_node(); // current numa node
+    int numa_node = _mi_os_numa_node(tld); // current numa node
 
     mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
     // try numa affine allocation
diff --git a/src/init.c b/src/init.c
index 0813fddd..166ca451 100644
--- a/src/init.c
+++ b/src/init.c
@@ -99,7 +99,7 @@ static mi_tld_t tld_main = {
   0, false,
   &_mi_heap_main,
   { { NULL, NULL }, {NULL ,NULL}, 0, 0, 0, 0, 0, 0, NULL, tld_main_stats }, // segments
-  { 0, tld_main_stats },       // os
+  { 0, -1, tld_main_stats },   // os
   { MI_STATS_NULL }            // stats
 };
 
@@ -218,6 +218,7 @@ static bool _mi_heap_init(void) {
     memset(tld, 0, sizeof(*tld));
     tld->heap_backing = heap;
     tld->segments.stats = &tld->stats;
+    tld->os.numa_node = -1;
     tld->os.stats = &tld->stats;
     _mi_heap_default = heap;
   }
diff --git a/src/memory.c b/src/memory.c
index 02e82e4d..a425393c 100644
--- a/src/memory.c
+++ b/src/memory.c
@@ -211,7 +211,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
     if (mi_atomic_cas_strong(&region->info, info, 0)) {
       // update the region count
       region->arena_memid = arena_memid;
-      mi_atomic_write(&region->numa_node, _mi_os_numa_node() + 1);
+      mi_atomic_write(&region->numa_node, _mi_os_numa_node(tld) + 1);
       mi_atomic_increment(&regions_count);
     }
     else {
@@ -220,7 +220,7 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
       for(size_t i = 1; i <= 4 && idx + i < MI_REGION_MAX; i++) {
         if (mi_atomic_cas_strong(&regions[idx+i].info, info, 0)) {
           regions[idx+i].arena_memid = arena_memid;
-          mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node() + 1);
+          mi_atomic_write(&regions[idx+i].numa_node, _mi_os_numa_node(tld) + 1);
           mi_atomic_increment(&regions_count);
           start = NULL;
           break;
@@ -430,7 +430,7 @@ void* _mi_mem_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* l
   mi_assert_internal(blocks > 0 && blocks <= 8*MI_INTPTR_SIZE);
 
   // find a range of free blocks
-  int numa_node = _mi_os_numa_node();
+  int numa_node = _mi_os_numa_node(tld);
   void* p = NULL;
   size_t count = mi_atomic_read(&regions_count);
   size_t idx = tld->region_idx; // start at 0 to reuse low addresses? Or, use tld->region_idx to reduce contention?
diff --git a/src/os.c b/src/os.c
index 2bb3ee3c..677d0ea2 100644
--- a/src/os.c
+++ b/src/os.c
@@ -97,7 +97,7 @@ typedef NTSTATUS (__stdcall *PNtAllocateVirtualMemoryEx)(HANDLE, PVOID*, SIZE_T*
 static PVirtualAlloc2 pVirtualAlloc2 = NULL;
 static PNtAllocateVirtualMemoryEx pNtAllocateVirtualMemoryEx = NULL;
 
-static bool mi_win_enable_large_os_pages() 
+static bool mi_win_enable_large_os_pages()
 {
   if (large_os_page_size > 0) return true;
 
@@ -148,10 +148,10 @@ void _mi_os_init(void) {
     FreeLibrary(hDll);
   }
   hDll = LoadLibrary(TEXT("ntdll.dll"));
-  if (hDll != NULL) {    
+  if (hDll != NULL) {
     pNtAllocateVirtualMemoryEx = (PNtAllocateVirtualMemoryEx)(void (*)(void))GetProcAddress(hDll, "NtAllocateVirtualMemoryEx");
     FreeLibrary(hDll);
-  }  
+  }
   if (mi_option_is_enabled(mi_option_large_os_pages) || mi_option_is_enabled(mi_option_reserve_huge_os_pages)) {
     mi_win_enable_large_os_pages();
   }
@@ -191,7 +191,7 @@ static bool mi_os_mem_free(void* addr, size_t size, bool was_committed, mi_stats
 #else
   err = (munmap(addr, size) == -1);
 #endif
-  if (was_committed) _mi_stat_decrease(&stats->committed, size); 
+  if (was_committed) _mi_stat_decrease(&stats->committed, size);
   _mi_stat_decrease(&stats->reserved, size);
   if (err) {
 #pragma warning(suppress:4996)
@@ -207,14 +207,14 @@ static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size);
 
 #ifdef _WIN32
 static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment, DWORD flags) {
-#if (MI_INTPTR_SIZE >= 8) 
+#if (MI_INTPTR_SIZE >= 8)
   // on 64-bit systems, try to use the virtual address area after 4TiB for 4MiB aligned allocations
   void* hint;
   if (addr == NULL && (hint = mi_os_get_aligned_hint(try_alignment,size)) != NULL) {
     return VirtualAlloc(hint, size, flags, PAGE_READWRITE);
   }
 #endif
-#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)  
+#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
   // on modern Windows try use VirtualAlloc2 for aligned allocation
   if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
     MEM_ADDRESS_REQUIREMENTS reqs = { 0 };
@@ -232,7 +232,7 @@ static void* mi_win_virtual_alloc(void* addr, size_t size, size_t try_alignment,
   mi_assert_internal(!(large_only && !allow_large));
   static volatile _Atomic(uintptr_t) large_page_try_ok; // = 0;
   void* p = NULL;
-  if ((large_only || use_large_os_page(size, try_alignment)) 
+  if ((large_only || use_large_os_page(size, try_alignment))
       && allow_large && (flags&MEM_COMMIT)!=0 && (flags&MEM_RESERVE)!=0) {
     uintptr_t try_ok = mi_atomic_read(&large_page_try_ok);
     if (!large_only && try_ok > 0) {
@@ -372,7 +372,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
   }
   if (p == NULL) {
     *is_large = false;
-    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);    
+    p = mi_unix_mmapx(addr, size, try_alignment, protect_flags, flags, fd);
     #if defined(MADV_HUGEPAGE)
     // Many Linux systems don't allow MAP_HUGETLB but they support instead
     // transparent huge pages (THP). It is not required to call `madvise` with MADV_HUGE
@@ -391,7 +391,7 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 }
 #endif
 
-// On 64-bit systems, we can do efficient aligned allocation by using 
+// On 64-bit systems, we can do efficient aligned allocation by using
 // the 4TiB to 30TiB area to allocate them.
 #if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
 static volatile _Atomic(intptr_t) aligned_base;
@@ -785,14 +785,14 @@ bool _mi_os_shrink(void* p, size_t oldsize, size_t newsize, mi_stats_t* stats) {
 
 
 /* ----------------------------------------------------------------------------
-Support for allocating huge OS pages (1Gib) that are reserved up-front 
+Support for allocating huge OS pages (1Gib) that are reserved up-front
 and possibly associated with a specific NUMA node. (use `numa_node>=0`)
 -----------------------------------------------------------------------------*/
-#define MI_HUGE_OS_PAGE_SIZE  (GiB)  
+#define MI_HUGE_OS_PAGE_SIZE  (GiB)
 
 #if defined(WIN32) && (MI_INTPTR_SIZE >= 8)
-static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) 
-{  
+static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
+{
   mi_assert_internal(size%GiB == 0);
 
   #if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)
@@ -802,8 +802,8 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
   reqs.HighestEndingAddress = NULL;
   reqs.LowestStartingAddress = NULL;
   reqs.Alignment = MI_SEGMENT_SIZE;
-  
-  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages  
+
+  // on modern Windows try use NtAllocateVirtualMemoryEx for 1GiB huge pages
   if (pNtAllocateVirtualMemoryEx != NULL) {
     #ifndef MEM_EXTENDED_PARAMETER_NONPAGED_HUGE
     #define MEM_EXTENDED_PARAMETER_NONPAGED_HUGE  (0x10)
@@ -825,10 +825,10 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
       return base;
     }
     else {
-      // fall back to regular huge pages    
+      // fall back to regular huge pages
       _mi_warning_message("unable to allocate using huge (1GiB) pages, trying large (2MiB) pages instead (error 0x%lx)\n", err);
     }
-  }  
+  }
   // on modern Windows try use VirtualAlloc2 for aligned large OS page allocation
   if (pVirtualAlloc2 != NULL) {
     params[0].Type = MemExtendedParameterAddressRequirements;
@@ -842,7 +842,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node)
     return (*pVirtualAlloc2)(GetCurrentProcess(), NULL, size, flags, PAGE_READWRITE, params, param_count);
   }
   #endif
-  return NULL; // give up on older Windows.. 
+  return NULL; // give up on older Windows..
 }
 #elif defined(MI_OS_USE_MMAP) && (MI_INTPTR_SIZE >= 8)
 #ifdef MI_HAS_NUMA
@@ -853,7 +853,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   bool is_large = true;
   void* p = mi_unix_mmap(NULL, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large);
   if (p == NULL) return NULL;
-  #ifdef MI_HAS_NUMA  
+  #ifdef MI_HAS_NUMA
   if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) {
     uintptr_t numa_mask = (1UL << numa_node);
     long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0);
@@ -866,7 +866,7 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   #endif
   return p;
 }
-#else 
+#else
 static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) {
   return NULL;
 }
@@ -884,12 +884,12 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) {
 }
 
 #ifdef WIN32
-static int mi_os_numa_nodex(void) {
+static int mi_os_numa_nodex() {
   PROCESSOR_NUMBER pnum;
   USHORT numa_node = 0;
   GetCurrentProcessorNumberEx(&pnum);
   GetNumaProcessorNodeEx(&pnum,&numa_node);
-  return (int)numa_node; 
+  return (int)numa_node;
 }
 
 static int mi_os_numa_node_countx(void) {
@@ -898,12 +898,42 @@ static int mi_os_numa_node_countx(void) {
   return (int)(numa_max + 1);
 }
 #elif MI_HAS_NUMA
-#include <numa.h>
+#include <dirent.h>
+#include <stdlib.h>
+#include <numaif.h>
 static int mi_os_numa_nodex(void) {
-  return numa_preferred();
+  #define MI_MAX_MASK (4)          // support at most 256 nodes
+  unsigned long mask[MI_MAX_MASK];
+  memset(mask,0,MI_MAX_MASK*sizeof(long));
+  int mode = 0;
+  long err = get_mempolicy(&mode, mask, MI_MAX_MASK*sizeof(long)*8, NULL, 0 /* thread policy */);
+  if (err != 0) return 0;
+  // find the lowest bit that is set
+  for(int i = 0; i < MI_MAX_MASK; i++) {
+    for(int j = 0; j < (int)(sizeof(long)*8); j++) {
+      if ((mask[i] & (1UL << j)) != 0) {
+        return (i*sizeof(long)*8 + j);
+      }
+    }
+  }
+	return 0;
 }
+
 static int mi_os_numa_node_countx(void) {
-  return (numa_max_node() + 1);
+  DIR* d = opendir("/sys/devices/system/node");
+  if (d==NULL) return 1;
+  
+  struct dirent* de;
+  int max_node_num = 0;
+  while ((de = readdir(d)) != NULL) {
+  	int node_num;
+  	if (strncmp(de->d_name, "node", 4) == 0) {
+		  node_num = (int)strtol(de->d_name+4, NULL, 0);
+			if (max_node_num < node_num) max_node_num = node_num;
+    }
+  }
+  closedir(d);
+  return (max_node_num + 1);
 }
 #else
 static int mi_os_numa_nodex(void) {
@@ -915,18 +945,28 @@ static int mi_os_numa_node_countx(void) {
 #endif
 
 int _mi_os_numa_node_count(void) {
-  long ncount = mi_os_numa_node_countx();
-  // never more than max numa node and at least 1
-  long nmax  = 1 + mi_option_get(mi_option_max_numa_node);
-  if (ncount > nmax) ncount = nmax;
-  if (ncount <= 0) ncount = 1;
-  return ncount;
+  static int numa_node_count = 0;
+  if (mi_unlikely(numa_node_count <= 0)) {
+    int ncount = mi_os_numa_node_countx();
+    // never more than max numa node and at least 1
+    int nmax = 1 + (int)mi_option_get(mi_option_max_numa_node);
+    if (ncount > nmax) ncount = nmax;
+    if (ncount <= 0)   ncount = 1;
+    numa_node_count = ncount;
+  }
+  mi_assert_internal(numa_node_count >= 1);
+  return numa_node_count;
 }
 
-int _mi_os_numa_node(void) {
-  int nnode = mi_os_numa_nodex();
-  // never more than the node count
-  int ncount = _mi_os_numa_node_count();
-  if (nnode >= ncount) { nnode = nnode % ncount; }  
-  return nnode;
+int _mi_os_numa_node(mi_os_tld_t* tld) {
+  if (mi_unlikely(tld->numa_node < 0)) {
+    int nnode = mi_os_numa_nodex();
+    // never more than the node count
+    int ncount = _mi_os_numa_node_count();
+    if (nnode >= ncount) { nnode = nnode % ncount; }
+    if (nnode < 0) nnode = 0;
+    tld->numa_node = nnode;
+  }
+  mi_assert_internal(tld->numa_node >= 0 && tld->numa_node < _mi_os_numa_node_count());
+  return tld->numa_node;
 }