diff --git a/src/os.c b/src/os.c index 677d0ea2..fc89d642 100644 --- a/src/os.c +++ b/src/os.c @@ -854,8 +854,11 @@ static void* mi_os_alloc_huge_os_pagesx(size_t size, int numa_node) { void* p = mi_unix_mmap(NULL, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large); if (p == NULL) return NULL; #ifdef MI_HAS_NUMA - if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { + if (numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes uintptr_t numa_mask = (1UL << numa_node); + // TODO: does `mbind` work correctly for huge OS pages? should we + // use `set_mempolicy` before calling mmap instead? + // see: long err = mbind(p, size, MPOL_PREFERRED, &numa_mask, 8*MI_INTPTR_SIZE, 0); if (err != 0) { _mi_warning_message("failed to bind huge (1GiB) pages to NUMA node %d: %s\n", numa_node, strerror(errno)); @@ -883,6 +886,9 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, size_t* psize) { return p; } +/* ---------------------------------------------------------------------------- +Support NUMA aware allocation +-----------------------------------------------------------------------------*/ #ifdef WIN32 static int mi_os_numa_nodex() { PROCESSOR_NUMBER pnum; @@ -902,6 +908,9 @@ static int mi_os_numa_node_countx(void) { #include #include static int mi_os_numa_nodex(void) { + #define MI_NUMA_NODE_SLOW // too slow, so cache it + // TODO: perhaps use RDTSCP instruction on x64? + // see #define MI_MAX_MASK (4) // support at most 256 nodes unsigned long mask[MI_MAX_MASK]; memset(mask,0,MI_MAX_MASK*sizeof(long)); @@ -945,7 +954,7 @@ static int mi_os_numa_node_countx(void) { #endif int _mi_os_numa_node_count(void) { - static int numa_node_count = 0; + static int numa_node_count = 0; // cache the node count if (mi_unlikely(numa_node_count <= 0)) { int ncount = mi_os_numa_node_countx(); // never more than max numa node and at least 1 @@ -959,14 +968,24 @@ int _mi_os_numa_node_count(void) { } int _mi_os_numa_node(mi_os_tld_t* tld) { + int numa_node; +#ifndef MI_NUMA_NODE_SLOW + UNUSED(tld); + numa_node = mi_os_numa_nodex(); +#else if (mi_unlikely(tld->numa_node < 0)) { - int nnode = mi_os_numa_nodex(); - // never more than the node count - int ncount = _mi_os_numa_node_count(); - if (nnode >= ncount) { nnode = nnode % ncount; } - if (nnode < 0) nnode = 0; - tld->numa_node = nnode; + // Cache the NUMA node of the thread if the call is slow. + // This may not be correct as threads can migrate to another cpu on + // another node -- however, for memory allocation this just means we keep + // using the same 'node id' for its allocations; new OS allocations + // naturally come from the actual node so in practice this may be fine. + tld->numa_node = mi_os_numa_nodex(); } - mi_assert_internal(tld->numa_node >= 0 && tld->numa_node < _mi_os_numa_node_count()); - return tld->numa_node; + numa_node = tld->numa_node +#endif + // never more than the node count and >= 0 + int numa_count = _mi_os_numa_node_count(); + if (numa_node >= numa_count) { numa_node = numa_node % numa_count; } + if (numa_node < 0) numa_node = 0; + return numa_node; } diff --git a/test/main-override.cpp b/test/main-override.cpp index e006ad27..f7a7f1bd 100644 --- a/test/main-override.cpp +++ b/test/main-override.cpp @@ -24,7 +24,7 @@ public: int main() { - //mi_stats_reset(); // ignore earlier allocations + mi_stats_reset(); // ignore earlier allocations atexit(free_p); void* p1 = malloc(78); void* p2 = mi_malloc_aligned(16,24);