avoid thread over-allocation on initial region allocations

2025-08-24 00:04:48 +03:00 · 2019-07-15 17:35:43 -07:00 · 2019-07-15 17:35:43 -07:00 · 72d8608333
commit 72d8608333
parent 7ea9cf8d1f
3 changed files with 54 additions and 38 deletions
--- a/ide/vs2017/mimalloc.vcxproj
+++ b/ide/vs2017/mimalloc.vcxproj
@ -95,6 +95,7 @@
      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
      <CompileAs>Default</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
    </ClCompile>
    <Lib>
      <AdditionalLibraryDirectories>
@ -112,6 +113,7 @@
      <AdditionalIncludeDirectories>../../include</AdditionalIncludeDirectories>
      <PreprocessorDefinitions>MI_DEBUG=3;%(PreprocessorDefinitions);</PreprocessorDefinitions>
      <CompileAs>Default</CompileAs>
      <SupportJustMyCode>false</SupportJustMyCode>
    </ClCompile>
    <PostBuildEvent>
      <Command>
@ -140,7 +142,7 @@
      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
      <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>
@ -171,7 +173,7 @@
      <PreprocessorDefinitions>%(PreprocessorDefinitions);NDEBUG</PreprocessorDefinitions>
      <AssemblerOutput>AssemblyAndSourceCode</AssemblerOutput>
      <AssemblerListingLocation>$(IntDir)</AssemblerListingLocation>
-      <WholeProgramOptimization>true</WholeProgramOptimization>
+      <WholeProgramOptimization>false</WholeProgramOptimization>
      <BufferSecurityCheck>false</BufferSecurityCheck>
      <InlineFunctionExpansion>AnySuitable</InlineFunctionExpansion>
      <FavorSizeOrSpeed>Neither</FavorSizeOrSpeed>
--- a/include/mimalloc-atomic.h
+++ b/include/mimalloc-atomic.h
@ -42,14 +42,22 @@ static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exch
 // Atomically read a value
 static inline uintptr_t mi_atomic_read(volatile uintptr_t* p);
 // Atomically read a pointer
 static inline void* mi_atomic_read_ptr(volatile void** p);
 // Atomically write a value
 static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x);
 // Atomically read a pointer
 static inline void* mi_atomic_read_ptr(volatile void** p) {
  return (void*)mi_atomic_read( (volatile uintptr_t*)p );
 }
 static inline void mi_atomic_yield(void);
 // Atomically write a pointer
 static inline void mi_atomic_write_ptr(volatile void** p, void* x) {
  mi_atomic_write((volatile uintptr_t*)p, (uintptr_t)x );
 }
 // Atomically compare and exchange a pointer; returns `true` if successful.
 static inline bool mi_atomic_compare_exchange_ptr(volatile void** p, void* newp, void* compare) {
  return mi_atomic_compare_exchange((volatile uintptr_t*)p, (uintptr_t)newp, (uintptr_t)compare);
@ -99,9 +107,6 @@ static inline uintptr_t mi_atomic_exchange(volatile uintptr_t* p, uintptr_t exch
 static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
  return *p;
 }
 static inline void* mi_atomic_read_ptr(volatile void** p) {
  return (void*)(*p);
 }
 static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
  *p = x;
 }
@ -171,10 +176,6 @@ static inline uintptr_t mi_atomic_read(volatile uintptr_t* p) {
  MI_USING_STD
  return atomic_load_explicit((volatile atomic_uintptr_t*)p, memory_order_relaxed);
 }
 static inline void* mi_atomic_read_ptr(volatile void** p) {
  MI_USING_STD
  return atomic_load_explicit((volatile _Atomic(void*)*)p, memory_order_relaxed);
 }
 static inline void mi_atomic_write(volatile uintptr_t* p, uintptr_t x) {
  MI_USING_STD
  return atomic_store_explicit((volatile atomic_uintptr_t*)p, x, memory_order_relaxed);
--- a/src/memory.c
+++ b/src/memory.c
@ -7,13 +7,16 @@ terms of the MIT license. A copy of the license can be found in the file
 /* ----------------------------------------------------------------------------
 This implements a layer between the raw OS memory (VirtualAlloc/mmap/sbrk/..)
-and the segment and huge object allocation by mimalloc. In contrast to the
+and the segment and huge object allocation by mimalloc. There may be multiple
-rest of mimalloc, this uses thread-shared "regions" that are accessed using
+implementations of this (one could be the identity going directly to the OS,
-atomic operations. We need this layer because of:
+another could be a simple cache etc), but the current one uses large "regions".
 In contrast to the rest of mimalloc, the "regions" are shared between threads and
 need to be accessed using atomic operations. 
 We need this memory layer between the raw OS calls because of:
 1. on `sbrk` like systems (like WebAssembly) we need our own memory maps in order
-   to reuse memory 
+   to reuse memory effectively.
 2. It turns out that for large objects, between 1MiB and 32MiB (?), the cost of
-   an OS allocation/free is still too expensive relative to the accesses in that
+   an OS allocation/free is still (much) too expensive relative to the accesses in that
   object :-( (`mallloc-large` tests this). This means we need a cheaper way to 
   reuse memory.
 3. This layer can help with a NUMA aware allocation in the future.
@ -34,7 +37,7 @@ Possible issues:
 #include <string.h>  // memset
-// Internal OS interface
+// Internal raw OS interface
 size_t  _mi_os_large_page_size();
 bool  _mi_os_protect(void* addr, size_t size);
 bool  _mi_os_unprotect(void* addr, size_t size);
@ -76,7 +79,7 @@ typedef struct mem_region_s {
 static mem_region_t regions[MI_REGION_MAX];
 static volatile size_t regions_count = 0;        // allocated regions
-static volatile uintptr_t region_next_idx = 0;
+static volatile uintptr_t region_next_idx = 0;   // good place to start searching
 /* ----------------------------------------------------------------------------
@ -105,6 +108,8 @@ static size_t mi_good_commit_size(size_t size) {
 Commit from a region
 -----------------------------------------------------------------------------*/
 #define ALLOCATING  ((void*)1)
 // Commit the `blocks` in `region` at `idx` and `bitidx` of a given `size`. 
 // Returns `false` on an error (OOM); `true` otherwise. `p` and `id` are only written
 // if the blocks were successfully claimed so ensure they are initialized to NULL/SIZE_MAX before the call. 
@ -115,9 +120,25 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
  mi_assert_internal((mask & mi_atomic_read(&region->map)) == mask);
  // ensure the region is reserved
-  void* start = mi_atomic_read_ptr(&region->start);
+  void* start;
-  if (start == NULL) {    
+  do {
    start = mi_atomic_read_ptr(&region->start);
    if (start == NULL) {
      start = ALLOCATING;  // try to start allocating
    }
    else if (start == ALLOCATING) {
      mi_atomic_yield(); // another thead is already allocating.. wait it out
      continue;
    }    
  } while( start == ALLOCATING && !mi_atomic_compare_exchange_ptr(&region->start, ALLOCATING, NULL) );
  mi_assert_internal(start != NULL);
  // allocate the region if needed
  if (start == ALLOCATING) {    
    start = _mi_os_alloc_aligned(MI_REGION_SIZE, MI_SEGMENT_ALIGN, mi_option_is_enabled(mi_option_eager_region_commit), tld);
    // set the new allocation (or NULL on failure) -- this releases any waiting threads.
    mi_atomic_write_ptr(&region->start, start);
    if (start == NULL) {
      // failure to allocate from the OS! unclaim the blocks and fail
      size_t map;
@ -126,22 +147,14 @@ static bool mi_region_commit_blocks(mem_region_t* region, size_t idx, size_t bit
      } while (!mi_atomic_compare_exchange(&region->map, map & ~mask, map));
      return false;
    }
-    // set the newly allocated region
+
-    if (mi_atomic_compare_exchange_ptr(&region->start, start, NULL)) {
+    // update the region count if this is a new max idx.
-      // update the region count
+    mi_atomic_compare_exchange(&regions_count, idx+1, idx);        
      mi_atomic_increment(&regions_count);
    }    
    else {
      // failed, another thread allocated just before us, free our allocated memory
      // TODO: should we keep the allocated memory and assign it to some other region?
      _mi_os_free(start, MI_REGION_SIZE, tld->stats);
      start = mi_atomic_read_ptr(&region->start);
    }    
  }
  mi_assert_internal(start != NULL && start != ALLOCATING);
  mi_assert_internal(start == mi_atomic_read_ptr(&region->start));
  // Commit the blocks to memory
  mi_assert_internal(start == mi_atomic_read_ptr(&region->start));
  mi_assert_internal(start != NULL);
  void* blocks_start = (uint8_t*)start + (bitidx * MI_SEGMENT_SIZE);
  if (commit && !mi_option_is_enabled(mi_option_eager_region_commit)) {
    _mi_os_commit(blocks_start, mi_good_commit_size(size), tld->stats);  // only commit needed size (unless using large OS pages)
@ -174,7 +187,7 @@ static bool mi_region_alloc_blocks(mem_region_t* region, size_t idx, size_t bloc
    bitidx = 0;
    do {
      // skip ones
-      while ((m&1) == 1) { bitidx++; m>>=1; }
+      while ((m&1) != 0) { bitidx++; m>>=1; }
      // count zeros
      mi_assert_internal((m&1)==0);
      size_t zeros = 1;
@ -315,14 +328,14 @@ void _mi_mem_free(void* p, size_t size, size_t id, mi_stats_t* stats) {
    // reset: 10x slowdown on malloc-large, decommit: 17x slowdown on malloc-large
    if (!mi_option_is_enabled(mi_option_large_os_pages)) {
      if (mi_option_is_enabled(mi_option_eager_region_commit)) {
-        //_mi_os_reset(p, size, stats);      // 10x slowdown on malloc-large
+        //_mi_os_reset(p, size, stats);      
      }
      else {      
-        //_mi_os_decommit(p, size, stats);  // 17x slowdown on malloc-large
+        //_mi_os_decommit(p, size, stats);  
      }
    }
-    // TODO: should we free empty regions? 
+    // TODO: should we free empty regions? currently only done _mi_mem_collect.
    // this frees up virtual address space which
    // might be useful on 32-bit systems?