merge from dev

2025-07-11 21:58:41 +03:00 · 2023-04-24 09:06:56 -07:00 · 2023-04-24 09:06:56 -07:00 · 56c0a8025a
commit 56c0a8025a
parent 074adc14e5 632edf9996
9 changed files with 78 additions and 68 deletions
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@ -751,8 +751,8 @@ static inline mi_memid_t _mi_memid_none(void) {

 static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
  mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
-  memid.was_committed = committed;
-  memid.was_zero = is_zero;
+  memid.initially_committed = committed;
+  memid.initially_zero = is_zero;
  memid.is_pinned = is_large;
  return memid;
 }
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -387,24 +387,24 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
 }

 typedef struct mi_memid_os_info {
-  void*         base;           // actual base address of the block (used for offset aligned allocations)
-  size_t        alignment;      // alignment at allocation
+  void*         base;               // actual base address of the block (used for offset aligned allocations)
+  size_t        alignment;          // alignment at allocation
 } mi_memid_os_info_t;

 typedef struct mi_memid_arena_info {
-  size_t        block_index;    // index in the arena
-  mi_arena_id_t id;             // arena id (>= 1)
-  bool          is_exclusive;   // the arena can only be used for specific arena allocations
+  size_t        block_index;        // index in the arena
+  mi_arena_id_t id;                 // arena id (>= 1)
+  bool          is_exclusive;       // the arena can only be used for specific arena allocations
 } mi_memid_arena_info_t;

 typedef struct mi_memid_s {
  union {
-    mi_memid_os_info_t    os;   // only used for MI_MEM_OS
-    mi_memid_arena_info_t arena;// only used for MI_MEM_ARENA
+    mi_memid_os_info_t    os;       // only used for MI_MEM_OS
+    mi_memid_arena_info_t arena;    // only used for MI_MEM_ARENA
  } mem;
-  bool          is_pinned;      // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
-  bool          was_committed;  // `true` if the memory was originally allocated as committed
-  bool          was_zero;       // `true` if the memory was originally zero initialized
+  bool          is_pinned;          // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
+  bool          initially_committed;// `true` if the memory was originally allocated as committed
+  bool          initially_zero;     // `true` if the memory was originally zero initialized
  mi_memkind_t  memkind;
 } mi_memid_t;

--- a/readme.md
+++ b/readme.md
@ -12,8 +12,8 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.

-Latest release tag: `v2.1.1` (2023-04-03).
-Latest stable  tag: `v1.8.1` (2023-04-03).
+Latest release tag: `v2.1.2` (2023-04-24).
+Latest stable  tag: `v1.8.2` (2023-04-24).

 mimalloc is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@ -43,7 +43,7 @@ It also includes a robust way to override the default allocator in [Windows](#ov
  and the chance of contending on a single location will be low -- this is quite
  similar to randomized algorithms like skip lists where adding
  a random oracle removes the need for a more complex algorithm.
- __eager page reset__: when a "page" becomes empty (with increased chance
+- __eager page purging__: when a "page" becomes empty (with increased chance
  due to free list sharding) the memory is marked to the OS as unused (reset or decommitted)
  reducing (real) memory pressure and fragmentation, especially in long running
  programs.
@ -78,6 +78,10 @@ Note: the `v2.x` version has a new algorithm for managing internal mimalloc page
  and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
  (see [below](#performance)); please report if you observe any significant performance regression.

+* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity 
+  by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
+  usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
+  
 * 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.

 * 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
@ -105,20 +109,6 @@ Note: the `v2.x` version has a new algorithm for managing internal mimalloc page
  improved wasm support, faster aligned allocation,
  various small fixes.

-* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
-  M1), improved performance for v2 for large objects, Python integration improvements, more standard
-  installation directories, various small fixes.
-
-* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
-  thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
-
-* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
-
-* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
-
-* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
-  improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
-
 * [Older release notes](#older-release-notes)

 Special thanks to:
@ -280,43 +270,48 @@ completely and redirect all calls to the _mimalloc_ library instead .

 ## Environment Options

-You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)),
-or via environment variables:
+You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)), or via environment variables:

 - `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
 - `MIMALLOC_VERBOSE=1`: show verbose messages.
 - `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
- `MIMALLOC_PAGE_RESET=0`: by default, mimalloc will reset (or purge) OS pages that are not in use, to signal to the OS
-   that the underlying physical memory can be reused. This can reduce memory fragmentation in long running (server)
-   programs. By setting it to `0` this will no longer be done which can improve performance for batch-like programs.
-   As an alternative, the `MIMALLOC_RESET_DELAY=`<msecs> can be set higher (100ms by default) to make the page
-   reset occur less frequently instead of turning it off completely.
+
+Advanced options:
+
+- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge 
+   OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which 
+   can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when
+   a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher
+   value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times.
+   Setting it to `-1` disables purging completely.   
+- `MIMALLOC_ARENA_EAGER_COMMIT=1`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc 
+   allocates segments and pages. This is by default 
+   only enabled on overcommit systems (e.g. Linux) but enabling it explicitly on other systems (like Windows or macOS)
+   may improve performance. Note that eager commit only increases the commit but not the actual the peak resident set 
+   (rss) so it is generally ok to enable this.
+
+Further options for large workloads and services:
+
 - `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected
   at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
   the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
   nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
+- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
   improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
   to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
   the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
-   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
-   <!--
-   - `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
-   show in the working set even though usually just a small part is committed to physical memory. This is why it
-   turned off by default on Windows as it looks not good in the task manager. However, turning it on has no
-   real drawbacks and may improve performance by a little.
-   -->
- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
+   can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).   
+- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
   startup and sometimes this can give a large (latency) performance improvement on big workloads.
-   Usually it is better to not use
-   `MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
+   Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large 
+   OS pages, use with care as reserving
   contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
   startup only once).
   Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
   With huge OS pages, it may be beneficial to set the setting
   `MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
   of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
-   and allocate just a little to take up space in the huge OS page area (which cannot be reset).
+   and allocate just a little to take up space in the huge OS page area (which cannot be purged).
   The huge pages are usually allocated evenly among NUMA nodes.
   We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all
   the huge pages at a specific numa node instead.
@ -794,6 +789,16 @@ provided by the bot. You will only need to do this once across all repos using o

 # Older Release Notes

+* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
+  M1), improved performance for v2 for large objects, Python integration improvements, more standard
+  installation directories, various small fixes.
+* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
+  thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
+* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
+* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
+* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
+  improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
+
 * 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
  handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
 * 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
@ -815,6 +820,7 @@ provided by the bot. You will only need to do this once across all repos using o
 more eager concurrent free, addition of STL allocator, fixed potential memory leak.
 * 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger
 free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode.
+
 * 2019-12-22, `v1.2.2`: stable release 1.2: minor updates.
 * 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
 * 2019-10-07, `v1.1.0`: stable release 1.1.
--- a/src/alloc-override.c
+++ b/src/alloc-override.c
@ -245,11 +245,13 @@ extern "C" {
  int    posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }

  // `aligned_alloc` is only available when __USE_ISOC11 is defined.
+  // Note: it seems __USE_ISOC11 is not defined in musl (and perhaps other libc's) so we only check
+  // for it if using glibc.
  // Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
  // override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
  // Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
  // uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we  can avoid overriding it ourselves.
-  #if __USE_ISOC11
+  #if !defined(__GLIBC__) || __USE_ISOC11
  void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); }
  #endif
 #endif
--- a/src/arena.c
+++ b/src/arena.c
@ -237,33 +237,33 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
  }

  // set the dirty bits (todo: no need for an atomic op here?)
-  if (arena->memid.was_zero && arena->blocks_dirty != NULL) {
-    memid->was_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
+  if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
+    memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
  }

  // set commit state
  if (arena->blocks_committed == NULL) {
    // always committed
-    memid->was_committed = true;    
+    memid->initially_committed = true;
  }
  else if (commit) {
    // commit requested, but the range may not be committed as a whole: ensure it is committed now
-    memid->was_committed = true;
+    memid->initially_committed = true;
    bool any_uncommitted;
    _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
    if (any_uncommitted) {
      bool commit_zero = false;
      if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
-        memid->was_committed = false;
+        memid->initially_committed = false;
      }
      else {
-        if (commit_zero) { memid->was_zero = true; }
+        if (commit_zero) { memid->initially_zero = true; }
      }
    }
  }
  else {
    // no need to commit, but check if already fully committed
-    memid->was_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);    
+    memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
  }
  
  return p;
@ -752,7 +752,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
  if (size < MI_ARENA_BLOCK_SIZE) return false;

  if (is_large) {
-    mi_assert_internal(memid.was_committed && memid.is_pinned);
+    mi_assert_internal(memid.initially_committed && memid.is_pinned);
  }

  const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
@ -781,7 +781,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
  arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
  arena->blocks_purge  = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap  
  // initialize committed bitmap?
-  if (arena->blocks_committed != NULL && arena->memid.was_committed) {
+  if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
    memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
  }
  
@ -799,8 +799,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int

 bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
  mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
-  memid.was_committed = is_committed;
-  memid.was_zero = is_zero;
+  memid.initially_committed = is_committed;
+  memid.initially_zero = is_zero;
  memid.is_pinned = is_large;
  return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id);
 }
--- a/src/init.c
+++ b/src/init.c
@ -244,7 +244,7 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
    }
    if (td != NULL) {
      td->memid = memid;
-      is_zero = memid.was_zero;
+      is_zero = memid.initially_zero;
    }
  }
  
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@ -387,9 +387,11 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
 }

 int _mi_prim_reset(void* start, size_t size) {
-  // We always use MADV_DONTNEED if possible even if it may be a bit more expensive as MADV_FREE
-  // as this guarantees that we see the actual rss reflected in tools like `top`.
-  #if 0 && defined(MADV_FREE)
+  // We try to use `MADV_FREE` as that is the fastest. A drawback though is that it 
+  // will not reduce the `rss` stats in tools like `top` even though the memory is available
+  // to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by 
+  // default `MADV_DONTNEED` is used though.
+  #if defined(MADV_FREE)
  static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
  int oadvice = (int)mi_atomic_load_relaxed(&advice);
  int err;
--- a/src/prim/windows/prim.c
+++ b/src/prim/windows/prim.c
@ -285,9 +285,9 @@ int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
 int _mi_prim_reset(void* addr, size_t size) {
  void* p = VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
  mi_assert_internal(p == addr);
-  #if 1
+  #if 0
  if (p != NULL) {
-    VirtualUnlock(addr,size); // VirtualUnlock after MEM_RESET removes the memory from the working set
+    VirtualUnlock(addr,size); // VirtualUnlock after MEM_RESET removes the memory directly from the working set
  }
  #endif
  return (p != NULL ? 0 : (int)GetLastError());
--- a/src/segment.c
+++ b/src/segment.c
@ -824,7 +824,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment

  // ensure metadata part of the segment is committed  
  mi_commit_mask_t commit_mask; 
-  if (memid.was_committed) { 
+  if (memid.initially_committed) { 
    mi_commit_mask_create_full(&commit_mask);  
  }
  else { 
@ -878,7 +878,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
  if (segment == NULL) return NULL;
  
  // zero the segment info? -- not always needed as it may be zero initialized from the OS   
-  if (!segment->memid.was_zero) {
+  if (!segment->memid.initially_zero) {
    ptrdiff_t ofs    = offsetof(mi_segment_t, next);
    size_t    prefix = offsetof(mi_segment_t, slices) - ofs;
    size_t    zsize  = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more