merge from dev

This commit is contained in:
daanx 2023-04-24 09:06:56 -07:00
commit 56c0a8025a
9 changed files with 78 additions and 68 deletions

View file

@ -751,8 +751,8 @@ static inline mi_memid_t _mi_memid_none(void) {
static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) {
mi_memid_t memid = _mi_memid_create(MI_MEM_OS);
memid.was_committed = committed;
memid.was_zero = is_zero;
memid.initially_committed = committed;
memid.initially_zero = is_zero;
memid.is_pinned = is_large;
return memid;
}

View file

@ -387,24 +387,24 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
}
typedef struct mi_memid_os_info {
void* base; // actual base address of the block (used for offset aligned allocations)
size_t alignment; // alignment at allocation
void* base; // actual base address of the block (used for offset aligned allocations)
size_t alignment; // alignment at allocation
} mi_memid_os_info_t;
typedef struct mi_memid_arena_info {
size_t block_index; // index in the arena
mi_arena_id_t id; // arena id (>= 1)
bool is_exclusive; // the arena can only be used for specific arena allocations
size_t block_index; // index in the arena
mi_arena_id_t id; // arena id (>= 1)
bool is_exclusive; // the arena can only be used for specific arena allocations
} mi_memid_arena_info_t;
typedef struct mi_memid_s {
union {
mi_memid_os_info_t os; // only used for MI_MEM_OS
mi_memid_arena_info_t arena;// only used for MI_MEM_ARENA
mi_memid_os_info_t os; // only used for MI_MEM_OS
mi_memid_arena_info_t arena; // only used for MI_MEM_ARENA
} mem;
bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
bool was_committed; // `true` if the memory was originally allocated as committed
bool was_zero; // `true` if the memory was originally zero initialized
bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large OS pages)
bool initially_committed;// `true` if the memory was originally allocated as committed
bool initially_zero; // `true` if the memory was originally zero initialized
mi_memkind_t memkind;
} mi_memid_t;

View file

@ -12,8 +12,8 @@ is a general purpose allocator with excellent [performance](#performance) charac
Initially developed by Daan Leijen for the runtime systems of the
[Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
Latest release tag: `v2.1.1` (2023-04-03).
Latest stable tag: `v1.8.1` (2023-04-03).
Latest release tag: `v2.1.2` (2023-04-24).
Latest stable tag: `v1.8.2` (2023-04-24).
mimalloc is a drop-in replacement for `malloc` and can be used in other programs
without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@ -43,7 +43,7 @@ It also includes a robust way to override the default allocator in [Windows](#ov
and the chance of contending on a single location will be low -- this is quite
similar to randomized algorithms like skip lists where adding
a random oracle removes the need for a more complex algorithm.
- __eager page reset__: when a "page" becomes empty (with increased chance
- __eager page purging__: when a "page" becomes empty (with increased chance
due to free list sharding) the memory is marked to the OS as unused (reset or decommitted)
reducing (real) memory pressure and fragmentation, especially in long running
programs.
@ -78,6 +78,10 @@ Note: the `v2.x` version has a new algorithm for managing internal mimalloc page
and fragmentation compared to mimalloc `v1.x` (especially for large workloads). Should otherwise have similar performance
(see [below](#performance)); please report if you observe any significant performance regression.
* 2023-04-24, `v1.8.2`, `v2.1.2`: Fixes build issues on freeBSD, musl, and C17 (UE 5.1.1). Reduce code size/complexity
by removing regions and segment-cache's and only use arenas with improved memory purging -- this may improve memory
usage as well for larger services. Renamed options for consistency. Improved Valgrind and ASAN checking.
* 2023-04-03, `v1.8.1`, `v2.1.1`: Fixes build issues on some platforms.
* 2023-03-29, `v1.8.0`, `v2.1.0`: Improved support dynamic overriding on Windows 11. Improved tracing precision
@ -105,20 +109,6 @@ Note: the `v2.x` version has a new algorithm for managing internal mimalloc page
improved wasm support, faster aligned allocation,
various small fixes.
* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
M1), improved performance for v2 for large objects, Python integration improvements, more standard
installation directories, various small fixes.
* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
* [Older release notes](#older-release-notes)
Special thanks to:
@ -280,43 +270,48 @@ completely and redirect all calls to the _mimalloc_ library instead .
## Environment Options
You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)),
or via environment variables:
You can set further options either programmatically (using [`mi_option_set`](https://microsoft.github.io/mimalloc/group__options.html)), or via environment variables:
- `MIMALLOC_SHOW_STATS=1`: show statistics when the program terminates.
- `MIMALLOC_VERBOSE=1`: show verbose messages.
- `MIMALLOC_SHOW_ERRORS=1`: show error and warning messages.
- `MIMALLOC_PAGE_RESET=0`: by default, mimalloc will reset (or purge) OS pages that are not in use, to signal to the OS
that the underlying physical memory can be reused. This can reduce memory fragmentation in long running (server)
programs. By setting it to `0` this will no longer be done which can improve performance for batch-like programs.
As an alternative, the `MIMALLOC_RESET_DELAY=`<msecs> can be set higher (100ms by default) to make the page
reset occur less frequently instead of turning it off completely.
Advanced options:
- `MIMALLOC_PURGE_DELAY=N`: the delay in `N` milli-seconds (by default `10`) after which mimalloc will purge
OS pages that are not in use. This signals to the OS that the underlying physical memory can be reused which
can reduce memory fragmentation especially in long running (server) programs. Setting `N` to `0` purges immediately when
a page becomes unused which can improve memory usage but also decreases performance. Setting `N` to a higher
value like `100` can improve performance (sometimes by a lot) at the cost of potentially using more memory at times.
Setting it to `-1` disables purging completely.
- `MIMALLOC_ARENA_EAGER_COMMIT=1`: turns on eager commit for the large arenas (usually 1GiB) from which mimalloc
allocates segments and pages. This is by default
only enabled on overcommit systems (e.g. Linux) but enabling it explicitly on other systems (like Windows or macOS)
may improve performance. Note that eager commit only increases the commit but not the actual the peak resident set
(rss) so it is generally ok to enable this.
Further options for large workloads and services:
- `MIMALLOC_USE_NUMA_NODES=N`: pretend there are at most `N` NUMA nodes. If not set, the actual NUMA nodes are detected
at runtime. Setting `N` to 1 may avoid problems in some virtual environments. Also, setting it to a lower number than
the actual NUMA nodes is fine and will only cause threads to potentially allocate more memory across actual NUMA
nodes (but this can happen in any case as NUMA local allocation is always a best effort but not guaranteed).
- `MIMALLOC_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
- `MIMALLOC_ALLOW_LARGE_OS_PAGES=1`: use large OS pages (2MiB) when available; for some workloads this can significantly
improve performance. Use `MIMALLOC_VERBOSE` to check if the large OS pages are enabled -- usually one needs
to explicitly allow large OS pages (as on [Windows][windows-huge] and [Linux][linux-huge]). However, sometimes
the OS is very slow to reserve contiguous physical memory for large OS pages so use with care on systems that
can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
<!--
- `MIMALLOC_EAGER_REGION_COMMIT=1`: on Windows, commit large (256MiB) regions eagerly. On Windows, these regions
show in the working set even though usually just a small part is committed to physical memory. This is why it
turned off by default on Windows as it looks not good in the task manager. However, turning it on has no
real drawbacks and may improve performance by a little.
-->
- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where N is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
can have fragmented memory (for that reason, we generally recommend to use `MIMALLOC_RESERVE_HUGE_OS_PAGES` instead whenever possible).
- `MIMALLOC_RESERVE_HUGE_OS_PAGES=N`: where `N` is the number of 1GiB _huge_ OS pages. This reserves the huge pages at
startup and sometimes this can give a large (latency) performance improvement on big workloads.
Usually it is better to not use
`MIMALLOC_LARGE_OS_PAGES` in combination with this setting. Just like large OS pages, use with care as reserving
Usually it is better to not use `MIMALLOC_ALLOW_LARGE_OS_PAGES=1` in combination with this setting. Just like large
OS pages, use with care as reserving
contiguous physical memory can take a long time when memory is fragmented (but reserving the huge pages is done at
startup only once).
Note that we usually need to explicitly enable huge OS pages (as on [Windows][windows-huge] and [Linux][linux-huge])).
With huge OS pages, it may be beneficial to set the setting
`MIMALLOC_EAGER_COMMIT_DELAY=N` (`N` is 1 by default) to delay the initial `N` segments (of 4MiB)
of a thread to not allocate in the huge OS pages; this prevents threads that are short lived
and allocate just a little to take up space in the huge OS page area (which cannot be reset).
and allocate just a little to take up space in the huge OS page area (which cannot be purged).
The huge pages are usually allocated evenly among NUMA nodes.
We can use `MIMALLOC_RESERVE_HUGE_OS_PAGES_AT=N` where `N` is the numa node (starting at 0) to allocate all
the huge pages at a specific numa node instead.
@ -794,6 +789,16 @@ provided by the bot. You will only need to do this once across all repos using o
# Older Release Notes
* 2021-11-14, `v1.7.3`, `v2.0.3` (beta): improved WASM support, improved macOS support and performance (including
M1), improved performance for v2 for large objects, Python integration improvements, more standard
installation directories, various small fixes.
* 2021-06-17, `v1.7.2`, `v2.0.2` (beta): support M1, better installation layout on Linux, fix
thread_id on Android, prefer 2-6TiB area for aligned allocation to work better on pre-windows 8, various small fixes.
* 2021-04-06, `v1.7.1`, `v2.0.1` (beta): fix bug in arena allocation for huge pages, improved aslr on large allocations, initial M1 support (still experimental).
* 2021-01-31, `v2.0.0`: beta release 2.0: new slice algorithm for managing internal mimalloc pages.
* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
* 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
* 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
@ -815,6 +820,7 @@ provided by the bot. You will only need to do this once across all repos using o
more eager concurrent free, addition of STL allocator, fixed potential memory leak.
* 2020-01-15, `v1.3.0`: stable release 1.3: bug fixes, improved randomness and [stronger
free list encoding](https://github.com/microsoft/mimalloc/blob/783e3377f79ee82af43a0793910a9f2d01ac7863/include/mimalloc-internal.h#L396) in secure mode.
* 2019-12-22, `v1.2.2`: stable release 1.2: minor updates.
* 2019-11-22, `v1.2.0`: stable release 1.2: bug fixes, improved secure mode (free list corruption checks, double free mitigation). Improved dynamic overriding on Windows.
* 2019-10-07, `v1.1.0`: stable release 1.1.

View file

@ -245,11 +245,13 @@ extern "C" {
int posix_memalign(void** p, size_t alignment, size_t size) { return mi_posix_memalign(p, alignment, size); }
// `aligned_alloc` is only available when __USE_ISOC11 is defined.
// Note: it seems __USE_ISOC11 is not defined in musl (and perhaps other libc's) so we only check
// for it if using glibc.
// Note: Conda has a custom glibc where `aligned_alloc` is declared `static inline` and we cannot
// override it, but both _ISOC11_SOURCE and __USE_ISOC11 are undefined in Conda GCC7 or GCC9.
// Fortunately, in the case where `aligned_alloc` is declared as `static inline` it
// uses internally `memalign`, `posix_memalign`, or `_aligned_malloc` so we can avoid overriding it ourselves.
#if __USE_ISOC11
#if !defined(__GLIBC__) || __USE_ISOC11
void* aligned_alloc(size_t alignment, size_t size) { return mi_aligned_alloc(alignment, size); }
#endif
#endif

View file

@ -237,33 +237,33 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar
}
// set the dirty bits (todo: no need for an atomic op here?)
if (arena->memid.was_zero && arena->blocks_dirty != NULL) {
memid->was_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
if (arena->memid.initially_zero && arena->blocks_dirty != NULL) {
memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL);
}
// set commit state
if (arena->blocks_committed == NULL) {
// always committed
memid->was_committed = true;
memid->initially_committed = true;
}
else if (commit) {
// commit requested, but the range may not be committed as a whole: ensure it is committed now
memid->was_committed = true;
memid->initially_committed = true;
bool any_uncommitted;
_mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted);
if (any_uncommitted) {
bool commit_zero = false;
if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) {
memid->was_committed = false;
memid->initially_committed = false;
}
else {
if (commit_zero) { memid->was_zero = true; }
if (commit_zero) { memid->initially_zero = true; }
}
}
}
else {
// no need to commit, but check if already fully committed
memid->was_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index);
}
return p;
@ -752,7 +752,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
if (size < MI_ARENA_BLOCK_SIZE) return false;
if (is_large) {
mi_assert_internal(memid.was_committed && memid.is_pinned);
mi_assert_internal(memid.initially_committed && memid.is_pinned);
}
const size_t bcount = size / MI_ARENA_BLOCK_SIZE;
@ -781,7 +781,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[2*fields]); // just after dirty bitmap
arena->blocks_purge = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after committed bitmap
// initialize committed bitmap?
if (arena->blocks_committed != NULL && arena->memid.was_committed) {
if (arena->blocks_committed != NULL && arena->memid.initially_committed) {
memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning
}
@ -799,8 +799,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int
bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept {
mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL);
memid.was_committed = is_committed;
memid.was_zero = is_zero;
memid.initially_committed = is_committed;
memid.initially_zero = is_zero;
memid.is_pinned = is_large;
return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id);
}

View file

@ -244,7 +244,7 @@ static mi_thread_data_t* mi_thread_data_zalloc(void) {
}
if (td != NULL) {
td->memid = memid;
is_zero = memid.was_zero;
is_zero = memid.initially_zero;
}
}

View file

@ -387,9 +387,11 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) {
}
int _mi_prim_reset(void* start, size_t size) {
// We always use MADV_DONTNEED if possible even if it may be a bit more expensive as MADV_FREE
// as this guarantees that we see the actual rss reflected in tools like `top`.
#if 0 && defined(MADV_FREE)
// We try to use `MADV_FREE` as that is the fastest. A drawback though is that it
// will not reduce the `rss` stats in tools like `top` even though the memory is available
// to other processes. With the default `MIMALLOC_PURGE_DECOMMITS=1` we ensure that by
// default `MADV_DONTNEED` is used though.
#if defined(MADV_FREE)
static _Atomic(size_t) advice = MI_ATOMIC_VAR_INIT(MADV_FREE);
int oadvice = (int)mi_atomic_load_relaxed(&advice);
int err;

View file

@ -285,9 +285,9 @@ int _mi_prim_decommit(void* addr, size_t size, bool* needs_recommit) {
int _mi_prim_reset(void* addr, size_t size) {
void* p = VirtualAlloc(addr, size, MEM_RESET, PAGE_READWRITE);
mi_assert_internal(p == addr);
#if 1
#if 0
if (p != NULL) {
VirtualUnlock(addr,size); // VirtualUnlock after MEM_RESET removes the memory from the working set
VirtualUnlock(addr,size); // VirtualUnlock after MEM_RESET removes the memory directly from the working set
}
#endif
return (p != NULL ? 0 : (int)GetLastError());

View file

@ -824,7 +824,7 @@ static mi_segment_t* mi_segment_os_alloc( size_t required, size_t page_alignment
// ensure metadata part of the segment is committed
mi_commit_mask_t commit_mask;
if (memid.was_committed) {
if (memid.initially_committed) {
mi_commit_mask_create_full(&commit_mask);
}
else {
@ -878,7 +878,7 @@ static mi_segment_t* mi_segment_alloc(size_t required, size_t page_alignment, mi
if (segment == NULL) return NULL;
// zero the segment info? -- not always needed as it may be zero initialized from the OS
if (!segment->memid.was_zero) {
if (!segment->memid.initially_zero) {
ptrdiff_t ofs = offsetof(mi_segment_t, next);
size_t prefix = offsetof(mi_segment_t, slices) - ofs;
size_t zsize = prefix + (sizeof(mi_slice_t) * (segment_slices + 1)); // one more