From 71cfa45e76415343e9d83b483f62d1c44cb821cc Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 28 Nov 2024 19:31:04 -0800 Subject: [PATCH 001/264] wip: initial work on mimalloc3 without segments --- ide/vs2022/mimalloc.vcxproj | 6 +- ide/vs2022/mimalloc.vcxproj.filters | 15 +- include/mimalloc/bits.h | 313 +++++ include/mimalloc/internal.h | 154 +-- include/mimalloc/prim.h | 3 - include/mimalloc/types.h | 61 +- src/bitmap.c | 13 +- src/init.c | 22 +- src/libc.c | 67 +- src/os.c | 12 + src/page-queue.c | 7 +- src/xarena.c | 1777 +++++++++++++++++++++++++++ src/xbitmap.c | 599 +++++++++ src/xbitmap.h | 94 ++ test/main-override-static.c | 147 ++- 15 files changed, 3001 insertions(+), 289 deletions(-) create mode 100644 include/mimalloc/bits.h create mode 100644 src/xarena.c create mode 100644 src/xbitmap.c create mode 100644 src/xbitmap.h diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index dddab777..138acf39 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -120,6 +120,7 @@ CompileAsCpp false stdcpp20 + AdvancedVectorExtensions2 @@ -219,7 +220,6 @@ true true - false @@ -252,17 +252,21 @@ + + + + diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters index 54ee0fcb..48958be1 100644 --- a/ide/vs2022/mimalloc.vcxproj.filters +++ b/ide/vs2022/mimalloc.vcxproj.filters @@ -13,9 +13,6 @@ Sources - - Sources - Sources @@ -64,6 +61,12 @@ Sources + + Sources + + + Sources + @@ -93,6 +96,12 @@ Headers + + Headers + + + Headers + diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h new file mode 100644 index 00000000..642f0f9c --- /dev/null +++ b/include/mimalloc/bits.h @@ -0,0 +1,313 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2024 Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- + Bit operation, and platform dependent definition (MI_INTPTR_SIZE etc) +---------------------------------------------------------------------------- */ + +#pragma once +#ifndef MI_BITS_H +#define MI_BITS_H + + +// ------------------------------------------------------ +// Size of a pointer. +// We assume that `sizeof(void*)==sizeof(intptr_t)` +// and it holds for all platforms we know of. +// +// However, the C standard only requires that: +// p == (void*)((intptr_t)p)) +// but we also need: +// i == (intptr_t)((void*)i) +// or otherwise one might define an intptr_t type that is larger than a pointer... +// ------------------------------------------------------ + +#if INTPTR_MAX > INT64_MAX +# define MI_INTPTR_SHIFT (4) // assume 128-bit (as on arm CHERI for example) +#elif INTPTR_MAX == INT64_MAX +# define MI_INTPTR_SHIFT (3) +#elif INTPTR_MAX == INT32_MAX +# define MI_INTPTR_SHIFT (2) +#else +#error platform pointers must be 32, 64, or 128 bits +#endif + +#if SIZE_MAX == UINT64_MAX +# define MI_SIZE_SHIFT (3) +typedef int64_t mi_ssize_t; +#elif SIZE_MAX == UINT32_MAX +# define MI_SIZE_SHIFT (2) +typedef int32_t mi_ssize_t; +#else +#error platform objects must be 32 or 64 bits +#endif + +#if (SIZE_MAX/2) > LONG_MAX +# define MI_ZU(x) x##ULL +# define MI_ZI(x) x##LL +#else +# define MI_ZU(x) x##UL +# define MI_ZI(x) x##L +#endif + +#define MI_INTPTR_SIZE (1< +#endif +#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) +#include +#endif + +#if defined(__AVX2__) && !defined(__BMI2__) // msvc +#define __BMI2__ 1 +#endif +#if (defined(__AVX2__) || defined(__BMI2__)) && !defined(__BMI1__) // msvc +#define __BMI1__ 1 +#endif + +/* -------------------------------------------------------------------------------- + Builtin's +-------------------------------------------------------------------------------- */ + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#define mi_builtin(name) __builtin_##name +#define mi_has_builtin(name) __has_builtin(__builtin_##name) + +#if (LONG_MAX == INT32_MAX) +#define mi_builtin32(name) mi_builtin(name##l) +#define mi_has_builtin32(name) mi_has_builtin(name##l) +#else +#define mi_builtin32(name) mi_builtin(name) +#define mi_has_builtin32(name) mi_has_builtin(name) +#endif +#if (LONG_MAX == INT64_MAX) +#define mi_builtin64(name) mi_builtin(name##l) +#define mi_has_builtin64(name) mi_has_builtin(name##l) +#else +#define mi_builtin64(name) mi_builtin(name##ll) +#define mi_has_builtin64(name) mi_has_builtin(name##ll) +#endif + +#if (MI_SIZE_BITS == 32) +#define mi_builtin_size(name) mi_builtin32(name) +#define mi_has_builtin_size(name) mi_has_builtin32(name) +#elif (MI_SIZE_BITS == 64) +#define mi_builtin_size(name) mi_builtin64(name) +#define mi_has_builtin_size(name) mi_has_builtin64(name) +#endif + + +/* -------------------------------------------------------------------------------- + Count trailing/leading zero's +-------------------------------------------------------------------------------- */ + +size_t _mi_clz_generic(size_t x); +size_t _mi_ctz_generic(size_t x); + +static inline size_t mi_ctz(size_t x) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + uint64_t r; + __asm volatile ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_ARM64 + uint64_t r; + __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_RISCV + size_t r; + __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : ); + return r; + #elif MI_ARCH_X64 && defined(__BMI1__) + return (size_t)_tzcnt_u64(x); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long idx; + #if MI_SIZE_BITS==32 + return (_BitScanForward(&idx, x) ? (size_t)idx : 32); + #else + return (_BitScanForward64(&idx, x) ? (size_t)idx : 64); + #endif + #elif mi_has_builtin_size(ctz) + return (x!=0 ? (size_t)mi_builtin_size(ctz)(x) : MI_SIZE_BITS); + #else + #define MI_HAS_FAST_BITSCAN 0 + return _mi_ctz_generic(x); + #endif +} + +static inline size_t mi_clz(size_t x) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + uint64_t r; + __asm volatile ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_ARM64 + uint64_t r; + __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_RISCV + size_t r; + __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : ); + return r; + #elif MI_ARCH_X64 && defined(__BMI1__) + return (size_t)_lzcnt_u64(x); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long idx; + #if MI_SIZE_BITS==32 + return (_BitScanReverse(&idx, x) ? 31 - (size_t)idx : 32); + #else + return (_BitScanReverse64(&idx, x) ? 63 - (size_t)idx : 64); + #endif + #elif mi_has_builtin_size(clz) + return (x!=0 ? (size_t)mi_builtin_size(clz)(x) : MI_SIZE_BITS); + #else + #define MI_HAS_FAST_BITSCAN 0 + return _mi_clz_generic(x); + #endif +} + +#ifndef MI_HAS_FAST_BITSCAN +#define MI_HAS_FAST_BITSCAN 1 +#endif + +/* -------------------------------------------------------------------------------- + find trailing/leading zero (bit scan forward/reverse) +-------------------------------------------------------------------------------- */ + +// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's) +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bsf(size_t x, size_t* idx) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + // on x64 the carry flag is set on zero which gives better codegen + bool is_zero; + __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); + return !is_zero; + #else + *idx = mi_ctz(x); + return (x!=0); + #endif +} + +// Bit scan reverse: find the most significant bit that is set +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bsr(size_t x, size_t* idx) { + #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long i; + #if MI_SIZE_BITS==32 + return (_BitScanReverse(&i, x) ? (*idx = i, true) : false); + #else + return (_BitScanReverse64(&i, x) ? (*idx = i, true) : false); + #endif + #else + const size_t r = mi_clz(x); + *idx = (~r & (MI_SIZE_BITS - 1)); + return (x!=0); + #endif +} + + +/* -------------------------------------------------------------------------------- + find least/most significant bit position +-------------------------------------------------------------------------------- */ + +// Find most significant bit index, or MI_SIZE_BITS if 0 +static inline size_t mi_find_msb(size_t x) { + #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long i; + #if MI_SIZE_BITS==32 + return (_BitScanReverse(&i, x) ? i : 32); + #else + return (_BitScanReverse64(&i, x) ? i : 64); + #endif + #else + return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x)); + #endif +} + +// Find least significant bit index, or MI_SIZE_BITS if 0 (this equals `mi_ctz`, count trailing zero's) +static inline size_t mi_find_lsb(size_t x) { + return mi_ctz(x); +} + + +/* -------------------------------------------------------------------------------- + rotate +-------------------------------------------------------------------------------- */ + +static inline size_t mi_rotr(size_t x, size_t r) { + #if (mi_has_builtin(rotateright64) && MI_SIZE_BITS==64) + return mi_builtin(rotateright64)(x,r); + #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32) + return mi_builtin(rotateright32)(x,r); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #if MI_BFIELD_SIZE==4 + return _lrotr(x,(int)r); + #else + return _rotr64(x,(int)r); + #endif + #else + // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to + // avoid UB when `rshift==0`. See + const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); + return (x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))); + #endif +} + +static inline size_t mi_rotl(size_t x, size_t r) { + #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64) + return mi_builtin(rotateleft64)(x,r); + #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32) + return mi_builtin(rotateleft32)(x,r); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #if MI_SIZE_BITS==32 + return _lrotl(x,(int)r); + #else + return _rotl64(x,(int)r); + #endif + #else + // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to + // avoid UB when `rshift==0`. See + const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); + return (x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))) + #endif +} + +#endif // MI_BITS_H diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 716386d2..b997099e 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -16,6 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "types.h" #include "track.h" +#include "bits.h" #if (MI_DEBUG>0) #define mi_trace_message(...) _mi_trace_message(__VA_ARGS__) @@ -23,26 +24,28 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_trace_message(...) #endif -#define MI_CACHE_LINE 64 #if defined(_MSC_VER) #pragma warning(disable:4127) // suppress constant conditional warning (due to MI_SECURE paths) #pragma warning(disable:26812) // unscoped enum warning #define mi_decl_noinline __declspec(noinline) #define mi_decl_thread __declspec(thread) -#define mi_decl_cache_align __declspec(align(MI_CACHE_LINE)) +#define mi_decl_align(a) __declspec(align(a)) #define mi_decl_weak #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc #define mi_decl_noinline __attribute__((noinline)) #define mi_decl_thread __thread -#define mi_decl_cache_align __attribute__((aligned(MI_CACHE_LINE))) +#define mi_decl_align(a) __attribute__((aligned(a))) #define mi_decl_weak __attribute__((weak)) #else #define mi_decl_noinline #define mi_decl_thread __thread // hope for the best :-) -#define mi_decl_cache_align +#define mi_decl_align(a) #define mi_decl_weak #endif +#define mi_decl_cache_align mi_decl_align(64) + + #if defined(__EMSCRIPTEN__) && !defined(__wasi__) #define __wasi__ #endif @@ -89,6 +92,7 @@ void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; +size_t _mi_thread_seq_id(void) mi_attr_noexcept; mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); void _mi_heap_guarded_init(mi_heap_t* heap); @@ -96,6 +100,7 @@ void _mi_heap_guarded_init(mi_heap_t* heap); // os.c void _mi_os_init(void); // called from process init void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats); +void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats); void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats); void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats); @@ -675,15 +680,6 @@ static inline bool mi_is_in_same_page(const void* p, const void* q) { return (idxp == idxq); } -static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) { - shift %= MI_INTPTR_BITS; - return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift)))); -} -static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) { - shift %= MI_INTPTR_BITS; - return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift)))); -} - static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) { void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]); return (p==null ? NULL : p); @@ -821,112 +817,6 @@ static inline size_t _mi_os_numa_node_count(void) { } - -// ----------------------------------------------------------------------- -// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero) -// ----------------------------------------------------------------------- - -#if defined(__GNUC__) - -#include // LONG_MAX -#define MI_HAVE_FAST_BITSCAN -static inline size_t mi_clz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; -#if (INTPTR_MAX == LONG_MAX) - return __builtin_clzl(x); -#else - return __builtin_clzll(x); -#endif -} -static inline size_t mi_ctz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; -#if (INTPTR_MAX == LONG_MAX) - return __builtin_ctzl(x); -#else - return __builtin_ctzll(x); -#endif -} - -#elif defined(_MSC_VER) - -#include // LONG_MAX -#include // BitScanReverse64 -#define MI_HAVE_FAST_BITSCAN -static inline size_t mi_clz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; - unsigned long idx; -#if (INTPTR_MAX == LONG_MAX) - _BitScanReverse(&idx, x); -#else - _BitScanReverse64(&idx, x); -#endif - return ((MI_INTPTR_BITS - 1) - idx); -} -static inline size_t mi_ctz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; - unsigned long idx; -#if (INTPTR_MAX == LONG_MAX) - _BitScanForward(&idx, x); -#else - _BitScanForward64(&idx, x); -#endif - return idx; -} - -#else -static inline size_t mi_ctz32(uint32_t x) { - // de Bruijn multiplication, see - static const unsigned char debruijn[32] = { - 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, - 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 - }; - if (x==0) return 32; - return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27]; -} -static inline size_t mi_clz32(uint32_t x) { - // de Bruijn multiplication, see - static const uint8_t debruijn[32] = { - 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, - 23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 - }; - if (x==0) return 32; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27]; -} - -static inline size_t mi_clz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; -#if (MI_INTPTR_BITS <= 32) - return mi_clz32((uint32_t)x); -#else - size_t count = mi_clz32((uint32_t)(x >> 32)); - if (count < 32) return count; - return (32 + mi_clz32((uint32_t)x)); -#endif -} -static inline size_t mi_ctz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; -#if (MI_INTPTR_BITS <= 32) - return mi_ctz32((uint32_t)x); -#else - size_t count = mi_ctz32((uint32_t)x); - if (count < 32) return count; - return (32 + mi_ctz32((uint32_t)(x>>32))); -#endif -} - -#endif - -// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero) -static inline size_t mi_bsr(uintptr_t x) { - return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x)); -} - - // --------------------------------------------------------------------------------- // Provide our own `_mi_memcpy` for potential performance optimizations. // @@ -947,20 +837,20 @@ static inline void _mi_memcpy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); } } -static inline void _mi_memzero(void* dst, size_t n) { +static inline void _mi_memset(void* dst, int val, size_t n) { if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { - __stosb((unsigned char*)dst, 0, n); + __stosb((unsigned char*)dst, (uint8_t)val, n); } else { - memset(dst, 0, n); + memset(dst, val, n); } } #else static inline void _mi_memcpy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); } -static inline void _mi_memzero(void* dst, size_t n) { - memset(dst, 0, n); +static inline void _mi_memset(void* dst, int val, size_t n) { + memset(dst, val, n); } #endif @@ -978,10 +868,10 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) { _mi_memcpy(adst, asrc, n); } -static inline void _mi_memzero_aligned(void* dst, size_t n) { +static inline void _mi_memset_aligned(void* dst, int val, size_t n) { mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0); void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE); - _mi_memzero(adst, n); + _mi_memset(adst, val, n); } #else // Default fallback on `_mi_memcpy` @@ -990,11 +880,19 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) { _mi_memcpy(dst, src, n); } -static inline void _mi_memzero_aligned(void* dst, size_t n) { +static inline void _mi_memset_aligned(void* dst, int val, size_t n) { mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0); - _mi_memzero(dst, n); + _mi_memset(dst, val, n); } #endif +static inline void _mi_memzero(void* dst, size_t n) { + _mi_memset(dst, 0, n); +} + +static inline void _mi_memzero_aligned(void* dst, size_t n) { + _mi_memset_aligned(dst, 0, n); +} + #endif diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 56715df4..8a627438 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -369,7 +369,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) { #endif // mi_prim_get_default_heap() - - - #endif // MIMALLOC_PRIM_H diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 44074450..e8705991 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -23,6 +23,7 @@ terms of the MIT license. A copy of the license can be found in the file #include // ptrdiff_t #include // uintptr_t, uint16_t, etc +#include "bits.h" // bit ops, size defines #include "atomic.h" // _Atomic #ifdef _MSC_VER @@ -106,61 +107,6 @@ terms of the MIT license. A copy of the license can be found in the file // #define MI_HUGE_PAGE_ABANDON 1 -// ------------------------------------------------------ -// Platform specific values -// ------------------------------------------------------ - -// ------------------------------------------------------ -// Size of a pointer. -// We assume that `sizeof(void*)==sizeof(intptr_t)` -// and it holds for all platforms we know of. -// -// However, the C standard only requires that: -// p == (void*)((intptr_t)p)) -// but we also need: -// i == (intptr_t)((void*)i) -// or otherwise one might define an intptr_t type that is larger than a pointer... -// ------------------------------------------------------ - -#if INTPTR_MAX > INT64_MAX -# define MI_INTPTR_SHIFT (4) // assume 128-bit (as on arm CHERI for example) -#elif INTPTR_MAX == INT64_MAX -# define MI_INTPTR_SHIFT (3) -#elif INTPTR_MAX == INT32_MAX -# define MI_INTPTR_SHIFT (2) -#else -#error platform pointers must be 32, 64, or 128 bits -#endif - -#if SIZE_MAX == UINT64_MAX -# define MI_SIZE_SHIFT (3) -typedef int64_t mi_ssize_t; -#elif SIZE_MAX == UINT32_MAX -# define MI_SIZE_SHIFT (2) -typedef int32_t mi_ssize_t; -#else -#error platform objects must be 32 or 64 bits -#endif - -#if (SIZE_MAX/2) > LONG_MAX -# define MI_ZU(x) x##ULL -# define MI_ZI(x) x##LL -#else -# define MI_ZU(x) x##UL -# define MI_ZI(x) x##L -#endif - -#define MI_INTPTR_SIZE (1<= 655360) #error "mimalloc internal: define more bins" @@ -461,8 +410,6 @@ typedef struct mi_page_queue_s { size_t block_size; } mi_page_queue_t; -#define MI_BIN_FULL (MI_BIN_HUGE+1) - // Random context typedef struct mi_random_cxt_s { uint32_t input[16]; diff --git a/src/bitmap.c b/src/bitmap.c index 976ba72c..3e6311dc 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -18,6 +18,7 @@ between the fields. (This is used in arena allocation) #include "mimalloc.h" #include "mimalloc/internal.h" +#include "mimalloc/bits.h" #include "bitmap.h" /* ----------------------------------------------------------- @@ -53,7 +54,7 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_ const size_t mask = mi_bitmap_mask_(count, 0); const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count; -#ifdef MI_HAVE_FAST_BITSCAN +#if MI_HAS_FAST_BITSCAN size_t bitidx = mi_ctz(~map); // quickly find the first zero bit if possible #else size_t bitidx = 0; // otherwise start at 0 @@ -79,7 +80,7 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_ } else { // on to the next bit range -#ifdef MI_HAVE_FAST_BITSCAN +#if MI_HAS_FAST_BITSCAN mi_assert_internal(mapm != 0); const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx)); mi_assert_internal(shift > 0 && shift <= count); @@ -146,7 +147,7 @@ static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size return ((field & mask) == mask); } -// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. +// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. // Returns `true` if successful when all previous `count` bits were 0. bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { const size_t idx = mi_bitmap_index_field(bitmap_idx); @@ -154,9 +155,9 @@ bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count const size_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); size_t expected = mi_atomic_load_relaxed(&bitmap[idx]); - do { + do { if ((expected & mask) != 0) return false; - } + } while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask)); mi_assert_internal((expected & mask) == 0); return true; @@ -194,7 +195,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit if (initial == 0) return false; if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx); // no need to cross fields (this case won't happen for us) if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries - + // scan ahead size_t found = initial; size_t mask = 0; // mask bits for the final field diff --git a/src/init.c b/src/init.c index a90818a4..2544f097 100644 --- a/src/init.c +++ b/src/init.c @@ -124,6 +124,18 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { return _mi_prim_thread_id(); } +// Thread sequence number +static _Atomic(size_t) mi_tcount; +static mi_decl_thread size_t mi_tseq; + +size_t _mi_thread_seq_id(void) mi_attr_noexcept { + size_t tseq = mi_tseq; + if (tseq == 0) { + mi_tseq = tseq = mi_atomic_add_acq_rel(&mi_tcount,1); + } + return tseq; +} + // the thread-local default heap for allocation mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty; @@ -169,8 +181,8 @@ mi_stats_t _mi_stats_main = { MI_STATS_NULL }; #if MI_GUARDED mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) { heap->guarded_sample_seed = seed; - if (heap->guarded_sample_seed == 0) { - heap->guarded_sample_seed = _mi_heap_random_next(heap); + if (heap->guarded_sample_seed == 0) { + heap->guarded_sample_seed = _mi_heap_random_next(heap); } heap->guarded_sample_rate = sample_rate; if (heap->guarded_sample_rate >= 1) { @@ -188,9 +200,9 @@ void _mi_heap_guarded_init(mi_heap_t* heap) { mi_heap_guarded_set_sample_rate(heap, (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX), (size_t)mi_option_get(mi_option_guarded_sample_seed)); - mi_heap_guarded_set_size_bound(heap, + mi_heap_guarded_set_size_bound(heap, (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX), - (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) ); + (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) ); } #else mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) { @@ -602,7 +614,7 @@ static void mi_detect_cpu_features(void) { } #else static void mi_detect_cpu_features(void) { - // nothing + // nothing } #endif diff --git a/src/libc.c b/src/libc.c index ce541f1b..05ed7b02 100644 --- a/src/libc.c +++ b/src/libc.c @@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file // -------------------------------------------------------- // This module defines various std libc functions to reduce -// the dependency on libc, and also prevent errors caused +// the dependency on libc, and also prevent errors caused // by some libc implementations when called before `main` // executes (due to malloc redirection) // -------------------------------------------------------- @@ -83,7 +83,7 @@ bool _mi_getenv(const char* name, char* result, size_t result_size) { // Define our own limited `_mi_vsnprintf` and `_mi_snprintf` // This is mostly to avoid calling these when libc is not yet // initialized (and to reduce dependencies) -// +// // format: d i, p x u, s // prec: z l ll L // width: 10 @@ -130,7 +130,7 @@ static void mi_out_alignright(char fill, char* start, size_t len, size_t extra, } -static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) +static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) { if (x == 0 || base == 0 || base > 16) { if (prefix != 0) { mi_outc(prefix, out, end); } @@ -144,8 +144,8 @@ static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end); x = x / base; } - if (prefix != 0) { - mi_outc(prefix, out, end); + if (prefix != 0) { + mi_outc(prefix, out, end); } size_t len = *out - start; // and reverse in-place @@ -181,7 +181,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) { size_t width = 0; char numtype = 'd'; char numplus = 0; - bool alignright = true; + bool alignright = true; if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); } if (c == '-') { alignright = false; MI_NEXTC(); } if (c == '0') { fill = '0'; MI_NEXTC(); } @@ -191,7 +191,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) { width = (10 * width) + (c - '0'); MI_NEXTC(); } if (c == 0) break; // extra check due to while - } + } if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); } else if (c == 'l') { numtype = c; MI_NEXTC(); @@ -273,3 +273,56 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { _mi_vsnprintf(buf, buflen, fmt, args); va_end(args); } + + + +// -------------------------------------------------------- +// generic trailing and leading zero count +// -------------------------------------------------------- + +static inline size_t mi_ctz_generic32(uint32_t x) { + // de Bruijn multiplication, see + static const uint8_t debruijn[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 + }; + if (x==0) return 32; + return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27]; +} + +static inline size_t mi_clz_generic32(uint32_t x) { + // de Bruijn multiplication, see + static const uint8_t debruijn[32] = { + 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, + 23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 + }; + if (x==0) return 32; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27]; +} + +size_t _mi_clz_generic(size_t x) { + if (x==0) return MI_SIZE_BITS; + #if (MI_SIZE_BITS <= 32) + return mi_clz_generic32((uint32_t)x); + #else + const size_t count = mi_clz_generic32((uint32_t)(x >> 32)); + if (count < 32) return count; + return (32 + mi_clz_generic32((uint32_t)x)); + #endif +} + +size_t _mi_ctz_generic(size_t x) { + if (x==0) return MI_SIZE_BITS; + #if (MI_SIZE_BITS <= 32) + return mi_ctz_generic32((uint32_t)x); + #else + const size_t count = mi_ctz_generic32((uint32_t)x); + if (count < 32) return count; + return (32 + mi_ctz_generic32((uint32_t)(x>>32))); + #endif +} diff --git a/src/os.c b/src/os.c index a7130b90..36b167cb 100644 --- a/src/os.c +++ b/src/os.c @@ -359,6 +359,18 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo return p; } +void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { + void* p = _mi_os_alloc(size, memid, &_mi_stats_main); + if (p == NULL) return NULL; + + // zero the OS memory if needed + if (!memid->initially_zero) { + _mi_memzero_aligned(p, size); + memid->initially_zero = true; + } + return p; +} + /* ----------------------------------------------------------- OS aligned allocation with an offset. This is used for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc diff --git a/src/page-queue.c b/src/page-queue.c index 9796f3dc..0a791adb 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -83,9 +83,10 @@ static inline uint8_t mi_bin(size_t size) { #if defined(MI_ALIGN4W) if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes #endif - wsize--; - // find the highest bit - uint8_t b = (uint8_t)mi_bsr(wsize); // note: wsize != 0 + wsize--; + mi_assert_internal(wsize!=0); + // find the highest bit position + uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize)); // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin diff --git a/src/xarena.c b/src/xarena.c new file mode 100644 index 00000000..42943f84 --- /dev/null +++ b/src/xarena.c @@ -0,0 +1,1777 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2024, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- +"Arenas" are fixed area's of OS memory from which we can allocate +large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB). +In contrast to the rest of mimalloc, the arenas are shared between +threads and need to be accessed using atomic operations. + +Arenas are also used to for huge OS page (1GiB) reservations or for reserving +OS memory upfront which can be improve performance or is sometimes needed +on embedded devices. We can also employ this with WASI or `sbrk` systems +to reserve large arenas upfront and be able to reuse the memory more effectively. + +The arena allocation needs to be thread safe and we use an atomic bitmap to allocate. +-----------------------------------------------------------------------------*/ + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "xbitmap.h" + + +/* ----------------------------------------------------------- + Arena allocation +----------------------------------------------------------- */ + +#define MI_ARENA_BLOCK_SIZE (MI_SMALL_PAGE_SIZE) // 64KiB +#define MI_ARENA_BLOCK_ALIGN (MI_ARENA_BLOCK_SIZE) // 64KiB +#define MI_ARENA_BIN_COUNT (MI_BIN_COUNT) + +#define MI_ARENA_MIN_OBJ_SIZE MI_ARENA_BLOCK_SIZE +#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE) // for now, cannot cross chunk boundaries + +// A memory arena descriptor +typedef struct mi_arena_s { + mi_arena_id_t id; // arena id; 0 for non-specific + mi_memid_t memid; // memid of the memory area + // _Atomic(uint8_t*) start; // the start of the memory area + // size_t meta_size; // size of the arena structure itself (including its bitmaps) + // mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) + size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) + int numa_node; // associated NUMA node + bool exclusive; // only allow allocations if specifically for this arena + bool is_large; // memory area consists of large- or huge OS pages (always committed) + mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited + _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. + + mi_bitmap_t blocks_free; // is the block free? + mi_bitmap_t blocks_committed; // is the block committed? (i.e. accessible) + mi_bitmap_t blocks_purge; // can the block be purged? (block in purge => block in free) + mi_bitmap_t blocks_dirty; // is the block potentially non-zero? + mi_bitmap_t blocks_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) + // the full queue contains abandoned full pages +} mi_arena_t; + +#define MI_MAX_ARENAS (1024) // Limited for now (and takes up .bss) + +// The available arenas +static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; +static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 + + +/* ----------------------------------------------------------- + Arena id's + id = arena_index + 1 +----------------------------------------------------------- */ + +size_t mi_arena_id_index(mi_arena_id_t id) { + return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); +} + +static mi_arena_id_t mi_arena_id_create(size_t arena_index) { + mi_assert_internal(arena_index < MI_MAX_ARENAS); + return (int)arena_index + 1; +} + +mi_arena_id_t _mi_arena_id_none(void) { + return 0; +} + +static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { + return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || + (arena_id == req_arena_id)); +} + +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { + if (memid.memkind == MI_MEM_ARENA) { + return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); + } + else { + return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); + } +} + +size_t mi_arena_get_count(void) { + return mi_atomic_load_relaxed(&mi_arena_count); +} + +mi_arena_t* mi_arena_from_index(size_t idx) { + mi_assert_internal(idx < mi_arena_get_count()); + return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); +} + + + +/* ----------------------------------------------------------- + Util +----------------------------------------------------------- */ + +// Blocks needed for a given byte size +static size_t mi_block_count_of_size(size_t size) { + return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); +} + +// Byte size of a number of blocks +static size_t mi_size_of_blocks(size_t bcount) { + return (bcount * MI_ARENA_BLOCK_SIZE); +} + +// Size of an arena +static size_t mi_arena_size(mi_arena_t* arena) { + return mi_size_of_blocks(arena->block_count); +} + +static size_t mi_arena_info_blocks(void) { + const size_t os_page_size = _mi_os_page_size(); + const size_t info_size = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page + const size_t info_blocks = mi_block_count_of_size(info_size); + return info_blocks; +} + + +// Start of the arena memory area +static uint8_t* mi_arena_start(mi_arena_t* arena) { + return ((uint8_t*)arena); +} + +// Start of a block +void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) { + return (mi_arena_start(arena) + mi_size_of_blocks(block_index)); +} + +// Arena area +void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { + if (size != NULL) *size = 0; + const size_t arena_index = mi_arena_id_index(arena_id); + if (arena_index >= MI_MAX_ARENAS) return NULL; + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); + if (arena == NULL) return NULL; + if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } + return mi_arena_start(arena); +} + + +// Create an arena memid +static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) { + mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); + memid.mem.arena.id = id; + memid.mem.arena.block_index = block_index; + memid.mem.arena.is_exclusive = is_exclusive; + return memid; +} + +// returns if the arena is exclusive +bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) { + mi_assert_internal(memid.memkind == MI_MEM_ARENA); + *arena_index = mi_arena_id_index(memid.mem.arena.id); + *block_index = memid.mem.arena.block_index; + return memid.mem.arena.is_exclusive; +} + + + +/* ----------------------------------------------------------- + Arena Allocation +----------------------------------------------------------- */ + +static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, + bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + MI_UNUSED(arena_index); + mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); + + size_t block_index; + if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL; + + // claimed it! + void* p = mi_arena_block_start(arena, block_index); + *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index); + memid->is_pinned = arena->memid.is_pinned; + + // set the dirty bits + if (arena->memid.initially_zero) { + memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL); + } + + // set commit state + if (commit) { + // commit requested, but the range may not be committed as a whole: ensure it is committed now + memid->initially_committed = true; + + bool all_already_committed; + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed); + if (!all_already_committed) { + bool commit_zero = false; + if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { + memid->initially_committed = false; + } + else { + if (commit_zero) { memid->initially_zero = true; } + } + } + } + else { + // no need to commit, but check if already fully committed + memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount); + } + + return p; +} + +// allocate in a speficic arena +static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, + size_t size, size_t alignment, + bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; + + const size_t bcount = mi_block_count_of_size(size); + const size_t arena_index = mi_arena_id_index(arena_id); + mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); + mi_assert_internal(size <= mi_size_of_blocks(bcount)); + + // Check arena suitability + mi_arena_t* arena = mi_arena_from_index(arena_index); + if (arena == NULL) return NULL; + if (!allow_large && arena->is_large) return NULL; + if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; + if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity + const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); + if (match_numa_node) { if (!numa_suitable) return NULL; } + else { if (numa_suitable) return NULL; } + } + + // try to allocate + void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld); + mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); + return p; +} + + +// allocate from an arena with fallback to the OS +static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; + + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + if mi_likely(max_arena == 0) return NULL; + + if (req_arena_id != _mi_arena_id_none()) { + // try a specific arena if requested + if (mi_arena_id_index(req_arena_id) < max_arena) { + void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + else { + // try numa affine allocation + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + + // try from another numa node instead.. + if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + } + return NULL; +} + +// try to reserve a fresh arena space +static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) +{ + if (_mi_preloading()) return false; // use OS only while pre loading + if (req_arena_id != _mi_arena_id_none()) return false; + + const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); + if (arena_count > (MI_MAX_ARENAS - 4)) return false; + + // calc reserve + size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); + if (arena_reserve == 0) return false; + + if (!_mi_os_has_virtual_reserve()) { + arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) + } + arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); + + if (arena_count >= 8 && arena_count <= 128) { + // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); + size_t reserve = 0; + if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { + arena_reserve = reserve; + } + } + + // check arena bounds + const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1); + const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE; + if (arena_reserve < min_reserve) { + arena_reserve = min_reserve; + } + else if (arena_reserve > max_reserve) { + arena_reserve = max_reserve; + } + + if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size + + // commit eagerly? + bool arena_commit = false; + if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } + else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } + + return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); +} + + +void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert_internal(memid != NULL && tld != NULL); + mi_assert_internal(size > 0); + size_t tseq = _mi_thread_seq_id(); + *memid = _mi_memid_none(); + + const int numa_node = _mi_os_numa_node(tld); // current numa node + + // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? + if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) { + void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + + // otherwise, try to first eagerly reserve a new arena + if (req_arena_id == _mi_arena_id_none()) { + mi_arena_id_t arena_id = 0; + if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { + // and try allocate in there + mi_assert_internal(req_arena_id == _mi_arena_id_none()); + p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + } + } + + // if we cannot use OS allocation, return NULL + if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { + errno = ENOMEM; + return NULL; + } + + // finally, fall back to the OS + if (align_offset > 0) { + return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); + } + else { + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); + } +} + +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); +} + + +/* ----------------------------------------------------------- + Arena free +----------------------------------------------------------- */ +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats); +static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats); + +void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { + mi_assert_internal(size > 0 && stats != NULL); + mi_assert_internal(committed_size <= size); + if (p==NULL) return; + if (size==0) return; + const bool all_committed = (committed_size == size); + + // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) + mi_track_mem_undefined(p, size); + + if (mi_memkind_is_os(memid.memkind)) { + // was a direct OS allocation, pass through + if (!all_committed && committed_size > 0) { + // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + _mi_os_free(p, size, memid, stats); + } + else if (memid.memkind == MI_MEM_ARENA) { + // allocated in an arena + size_t arena_idx; + size_t block_idx; + mi_arena_memid_indices(memid, &arena_idx, &block_idx); + mi_assert_internal(arena_idx < MI_MAX_ARENAS); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); + mi_assert_internal(arena != NULL); + const size_t blocks = mi_block_count_of_size(size); + + // checks + if (arena == NULL) { + _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + mi_assert_internal(block_idx < arena->block_count); + mi_assert_internal(block_idx > mi_arena_info_blocks()); + if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) { + _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + + // potentially decommit + if (arena->memid.is_pinned || arena->memid.initially_committed) { + mi_assert_internal(all_committed); + } + else { + if (!all_committed) { + // mark the entire range as no longer committed (so we recommit the full range when re-using) + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); + mi_track_mem_noaccess(p, size); + if (committed_size > 0) { + // if partially committed, adjust the committed stats (is it will be recommitted when re-using) + // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + // note: if not all committed, it may be that the purge will reset/decommit the entire range + // that contains already decommitted parts. Since purge consistently uses reset or decommit that + // works (as we should never reset decommitted parts). + } + // (delay) purge the entire range + mi_arena_schedule_purge(arena, block_idx, blocks, stats); + } + + // and make it available to others again + bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL); + if (!all_inuse) { + _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); + return; + }; + } + else { + // arena was none, external, or static; nothing to do + mi_assert_internal(memid.memkind < MI_MEM_OS); + } + + // purge expired decommits + mi_arenas_try_purge(false, false, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +static void mi_arenas_unsafe_destroy(void) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + size_t new_max_arena = 0; + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + if (arena != NULL) { + mi_lock_done(&arena->abandoned_visit_lock); + if (mi_memkind_is_os(arena->memid.memkind)) { + mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); + _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main); + } + } + } + + // try to lower the max arena. + size_t expected = max_arena; + mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); +} + +// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired +void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { + mi_arenas_unsafe_destroy(); + _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas +} + +// Is a pointer inside any of our arenas? +bool _mi_arena_contains(const void* p) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { + return true; + } + } + return false; +} + + +/* ----------------------------------------------------------- + Add an arena. +----------------------------------------------------------- */ + +static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { + mi_assert_internal(arena != NULL); + mi_assert_internal(arena->block_count > 0); + if (arena_id != NULL) { *arena_id = -1; } + + size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); + if (i >= MI_MAX_ARENAS) { + mi_atomic_decrement_acq_rel(&mi_arena_count); + return false; + } + _mi_stat_counter_increase(&stats->arena_count,1); + arena->id = mi_arena_id_create(i); + mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); + if (arena_id != NULL) { *arena_id = arena->id; } + return true; +} + +static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept +{ + mi_assert(!is_large || memid.initially_committed && memid.is_pinned); + mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)); + mi_assert(start!=NULL); + if (start==NULL) return false; + if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) { + // todo: use alignment in memid to align to blocksize first? + _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start); + return false; + } + + if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } + + const size_t info_blocks = mi_arena_info_blocks(); + const size_t bcount = size / MI_ARENA_BLOCK_SIZE; // divide down + if (bcount < info_blocks+1) { + _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB); + return false; + } + if (bcount > MI_BITMAP_MAX_BITS) { + // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) + _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB); + return false; + } + mi_arena_t* arena = (mi_arena_t*)start; + + // commit & zero if needed + bool is_zero = memid.initially_zero; + if (!memid.initially_committed) { + _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main); + } + if (!is_zero) { + _mi_memzero(arena, mi_size_of_blocks(info_blocks)); + } + + // init + arena->id = _mi_arena_id_none(); + arena->memid = memid; + arena->exclusive = exclusive; + arena->block_count = bcount; + arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) + arena->is_large = is_large; + arena->purge_expire = 0; + mi_lock_init(&arena->abandoned_visit_lock); + + // init bitmaps + mi_bitmap_init(&arena->blocks_free,true); + mi_bitmap_init(&arena->blocks_committed,true); + mi_bitmap_init(&arena->blocks_dirty,true); + mi_bitmap_init(&arena->blocks_purge,true); + for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) { + mi_bitmap_init(&arena->blocks_abandoned[i],true); + } + + // reserve our meta info (and reserve blocks outside the memory area) + mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks); + if (memid.initially_committed) { + mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count); + } + else { + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL); + } + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL); + + return mi_arena_add(arena, arena_id, &_mi_stats_main); +} + + +bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); + memid.initially_committed = is_committed; + memid.initially_zero = is_zero; + memid.is_pinned = is_large; + return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = _mi_arena_id_none(); + size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block + mi_memid_t memid; + void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main); + if (start == NULL) return ENOMEM; + const bool is_large = memid.is_pinned; // todo: use separate is_large field? + if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { + _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); + _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); + return ENOMEM; + } + _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); + return 0; +} + + +// Manage a range of regular OS memory +bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { + return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { + return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); +} + + +/* ----------------------------------------------------------- + Debugging +----------------------------------------------------------- */ +static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { + size_t bit_set_count = 0; + for (int bit = 0; bit < MI_BFIELD_BITS; bit++) { + bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); + if (is_set) bit_set_count++; + buf[bit] = (is_set ? 'x' : '.'); + } + return bit_set_count; +} + +static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) { + _mi_verbose_message("%s%s:\n", prefix, header); + size_t bit_count = 0; + size_t bit_set_count = 0; + for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) { + char buf[MI_BITMAP_CHUNK_BITS + 1]; + mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; + for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { + if (bit_count < block_count) { + bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS); + } + else { + _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS); + } + bit_count += MI_BFIELD_BITS; + } + buf[MI_BITMAP_CHUNK_BITS] = 0; + _mi_verbose_message("%s %s\n", prefix, buf); + } + _mi_verbose_message("%s total ('x'): %zu\n", prefix, bit_set_count); + return bit_set_count; +} + +void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { + MI_UNUSED(show_abandoned); + size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); + size_t free_total = 0; + size_t block_total = 0; + //size_t abandoned_total = 0; + size_t purge_total = 0; + for (size_t i = 0; i < max_arenas; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena == NULL) break; + block_total += arena->block_count; + _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : "")); + if (show_inuse) { + free_total += mi_debug_show_bitmap(" ", "free blocks", arena->block_count, &arena->blocks_free); + } + mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, &arena->blocks_committed); + // todo: abandoned blocks + if (show_purge) { + purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, &arena->blocks_purge); + } + } + if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", block_total - free_total); + // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); + if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); +} + + +/* ----------------------------------------------------------- + Reserve a huge page arena. +----------------------------------------------------------- */ +// reserve at a specific numa node +int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = -1; + if (pages==0) return 0; + if (numa_node < -1) numa_node = -1; + if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); + size_t hsize = 0; + size_t pages_reserved = 0; + mi_memid_t memid; + void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); + if (p==NULL || pages_reserved==0) { + _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); + return ENOMEM; + } + _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); + + if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { + _mi_os_free(p, hsize, memid, &_mi_stats_main); + return ENOMEM; + } + return 0; +} + +int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { + return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); +} + +// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) +int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { + if (pages == 0) return 0; + + // pages per numa node + size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); + if (numa_count <= 0) numa_count = 1; + const size_t pages_per = pages / numa_count; + const size_t pages_mod = pages % numa_count; + const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); + + // reserve evenly among numa nodes + for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { + size_t node_pages = pages_per; // can be 0 + if (numa_node < pages_mod) node_pages++; + int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); + if (err) return err; + if (pages < node_pages) { + pages = 0; + } + else { + pages -= node_pages; + } + } + + return 0; +} + +int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { + MI_UNUSED(max_secs); + _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); + if (pages_reserved != NULL) *pages_reserved = 0; + int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); + if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; + return err; +} + + + +/* ----------------------------------------------------------- + Arena purge +----------------------------------------------------------- */ + +static long mi_arena_purge_delay(void) { + // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); +} + +// reset or decommit in an arena and update the committed/decommit bitmaps +// assumes we own the area (i.e. blocks_free is claimed by us) +static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { + mi_assert_internal(!arena->memid.is_pinned); + const size_t size = mi_size_of_blocks(blocks); + void* const p = mi_arena_block_start(arena, block_idx); + bool needs_recommit; + if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) { + // all blocks are committed, we can purge freely + needs_recommit = _mi_os_purge(p, size, stats); + } + else { + // some blocks are not committed -- this can happen when a partially committed block is freed + // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge + // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), + // and also undo the decommit stats (as it was already adjusted) + mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); + needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); + if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } + } + + // clear the purged blocks + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL); + + // update committed bitmap + if (needs_recommit) { + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); + } +} + + +// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. +// Note: assumes we (still) own the area as we may purge immediately +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { + const long delay = mi_arena_purge_delay(); + if (delay < 0) return; // is purging allowed at all? + + if (_mi_preloading() || delay == 0) { + // decommit directly + mi_arena_purge(arena, block_idx, blocks, stats); + } + else { + // schedule decommit + _mi_error_message(EFAULT, "purging not yet implemented\n"); + } +} + + +static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) { + if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled + + const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); + if (max_arena == 0) return; + + _mi_error_message(EFAULT, "purging not yet implemented\n"); + MI_UNUSED(stats); + MI_UNUSED(visit_all); + MI_UNUSED(force); +} + + +#if 0 + +#define MI_IN_ARENA_C +#include "arena-abandon.c" +#undef MI_IN_ARENA_C + +/* ----------------------------------------------------------- + Arena id's + id = arena_index + 1 +----------------------------------------------------------- */ + +size_t mi_arena_id_index(mi_arena_id_t id) { + return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); +} + +static mi_arena_id_t mi_arena_id_create(size_t arena_index) { + mi_assert_internal(arena_index < MI_MAX_ARENAS); + return (int)arena_index + 1; +} + +mi_arena_id_t _mi_arena_id_none(void) { + return 0; +} + +static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { + return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || + (arena_id == req_arena_id)); +} + +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { + if (memid.memkind == MI_MEM_ARENA) { + return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); + } + else { + return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); + } +} + +size_t mi_arena_get_count(void) { + return mi_atomic_load_relaxed(&mi_arena_count); +} + +mi_arena_t* mi_arena_from_index(size_t idx) { + mi_assert_internal(idx < mi_arena_get_count()); + return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); +} + + +/* ----------------------------------------------------------- + Arena allocations get a (currently) 16-bit memory id where the + lower 8 bits are the arena id, and the upper bits the block index. +----------------------------------------------------------- */ + +static size_t mi_block_count_of_size(size_t size) { + return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); +} + +static size_t mi_size_of_blocks(size_t bcount) { + return (bcount * MI_ARENA_BLOCK_SIZE); +} + +static size_t mi_arena_size(mi_arena_t* arena) { + return mi_size_of_blocks(arena->block_count); +} + +static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) { + mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); + memid.mem.arena.id = id; + memid.mem.arena.block_index = bitmap_index; + memid.mem.arena.is_exclusive = is_exclusive; + return memid; +} + +bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) { + mi_assert_internal(memid.memkind == MI_MEM_ARENA); + *arena_index = mi_arena_id_index(memid.mem.arena.id); + *bitmap_index = memid.mem.arena.block_index; + return memid.mem.arena.is_exclusive; +} + + + +/* ----------------------------------------------------------- + Special static area for mimalloc internal structures + to avoid OS calls (for example, for the arena metadata (~= 256b)) +----------------------------------------------------------- */ + +#define MI_ARENA_STATIC_MAX ((MI_INTPTR_SIZE/2)*MI_KiB) // 4 KiB on 64-bit + +static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX]; // must be cache aligned, see issue #895 +static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top; + +static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) { + *memid = _mi_memid_none(); + if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL; + const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top); + if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL; + + // try to claim space + if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; } + const size_t oversize = size + alignment - 1; + if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL; + const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize); + size_t top = oldtop + oversize; + if (top > MI_ARENA_STATIC_MAX) { + // try to roll back, ok if this fails + mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop); + return NULL; + } + + // success + *memid = _mi_memid_create(MI_MEM_STATIC); + memid->initially_zero = true; + const size_t start = _mi_align_up(oldtop, alignment); + uint8_t* const p = &mi_arena_static[start]; + _mi_memzero_aligned(p, size); + return p; +} + +void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) { + *memid = _mi_memid_none(); + + // try static + void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid); + if (p != NULL) return p; + + // or fall back to the OS + p = _mi_os_alloc(size, memid, &_mi_stats_main); + if (p == NULL) return NULL; + + // zero the OS memory if needed + if (!memid->initially_zero) { + _mi_memzero_aligned(p, size); + memid->initially_zero = true; + } + return p; +} + +void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { + if (mi_memkind_is_os(memid.memkind)) { + _mi_os_free(p, size, memid, &_mi_stats_main); + } + else { + mi_assert(memid.memkind == MI_MEM_STATIC); + } +} + +void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { + return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex))); +} + + +/* ----------------------------------------------------------- + Thread safe allocation in an arena +----------------------------------------------------------- */ + +// claim the `blocks_inuse` bits +static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats) +{ + size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx); // start from last search; ok to be relaxed as the exact start does not matter + if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) { + mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx)); // start search from found location next time around + return true; + }; + return false; +} + + +/* ----------------------------------------------------------- + Arena Allocation +----------------------------------------------------------- */ + +static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, + bool commit, mi_memid_t* memid, mi_os_tld_t* tld) +{ + MI_UNUSED(arena_index); + mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); + + mi_bitmap_index_t bitmap_index; + if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL; + + // claimed it! + void* p = mi_arena_block_start(arena, bitmap_index); + *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index); + memid->is_pinned = arena->memid.is_pinned; + + // none of the claimed blocks should be scheduled for a decommit + if (arena->blocks_purge != NULL) { + // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`). + _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index); + } + + // set the dirty bits (todo: no need for an atomic op here?) + if (arena->memid.initially_zero && arena->blocks_dirty != NULL) { + memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL); + } + + // set commit state + if (arena->blocks_committed == NULL) { + // always committed + memid->initially_committed = true; + } + else if (commit) { + // commit requested, but the range may not be committed as a whole: ensure it is committed now + memid->initially_committed = true; + bool any_uncommitted; + _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted); + if (any_uncommitted) { + bool commit_zero = false; + if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { + memid->initially_committed = false; + } + else { + if (commit_zero) { memid->initially_zero = true; } + } + } + } + else { + // no need to commit, but check if already fully committed + memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index); + } + + return p; +} + +// allocate in a speficic arena +static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, + bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) +{ + MI_UNUSED_RELEASE(alignment); + mi_assert(alignment <= MI_SEGMENT_ALIGN); + const size_t bcount = mi_block_count_of_size(size); + const size_t arena_index = mi_arena_id_index(arena_id); + mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); + mi_assert_internal(size <= mi_size_of_blocks(bcount)); + + // Check arena suitability + mi_arena_t* arena = mi_arena_from_index(arena_index); + if (arena == NULL) return NULL; + if (!allow_large && arena->is_large) return NULL; + if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; + if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity + const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); + if (match_numa_node) { if (!numa_suitable) return NULL; } + else { if (numa_suitable) return NULL; } + } + + // try to allocate + void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld); + mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); + return p; +} + + +// allocate from an arena with fallback to the OS +static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) +{ + MI_UNUSED(alignment); + mi_assert_internal(alignment <= MI_SEGMENT_ALIGN); + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + if mi_likely(max_arena == 0) return NULL; + + if (req_arena_id != _mi_arena_id_none()) { + // try a specific arena if requested + if (mi_arena_id_index(req_arena_id) < max_arena) { + void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + } + else { + // try numa affine allocation + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + + // try from another numa node instead.. + if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + } + } + return NULL; +} + +// try to reserve a fresh arena space +static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id) +{ + if (_mi_preloading()) return false; // use OS only while pre loading + if (req_arena_id != _mi_arena_id_none()) return false; + + const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); + if (arena_count > (MI_MAX_ARENAS - 4)) return false; + + size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); + if (arena_reserve == 0) return false; + + if (!_mi_os_has_virtual_reserve()) { + arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) + } + arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); + arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE); + if (arena_count >= 8 && arena_count <= 128) { + // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 ); + size_t reserve = 0; + if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { + arena_reserve = reserve; + } + } + if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size + + // commit eagerly? + bool arena_commit = false; + if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } + else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } + + return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); +} + + +void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert_internal(memid != NULL && tld != NULL); + mi_assert_internal(size > 0); + *memid = _mi_memid_none(); + + const int numa_node = _mi_os_numa_node(tld); // current numa node + + // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? + if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) { + void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + + // otherwise, try to first eagerly reserve a new arena + if (req_arena_id == _mi_arena_id_none()) { + mi_arena_id_t arena_id = 0; + if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { + // and try allocate in there + mi_assert_internal(req_arena_id == _mi_arena_id_none()); + p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + } + } + } + + // if we cannot use OS allocation, return NULL + if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { + errno = ENOMEM; + return NULL; + } + + // finally, fall back to the OS + if (align_offset > 0) { + return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); + } + else { + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); + } +} + +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); +} + + +void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { + if (size != NULL) *size = 0; + size_t arena_index = mi_arena_id_index(arena_id); + if (arena_index >= MI_MAX_ARENAS) return NULL; + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); + if (arena == NULL) return NULL; + if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } + return arena->start; +} + + +/* ----------------------------------------------------------- + Arena purge +----------------------------------------------------------- */ + +static long mi_arena_purge_delay(void) { + // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); +} + +// reset or decommit in an arena and update the committed/decommit bitmaps +// assumes we own the area (i.e. blocks_in_use is claimed by us) +static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) { + mi_assert_internal(arena->blocks_committed != NULL); + mi_assert_internal(arena->blocks_purge != NULL); + mi_assert_internal(!arena->memid.is_pinned); + const size_t size = mi_size_of_blocks(blocks); + void* const p = mi_arena_block_start(arena, bitmap_idx); + bool needs_recommit; + if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { + // all blocks are committed, we can purge freely + needs_recommit = _mi_os_purge(p, size, stats); + } + else { + // some blocks are not committed -- this can happen when a partially committed block is freed + // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge + // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), + // and also undo the decommit stats (as it was already adjusted) + mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); + needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); + if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } + } + + // clear the purged blocks + _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx); + // update committed bitmap + if (needs_recommit) { + _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); + } +} + +// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. +// Note: assumes we (still) own the area as we may purge immediately +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) { + mi_assert_internal(arena->blocks_purge != NULL); + const long delay = mi_arena_purge_delay(); + if (delay < 0) return; // is purging allowed at all? + + if (_mi_preloading() || delay == 0) { + // decommit directly + mi_arena_purge(arena, bitmap_idx, blocks, stats); + } + else { + // schedule decommit + mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); + if (expire != 0) { + mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10)); // add smallish extra delay + } + else { + mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); + } + _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL); + } +} + +// purge a range of blocks +// return true if the full range was purged. +// assumes we own the area (i.e. blocks_in_use is claimed by us) +static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) { + const size_t endidx = startseqx + bitlen; + size_t bitseqx = startseqx; + bool all_purged = false; + while (bitseqx < endidx) { + // count consecutive ones in the purge mask + size_t count = 0; + while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) { + count++; + } + if (count > 0) { + // found range to be purged + const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx); + mi_arena_purge(arena, range_idx, count, stats); + if (count == bitlen) { + all_purged = true; + } + } + bitseqx += (count+1); // +1 to skip the zero bit (or end) + } + return all_purged; +} + +// returns true if anything was purged +static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) +{ + if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false; + mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); + if (expire == 0) return false; + if (!force && expire > now) return false; + + // reset expire (if not already set concurrently) + mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); + + // potential purges scheduled, walk through the bitmap + bool any_purged = false; + bool full_purge = true; + for (size_t i = 0; i < arena->field_count; i++) { + size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]); + if (purge != 0) { + size_t bitseqx = 0; + while (bitseqx < MI_BITMAP_FIELD_BITS) { + // find consecutive range of ones in the purge mask + size_t bitlen = 0; + while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) { + bitlen++; + } + // temporarily claim the purge range as "in-use" to be thread-safe with allocation + // try to claim the longest range of corresponding in_use bits + const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx); + while( bitlen > 0 ) { + if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) { + break; + } + bitlen--; + } + // actual claimed bits at `in_use` + if (bitlen > 0) { + // read purge again now that we have the in_use bits + purge = mi_atomic_load_acquire(&arena->blocks_purge[i]); + if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) { + full_purge = false; + } + any_purged = true; + // release the claimed `in_use` bits again + _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index); + } + bitseqx += (bitlen+1); // +1 to skip the zero (or end) + } // while bitseqx + } // purge != 0 + } + // if not fully purged, make sure to purge again in the future + if (!full_purge) { + const long delay = mi_arena_purge_delay(); + mi_msecs_t expected = 0; + mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay); + } + return any_purged; +} + +static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) { + if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled + + const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); + if (max_arena == 0) return; + + // allow only one thread to purge at a time + static mi_atomic_guard_t purge_guard; + mi_atomic_guard(&purge_guard) + { + mi_msecs_t now = _mi_clock_now(); + size_t max_purge_count = (visit_all ? max_arena : 1); + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + if (arena != NULL) { + if (mi_arena_try_purge(arena, now, force, stats)) { + if (max_purge_count <= 1) break; + max_purge_count--; + } + } + } + } +} + + +/* ----------------------------------------------------------- + Arena free +----------------------------------------------------------- */ + +void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { + mi_assert_internal(size > 0 && stats != NULL); + mi_assert_internal(committed_size <= size); + if (p==NULL) return; + if (size==0) return; + const bool all_committed = (committed_size == size); + + // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) + mi_track_mem_undefined(p,size); + + if (mi_memkind_is_os(memid.memkind)) { + // was a direct OS allocation, pass through + if (!all_committed && committed_size > 0) { + // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + _mi_os_free(p, size, memid, stats); + } + else if (memid.memkind == MI_MEM_ARENA) { + // allocated in an arena + size_t arena_idx; + size_t bitmap_idx; + mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx); + mi_assert_internal(arena_idx < MI_MAX_ARENAS); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]); + mi_assert_internal(arena != NULL); + const size_t blocks = mi_block_count_of_size(size); + + // checks + if (arena == NULL) { + _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx)); + if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) { + _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + + // potentially decommit + if (arena->memid.is_pinned || arena->blocks_committed == NULL) { + mi_assert_internal(all_committed); + } + else { + mi_assert_internal(arena->blocks_committed != NULL); + mi_assert_internal(arena->blocks_purge != NULL); + + if (!all_committed) { + // mark the entire range as no longer committed (so we recommit the full range when re-using) + _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); + mi_track_mem_noaccess(p,size); + if (committed_size > 0) { + // if partially committed, adjust the committed stats (is it will be recommitted when re-using) + // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + // note: if not all committed, it may be that the purge will reset/decommit the entire range + // that contains already decommitted parts. Since purge consistently uses reset or decommit that + // works (as we should never reset decommitted parts). + } + // (delay) purge the entire range + mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats); + } + + // and make it available to others again + bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx); + if (!all_inuse) { + _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); + return; + }; + } + else { + // arena was none, external, or static; nothing to do + mi_assert_internal(memid.memkind < MI_MEM_OS); + } + + // purge expired decommits + mi_arenas_try_purge(false, false, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +static void mi_arenas_unsafe_destroy(void) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + size_t new_max_arena = 0; + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + if (arena != NULL) { + mi_lock_done(&arena->abandoned_visit_lock); + if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) { + mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); + _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); + } + else { + new_max_arena = i; + } + _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size); + } + } + + // try to lower the max arena. + size_t expected = max_arena; + mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); +} + +// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired +void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { + mi_arenas_unsafe_destroy(); + _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas +} + +// Is a pointer inside any of our arenas? +bool _mi_arena_contains(const void* p) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { + return true; + } + } + return false; +} + +/* ----------------------------------------------------------- + Add an arena. +----------------------------------------------------------- */ + +static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { + mi_assert_internal(arena != NULL); + mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0); + mi_assert_internal(arena->block_count > 0); + if (arena_id != NULL) { *arena_id = -1; } + + size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); + if (i >= MI_MAX_ARENAS) { + mi_atomic_decrement_acq_rel(&mi_arena_count); + return false; + } + _mi_stat_counter_increase(&stats->arena_count,1); + arena->id = mi_arena_id_create(i); + mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); + if (arena_id != NULL) { *arena_id = arena->id; } + return true; +} + +static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept +{ + if (arena_id != NULL) *arena_id = _mi_arena_id_none(); + if (size < MI_ARENA_BLOCK_SIZE) return false; + + if (is_large) { + mi_assert_internal(memid.initially_committed && memid.is_pinned); + } + + const size_t bcount = size / MI_ARENA_BLOCK_SIZE; + const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS); + const size_t bitmaps = (memid.is_pinned ? 3 : 5); + const size_t asize = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t)); + mi_memid_t meta_memid; + mi_arena_t* arena = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid); + if (arena == NULL) return false; + + // already zero'd due to zalloc + // _mi_memzero(arena, asize); + arena->id = _mi_arena_id_none(); + arena->memid = memid; + arena->exclusive = exclusive; + arena->meta_size = asize; + arena->meta_memid = meta_memid; + arena->block_count = bcount; + arena->field_count = fields; + arena->start = (uint8_t*)start; + arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) + arena->is_large = is_large; + arena->purge_expire = 0; + arena->search_idx = 0; + mi_lock_init(&arena->abandoned_visit_lock); + // consecutive bitmaps + arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap + arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap + arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap + arena->blocks_purge = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap + // initialize committed bitmap? + if (arena->blocks_committed != NULL && arena->memid.initially_committed) { + memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning + } + + // and claim leftover blocks if needed (so we never allocate there) + ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount; + mi_assert_internal(post >= 0); + if (post > 0) { + // don't use leftover bits at the end + mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); + _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL); + } + return mi_arena_add(arena, arena_id, &_mi_stats_main); + +} + +bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); + memid.initially_committed = is_committed; + memid.initially_zero = is_zero; + memid.is_pinned = is_large; + return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = _mi_arena_id_none(); + size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block + mi_memid_t memid; + void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main); + if (start == NULL) return ENOMEM; + const bool is_large = memid.is_pinned; // todo: use separate is_large field? + if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { + _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); + _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); + return ENOMEM; + } + _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); + return 0; +} + + +// Manage a range of regular OS memory +bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { + return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { + return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); +} + + +/* ----------------------------------------------------------- + Debugging +----------------------------------------------------------- */ + +static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) { + _mi_verbose_message("%s%s:\n", prefix, header); + size_t bcount = 0; + size_t inuse_count = 0; + for (size_t i = 0; i < field_count; i++) { + char buf[MI_BITMAP_FIELD_BITS + 1]; + uintptr_t field = mi_atomic_load_relaxed(&fields[i]); + for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) { + if (bcount < block_count) { + bool inuse = ((((uintptr_t)1 << bit) & field) != 0); + if (inuse) inuse_count++; + buf[bit] = (inuse ? 'x' : '.'); + } + else { + buf[bit] = ' '; + } + } + buf[MI_BITMAP_FIELD_BITS] = 0; + _mi_verbose_message("%s %s\n", prefix, buf); + } + _mi_verbose_message("%s total ('x'): %zu\n", prefix, inuse_count); + return inuse_count; +} + +void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { + size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); + size_t inuse_total = 0; + size_t abandoned_total = 0; + size_t purge_total = 0; + for (size_t i = 0; i < max_arenas; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena == NULL) break; + _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : "")); + if (show_inuse) { + inuse_total += mi_debug_show_bitmap(" ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count); + } + if (arena->blocks_committed != NULL) { + mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count); + } + if (show_abandoned) { + abandoned_total += mi_debug_show_bitmap(" ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count); + } + if (show_purge && arena->blocks_purge != NULL) { + purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count); + } + } + if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", inuse_total); + if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); + if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); +} + + +/* ----------------------------------------------------------- + Reserve a huge page arena. +----------------------------------------------------------- */ +// reserve at a specific numa node +int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = -1; + if (pages==0) return 0; + if (numa_node < -1) numa_node = -1; + if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); + size_t hsize = 0; + size_t pages_reserved = 0; + mi_memid_t memid; + void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); + if (p==NULL || pages_reserved==0) { + _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); + return ENOMEM; + } + _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); + + if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { + _mi_os_free(p, hsize, memid, &_mi_stats_main); + return ENOMEM; + } + return 0; +} + +int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { + return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); +} + +// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) +int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { + if (pages == 0) return 0; + + // pages per numa node + size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); + if (numa_count <= 0) numa_count = 1; + const size_t pages_per = pages / numa_count; + const size_t pages_mod = pages % numa_count; + const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); + + // reserve evenly among numa nodes + for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { + size_t node_pages = pages_per; // can be 0 + if (numa_node < pages_mod) node_pages++; + int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); + if (err) return err; + if (pages < node_pages) { + pages = 0; + } + else { + pages -= node_pages; + } + } + + return 0; +} + +int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { + MI_UNUSED(max_secs); + _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); + if (pages_reserved != NULL) *pages_reserved = 0; + int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); + if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; + return err; +} + + +#endif \ No newline at end of file diff --git a/src/xbitmap.c b/src/xbitmap.c new file mode 100644 index 00000000..68525c84 --- /dev/null +++ b/src/xbitmap.c @@ -0,0 +1,599 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2024 Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- +Concurrent bitmap that can set/reset sequences of bits atomically +---------------------------------------------------------------------------- */ + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "mimalloc/bits.h" +#include "xbitmap.h" + +/* -------------------------------------------------------------------------------- + bfields +-------------------------------------------------------------------------------- */ + +static inline size_t mi_bfield_ctz(mi_bfield_t x) { + return mi_ctz(x); +} + +static inline size_t mi_bfield_clz(mi_bfield_t x) { + return mi_clz(x); +} + +// find the least significant bit that is set (i.e. count trailing zero's) +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { + return mi_bsf(x,idx); +} + +static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { + return mi_rotr(x,r); +} + +// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). +static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { + mi_assert_internal(idx < MI_BFIELD_BITS); + const mi_bfield_t mask = ((mi_bfield_t)1)<bfields[i], idx); +} + +static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) { + mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); + const size_t i = byte_idx / MI_BFIELD_SIZE; + const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; + return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx); +} + +// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0) +static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + bool all_transition = true; + bool all_already_xset = true; + size_t idx = cidx % MI_BFIELD_BITS; + size_t field = cidx / MI_BFIELD_BITS; + while (n > 0) { + size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(idx + m <= MI_BFIELD_BITS); + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset); + all_already_xset = all_already_xset && already_xset; + // next field + field++; + idx = 0; + n -= m; + } + *palready_xset = all_already_xset; + return all_transition; +} + +// Check if a sequence of `n` bits within a chunk are all set/cleared. +static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + bool all_xset = true; + size_t idx = cidx % MI_BFIELD_BITS; + size_t field = cidx / MI_BFIELD_BITS; + while (n > 0) { + size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(idx + m <= MI_BFIELD_BITS); + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask); + // next field + field++; + idx = 0; + n -= m; + } + return all_xset; +} + +// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0), +// and false otherwise leaving all bit fields as is. +static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + if (n==0) return true; + size_t start_idx = cidx % MI_BFIELD_BITS; + size_t start_field = cidx / MI_BFIELD_BITS; + size_t end_field = MI_BITMAP_CHUNK_FIELDS; + size_t mask_mid = 0; + size_t mask_end = 0; + + // first field + size_t field = start_field; + size_t m = MI_BFIELD_BITS - start_idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); + mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask_start)) return false; + + // done? + n -= m; + if (n==0) return true; + + // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields + + // mid fields + while (n >= MI_BFIELD_BITS) { + field++; + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + mask_mid = ~MI_ZU(0); + if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore; + n -= MI_BFIELD_BITS; + } + + // last field + if (n > 0) { + mi_assert_internal(n < MI_BFIELD_BITS); + field++; + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + end_field = field; + mask_end = (MI_ZU(1)<bfields[field], mask_end)) goto restore; + } + + return true; + +restore: + // field is on the field that failed to set atomically; we need to restore all previous fields + mi_assert_internal(field > start_field); + while( field > start_field) { + field--; + const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); + bool already_xset; + mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset); + } + return false; +} + + +// find least 1-bit in a chunk and try unset it atomically +// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// todo: try neon version +static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + while(true) { + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + if (_mm256_testz_si256(vec,vec)) return false; // vec == 0 ? + const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1 : 0) + const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0) + mi_assert_internal(mask != 0); + const size_t chunk_idx = _tzcnt_u32(mask) / 8; // tzcnt == 0, 8, 16, or 24 + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + size_t cidx; + if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) { // find the bit that is set + if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) { // unset atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + // try again + } + #else + size_t idx; + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + size_t idx; + if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit + if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) { // try unset atomically + *pidx = (i*MI_BFIELD_BITS + idx); + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + } + return false; + #endif +} + + +// find least byte in a chunk with all bits set, and try unset it atomically +// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// todo: try neon version +static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + while(true) { + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1 : 0) + const uint32_t mask = _mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte + if (mask == 0) return false; + const size_t i = _tzcnt_u32(mask); + mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS); + const size_t chunk_idx = i / MI_BFIELD_SIZE; + const size_t byte_idx = i % MI_BFIELD_SIZE; + if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) { // try to unset atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8); + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + // try again + } + #else + size_t idx; + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + const mi_bfield_t x = chunk->bfields[i]; + // has_set8 has low bit in each byte set if the byte in x == 0xFF + const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F + (x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 + >> 7; // shift high bit to low bit + size_t idx; + if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit + mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); + mi_assert_internal((idx%8)==0); + const size_t byte_idx = idx/8; + if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) { // unset the byte atomically + *pidx = (i*MI_BFIELD_BITS) + idx; + mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS); + return true; + } + // else continue + } + } + return false; + #endif +} + + +// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically +// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. +// todo: try avx2 and neon version +// todo: allow spanning across bfield boundaries? +static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { + if (n == 0 || n > MI_BFIELD_BITS) return false; // TODO: allow larger? + const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1); + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + mi_bfield_t b = chunk->bfields[i]; + size_t bshift = 0; + size_t idx; + while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit + b >>= idx; + bshift += idx; + if (bshift + n >= MI_BFIELD_BITS) break; + + if ((b&mask) == mask) { // found a match + mi_assert_internal( ((mask << bshift) >> bshift) == mask ); + if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<bfields[i] >> bshift); + } + } + else { + // advance + const size_t ones = mi_bfield_ctz(~b); // skip all ones (since it didn't fit the mask) + mi_assert_internal(ones>0); + bshift += ones; + b >>= ones; + } + } + } + return false; +} + + +// are all bits in a bitmap chunk set? +static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return _mm256_test_all_ones(vec); + #else + // written like this for vectorization + mi_bfield_t x = chunk->bfields[0]; + for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { + x = x & chunk->bfields[i]; + } + return (~x == 0); + #endif +} + +// are all bits in a bitmap chunk clear? +static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return _mm256_testz_si256( vec, vec ); + #else + // written like this for vectorization + mi_bfield_t x = chunk->bfields[0]; + for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { + x = x | chunk->bfields[i]; + } + return (x == 0); + #endif +} + +/* -------------------------------------------------------------------------------- + bitmap +-------------------------------------------------------------------------------- */ +// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { + if (!already_zero) { + _mi_memzero_aligned(bitmap, sizeof(*bitmap)); + } +} + +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS); + + // first chunk + size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + size_t m = MI_BITMAP_CHUNK_BITS - cidx; + if (m > n) { m = n; } + bool already_xset; + mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset); + + // n can be large so use memset for efficiency for all in-between chunks + chunk_idx++; + n -= m; + const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; + if (mid_chunks > 0) { + _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8); + chunk_idx += mid_chunks; + n -= mid_chunks * MI_BITMAP_CHUNK_BITS; + } + + // last chunk + if (n > 0) { + mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset); + } +} + + +// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0), +// and false otherwise leaving the bitmask as is. +bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx); +} + +// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) +// and false otherwise leaving the bitmask as is. +bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < MI_BITMAP_MAX_BITS); + mi_assert_internal(idx%8 == 0); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; + return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx); +} + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) +// and false otherwise leaving the bitmask as is. +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); } + if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); } + + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n); +} + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + bool local_already_xset; + if (already_xset==NULL) { already_xset = &local_already_xset; } + // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } + // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } + + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset); +} + +// Is a sequence of n bits already all set/cleared? +bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); +} + + +#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \ + { size_t _set_idx; \ + size_t _start = start % MI_BFIELD_BITS; \ + mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \ + while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \ + decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS; + +#define mi_bitmap_forall_set_chunks_end() \ + _start += _set_idx+1; /* so chunk_idx stays valid */ \ + _any_set >>= _set_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ \ + _any_set >>= 1; \ + } \ + } + +// Find a set bit in a bitmap and atomically unset it. Returns true on success, +// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. +// The low `MI_BFIELD_BITS` of start are used to set the start point of the search +// (to reduce thread contention). +bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) { + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS); + return true; + } + else { + // we may find that all are unset only on a second iteration but that is ok as + // _any_set is a conservative approximation. + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + } + } + } + mi_bitmap_forall_set_chunks_end(); + return false; +} + + +// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. +bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) { + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8); + mi_assert_internal((*pidx % 8) == 0); + return true; + } + else { + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + } + } + } + mi_bitmap_forall_set_chunks_end(); + return false; +} + +// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) { + // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger + // TODO: allow spanning across chunk boundaries + if (n == 0 || n > MI_BFIELD_BITS) return false; + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n); + return true; + } + else { + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + } + } + } + mi_bitmap_forall_set_chunks_end(); + return false; +} diff --git a/src/xbitmap.h b/src/xbitmap.h new file mode 100644 index 00000000..869db2a2 --- /dev/null +++ b/src/xbitmap.h @@ -0,0 +1,94 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2023 Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- +Concurrent bitmap that can set/reset sequences of bits atomically +---------------------------------------------------------------------------- */ +#pragma once +#ifndef MI_XBITMAP_H +#define MI_XBITMAP_H + +/* -------------------------------------------------------------------------------- + Definitions +-------------------------------------------------------------------------------- */ + +typedef size_t mi_bfield_t; + +#define MI_BFIELD_BITS_SHIFT (MI_SIZE_SHIFT+3) +#define MI_BFIELD_BITS (1 << MI_BFIELD_BITS_SHIFT) +#define MI_BFIELD_SIZE (MI_BFIELD_BITS/8) +#define MI_BFIELD_BITS_MOD_MASK (MI_BFIELD_BITS - 1) +#define MI_BFIELD_LO_BIT8 ((~(mi_bfield_t(0)))/0xFF) // 0x01010101 .. +#define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. + +#define MI_BITMAP_CHUNK_BITS_SHIFT (8) // 2^8 = 256 bits per chunk +#define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) +#define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) +#define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) + +typedef mi_decl_align(32) struct mi_bitmap_chunk_s { + _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; +} mi_bitmap_chunk_t; + + +typedef mi_decl_align(32) struct mi_bitmap_s { + mi_bitmap_chunk_t chunks[MI_BFIELD_BITS]; + _Atomic(mi_bfield_t)any_set; +} mi_bitmap_t; + +#define MI_BITMAP_MAX_BITS (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit + +/* -------------------------------------------------------------------------------- + Bitmap +-------------------------------------------------------------------------------- */ + +typedef bool mi_bit_t; +#define MI_BIT_SET (true) +#define MI_BIT_CLEAR (false) + +// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero); + +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset); + +// Is a sequence of n bits already all set/cleared? +bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) +// and false otherwise leaving the bitmask as is. +mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); + +// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) +// and false otherwise leaving the bitmask as is. +mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); + +// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) +// and false otherwise leaving the bitmask as is. +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Find a set bit in a bitmap and atomically unset it. Returns true on success, +// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. +// The low `MI_BFIELD_BITS` of start are used to set the start point of the search +// (to reduce thread contention). +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start); + +// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ); + +// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ); + +#endif // MI_XBITMAP_H diff --git a/test/main-override-static.c b/test/main-override-static.c index 4ad76d6a..a8e30f69 100644 --- a/test/main-override-static.c +++ b/test/main-override-static.c @@ -7,6 +7,8 @@ #include #include // redefines malloc etc. +static void mi_bins(void); + static void double_free1(); static void double_free2(); static void corrupt_free(); @@ -33,7 +35,7 @@ int main() { // corrupt_free(); // block_overflow1(); // block_overflow2(); - test_canary_leak(); + // test_canary_leak(); // test_aslr(); // invalid_free(); // test_reserved(); @@ -41,6 +43,9 @@ int main() { // test_heap_walk(); // alloc_huge(); + mi_bins(); + + void* p1 = malloc(78); void* p2 = malloc(24); free(p1); @@ -73,7 +78,7 @@ int main() { static void invalid_free() { free((void*)0xBADBEEF); - realloc((void*)0xBADBEEF,10); + realloc((void*)0xBADBEEF, 10); } static void block_overflow1() { @@ -171,7 +176,7 @@ static void test_process_info(void) { size_t peak_commit = 0; size_t page_faults = 0; for (int i = 0; i < 100000; i++) { - void* p = calloc(100,10); + void* p = calloc(100, 10); free(p); } mi_process_info(&elapsed, &user_msecs, &system_msecs, ¤t_rss, &peak_rss, ¤t_commit, &peak_commit, &page_faults); @@ -229,8 +234,8 @@ static void test_heap_walk(void) { } static void test_canary_leak(void) { - char* p = mi_mallocn_tp(char,23); - for(int i = 0; i < 23; i++) { + char* p = mi_mallocn_tp(char, 23); + for (int i = 0; i < 23; i++) { p[i] = '0'+i; } puts(p); @@ -248,15 +253,15 @@ static void test_canary_leak(void) { static void test_large_pages(void) { mi_memid_t memid; - #if 0 +#if 0 size_t pages_reserved; size_t page_size; uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid); const size_t req_size = pages_reserved * page_size; - #else +#else const size_t req_size = 64*MI_MiB; - uint8_t* p = (uint8_t*)_mi_os_alloc(req_size,&memid,NULL); - #endif + uint8_t* p = (uint8_t*)_mi_os_alloc(req_size, &memid, NULL); +#endif p[0] = 1; @@ -276,63 +281,16 @@ static void test_large_pages(void) { // bin size experiments // ------------------------------ -#if 0 +#if 1 #include #include +#include -#define MI_INTPTR_SIZE 8 #define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE) #define MI_BIN_HUGE 100 //#define MI_ALIGN2W -// Bit scan reverse: return the index of the highest bit. -static inline uint8_t mi_bsr32(uint32_t x); - -#if defined(_MSC_VER) -#include -#include -static inline uint8_t mi_bsr32(uint32_t x) { - uint32_t idx; - _BitScanReverse((DWORD*)&idx, x); - return idx; -} -#elif defined(__GNUC__) || defined(__clang__) -static inline uint8_t mi_bsr32(uint32_t x) { - return (31 - __builtin_clz(x)); -} -#else -static inline uint8_t mi_bsr32(uint32_t x) { - // de Bruijn multiplication, see - static const uint8_t debruijn[32] = { - 31, 0, 22, 1, 28, 23, 18, 2, 29, 26, 24, 10, 19, 7, 3, 12, - 30, 21, 27, 17, 25, 9, 6, 11, 20, 16, 8, 5, 15, 4, 14, 13, - }; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - x++; - return debruijn[(x*0x076be629) >> 27]; -} -#endif - -/* -// Bit scan reverse: return the index of the highest bit. -uint8_t _mi_bsr(uintptr_t x) { - if (x == 0) return 0; - #if MI_INTPTR_SIZE==8 - uint32_t hi = (x >> 32); - return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi)); - #elif MI_INTPTR_SIZE==4 - return mi_bsr32(x); - #else - # error "define bsr for non-32 or 64-bit platforms" - #endif -} -*/ - static inline size_t _mi_wsize_from_size(size_t size) { return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t); @@ -370,7 +328,9 @@ extern inline uint8_t _mi_bin8(size_t size) { #endif wsize--; // find the highest bit - uint8_t b = mi_bsr32((uint32_t)wsize); + size_t idx; + mi_bsr(wsize, &idx); + uint8_t b = (uint8_t)idx; // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin @@ -402,44 +362,79 @@ static inline uint8_t _mi_bin4(size_t size) { bin = MI_BIN_HUGE; } else { - uint8_t b = mi_bsr32((uint32_t)wsize); + size_t idx; + mi_bsr(wsize, &idx); + uint8_t b = (uint8_t)idx; bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3; } return bin; } -static size_t _mi_binx4(size_t bsize) { - if (bsize==0) return 0; - uint8_t b = mi_bsr32((uint32_t)bsize); - if (b <= 1) return bsize; - size_t bin = ((b << 1) | (bsize >> (b - 1))&0x01); +static size_t _mi_binx4(size_t wsize) { + size_t bin; + if (wsize <= 1) { + bin = 1; + } + else if (wsize <= 8) { + // bin = (wsize+1)&~1; // round to double word sizes + bin = (uint8_t)wsize; + } + else { + size_t idx; + mi_bsr(wsize, &idx); + uint8_t b = (uint8_t)idx; + if (b <= 1) return wsize; + bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3; + } return bin; } static size_t _mi_binx8(size_t bsize) { if (bsize<=1) return bsize; - uint8_t b = mi_bsr32((uint32_t)bsize); + size_t idx; + mi_bsr(bsize, &idx); + uint8_t b = (uint8_t)idx; if (b <= 2) return bsize; size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5; return bin; } + +static inline size_t mi_bin(size_t wsize) { + uint8_t bin; + if (wsize <= 1) { + bin = 1; + } + else if (wsize <= 8) { + // bin = (wsize+1)&~1; // round to double word sizes + bin = (uint8_t)wsize; + } + else { + wsize--; + assert(wsize>0); + // find the highest bit + uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize)); + + // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). + // - adjust with 3 because we use do not round the first 8 sizes + // which each get an exact bin + bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3; + } + return bin; +} + + static void mi_bins(void) { //printf(" QNULL(1), /* 0 */ \\\n "); size_t last_bin = 0; - size_t min_bsize = 0; - size_t last_bsize = 0; - for (size_t bsize = 1; bsize < 2*1024; bsize++) { - size_t size = bsize * 64 * 1024; - size_t bin = _mi_binx8(bsize); + for (size_t wsize = 1; wsize <= (4*1024*1024) / 8 + 1024; wsize++) { + size_t bin = mi_bin(wsize); if (bin != last_bin) { - printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_bsize, last_bsize, last_bin); - //printf("QNULL(%6zd), ", wsize); - //if (last_bin%8 == 0) printf("/* %i */ \\\n ", last_bin); + //printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_wsize, last_wsize, last_bin); + printf("QNULL(%6zd), ", wsize-1); + if (last_bin%8 == 0) printf("/* %zu */ \\\n ", last_bin); last_bin = bin; - min_bsize = bsize; } - last_bsize = bsize; } } #endif From 441d4fed9fd302bb2a2b326bc8b134c8a15982bb Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 29 Nov 2024 10:40:18 -0800 Subject: [PATCH 002/264] wip: further progress on removing segments --- CMakeLists.txt | 1 + ide/vs2022/mimalloc.vcxproj | 12 +- ide/vs2022/mimalloc.vcxproj.filters | 13 +- include/mimalloc/bits.h | 6 + include/mimalloc/internal.h | 183 +++--- include/mimalloc/types.h | 271 +++----- src/alloc.c | 2 +- src/{xarena.c => arena-old.c} | 875 ++------------------------ src/arena.c | 871 ++++++++++++++++++++++++-- src/bitmap-old.c | 419 +++++++++++++ src/bitmap-old.h | 110 ++++ src/bitmap.c | 940 +++++++++++++++++----------- src/bitmap.h | 154 ++--- src/free.c | 118 ++-- src/heap.c | 5 +- src/os.c | 55 +- src/page-map.c | 90 +++ src/page.c | 67 +- src/static.c | 3 +- src/xbitmap.c | 599 ------------------ src/xbitmap.h | 94 --- 21 files changed, 2396 insertions(+), 2492 deletions(-) rename src/{xarena.c => arena-old.c} (53%) create mode 100644 src/bitmap-old.c create mode 100644 src/bitmap-old.h create mode 100644 src/page-map.c delete mode 100644 src/xbitmap.c delete mode 100644 src/xbitmap.h diff --git a/CMakeLists.txt b/CMakeLists.txt index 5fc1808e..5cb05840 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -55,6 +55,7 @@ set(mi_sources src/options.c src/os.c src/page.c + src/page-map.c src/random.c src/segment.c src/segment-map.c diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 138acf39..3dd7326f 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -214,12 +214,7 @@ - - true - true - true - true - + false @@ -232,6 +227,7 @@ + true @@ -248,12 +244,8 @@ - - - - diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters index 48958be1..2eed7e90 100644 --- a/ide/vs2022/mimalloc.vcxproj.filters +++ b/ide/vs2022/mimalloc.vcxproj.filters @@ -43,12 +43,6 @@ Sources - - Sources - - - Sources - Sources @@ -58,13 +52,10 @@ Sources - + Sources - - Sources - - + Sources diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 642f0f9c..ad7ea3e6 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -100,6 +100,10 @@ typedef int32_t mi_ssize_t; #define __BMI1__ 1 #endif +// Define big endian if needed +// #define MI_BIG_ENDIAN 1 + + /* -------------------------------------------------------------------------------- Builtin's -------------------------------------------------------------------------------- */ @@ -310,4 +314,6 @@ static inline size_t mi_rotl(size_t x, size_t r) { #endif } + + #endif // MI_BITS_H diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index b997099e..2713c0ac 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -108,6 +108,7 @@ size_t _mi_os_page_size(void); size_t _mi_os_good_alloc_size(size_t size); bool _mi_os_has_overcommit(void); bool _mi_os_has_virtual_reserve(void); +size_t _mi_os_virtual_address_bits(void); bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats); bool _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats); @@ -136,12 +137,11 @@ bool _mi_arena_contains(const void* p); void _mi_arenas_collect(bool force_purge, mi_stats_t* stats); void _mi_arena_unsafe_destroy_all(mi_stats_t* stats); -bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment); -void _mi_arena_segment_mark_abandoned(mi_segment_t* segment); void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid); void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size); +/* typedef struct mi_arena_field_cursor_s { // abstract struct size_t os_list_count; // max entries to visit in the OS abandoned list size_t start; // start arena idx (may need to be wrapped) @@ -154,27 +154,12 @@ typedef struct mi_arena_field_cursor_s { // abstract struct void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current); mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous); void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current); +*/ -// "segment-map.c" -void _mi_segment_map_allocated_at(const mi_segment_t* segment); -void _mi_segment_map_freed_at(const mi_segment_t* segment); +// "page-map.c" +void _mi_page_map_register(mi_page_t* page); +void _mi_page_map_unregister(mi_page_t* page); -// "segment.c" -mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld); -void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld); -void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld); -uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); - -#if MI_HUGE_PAGE_ABANDON -void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block); -#else -void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block); -#endif - -void _mi_segments_collect(bool force, mi_segments_tld_t* tld); -void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld); -bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment); -bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg); // "page.c" void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; @@ -226,7 +211,7 @@ void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, siz void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept; mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p); bool _mi_free_delayed_block(mi_block_t* block); -void _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration +// void _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); // "libc.c" @@ -338,8 +323,8 @@ static inline uintptr_t _mi_align_up(uintptr_t sz, size_t alignment) { // Align a pointer upwards -static inline void* mi_align_up_ptr(void* p, size_t alignment) { - return (void*)_mi_align_up((uintptr_t)p, alignment); +static inline uint8_t* _mi_align_up_ptr(void* p, size_t alignment) { + return (uint8_t*)_mi_align_up((uintptr_t)p, alignment); } @@ -445,68 +430,44 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si return heap->pages_free_direct[idx]; } -// Segment that contains the pointer -// Large aligned blocks may be aligned at N*MI_SEGMENT_SIZE (inside a huge segment > MI_SEGMENT_SIZE), -// and we need align "down" to the segment info which is `MI_SEGMENT_SIZE` bytes before it; -// therefore we align one byte before `p`. -// We check for NULL afterwards on 64-bit systems to improve codegen for `mi_free`. -static inline mi_segment_t* _mi_ptr_segment(const void* p) { - mi_segment_t* const segment = (mi_segment_t*)(((uintptr_t)p - 1) & ~MI_SEGMENT_MASK); - #if MI_INTPTR_SIZE <= 4 - return (p==NULL ? NULL : segment); - #else - return ((intptr_t)segment <= 0 ? NULL : segment); + +extern signed char* _mi_page_map; + +#define MI_PAGE_PTR_INVALID ((mi_page_t*)(1)) + +static inline mi_page_t* _mi_ptr_page(const void* p) { + const uintptr_t up = ((uintptr_t)p) >> MI_ARENA_BLOCK_SHIFT; + const ptrdiff_t ofs = _mi_page_map[up]; + #if MI_DEBUG + if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID; #endif + return (mi_page_t*)((up + ofs - 1) << MI_ARENA_BLOCK_SHIFT); } -// Segment belonging to a page -static inline mi_segment_t* _mi_page_segment(const mi_page_t* page) { - mi_assert_internal(page!=NULL); - mi_segment_t* segment = _mi_ptr_segment(page); - mi_assert_internal(segment == NULL || page == &segment->pages[page->segment_idx]); - return segment; -} -// used internally -static inline size_t _mi_segment_page_idx_of(const mi_segment_t* segment, const void* p) { - // if (segment->page_size > MI_SEGMENT_SIZE) return &segment->pages[0]; // huge pages - ptrdiff_t diff = (uint8_t*)p - (uint8_t*)segment; - mi_assert_internal(diff >= 0 && (size_t)diff <= MI_SEGMENT_SIZE /* for huge alignment it can be equal */); - size_t idx = (size_t)diff >> segment->page_shift; - mi_assert_internal(idx < segment->capacity); - mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || idx == 0); - return idx; -} - -// Get the page containing the pointer -static inline mi_page_t* _mi_segment_page_of(const mi_segment_t* segment, const void* p) { - size_t idx = _mi_segment_page_idx_of(segment, p); - return &((mi_segment_t*)segment)->pages[idx]; -} - -// Quick page start for initialized pages -static inline uint8_t* mi_page_start(const mi_page_t* page) { - mi_assert_internal(page->page_start != NULL); - mi_assert_expensive(_mi_segment_page_start(_mi_page_segment(page),page,NULL) == page->page_start); - return page->page_start; -} - -// Get the page containing the pointer -static inline mi_page_t* _mi_ptr_page(void* p) { - mi_assert_internal(p!=NULL); - return _mi_segment_page_of(_mi_ptr_segment(p), p); -} - -// Get the block size of a page (special case for huge objects) +// Get the block size of a page static inline size_t mi_page_block_size(const mi_page_t* page) { mi_assert_internal(page->block_size > 0); return page->block_size; } -static inline bool mi_page_is_huge(const mi_page_t* page) { - mi_assert_internal((page->is_huge && _mi_page_segment(page)->page_kind == MI_PAGE_HUGE) || - (!page->is_huge && _mi_page_segment(page)->page_kind != MI_PAGE_HUGE)); - return page->is_huge; +// Page start +static inline uint8_t* mi_page_start(const mi_page_t* page) { + mi_assert(sizeof(mi_page_t) <= MI_PAGE_INFO_SIZE); + return (uint8_t*)page + MI_PAGE_INFO_SIZE; +} + +static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) { + if (size) { *size = mi_page_block_size(page) * page->reserved; } + return mi_page_start(page); +} + +static inline bool mi_page_is_in_arena(const mi_page_t* page) { + return (page->memid.memkind == MI_MEM_ARENA); +} + +static inline bool mi_page_is_singleton(const mi_page_t* page) { + return (page->reserved == 1); } // Get the usable block size of a page without fixed padding. @@ -515,11 +476,6 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) { return mi_page_block_size(page) - MI_PADDING_SIZE; } -// size of a segment -static inline size_t mi_segment_size(mi_segment_t* segment) { - return segment->segment_size; -} - // Thread free access static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3); @@ -534,10 +490,20 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap)); } +static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { + return mi_atomic_load_relaxed(&page->xthread_id); +} + static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); mi_atomic_store_release(&page->xheap,(uintptr_t)heap); - if (heap != NULL) { page->heap_tag = heap->tag; } + if (heap != NULL) { + page->heap_tag = heap->tag; + mi_atomic_store_release(&page->xthread_id, heap->thread_id); + } + else { + mi_atomic_store_release(&page->xthread_id,0); + } } // Thread free flag helpers @@ -576,6 +542,21 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) { return (page->free != NULL); } + +// is the page not yet used up to its reserved space? +static inline bool mi_page_is_expandable(const mi_page_t* page) { + mi_assert_internal(page != NULL); + mi_assert_internal(page->capacity <= page->reserved); + return (page->capacity < page->reserved); +} + + +static inline bool mi_page_is_full(mi_page_t* page) { + bool full = (page->reserved == page->used); + mi_assert_internal(!full || page->free == NULL); + return full; +} + // is more than 7/8th of a page in use? static inline bool mi_page_mostly_used(const mi_page_t* page) { if (page==NULL) return true; @@ -583,6 +564,15 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) { return (page->reserved - page->used <= frac); } +static inline bool mi_page_is_abandoned(mi_page_t* page) { + return (mi_page_thread_id(page) == 0); +} + +static inline bool mi_page_is_huge(mi_page_t* page) { + return (page->block_size > MI_LARGE_MAX_OBJ_SIZE); +} + + static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) { return &((mi_heap_t*)heap)->pages[_mi_bin(size)]; } @@ -667,17 +657,8 @@ We also pass a separate `null` value to be used as `NULL` or otherwise `(k2<<= 655360) -#error "mimalloc internal: define more bins" -#endif - -// Maximum block size for which blocks are guaranteed to be block size aligned. (see `segment.c:_mi_segment_page_start`) -#define MI_MAX_ALIGN_GUARANTEE (MI_MEDIUM_OBJ_SIZE_MAX) - -// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated huge page segments -#define MI_BLOCK_ALIGNMENT_MAX (MI_SEGMENT_SIZE >> 1) +// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated orphan pages +#define MI_BLOCK_ALIGNMENT_MAX (MI_ARENA_BLOCK_ALIGN) // We never allocate more than PTRDIFF_MAX (see also ) -#define MI_MAX_ALLOC_SIZE PTRDIFF_MAX +#define MI_MAX_ALLOC_SIZE PTRDIFF_MAX + + +// --------------------------------------------------------------- +// a memory id tracks the provenance of arena/OS allocated memory +// --------------------------------------------------------------- + +// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this. +typedef enum mi_memkind_e { + MI_MEM_NONE, // not allocated + MI_MEM_EXTERNAL, // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example) + MI_MEM_STATIC, // allocated in a static area and should not be freed (for arena meta data for example) + MI_MEM_OS, // allocated from the OS + MI_MEM_OS_HUGE, // allocated as huge OS pages (usually 1GiB, pinned to physical memory) + MI_MEM_OS_REMAP, // allocated in a remapable area (i.e. using `mremap`) + MI_MEM_ARENA // allocated from an arena (the usual case) +} mi_memkind_t; + +static inline bool mi_memkind_is_os(mi_memkind_t memkind) { + return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP); +} + +typedef struct mi_memid_os_info { + void* base; // actual base address of the block (used for offset aligned allocations) + size_t alignment; // alignment at allocation +} mi_memid_os_info_t; + +typedef struct mi_memid_arena_info { + size_t block_index; // index in the arena + mi_arena_id_t id; // arena id (>= 1) + bool is_exclusive; // this arena can only be used for specific arena allocations +} mi_memid_arena_info_t; + +typedef struct mi_memid_s { + union { + mi_memid_os_info_t os; // only used for MI_MEM_OS + mi_memid_arena_info_t arena; // only used for MI_MEM_ARENA + } mem; + bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages) + bool initially_committed;// `true` if the memory was originally allocated as committed + bool initially_zero; // `true` if the memory was originally zero initialized + mi_memkind_t memkind; +} mi_memid_t; + // ------------------------------------------------------ // Mimalloc pages contain allocated blocks @@ -223,6 +248,10 @@ typedef union mi_page_flags_s { // We use the bottom 2 bits of the pointer for mi_delayed_t flags typedef uintptr_t mi_thread_free_t; +// Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython) +typedef struct mi_subproc_s mi_subproc_t; + + // A page contains blocks of one specific size (`block_size`). // Each page has three list of free blocks: // `free` for blocks that can be allocated, @@ -242,8 +271,6 @@ typedef uintptr_t mi_thread_free_t; // Notes: // - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc` // - Using `uint16_t` does not seem to slow things down -// - The size is 10 words on 64-bit which helps the page index calculations -// (and 12 words on 32-bit, and encoded free lists add 2 words) // - `xthread_free` uses the bottom bits as a delayed-free flags to optimize // concurrent frees where only the first concurrent free adds to the owning // heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`). @@ -252,15 +279,8 @@ typedef uintptr_t mi_thread_free_t; // the owning heap `thread_delayed_free` list. This guarantees that pages // will be freed correctly even if only other threads free blocks. typedef struct mi_page_s { - // "owned" by the segment - uint8_t segment_idx; // index in the segment `pages` array, `page == &segment->pages[page->segment_idx]` - uint8_t segment_in_use:1; // `true` if the segment allocated this page - uint8_t is_committed:1; // `true` if the page virtual memory is committed - uint8_t is_zero_init:1; // `true` if the page was initially zero initialized - uint8_t is_huge:1; // `true` if the page is in a huge segment - - // layout like this to optimize access in `mi_malloc` and `mi_free` - uint16_t capacity; // number of blocks committed, must be the first field, see `segment.c:page_clear` + mi_memid_t memid; // provenance of the page memory + uint16_t capacity; // number of blocks committed (must be the first field for proper zero-initialisation) uint16_t reserved; // number of blocks reserved in memory mi_page_flags_t flags; // `in_full` and `has_aligned` flags (8 bits) uint8_t free_is_zero:1; // `true` if the blocks in the free list are zero initialized @@ -272,120 +292,54 @@ typedef struct mi_page_s { uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type // padding - size_t block_size; // size available in each block (always `>0`) - uint8_t* page_start; // start of the page area containing the blocks + size_t block_size; // size available in each block (always `>0`) #if (MI_ENCODE_FREELIST || MI_PADDING) uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary #endif _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - _Atomic(uintptr_t) xheap; + _Atomic(uintptr_t) xheap; // heap this threads belong to. + _Atomic(mi_threadid_t)xthread_id; // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned) struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` - - #if MI_INTPTR_SIZE==4 // pad to 12 words on 32-bit - void* padding[1]; - #endif } mi_page_t; +// ------------------------------------------------------ +// Object sizes +// ------------------------------------------------------ + +#define MI_PAGE_ALIGN (64) +#define MI_PAGE_INFO_SIZE (MI_SIZE_SHIFT*MI_PAGE_ALIGN) // should be > sizeof(mi_page_t) + +// The max object size are checked to not waste more than 12.5% internally over the page sizes. +// (Except for large pages since huge objects are allocated in 4MiB chunks) +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~16KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~128KiB +#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // ~2MiB +#define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) + + +#if (MI_LARGE_MAX_OBJ_WSIZE >= 655360) +#error "mimalloc internal: define more bins" +#endif + // ------------------------------------------------------ -// Mimalloc segments contain mimalloc pages +// Page kinds // ------------------------------------------------------ typedef enum mi_page_kind_e { - MI_PAGE_SMALL, // small blocks go into 64KiB pages inside a segment - MI_PAGE_MEDIUM, // medium blocks go into 512KiB pages inside a segment - MI_PAGE_LARGE, // larger blocks go into a single page spanning a whole segment - MI_PAGE_HUGE // a huge page is a single page in a segment of variable size (but still 2MiB aligned) + MI_PAGE_SMALL, // small blocks go into 64KiB pages + MI_PAGE_MEDIUM, // medium blocks go into 512KiB pages + MI_PAGE_LARGE, // larger blocks go into 4MiB pages + MI_PAGE_SINGLETON // page containing a single block. // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`. } mi_page_kind_t; -// --------------------------------------------------------------- -// a memory id tracks the provenance of arena/OS allocated memory -// --------------------------------------------------------------- - -// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this. -typedef enum mi_memkind_e { - MI_MEM_NONE, // not allocated - MI_MEM_EXTERNAL, // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example) - MI_MEM_STATIC, // allocated in a static area and should not be freed (for arena meta data for example) - MI_MEM_OS, // allocated from the OS - MI_MEM_OS_HUGE, // allocated as huge OS pages (usually 1GiB, pinned to physical memory) - MI_MEM_OS_REMAP, // allocated in a remapable area (i.e. using `mremap`) - MI_MEM_ARENA // allocated from an arena (the usual case) -} mi_memkind_t; - -static inline bool mi_memkind_is_os(mi_memkind_t memkind) { - return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP); -} - -typedef struct mi_memid_os_info { - void* base; // actual base address of the block (used for offset aligned allocations) - size_t alignment; // alignment at allocation -} mi_memid_os_info_t; - -typedef struct mi_memid_arena_info { - size_t block_index; // index in the arena - mi_arena_id_t id; // arena id (>= 1) - bool is_exclusive; // this arena can only be used for specific arena allocations -} mi_memid_arena_info_t; - -typedef struct mi_memid_s { - union { - mi_memid_os_info_t os; // only used for MI_MEM_OS - mi_memid_arena_info_t arena; // only used for MI_MEM_ARENA - } mem; - bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages) - bool initially_committed;// `true` if the memory was originally allocated as committed - bool initially_zero; // `true` if the memory was originally zero initialized - mi_memkind_t memkind; -} mi_memid_t; - - -// --------------------------------------------------------------- -// Segments contain mimalloc pages -// --------------------------------------------------------------- -typedef struct mi_subproc_s mi_subproc_t; - -// Segments are large allocated memory blocks (2MiB on 64 bit) from the OS. -// Inside segments we allocated fixed size _pages_ that contain blocks. -typedef struct mi_segment_s { - // constant fields - mi_memid_t memid; // memory id to track provenance - bool allow_decommit; - bool allow_purge; - size_t segment_size; // for huge pages this may be different from `MI_SEGMENT_SIZE` - mi_subproc_t* subproc; // segment belongs to sub process - - // segment fields - struct mi_segment_s* next; // must be the first (non-constant) segment field -- see `segment.c:segment_init` - struct mi_segment_s* prev; - bool was_reclaimed; // true if it was reclaimed (used to limit reclaim-on-free reclamation) - bool dont_free; // can be temporarily true to ensure the segment is not freed - - size_t abandoned; // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`) - size_t abandoned_visits; // count how often this segment is visited for reclaiming (to force reclaim if it is too long) - - size_t used; // count of pages in use (`used <= capacity`) - size_t capacity; // count of available pages (`#free + used`) - size_t segment_info_size;// space we are using from the first page for segment meta-data and possible guard pages. - uintptr_t cookie; // verify addresses in secure mode: `_mi_ptr_cookie(segment) == segment->cookie` - - struct mi_segment_s* abandoned_os_next; // only used for abandoned segments outside arena's, and only if `mi_option_visit_abandoned` is enabled - struct mi_segment_s* abandoned_os_prev; - - // layout like this to optimize access in `mi_free` - _Atomic(mi_threadid_t) thread_id; // unique id of the thread owning this segment - size_t page_shift; // `1 << page_shift` == the page sizes == `page->block_size * page->reserved` (unless the first page, then `-segment_info_size`). - mi_page_kind_t page_kind; // kind of pages: small, medium, large, or huge - mi_page_t pages[1]; // up to `MI_SMALL_PAGES_PER_SEGMENT` pages -} mi_segment_t; - // ------------------------------------------------------ // Heaps @@ -522,21 +476,18 @@ typedef struct mi_stat_counter_s { } mi_stat_counter_t; typedef struct mi_stats_s { - mi_stat_count_t segments; mi_stat_count_t pages; mi_stat_count_t reserved; mi_stat_count_t committed; mi_stat_count_t reset; mi_stat_count_t purged; mi_stat_count_t page_committed; - mi_stat_count_t segments_abandoned; mi_stat_count_t pages_abandoned; mi_stat_count_t threads; mi_stat_count_t normal; mi_stat_count_t huge; mi_stat_count_t giant; mi_stat_count_t malloc; - mi_stat_count_t segments_cache; mi_stat_counter_t pages_extended; mi_stat_counter_t mmap_calls; mi_stat_counter_t commit_calls; @@ -581,12 +532,12 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); // ------------------------------------------------------ struct mi_subproc_s { - _Atomic(size_t) abandoned_count; // count of abandoned segments for this sub-process - _Atomic(size_t) abandoned_os_list_count; // count of abandoned segments in the os-list - mi_lock_t abandoned_os_lock; // lock for the abandoned os segment list (outside of arena's) (this lock protect list operations) + _Atomic(size_t) abandoned_count; // count of abandoned pages for this sub-process + _Atomic(size_t) abandoned_os_list_count; // count of abandoned pages in the os-list + mi_lock_t abandoned_os_lock; // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations) mi_lock_t abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list - mi_segment_t* abandoned_os_list; // doubly-linked list of abandoned segments outside of arena's (in OS allocated memory) - mi_segment_t* abandoned_os_list_tail; // the tail-end of the list + mi_page_t* abandoned_os_list; // doubly-linked list of abandoned pages outside of arena's (in OS allocated memory) + mi_page_t* abandoned_os_list_tail; // the tail-end of the list mi_memid_t memid; // provenance of this memory block }; @@ -597,11 +548,6 @@ struct mi_subproc_s { // Milliseconds as in `int64_t` to avoid overflows typedef int64_t mi_msecs_t; -// Queue of segments -typedef struct mi_segment_queue_s { - mi_segment_t* first; - mi_segment_t* last; -} mi_segment_queue_t; // OS thread local data typedef struct mi_os_tld_s { @@ -609,28 +555,13 @@ typedef struct mi_os_tld_s { mi_stats_t* stats; // points to tld stats } mi_os_tld_t; -// Segments thread local data -typedef struct mi_segments_tld_s { - mi_segment_queue_t small_free; // queue of segments with free small pages - mi_segment_queue_t medium_free; // queue of segments with free medium pages - mi_page_queue_t pages_purge; // queue of freed pages that are delay purged - size_t count; // current number of segments; - size_t peak_count; // peak number of segments - size_t current_size; // current size of all segments - size_t peak_size; // peak size of all segments - size_t reclaim_count;// number of reclaimed (abandoned) segments - mi_subproc_t* subproc; // sub-process this thread belongs to. - mi_stats_t* stats; // points to tld stats - mi_os_tld_t* os; // points to os tld -} mi_segments_tld_t; - // Thread local data struct mi_tld_s { unsigned long long heartbeat; // monotonic heartbeat count bool recurse; // true if deferred was called; used to prevent infinite recursion. mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) - mi_segments_tld_t segments; // segment tld + mi_subproc_t* subproc; // sub-process this thread belongs to. mi_os_tld_t os; // os tld mi_stats_t stats; // statistics }; diff --git a/src/alloc.c b/src/alloc.c index a093f108..00f6d1a4 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -82,7 +82,7 @@ extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_ #if (MI_STAT>0) const size_t bsize = mi_page_usable_block_size(page); - if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { + if (bsize <= MI_LARGE_MAX_OBJ_SIZE) { mi_heap_stat_increase(heap, normal, bsize); mi_heap_stat_counter_increase(heap, normal_count, 1); #if (MI_STAT>1) diff --git a/src/xarena.c b/src/arena-old.c similarity index 53% rename from src/xarena.c rename to src/arena-old.c index 42943f84..8ca5aaf3 100644 --- a/src/xarena.c +++ b/src/arena-old.c @@ -21,834 +21,46 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo #include "mimalloc.h" #include "mimalloc/internal.h" -#include "xbitmap.h" +#include "mimalloc/atomic.h" +#include "bitmap.h" /* ----------------------------------------------------------- Arena allocation ----------------------------------------------------------- */ -#define MI_ARENA_BLOCK_SIZE (MI_SMALL_PAGE_SIZE) // 64KiB -#define MI_ARENA_BLOCK_ALIGN (MI_ARENA_BLOCK_SIZE) // 64KiB -#define MI_ARENA_BIN_COUNT (MI_BIN_COUNT) - -#define MI_ARENA_MIN_OBJ_SIZE MI_ARENA_BLOCK_SIZE -#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE) // for now, cannot cross chunk boundaries - // A memory arena descriptor typedef struct mi_arena_s { mi_arena_id_t id; // arena id; 0 for non-specific mi_memid_t memid; // memid of the memory area - // _Atomic(uint8_t*) start; // the start of the memory area - // size_t meta_size; // size of the arena structure itself (including its bitmaps) - // mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) + _Atomic(uint8_t*)start; // the start of the memory area size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) + size_t field_count; // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`) + size_t meta_size; // size of the arena structure itself (including its bitmaps) + mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) int numa_node; // associated NUMA node bool exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited - _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. - - mi_bitmap_t blocks_free; // is the block free? - mi_bitmap_t blocks_committed; // is the block committed? (i.e. accessible) - mi_bitmap_t blocks_purge; // can the block be purged? (block in purge => block in free) - mi_bitmap_t blocks_dirty; // is the block potentially non-zero? - mi_bitmap_t blocks_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) - // the full queue contains abandoned full pages + _Atomic(size_t)search_idx; // optimization to start the search for free blocks + _Atomic(mi_msecs_t)purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. + mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? + mi_bitmap_field_t* blocks_committed; // are the blocks committed? (can be NULL for memory that cannot be decommitted) + mi_bitmap_field_t* blocks_purge; // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted) + mi_bitmap_field_t* blocks_abandoned; // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here) + mi_bitmap_field_t blocks_inuse[1]; // in-place bitmap of in-use blocks (of size `field_count`) + // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields. } mi_arena_t; -#define MI_MAX_ARENAS (1024) // Limited for now (and takes up .bss) + +#define MI_ARENA_BLOCK_SIZE (MI_SEGMENT_SIZE) // 64MiB (must be at least MI_SEGMENT_ALIGN) +#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2) // 32MiB +#define MI_MAX_ARENAS (132) // Limited as the reservation exponentially increases (and takes up .bss) // The available arenas static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 - -/* ----------------------------------------------------------- - Arena id's - id = arena_index + 1 ------------------------------------------------------------ */ - -size_t mi_arena_id_index(mi_arena_id_t id) { - return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); -} - -static mi_arena_id_t mi_arena_id_create(size_t arena_index) { - mi_assert_internal(arena_index < MI_MAX_ARENAS); - return (int)arena_index + 1; -} - -mi_arena_id_t _mi_arena_id_none(void) { - return 0; -} - -static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { - return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || - (arena_id == req_arena_id)); -} - -bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { - if (memid.memkind == MI_MEM_ARENA) { - return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); - } - else { - return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); - } -} - -size_t mi_arena_get_count(void) { - return mi_atomic_load_relaxed(&mi_arena_count); -} - -mi_arena_t* mi_arena_from_index(size_t idx) { - mi_assert_internal(idx < mi_arena_get_count()); - return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); -} - - - -/* ----------------------------------------------------------- - Util ------------------------------------------------------------ */ - -// Blocks needed for a given byte size -static size_t mi_block_count_of_size(size_t size) { - return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); -} - -// Byte size of a number of blocks -static size_t mi_size_of_blocks(size_t bcount) { - return (bcount * MI_ARENA_BLOCK_SIZE); -} - -// Size of an arena -static size_t mi_arena_size(mi_arena_t* arena) { - return mi_size_of_blocks(arena->block_count); -} - -static size_t mi_arena_info_blocks(void) { - const size_t os_page_size = _mi_os_page_size(); - const size_t info_size = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page - const size_t info_blocks = mi_block_count_of_size(info_size); - return info_blocks; -} - - -// Start of the arena memory area -static uint8_t* mi_arena_start(mi_arena_t* arena) { - return ((uint8_t*)arena); -} - -// Start of a block -void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) { - return (mi_arena_start(arena) + mi_size_of_blocks(block_index)); -} - -// Arena area -void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { - if (size != NULL) *size = 0; - const size_t arena_index = mi_arena_id_index(arena_id); - if (arena_index >= MI_MAX_ARENAS) return NULL; - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); - if (arena == NULL) return NULL; - if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } - return mi_arena_start(arena); -} - - -// Create an arena memid -static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) { - mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); - memid.mem.arena.id = id; - memid.mem.arena.block_index = block_index; - memid.mem.arena.is_exclusive = is_exclusive; - return memid; -} - -// returns if the arena is exclusive -bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) { - mi_assert_internal(memid.memkind == MI_MEM_ARENA); - *arena_index = mi_arena_id_index(memid.mem.arena.id); - *block_index = memid.mem.arena.block_index; - return memid.mem.arena.is_exclusive; -} - - - -/* ----------------------------------------------------------- - Arena Allocation ------------------------------------------------------------ */ - -static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, - bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) -{ - MI_UNUSED(arena_index); - mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); - - size_t block_index; - if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL; - - // claimed it! - void* p = mi_arena_block_start(arena, block_index); - *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index); - memid->is_pinned = arena->memid.is_pinned; - - // set the dirty bits - if (arena->memid.initially_zero) { - memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL); - } - - // set commit state - if (commit) { - // commit requested, but the range may not be committed as a whole: ensure it is committed now - memid->initially_committed = true; - - bool all_already_committed; - mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed); - if (!all_already_committed) { - bool commit_zero = false; - if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { - memid->initially_committed = false; - } - else { - if (commit_zero) { memid->initially_zero = true; } - } - } - } - else { - // no need to commit, but check if already fully committed - memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount); - } - - return p; -} - -// allocate in a speficic arena -static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, - size_t size, size_t alignment, - bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) -{ - mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); - if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; - - const size_t bcount = mi_block_count_of_size(size); - const size_t arena_index = mi_arena_id_index(arena_id); - mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); - mi_assert_internal(size <= mi_size_of_blocks(bcount)); - - // Check arena suitability - mi_arena_t* arena = mi_arena_from_index(arena_index); - if (arena == NULL) return NULL; - if (!allow_large && arena->is_large) return NULL; - if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; - if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity - const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); - if (match_numa_node) { if (!numa_suitable) return NULL; } - else { if (numa_suitable) return NULL; } - } - - // try to allocate - void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld); - mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); - return p; -} - - -// allocate from an arena with fallback to the OS -static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, - bool commit, bool allow_large, - mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) -{ - mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); - if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; - - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - if mi_likely(max_arena == 0) return NULL; - - if (req_arena_id != _mi_arena_id_none()) { - // try a specific arena if requested - if (mi_arena_id_index(req_arena_id) < max_arena) { - void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - } - } - else { - // try numa affine allocation - for (size_t i = 0; i < max_arena; i++) { - void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - } - - // try from another numa node instead.. - if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already - for (size_t i = 0; i < max_arena; i++) { - void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - } - } - } - return NULL; -} - -// try to reserve a fresh arena space -static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) -{ - if (_mi_preloading()) return false; // use OS only while pre loading - if (req_arena_id != _mi_arena_id_none()) return false; - - const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); - if (arena_count > (MI_MAX_ARENAS - 4)) return false; - - // calc reserve - size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); - if (arena_reserve == 0) return false; - - if (!_mi_os_has_virtual_reserve()) { - arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) - } - arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); - - if (arena_count >= 8 && arena_count <= 128) { - // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); - size_t reserve = 0; - if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { - arena_reserve = reserve; - } - } - - // check arena bounds - const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1); - const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE; - if (arena_reserve < min_reserve) { - arena_reserve = min_reserve; - } - else if (arena_reserve > max_reserve) { - arena_reserve = max_reserve; - } - - if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size - - // commit eagerly? - bool arena_commit = false; - if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } - else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } - - return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); -} - - -void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) -{ - mi_assert_internal(memid != NULL && tld != NULL); - mi_assert_internal(size > 0); - size_t tseq = _mi_thread_seq_id(); - *memid = _mi_memid_none(); - - const int numa_node = _mi_os_numa_node(tld); // current numa node - - // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) - if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? - if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) { - void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - - // otherwise, try to first eagerly reserve a new arena - if (req_arena_id == _mi_arena_id_none()) { - mi_arena_id_t arena_id = 0; - if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { - // and try allocate in there - mi_assert_internal(req_arena_id == _mi_arena_id_none()); - p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - } - } - } - } - - // if we cannot use OS allocation, return NULL - if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { - errno = ENOMEM; - return NULL; - } - - // finally, fall back to the OS - if (align_offset > 0) { - return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); - } - else { - return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); - } -} - -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) -{ - return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); -} - - -/* ----------------------------------------------------------- - Arena free ------------------------------------------------------------ */ -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats); -static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats); - -void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { - mi_assert_internal(size > 0 && stats != NULL); - mi_assert_internal(committed_size <= size); - if (p==NULL) return; - if (size==0) return; - const bool all_committed = (committed_size == size); - - // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) - mi_track_mem_undefined(p, size); - - if (mi_memkind_is_os(memid.memkind)) { - // was a direct OS allocation, pass through - if (!all_committed && committed_size > 0) { - // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) - _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } - _mi_os_free(p, size, memid, stats); - } - else if (memid.memkind == MI_MEM_ARENA) { - // allocated in an arena - size_t arena_idx; - size_t block_idx; - mi_arena_memid_indices(memid, &arena_idx, &block_idx); - mi_assert_internal(arena_idx < MI_MAX_ARENAS); - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); - mi_assert_internal(arena != NULL); - const size_t blocks = mi_block_count_of_size(size); - - // checks - if (arena == NULL) { - _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); - return; - } - mi_assert_internal(block_idx < arena->block_count); - mi_assert_internal(block_idx > mi_arena_info_blocks()); - if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) { - _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); - return; - } - - // potentially decommit - if (arena->memid.is_pinned || arena->memid.initially_committed) { - mi_assert_internal(all_committed); - } - else { - if (!all_committed) { - // mark the entire range as no longer committed (so we recommit the full range when re-using) - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); - mi_track_mem_noaccess(p, size); - if (committed_size > 0) { - // if partially committed, adjust the committed stats (is it will be recommitted when re-using) - // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. - _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } - // note: if not all committed, it may be that the purge will reset/decommit the entire range - // that contains already decommitted parts. Since purge consistently uses reset or decommit that - // works (as we should never reset decommitted parts). - } - // (delay) purge the entire range - mi_arena_schedule_purge(arena, block_idx, blocks, stats); - } - - // and make it available to others again - bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL); - if (!all_inuse) { - _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); - return; - }; - } - else { - // arena was none, external, or static; nothing to do - mi_assert_internal(memid.memkind < MI_MEM_OS); - } - - // purge expired decommits - mi_arenas_try_purge(false, false, stats); -} - -// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` -// for dynamic libraries that are unloaded and need to release all their allocated memory. -static void mi_arenas_unsafe_destroy(void) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - size_t new_max_arena = 0; - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); - if (arena != NULL) { - mi_lock_done(&arena->abandoned_visit_lock); - if (mi_memkind_is_os(arena->memid.memkind)) { - mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); - _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main); - } - } - } - - // try to lower the max arena. - size_t expected = max_arena; - mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); -} - -// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired -void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { - mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); -} - -// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` -// for dynamic libraries that are unloaded and need to release all their allocated memory. -void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { - mi_arenas_unsafe_destroy(); - _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas -} - -// Is a pointer inside any of our arenas? -bool _mi_arena_contains(const void* p) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { - return true; - } - } - return false; -} - - -/* ----------------------------------------------------------- - Add an arena. ------------------------------------------------------------ */ - -static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { - mi_assert_internal(arena != NULL); - mi_assert_internal(arena->block_count > 0); - if (arena_id != NULL) { *arena_id = -1; } - - size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); - if (i >= MI_MAX_ARENAS) { - mi_atomic_decrement_acq_rel(&mi_arena_count); - return false; - } - _mi_stat_counter_increase(&stats->arena_count,1); - arena->id = mi_arena_id_create(i); - mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); - if (arena_id != NULL) { *arena_id = arena->id; } - return true; -} - -static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept -{ - mi_assert(!is_large || memid.initially_committed && memid.is_pinned); - mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)); - mi_assert(start!=NULL); - if (start==NULL) return false; - if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) { - // todo: use alignment in memid to align to blocksize first? - _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start); - return false; - } - - if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } - - const size_t info_blocks = mi_arena_info_blocks(); - const size_t bcount = size / MI_ARENA_BLOCK_SIZE; // divide down - if (bcount < info_blocks+1) { - _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB); - return false; - } - if (bcount > MI_BITMAP_MAX_BITS) { - // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) - _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB); - return false; - } - mi_arena_t* arena = (mi_arena_t*)start; - - // commit & zero if needed - bool is_zero = memid.initially_zero; - if (!memid.initially_committed) { - _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main); - } - if (!is_zero) { - _mi_memzero(arena, mi_size_of_blocks(info_blocks)); - } - - // init - arena->id = _mi_arena_id_none(); - arena->memid = memid; - arena->exclusive = exclusive; - arena->block_count = bcount; - arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) - arena->is_large = is_large; - arena->purge_expire = 0; - mi_lock_init(&arena->abandoned_visit_lock); - - // init bitmaps - mi_bitmap_init(&arena->blocks_free,true); - mi_bitmap_init(&arena->blocks_committed,true); - mi_bitmap_init(&arena->blocks_dirty,true); - mi_bitmap_init(&arena->blocks_purge,true); - for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) { - mi_bitmap_init(&arena->blocks_abandoned[i],true); - } - - // reserve our meta info (and reserve blocks outside the memory area) - mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks); - if (memid.initially_committed) { - mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count); - } - else { - mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL); - } - mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL); - - return mi_arena_add(arena, arena_id, &_mi_stats_main); -} - - -bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); - memid.initially_committed = is_committed; - memid.initially_zero = is_zero; - memid.is_pinned = is_large; - return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id); -} - -// Reserve a range of regular OS memory -int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - if (arena_id != NULL) *arena_id = _mi_arena_id_none(); - size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block - mi_memid_t memid; - void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main); - if (start == NULL) return ENOMEM; - const bool is_large = memid.is_pinned; // todo: use separate is_large field? - if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { - _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); - _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); - return ENOMEM; - } - _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); - return 0; -} - - -// Manage a range of regular OS memory -bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { - return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); -} - -// Reserve a range of regular OS memory -int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { - return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); -} - - -/* ----------------------------------------------------------- - Debugging ------------------------------------------------------------ */ -static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { - size_t bit_set_count = 0; - for (int bit = 0; bit < MI_BFIELD_BITS; bit++) { - bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); - if (is_set) bit_set_count++; - buf[bit] = (is_set ? 'x' : '.'); - } - return bit_set_count; -} - -static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) { - _mi_verbose_message("%s%s:\n", prefix, header); - size_t bit_count = 0; - size_t bit_set_count = 0; - for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) { - char buf[MI_BITMAP_CHUNK_BITS + 1]; - mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; - for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { - if (bit_count < block_count) { - bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS); - } - else { - _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS); - } - bit_count += MI_BFIELD_BITS; - } - buf[MI_BITMAP_CHUNK_BITS] = 0; - _mi_verbose_message("%s %s\n", prefix, buf); - } - _mi_verbose_message("%s total ('x'): %zu\n", prefix, bit_set_count); - return bit_set_count; -} - -void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { - MI_UNUSED(show_abandoned); - size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); - size_t free_total = 0; - size_t block_total = 0; - //size_t abandoned_total = 0; - size_t purge_total = 0; - for (size_t i = 0; i < max_arenas; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena == NULL) break; - block_total += arena->block_count; - _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : "")); - if (show_inuse) { - free_total += mi_debug_show_bitmap(" ", "free blocks", arena->block_count, &arena->blocks_free); - } - mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, &arena->blocks_committed); - // todo: abandoned blocks - if (show_purge) { - purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, &arena->blocks_purge); - } - } - if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", block_total - free_total); - // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); - if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); -} - - -/* ----------------------------------------------------------- - Reserve a huge page arena. ------------------------------------------------------------ */ -// reserve at a specific numa node -int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - if (arena_id != NULL) *arena_id = -1; - if (pages==0) return 0; - if (numa_node < -1) numa_node = -1; - if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); - size_t hsize = 0; - size_t pages_reserved = 0; - mi_memid_t memid; - void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); - if (p==NULL || pages_reserved==0) { - _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); - return ENOMEM; - } - _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); - - if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { - _mi_os_free(p, hsize, memid, &_mi_stats_main); - return ENOMEM; - } - return 0; -} - -int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { - return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); -} - -// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) -int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { - if (pages == 0) return 0; - - // pages per numa node - size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); - if (numa_count <= 0) numa_count = 1; - const size_t pages_per = pages / numa_count; - const size_t pages_mod = pages % numa_count; - const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); - - // reserve evenly among numa nodes - for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { - size_t node_pages = pages_per; // can be 0 - if (numa_node < pages_mod) node_pages++; - int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); - if (err) return err; - if (pages < node_pages) { - pages = 0; - } - else { - pages -= node_pages; - } - } - - return 0; -} - -int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { - MI_UNUSED(max_secs); - _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); - if (pages_reserved != NULL) *pages_reserved = 0; - int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); - if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; - return err; -} - - - -/* ----------------------------------------------------------- - Arena purge ------------------------------------------------------------ */ - -static long mi_arena_purge_delay(void) { - // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay - return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); -} - -// reset or decommit in an arena and update the committed/decommit bitmaps -// assumes we own the area (i.e. blocks_free is claimed by us) -static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { - mi_assert_internal(!arena->memid.is_pinned); - const size_t size = mi_size_of_blocks(blocks); - void* const p = mi_arena_block_start(arena, block_idx); - bool needs_recommit; - if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) { - // all blocks are committed, we can purge freely - needs_recommit = _mi_os_purge(p, size, stats); - } - else { - // some blocks are not committed -- this can happen when a partially committed block is freed - // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge - // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), - // and also undo the decommit stats (as it was already adjusted) - mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); - needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); - if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } - } - - // clear the purged blocks - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL); - - // update committed bitmap - if (needs_recommit) { - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); - } -} - - -// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. -// Note: assumes we (still) own the area as we may purge immediately -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { - const long delay = mi_arena_purge_delay(); - if (delay < 0) return; // is purging allowed at all? - - if (_mi_preloading() || delay == 0) { - // decommit directly - mi_arena_purge(arena, block_idx, blocks, stats); - } - else { - // schedule decommit - _mi_error_message(EFAULT, "purging not yet implemented\n"); - } -} - - -static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) { - if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled - - const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); - if (max_arena == 0) return; - - _mi_error_message(EFAULT, "purging not yet implemented\n"); - MI_UNUSED(stats); - MI_UNUSED(visit_all); - MI_UNUSED(force); -} - - -#if 0 - #define MI_IN_ARENA_C #include "arena-abandon.c" #undef MI_IN_ARENA_C @@ -904,12 +116,12 @@ static size_t mi_block_count_of_size(size_t size) { return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); } -static size_t mi_size_of_blocks(size_t bcount) { +static size_t mi_arena_block_size(size_t bcount) { return (bcount * MI_ARENA_BLOCK_SIZE); } static size_t mi_arena_size(mi_arena_t* arena) { - return mi_size_of_blocks(arena->block_count); + return mi_arena_block_size(arena->block_count); } static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) { @@ -995,7 +207,7 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { } void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { - return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex))); + return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex))); } @@ -1004,7 +216,7 @@ void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { ----------------------------------------------------------- */ // claim the `blocks_inuse` bits -static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats) +static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) { size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx); // start from last search; ok to be relaxed as the exact start does not matter if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) { @@ -1056,7 +268,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted); if (any_uncommitted) { bool commit_zero = false; - if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { + if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) { memid->initially_committed = false; } else { @@ -1081,7 +293,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no const size_t bcount = mi_block_count_of_size(size); const size_t arena_index = mi_arena_id_index(arena_id); mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); - mi_assert_internal(size <= mi_size_of_blocks(bcount)); + mi_assert_internal(size <= mi_arena_block_size(bcount)); // Check arena suitability mi_arena_t* arena = mi_arena_from_index(arena_index); @@ -1227,7 +439,7 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { if (arena_index >= MI_MAX_ARENAS) return NULL; mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); if (arena == NULL) return NULL; - if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } + if (size != NULL) { *size = mi_arena_block_size(arena->block_count); } return arena->start; } @@ -1247,7 +459,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_assert_internal(arena->blocks_committed != NULL); mi_assert_internal(arena->blocks_purge != NULL); mi_assert_internal(!arena->memid.is_pinned); - const size_t size = mi_size_of_blocks(blocks); + const size_t size = mi_arena_block_size(blocks); void* const p = mi_arena_block_start(arena, bitmap_idx); bool needs_recommit; if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { @@ -1299,25 +511,25 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t // purge a range of blocks // return true if the full range was purged. // assumes we own the area (i.e. blocks_in_use is claimed by us) -static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) { - const size_t endidx = startseqx + bitlen; - size_t bitseqx = startseqx; +static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) { + const size_t endidx = startidx + bitlen; + size_t bitidx = startidx; bool all_purged = false; - while (bitseqx < endidx) { + while (bitidx < endidx) { // count consecutive ones in the purge mask size_t count = 0; - while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) { + while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) { count++; } if (count > 0) { // found range to be purged - const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx); + const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx); mi_arena_purge(arena, range_idx, count, stats); if (count == bitlen) { all_purged = true; } } - bitseqx += (count+1); // +1 to skip the zero bit (or end) + bitidx += (count+1); // +1 to skip the zero bit (or end) } return all_purged; } @@ -1339,16 +551,16 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi for (size_t i = 0; i < arena->field_count; i++) { size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]); if (purge != 0) { - size_t bitseqx = 0; - while (bitseqx < MI_BITMAP_FIELD_BITS) { + size_t bitidx = 0; + while (bitidx < MI_BITMAP_FIELD_BITS) { // find consecutive range of ones in the purge mask size_t bitlen = 0; - while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) { + while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) { bitlen++; } // temporarily claim the purge range as "in-use" to be thread-safe with allocation // try to claim the longest range of corresponding in_use bits - const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx); + const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx); while( bitlen > 0 ) { if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) { break; @@ -1359,15 +571,15 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi if (bitlen > 0) { // read purge again now that we have the in_use bits purge = mi_atomic_load_acquire(&arena->blocks_purge[i]); - if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) { + if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) { full_purge = false; } any_purged = true; // release the claimed `in_use` bits again _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index); } - bitseqx += (bitlen+1); // +1 to skip the zero (or end) - } // while bitseqx + bitidx += (bitlen+1); // +1 to skip the zero (or end) + } // while bitidx } // purge != 0 } // if not fully purged, make sure to purge again in the future @@ -1530,7 +742,7 @@ bool _mi_arena_contains(const void* p) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { + if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { return true; } } @@ -1606,8 +818,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int mi_assert_internal(post >= 0); if (post > 0) { // don't use leftover bits at the end - mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); - _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL); + mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); + _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL); } return mi_arena_add(arena, arena_id, &_mi_stats_main); @@ -1774,4 +986,3 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv } -#endif \ No newline at end of file diff --git a/src/arena.c b/src/arena.c index 8ca5aaf3..28ad61f1 100644 --- a/src/arena.c +++ b/src/arena.c @@ -21,7 +21,6 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo #include "mimalloc.h" #include "mimalloc/internal.h" -#include "mimalloc/atomic.h" #include "bitmap.h" @@ -29,38 +28,823 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo Arena allocation ----------------------------------------------------------- */ +#define MI_ARENA_BIN_COUNT (MI_BIN_COUNT) + + // A memory arena descriptor typedef struct mi_arena_s { - mi_arena_id_t id; // arena id; 0 for non-specific mi_memid_t memid; // memid of the memory area - _Atomic(uint8_t*)start; // the start of the memory area + mi_arena_id_t id; // arena id; 0 for non-specific + size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) - size_t field_count; // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`) - size_t meta_size; // size of the arena structure itself (including its bitmaps) - mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) int numa_node; // associated NUMA node bool exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited - _Atomic(size_t)search_idx; // optimization to start the search for free blocks - _Atomic(mi_msecs_t)purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. - mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? - mi_bitmap_field_t* blocks_committed; // are the blocks committed? (can be NULL for memory that cannot be decommitted) - mi_bitmap_field_t* blocks_purge; // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted) - mi_bitmap_field_t* blocks_abandoned; // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here) - mi_bitmap_field_t blocks_inuse[1]; // in-place bitmap of in-use blocks (of size `field_count`) - // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields. + _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. + + mi_bitmap_t blocks_free; // is the block free? + mi_bitmap_t blocks_committed; // is the block committed? (i.e. accessible) + mi_bitmap_t blocks_purge; // can the block be purged? (block in purge => block in free) + mi_bitmap_t blocks_dirty; // is the block potentially non-zero? + mi_bitmap_t blocks_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) + // the full queue contains abandoned full pages } mi_arena_t; - -#define MI_ARENA_BLOCK_SIZE (MI_SEGMENT_SIZE) // 64MiB (must be at least MI_SEGMENT_ALIGN) -#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2) // 32MiB -#define MI_MAX_ARENAS (132) // Limited as the reservation exponentially increases (and takes up .bss) +#define MI_MAX_ARENAS (1024) // Limited for now (and takes up .bss) // The available arenas static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 + +/* ----------------------------------------------------------- + Arena id's + id = arena_index + 1 +----------------------------------------------------------- */ + +size_t mi_arena_id_index(mi_arena_id_t id) { + return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); +} + +static mi_arena_id_t mi_arena_id_create(size_t arena_index) { + mi_assert_internal(arena_index < MI_MAX_ARENAS); + return (int)arena_index + 1; +} + +mi_arena_id_t _mi_arena_id_none(void) { + return 0; +} + +static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { + return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || + (arena_id == req_arena_id)); +} + +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { + if (memid.memkind == MI_MEM_ARENA) { + return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); + } + else { + return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); + } +} + +size_t mi_arena_get_count(void) { + return mi_atomic_load_relaxed(&mi_arena_count); +} + +mi_arena_t* mi_arena_from_index(size_t idx) { + mi_assert_internal(idx < mi_arena_get_count()); + return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); +} + + + +/* ----------------------------------------------------------- + Util +----------------------------------------------------------- */ + + +// Size of an arena +static size_t mi_arena_size(mi_arena_t* arena) { + return mi_size_of_blocks(arena->block_count); +} + +static size_t mi_arena_info_blocks(void) { + const size_t os_page_size = _mi_os_page_size(); + const size_t info_size = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page + const size_t info_blocks = mi_block_count_of_size(info_size); + return info_blocks; +} + + +// Start of the arena memory area +static uint8_t* mi_arena_start(mi_arena_t* arena) { + return ((uint8_t*)arena); +} + +// Start of a block +void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) { + return (mi_arena_start(arena) + mi_size_of_blocks(block_index)); +} + +// Arena area +void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { + if (size != NULL) *size = 0; + const size_t arena_index = mi_arena_id_index(arena_id); + if (arena_index >= MI_MAX_ARENAS) return NULL; + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); + if (arena == NULL) return NULL; + if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } + return mi_arena_start(arena); +} + + +// Create an arena memid +static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) { + mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); + memid.mem.arena.id = id; + memid.mem.arena.block_index = block_index; + memid.mem.arena.is_exclusive = is_exclusive; + return memid; +} + +// returns if the arena is exclusive +bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) { + mi_assert_internal(memid.memkind == MI_MEM_ARENA); + *arena_index = mi_arena_id_index(memid.mem.arena.id); + *block_index = memid.mem.arena.block_index; + return memid.mem.arena.is_exclusive; +} + + + +/* ----------------------------------------------------------- + Arena Allocation +----------------------------------------------------------- */ + +static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, + bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + MI_UNUSED(arena_index); + mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); + + size_t block_index; + if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL; + + // claimed it! + void* p = mi_arena_block_start(arena, block_index); + *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index); + memid->is_pinned = arena->memid.is_pinned; + + // set the dirty bits + if (arena->memid.initially_zero) { + memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL); + } + + // set commit state + if (commit) { + // commit requested, but the range may not be committed as a whole: ensure it is committed now + memid->initially_committed = true; + + bool all_already_committed; + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed); + if (!all_already_committed) { + bool commit_zero = false; + if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { + memid->initially_committed = false; + } + else { + if (commit_zero) { memid->initially_zero = true; } + } + } + } + else { + // no need to commit, but check if already fully committed + memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount); + } + + return p; +} + +// allocate in a speficic arena +static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, + size_t size, size_t alignment, + bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; + + const size_t bcount = mi_block_count_of_size(size); + const size_t arena_index = mi_arena_id_index(arena_id); + mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); + mi_assert_internal(size <= mi_size_of_blocks(bcount)); + + // Check arena suitability + mi_arena_t* arena = mi_arena_from_index(arena_index); + if (arena == NULL) return NULL; + if (!allow_large && arena->is_large) return NULL; + if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; + if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity + const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); + if (match_numa_node) { if (!numa_suitable) return NULL; } + else { if (numa_suitable) return NULL; } + } + + // try to allocate + void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld); + mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); + return p; +} + + +// allocate from an arena with fallback to the OS +static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; + + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + if mi_likely(max_arena == 0) return NULL; + + if (req_arena_id != _mi_arena_id_none()) { + // try a specific arena if requested + if (mi_arena_id_index(req_arena_id) < max_arena) { + void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + else { + // try numa affine allocation + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + + // try from another numa node instead.. + if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + } + return NULL; +} + +// try to reserve a fresh arena space +static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) +{ + if (_mi_preloading()) return false; // use OS only while pre loading + if (req_arena_id != _mi_arena_id_none()) return false; + + const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); + if (arena_count > (MI_MAX_ARENAS - 4)) return false; + + // calc reserve + size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); + if (arena_reserve == 0) return false; + + if (!_mi_os_has_virtual_reserve()) { + arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) + } + arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); + + if (arena_count >= 8 && arena_count <= 128) { + // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); + size_t reserve = 0; + if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { + arena_reserve = reserve; + } + } + + // check arena bounds + const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1); + const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE; + if (arena_reserve < min_reserve) { + arena_reserve = min_reserve; + } + else if (arena_reserve > max_reserve) { + arena_reserve = max_reserve; + } + + if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size + + // commit eagerly? + bool arena_commit = false; + if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } + else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } + + return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); +} + + +void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert_internal(memid != NULL && tld != NULL); + mi_assert_internal(size > 0); + size_t tseq = _mi_thread_seq_id(); + *memid = _mi_memid_none(); + + const int numa_node = _mi_os_numa_node(tld); // current numa node + + // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? + if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) { + void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + + // otherwise, try to first eagerly reserve a new arena + if (req_arena_id == _mi_arena_id_none()) { + mi_arena_id_t arena_id = 0; + if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { + // and try allocate in there + mi_assert_internal(req_arena_id == _mi_arena_id_none()); + p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + } + } + + // if we cannot use OS allocation, return NULL + if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { + errno = ENOMEM; + return NULL; + } + + // finally, fall back to the OS + if (align_offset > 0) { + return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); + } + else { + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); + } +} + +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); +} + + +/* ----------------------------------------------------------- + Arena free +----------------------------------------------------------- */ +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats); +static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats); + +void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { + mi_assert_internal(size > 0 && stats != NULL); + mi_assert_internal(committed_size <= size); + if (p==NULL) return; + if (size==0) return; + const bool all_committed = (committed_size == size); + + // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) + mi_track_mem_undefined(p, size); + + if (mi_memkind_is_os(memid.memkind)) { + // was a direct OS allocation, pass through + if (!all_committed && committed_size > 0) { + // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + _mi_os_free(p, size, memid, stats); + } + else if (memid.memkind == MI_MEM_ARENA) { + // allocated in an arena + size_t arena_idx; + size_t block_idx; + mi_arena_memid_indices(memid, &arena_idx, &block_idx); + mi_assert_internal(arena_idx < MI_MAX_ARENAS); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); + mi_assert_internal(arena != NULL); + const size_t blocks = mi_block_count_of_size(size); + + // checks + if (arena == NULL) { + _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + mi_assert_internal(block_idx < arena->block_count); + mi_assert_internal(block_idx > mi_arena_info_blocks()); + if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) { + _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + + // potentially decommit + if (arena->memid.is_pinned || arena->memid.initially_committed) { + mi_assert_internal(all_committed); + } + else { + if (!all_committed) { + // mark the entire range as no longer committed (so we recommit the full range when re-using) + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); + mi_track_mem_noaccess(p, size); + if (committed_size > 0) { + // if partially committed, adjust the committed stats (is it will be recommitted when re-using) + // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + // note: if not all committed, it may be that the purge will reset/decommit the entire range + // that contains already decommitted parts. Since purge consistently uses reset or decommit that + // works (as we should never reset decommitted parts). + } + // (delay) purge the entire range + mi_arena_schedule_purge(arena, block_idx, blocks, stats); + } + + // and make it available to others again + bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL); + if (!all_inuse) { + _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); + return; + }; + } + else { + // arena was none, external, or static; nothing to do + mi_assert_internal(memid.memkind < MI_MEM_OS); + } + + // purge expired decommits + mi_arenas_try_purge(false, false, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +static void mi_arenas_unsafe_destroy(void) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + size_t new_max_arena = 0; + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + if (arena != NULL) { + mi_lock_done(&arena->abandoned_visit_lock); + if (mi_memkind_is_os(arena->memid.memkind)) { + mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); + _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main); + } + } + } + + // try to lower the max arena. + size_t expected = max_arena; + mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); +} + +// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired +void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { + mi_arenas_unsafe_destroy(); + _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas +} + +// Is a pointer inside any of our arenas? +bool _mi_arena_contains(const void* p) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { + return true; + } + } + return false; +} + + +/* ----------------------------------------------------------- + Add an arena. +----------------------------------------------------------- */ + +static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { + mi_assert_internal(arena != NULL); + mi_assert_internal(arena->block_count > 0); + if (arena_id != NULL) { *arena_id = -1; } + + size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); + if (i >= MI_MAX_ARENAS) { + mi_atomic_decrement_acq_rel(&mi_arena_count); + return false; + } + _mi_stat_counter_increase(&stats->arena_count,1); + arena->id = mi_arena_id_create(i); + mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); + if (arena_id != NULL) { *arena_id = arena->id; } + return true; +} + +static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept +{ + mi_assert(!is_large || memid.initially_committed && memid.is_pinned); + mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)); + mi_assert(start!=NULL); + if (start==NULL) return false; + if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) { + // todo: use alignment in memid to align to blocksize first? + _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start); + return false; + } + + if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } + + const size_t info_blocks = mi_arena_info_blocks(); + const size_t bcount = size / MI_ARENA_BLOCK_SIZE; // divide down + if (bcount < info_blocks+1) { + _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB); + return false; + } + if (bcount > MI_BITMAP_MAX_BITS) { + // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) + _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB); + return false; + } + mi_arena_t* arena = (mi_arena_t*)start; + + // commit & zero if needed + bool is_zero = memid.initially_zero; + if (!memid.initially_committed) { + _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main); + } + if (!is_zero) { + _mi_memzero(arena, mi_size_of_blocks(info_blocks)); + } + + // init + arena->id = _mi_arena_id_none(); + arena->memid = memid; + arena->exclusive = exclusive; + arena->block_count = bcount; + arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) + arena->is_large = is_large; + arena->purge_expire = 0; + mi_lock_init(&arena->abandoned_visit_lock); + + // init bitmaps + mi_bitmap_init(&arena->blocks_free,true); + mi_bitmap_init(&arena->blocks_committed,true); + mi_bitmap_init(&arena->blocks_dirty,true); + mi_bitmap_init(&arena->blocks_purge,true); + for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) { + mi_bitmap_init(&arena->blocks_abandoned[i],true); + } + + // reserve our meta info (and reserve blocks outside the memory area) + mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks); + if (memid.initially_committed) { + mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count); + } + else { + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL); + } + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL); + + return mi_arena_add(arena, arena_id, &_mi_stats_main); +} + + +bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); + memid.initially_committed = is_committed; + memid.initially_zero = is_zero; + memid.is_pinned = is_large; + return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = _mi_arena_id_none(); + size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block + mi_memid_t memid; + void* start = _mi_os_alloc_aligned(size, MI_ARENA_BLOCK_ALIGN, commit, allow_large, &memid, &_mi_stats_main); + if (start == NULL) return ENOMEM; + const bool is_large = memid.is_pinned; // todo: use separate is_large field? + if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { + _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); + _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); + return ENOMEM; + } + _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); + return 0; +} + + +// Manage a range of regular OS memory +bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { + return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { + return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); +} + + +/* ----------------------------------------------------------- + Debugging +----------------------------------------------------------- */ +static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { + size_t bit_set_count = 0; + for (int bit = 0; bit < MI_BFIELD_BITS; bit++) { + bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); + if (is_set) bit_set_count++; + buf[bit] = (is_set ? 'x' : '.'); + } + return bit_set_count; +} + +static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) { + _mi_verbose_message("%s%s:\n", prefix, header); + size_t bit_count = 0; + size_t bit_set_count = 0; + for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) { + char buf[MI_BITMAP_CHUNK_BITS + 1]; + mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; + for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { + if (bit_count < block_count) { + bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS); + } + else { + _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS); + } + bit_count += MI_BFIELD_BITS; + } + buf[MI_BITMAP_CHUNK_BITS] = 0; + _mi_verbose_message("%s %s\n", prefix, buf); + } + _mi_verbose_message("%s total ('x'): %zu\n", prefix, bit_set_count); + return bit_set_count; +} + +void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { + MI_UNUSED(show_abandoned); + size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); + size_t free_total = 0; + size_t block_total = 0; + //size_t abandoned_total = 0; + size_t purge_total = 0; + for (size_t i = 0; i < max_arenas; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena == NULL) break; + block_total += arena->block_count; + _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : "")); + if (show_inuse) { + free_total += mi_debug_show_bitmap(" ", "free blocks", arena->block_count, &arena->blocks_free); + } + mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, &arena->blocks_committed); + // todo: abandoned blocks + if (show_purge) { + purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, &arena->blocks_purge); + } + } + if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", block_total - free_total); + // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); + if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); +} + + +/* ----------------------------------------------------------- + Reserve a huge page arena. +----------------------------------------------------------- */ +// reserve at a specific numa node +int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = -1; + if (pages==0) return 0; + if (numa_node < -1) numa_node = -1; + if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); + size_t hsize = 0; + size_t pages_reserved = 0; + mi_memid_t memid; + void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); + if (p==NULL || pages_reserved==0) { + _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); + return ENOMEM; + } + _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); + + if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { + _mi_os_free(p, hsize, memid, &_mi_stats_main); + return ENOMEM; + } + return 0; +} + +int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { + return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); +} + +// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) +int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { + if (pages == 0) return 0; + + // pages per numa node + size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); + if (numa_count <= 0) numa_count = 1; + const size_t pages_per = pages / numa_count; + const size_t pages_mod = pages % numa_count; + const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); + + // reserve evenly among numa nodes + for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { + size_t node_pages = pages_per; // can be 0 + if (numa_node < pages_mod) node_pages++; + int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); + if (err) return err; + if (pages < node_pages) { + pages = 0; + } + else { + pages -= node_pages; + } + } + + return 0; +} + +int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { + MI_UNUSED(max_secs); + _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); + if (pages_reserved != NULL) *pages_reserved = 0; + int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); + if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; + return err; +} + + + +/* ----------------------------------------------------------- + Abandoned pages +----------------------------------------------------------- */ + +void mi_arena_page_abandon(mi_page_t* page) { + mi_assert_internal(mi_page_is_abandoned(page)); + if (mi_page_is_full(page)) {} +} + + + +/* ----------------------------------------------------------- + Arena purge +----------------------------------------------------------- */ + +static long mi_arena_purge_delay(void) { + // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); +} + +// reset or decommit in an arena and update the committed/decommit bitmaps +// assumes we own the area (i.e. blocks_free is claimed by us) +static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { + mi_assert_internal(!arena->memid.is_pinned); + const size_t size = mi_size_of_blocks(blocks); + void* const p = mi_arena_block_start(arena, block_idx); + bool needs_recommit; + if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) { + // all blocks are committed, we can purge freely + needs_recommit = _mi_os_purge(p, size, stats); + } + else { + // some blocks are not committed -- this can happen when a partially committed block is freed + // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge + // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), + // and also undo the decommit stats (as it was already adjusted) + mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); + needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); + if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } + } + + // clear the purged blocks + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL); + + // update committed bitmap + if (needs_recommit) { + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); + } +} + + +// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. +// Note: assumes we (still) own the area as we may purge immediately +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { + const long delay = mi_arena_purge_delay(); + if (delay < 0) return; // is purging allowed at all? + + if (_mi_preloading() || delay == 0) { + // decommit directly + mi_arena_purge(arena, block_idx, blocks, stats); + } + else { + // schedule decommit + _mi_error_message(EFAULT, "purging not yet implemented\n"); + } +} + + +static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) { + if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled + + const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); + if (max_arena == 0) return; + + _mi_error_message(EFAULT, "purging not yet implemented\n"); + MI_UNUSED(stats); + MI_UNUSED(visit_all); + MI_UNUSED(force); +} + + +#if 0 + #define MI_IN_ARENA_C #include "arena-abandon.c" #undef MI_IN_ARENA_C @@ -116,12 +900,12 @@ static size_t mi_block_count_of_size(size_t size) { return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); } -static size_t mi_arena_block_size(size_t bcount) { +static size_t mi_size_of_blocks(size_t bcount) { return (bcount * MI_ARENA_BLOCK_SIZE); } static size_t mi_arena_size(mi_arena_t* arena) { - return mi_arena_block_size(arena->block_count); + return mi_size_of_blocks(arena->block_count); } static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) { @@ -207,7 +991,7 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { } void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { - return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex))); + return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex))); } @@ -216,7 +1000,7 @@ void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { ----------------------------------------------------------- */ // claim the `blocks_inuse` bits -static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) +static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats) { size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx); // start from last search; ok to be relaxed as the exact start does not matter if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) { @@ -268,7 +1052,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted); if (any_uncommitted) { bool commit_zero = false; - if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) { + if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { memid->initially_committed = false; } else { @@ -293,7 +1077,7 @@ static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_no const size_t bcount = mi_block_count_of_size(size); const size_t arena_index = mi_arena_id_index(arena_id); mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); - mi_assert_internal(size <= mi_arena_block_size(bcount)); + mi_assert_internal(size <= mi_size_of_blocks(bcount)); // Check arena suitability mi_arena_t* arena = mi_arena_from_index(arena_index); @@ -439,7 +1223,7 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { if (arena_index >= MI_MAX_ARENAS) return NULL; mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); if (arena == NULL) return NULL; - if (size != NULL) { *size = mi_arena_block_size(arena->block_count); } + if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } return arena->start; } @@ -459,7 +1243,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_assert_internal(arena->blocks_committed != NULL); mi_assert_internal(arena->blocks_purge != NULL); mi_assert_internal(!arena->memid.is_pinned); - const size_t size = mi_arena_block_size(blocks); + const size_t size = mi_size_of_blocks(blocks); void* const p = mi_arena_block_start(arena, bitmap_idx); bool needs_recommit; if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { @@ -511,25 +1295,25 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t // purge a range of blocks // return true if the full range was purged. // assumes we own the area (i.e. blocks_in_use is claimed by us) -static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) { - const size_t endidx = startidx + bitlen; - size_t bitidx = startidx; +static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) { + const size_t endidx = startseqx + bitlen; + size_t bitseqx = startseqx; bool all_purged = false; - while (bitidx < endidx) { + while (bitseqx < endidx) { // count consecutive ones in the purge mask size_t count = 0; - while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) { + while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) { count++; } if (count > 0) { // found range to be purged - const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx); + const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx); mi_arena_purge(arena, range_idx, count, stats); if (count == bitlen) { all_purged = true; } } - bitidx += (count+1); // +1 to skip the zero bit (or end) + bitseqx += (count+1); // +1 to skip the zero bit (or end) } return all_purged; } @@ -551,16 +1335,16 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi for (size_t i = 0; i < arena->field_count; i++) { size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]); if (purge != 0) { - size_t bitidx = 0; - while (bitidx < MI_BITMAP_FIELD_BITS) { + size_t bitseqx = 0; + while (bitseqx < MI_BITMAP_FIELD_BITS) { // find consecutive range of ones in the purge mask size_t bitlen = 0; - while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) { + while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) { bitlen++; } // temporarily claim the purge range as "in-use" to be thread-safe with allocation // try to claim the longest range of corresponding in_use bits - const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx); + const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx); while( bitlen > 0 ) { if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) { break; @@ -571,15 +1355,15 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi if (bitlen > 0) { // read purge again now that we have the in_use bits purge = mi_atomic_load_acquire(&arena->blocks_purge[i]); - if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) { + if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) { full_purge = false; } any_purged = true; // release the claimed `in_use` bits again _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index); } - bitidx += (bitlen+1); // +1 to skip the zero (or end) - } // while bitidx + bitseqx += (bitlen+1); // +1 to skip the zero (or end) + } // while bitseqx } // purge != 0 } // if not fully purged, make sure to purge again in the future @@ -742,7 +1526,7 @@ bool _mi_arena_contains(const void* p) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { + if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { return true; } } @@ -818,8 +1602,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int mi_assert_internal(post >= 0); if (post > 0) { // don't use leftover bits at the end - mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); - _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL); + mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); + _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL); } return mi_arena_add(arena, arena_id, &_mi_stats_main); @@ -986,3 +1770,4 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv } +#endif \ No newline at end of file diff --git a/src/bitmap-old.c b/src/bitmap-old.c new file mode 100644 index 00000000..3e6311dc --- /dev/null +++ b/src/bitmap-old.c @@ -0,0 +1,419 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2023 Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- +Concurrent bitmap that can set/reset sequences of bits atomically, +represented as an array of fields where each field is a machine word (`size_t`) + +There are two api's; the standard one cannot have sequences that cross +between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). + +The `_across` postfixed functions do allow sequences that can cross over +between the fields. (This is used in arena allocation) +---------------------------------------------------------------------------- */ + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "mimalloc/bits.h" +#include "bitmap.h" + +/* ----------------------------------------------------------- + Bitmap definition +----------------------------------------------------------- */ + +// The bit mask for a given number of blocks at a specified bit index. +static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) { + mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS); + mi_assert_internal(count > 0); + if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL; + if (count == 0) return 0; + return ((((size_t)1 << count) - 1) << bitidx); +} + + + +/* ----------------------------------------------------------- + Claim a bit sequence atomically +----------------------------------------------------------- */ + +// Try to atomically claim a sequence of `count` bits in a single +// field at `idx` in `bitmap`. Returns `true` on success. +bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) +{ + mi_assert_internal(bitmap_idx != NULL); + mi_assert_internal(count <= MI_BITMAP_FIELD_BITS); + mi_bitmap_field_t* field = &bitmap[idx]; + size_t map = mi_atomic_load_relaxed(field); + if (map==MI_BITMAP_FIELD_FULL) return false; // short cut + + // search for 0-bit sequence of length count + const size_t mask = mi_bitmap_mask_(count, 0); + const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count; + +#if MI_HAS_FAST_BITSCAN + size_t bitidx = mi_ctz(~map); // quickly find the first zero bit if possible +#else + size_t bitidx = 0; // otherwise start at 0 +#endif + size_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx + + // scan linearly for a free range of zero bits + while (bitidx <= bitidx_max) { + const size_t mapm = (map & m); + if (mapm == 0) { // are the mask bits free at bitidx? + mi_assert_internal((m >> bitidx) == mask); // no overflow? + const size_t newmap = (map | m); + mi_assert_internal((newmap^map) >> bitidx == mask); + if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { // TODO: use weak cas here? + // no success, another thread claimed concurrently.. keep going (with updated `map`) + continue; + } + else { + // success, we claimed the bits! + *bitmap_idx = mi_bitmap_index_create(idx, bitidx); + return true; + } + } + else { + // on to the next bit range +#if MI_HAS_FAST_BITSCAN + mi_assert_internal(mapm != 0); + const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx)); + mi_assert_internal(shift > 0 && shift <= count); +#else + const size_t shift = 1; +#endif + bitidx += shift; + m <<= shift; + } + } + // no bits found + return false; +} + + +// Starts at idx, and wraps around to search in all `bitmap_fields` fields. +// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields. +bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) { + size_t idx = start_field_idx; + for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { + if (idx >= bitmap_fields) { idx = 0; } // wrap + if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { + return true; + } + } + return false; +} + + +// Set `count` bits at `bitmap_idx` to 0 atomically +// Returns `true` if all `count` bits were 1 previously. +bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { + const size_t idx = mi_bitmap_index_field(bitmap_idx); + const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); + const size_t mask = mi_bitmap_mask_(count, bitidx); + mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); + // mi_assert_internal((bitmap[idx] & mask) == mask); + const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask); + return ((prev & mask) == mask); +} + + +// Set `count` bits at `bitmap_idx` to 1 atomically +// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. +bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) { + const size_t idx = mi_bitmap_index_field(bitmap_idx); + const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); + const size_t mask = mi_bitmap_mask_(count, bitidx); + mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); + //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0); + size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask); + if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); } + return ((prev & mask) == 0); +} + +// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one. +static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) { + const size_t idx = mi_bitmap_index_field(bitmap_idx); + const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); + const size_t mask = mi_bitmap_mask_(count, bitidx); + mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); + const size_t field = mi_atomic_load_relaxed(&bitmap[idx]); + if (any_ones != NULL) { *any_ones = ((field & mask) != 0); } + return ((field & mask) == mask); +} + +// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. +// Returns `true` if successful when all previous `count` bits were 0. +bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { + const size_t idx = mi_bitmap_index_field(bitmap_idx); + const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); + const size_t mask = mi_bitmap_mask_(count, bitidx); + mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); + size_t expected = mi_atomic_load_relaxed(&bitmap[idx]); + do { + if ((expected & mask) != 0) return false; + } + while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask)); + mi_assert_internal((expected & mask) == 0); + return true; +} + + +bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { + return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL); +} + +bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { + bool any_ones; + mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); + return any_ones; +} + + +//-------------------------------------------------------------------------- +// the `_across` functions work on bitmaps where sequences can cross over +// between the fields. This is used in arena allocation +//-------------------------------------------------------------------------- + +// Try to atomically claim a sequence of `count` bits starting from the field +// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success. +// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`) +static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) +{ + mi_assert_internal(bitmap_idx != NULL); + + // check initial trailing zeros + mi_bitmap_field_t* field = &bitmap[idx]; + size_t map = mi_atomic_load_relaxed(field); + const size_t initial = mi_clz(map); // count of initial zeros starting at idx + mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS); + if (initial == 0) return false; + if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx); // no need to cross fields (this case won't happen for us) + if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries + + // scan ahead + size_t found = initial; + size_t mask = 0; // mask bits for the final field + while(found < count) { + field++; + map = mi_atomic_load_relaxed(field); + const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found)); + mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS); + mask = mi_bitmap_mask_(mask_bits, 0); + if ((map & mask) != 0) return false; // some part is already claimed + found += mask_bits; + } + mi_assert_internal(field < &bitmap[bitmap_fields]); + + // we found a range of contiguous zeros up to the final field; mask contains mask in the final field + // now try to claim the range atomically + mi_bitmap_field_t* const final_field = field; + const size_t final_mask = mask; + mi_bitmap_field_t* const initial_field = &bitmap[idx]; + const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial; + const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx); + + // initial field + size_t newmap; + field = initial_field; + map = mi_atomic_load_relaxed(field); + do { + newmap = (map | initial_mask); + if ((map & initial_mask) != 0) { goto rollback; }; + } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); + + // intermediate fields + while (++field < final_field) { + newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0); + map = 0; + if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; } + } + + // final field + mi_assert_internal(field == final_field); + map = mi_atomic_load_relaxed(field); + do { + newmap = (map | final_mask); + if ((map & final_mask) != 0) { goto rollback; } + } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); + + // claimed! + mi_stat_counter_increase(stats->arena_crossover_count,1); + *bitmap_idx = mi_bitmap_index_create(idx, initial_idx); + return true; + +rollback: + // roll back intermediate fields + // (we just failed to claim `field` so decrement first) + while (--field > initial_field) { + newmap = 0; + map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0); + mi_assert_internal(mi_atomic_load_relaxed(field) == map); + mi_atomic_store_release(field, newmap); + } + if (field == initial_field) { // (if we failed on the initial field, `field + 1 == initial_field`) + map = mi_atomic_load_relaxed(field); + do { + mi_assert_internal((map & initial_mask) == initial_mask); + newmap = (map & ~initial_mask); + } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); + } + mi_stat_counter_increase(stats->arena_rollback_count,1); + // retry? (we make a recursive call instead of goto to be able to use const declarations) + if (retries <= 2) { + return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats); + } + else { + return false; + } +} + + +// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. +// Starts at idx, and wraps around to search in all `bitmap_fields` fields. +bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) { + mi_assert_internal(count > 0); + if (count <= 2) { + // we don't bother with crossover fields for small counts + return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx); + } + + // visit the fields + size_t idx = start_field_idx; + for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { + if (idx >= bitmap_fields) { idx = 0; } // wrap + // first try to claim inside a field + /* + if (count <= MI_BITMAP_FIELD_BITS) { + if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { + return true; + } + } + */ + // if that fails, then try to claim across fields + if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) { + return true; + } + } + return false; +} + +// Helper for masks across fields; returns the mid count, post_mask may be 0 +static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) { + MI_UNUSED(bitmap_fields); + const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); + if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) { + *pre_mask = mi_bitmap_mask_(count, bitidx); + *mid_mask = 0; + *post_mask = 0; + mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields); + return 0; + } + else { + const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx; + mi_assert_internal(pre_bits < count); + *pre_mask = mi_bitmap_mask_(pre_bits, bitidx); + count -= pre_bits; + const size_t mid_count = (count / MI_BITMAP_FIELD_BITS); + *mid_mask = MI_BITMAP_FIELD_FULL; + count %= MI_BITMAP_FIELD_BITS; + *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0)); + mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields); + return mid_count; + } +} + +// Set `count` bits at `bitmap_idx` to 0 atomically +// Returns `true` if all `count` bits were 1 previously. +bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { + size_t idx = mi_bitmap_index_field(bitmap_idx); + size_t pre_mask; + size_t mid_mask; + size_t post_mask; + size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); + bool all_one = true; + mi_bitmap_field_t* field = &bitmap[idx]; + size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask); // clear first part + if ((prev & pre_mask) != pre_mask) all_one = false; + while(mid_count-- > 0) { + prev = mi_atomic_and_acq_rel(field++, ~mid_mask); // clear mid part + if ((prev & mid_mask) != mid_mask) all_one = false; + } + if (post_mask!=0) { + prev = mi_atomic_and_acq_rel(field, ~post_mask); // clear end part + if ((prev & post_mask) != post_mask) all_one = false; + } + return all_one; +} + +// Set `count` bits at `bitmap_idx` to 1 atomically +// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. +bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) { + size_t idx = mi_bitmap_index_field(bitmap_idx); + size_t pre_mask; + size_t mid_mask; + size_t post_mask; + size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); + bool all_zero = true; + bool any_zero = false; + _Atomic(size_t)*field = &bitmap[idx]; + size_t prev = mi_atomic_or_acq_rel(field++, pre_mask); + if ((prev & pre_mask) != 0) all_zero = false; + if ((prev & pre_mask) != pre_mask) any_zero = true; + while (mid_count-- > 0) { + prev = mi_atomic_or_acq_rel(field++, mid_mask); + if ((prev & mid_mask) != 0) all_zero = false; + if ((prev & mid_mask) != mid_mask) any_zero = true; + } + if (post_mask!=0) { + prev = mi_atomic_or_acq_rel(field, post_mask); + if ((prev & post_mask) != 0) all_zero = false; + if ((prev & post_mask) != post_mask) any_zero = true; + } + if (pany_zero != NULL) { *pany_zero = any_zero; } + return all_zero; +} + + +// Returns `true` if all `count` bits were 1. +// `any_ones` is `true` if there was at least one bit set to one. +static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) { + size_t idx = mi_bitmap_index_field(bitmap_idx); + size_t pre_mask; + size_t mid_mask; + size_t post_mask; + size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); + bool all_ones = true; + bool any_ones = false; + mi_bitmap_field_t* field = &bitmap[idx]; + size_t prev = mi_atomic_load_relaxed(field++); + if ((prev & pre_mask) != pre_mask) all_ones = false; + if ((prev & pre_mask) != 0) any_ones = true; + while (mid_count-- > 0) { + prev = mi_atomic_load_relaxed(field++); + if ((prev & mid_mask) != mid_mask) all_ones = false; + if ((prev & mid_mask) != 0) any_ones = true; + } + if (post_mask!=0) { + prev = mi_atomic_load_relaxed(field); + if ((prev & post_mask) != post_mask) all_ones = false; + if ((prev & post_mask) != 0) any_ones = true; + } + if (pany_ones != NULL) { *pany_ones = any_ones; } + return all_ones; +} + +bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { + return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL); +} + +bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { + bool any_ones; + mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); + return any_ones; +} diff --git a/src/bitmap-old.h b/src/bitmap-old.h new file mode 100644 index 00000000..f8898935 --- /dev/null +++ b/src/bitmap-old.h @@ -0,0 +1,110 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2023 Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- +Concurrent bitmap that can set/reset sequences of bits atomically, +represented as an array of fields where each field is a machine word (`size_t`) + +There are two api's; the standard one cannot have sequences that cross +between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). +(this is used in region allocation) + +The `_across` postfixed functions do allow sequences that can cross over +between the fields. (This is used in arena allocation) +---------------------------------------------------------------------------- */ +#pragma once +#ifndef MI_BITMAP_H +#define MI_BITMAP_H + +/* ----------------------------------------------------------- + Bitmap definition +----------------------------------------------------------- */ + +#define MI_BITMAP_FIELD_BITS (8*MI_SIZE_SIZE) +#define MI_BITMAP_FIELD_FULL (~((size_t)0)) // all bits set + +// An atomic bitmap of `size_t` fields +typedef _Atomic(size_t) mi_bitmap_field_t; +typedef mi_bitmap_field_t* mi_bitmap_t; + +// A bitmap index is the index of the bit in a bitmap. +typedef size_t mi_bitmap_index_t; + +// Create a bit index. +static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) { + mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS); + return (idx*MI_BITMAP_FIELD_BITS) + bitidx; +} +static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) { + mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS); + return mi_bitmap_index_create_ex(idx,bitidx); +} + +// Get the field index from a bit index. +static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) { + return (bitmap_idx / MI_BITMAP_FIELD_BITS); +} + +// Get the bit index in a bitmap field +static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) { + return (bitmap_idx % MI_BITMAP_FIELD_BITS); +} + +// Get the full bit index +static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) { + return bitmap_idx; +} + +/* ----------------------------------------------------------- + Claim a bit sequence atomically +----------------------------------------------------------- */ + +// Try to atomically claim a sequence of `count` bits in a single +// field at `idx` in `bitmap`. Returns `true` on success. +bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx); + +// Starts at idx, and wraps around to search in all `bitmap_fields` fields. +// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields. +bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx); + +// Set `count` bits at `bitmap_idx` to 0 atomically +// Returns `true` if all `count` bits were 1 previously. +bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); + +// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. +// Returns `true` if successful when all previous `count` bits were 0. +bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); + +// Set `count` bits at `bitmap_idx` to 1 atomically +// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. +bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero); + +bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); +bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); + + +//-------------------------------------------------------------------------- +// the `_across` functions work on bitmaps where sequences can cross over +// between the fields. This is used in arena allocation +//-------------------------------------------------------------------------- + +// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. +// Starts at idx, and wraps around to search in all `bitmap_fields` fields. +bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats); + +// Set `count` bits at `bitmap_idx` to 0 atomically +// Returns `true` if all `count` bits were 1 previously. +bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); + +// Set `count` bits at `bitmap_idx` to 1 atomically +// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. +bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero); + +bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); +bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); + +#endif diff --git a/src/bitmap.c b/src/bitmap.c index 3e6311dc..463d74c7 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1,19 +1,12 @@ /* ---------------------------------------------------------------------------- -Copyright (c) 2019-2023 Microsoft Research, Daan Leijen +Copyright (c) 2019-2024 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ /* ---------------------------------------------------------------------------- -Concurrent bitmap that can set/reset sequences of bits atomically, -represented as an array of fields where each field is a machine word (`size_t`) - -There are two api's; the standard one cannot have sequences that cross -between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). - -The `_across` postfixed functions do allow sequences that can cross over -between the fields. (This is used in arena allocation) +Concurrent bitmap that can set/reset sequences of bits atomically ---------------------------------------------------------------------------- */ #include "mimalloc.h" @@ -21,399 +14,586 @@ between the fields. (This is used in arena allocation) #include "mimalloc/bits.h" #include "bitmap.h" -/* ----------------------------------------------------------- - Bitmap definition ------------------------------------------------------------ */ +/* -------------------------------------------------------------------------------- + bfields +-------------------------------------------------------------------------------- */ -// The bit mask for a given number of blocks at a specified bit index. -static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) { - mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS); - mi_assert_internal(count > 0); - if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL; - if (count == 0) return 0; - return ((((size_t)1 << count) - 1) << bitidx); +static inline size_t mi_bfield_ctz(mi_bfield_t x) { + return mi_ctz(x); +} + +static inline size_t mi_bfield_clz(mi_bfield_t x) { + return mi_clz(x); +} + +// find the least significant bit that is set (i.e. count trailing zero's) +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { + return mi_bsf(x,idx); +} + +static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { + return mi_rotr(x,r); +} + +// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). +static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { + mi_assert_internal(idx < MI_BFIELD_BITS); + const mi_bfield_t mask = ((mi_bfield_t)1)<> bitidx) == mask); // no overflow? - const size_t newmap = (map | m); - mi_assert_internal((newmap^map) >> bitidx == mask); - if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { // TODO: use weak cas here? - // no success, another thread claimed concurrently.. keep going (with updated `map`) - continue; +static bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx ) { + mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + return mi_bfield_atomic_try_xset( set, &chunk->bfields[i], idx); +} + +static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) { + mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); + const size_t i = byte_idx / MI_BFIELD_SIZE; + const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; + return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx); +} + +// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0) +static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + bool all_transition = true; + bool all_already_xset = true; + size_t idx = cidx % MI_BFIELD_BITS; + size_t field = cidx / MI_BFIELD_BITS; + while (n > 0) { + size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(idx + m <= MI_BFIELD_BITS); + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset); + all_already_xset = all_already_xset && already_xset; + // next field + field++; + idx = 0; + n -= m; + } + *palready_xset = all_already_xset; + return all_transition; +} + +// Check if a sequence of `n` bits within a chunk are all set/cleared. +static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + bool all_xset = true; + size_t idx = cidx % MI_BFIELD_BITS; + size_t field = cidx / MI_BFIELD_BITS; + while (n > 0) { + size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(idx + m <= MI_BFIELD_BITS); + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask); + // next field + field++; + idx = 0; + n -= m; + } + return all_xset; +} + +// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0), +// and false otherwise leaving all bit fields as is. +static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + if (n==0) return true; + size_t start_idx = cidx % MI_BFIELD_BITS; + size_t start_field = cidx / MI_BFIELD_BITS; + size_t end_field = MI_BITMAP_CHUNK_FIELDS; + size_t mask_mid = 0; + size_t mask_end = 0; + + // first field + size_t field = start_field; + size_t m = MI_BFIELD_BITS - start_idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); + mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask_start)) return false; + + // done? + n -= m; + if (n==0) return true; + + // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields + + // mid fields + while (n >= MI_BFIELD_BITS) { + field++; + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + mask_mid = ~MI_ZU(0); + if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore; + n -= MI_BFIELD_BITS; + } + + // last field + if (n > 0) { + mi_assert_internal(n < MI_BFIELD_BITS); + field++; + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + end_field = field; + mask_end = (MI_ZU(1)<bfields[field], mask_end)) goto restore; + } + + return true; + +restore: + // field is on the field that failed to set atomically; we need to restore all previous fields + mi_assert_internal(field > start_field); + while( field > start_field) { + field--; + const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); + bool already_xset; + mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset); + } + return false; +} + + +// find least 1-bit in a chunk and try unset it atomically +// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// todo: try neon version +static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + while(true) { + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + if (_mm256_testz_si256(vec,vec)) return false; // vec == 0 ? + const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1 : 0) + const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0) + mi_assert_internal(mask != 0); + const size_t chunk_idx = _tzcnt_u32(mask) / 8; // tzcnt == 0, 8, 16, or 24 + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + size_t cidx; + if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) { // find the bit that is set + if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) { // unset atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + // try again + } + #else + size_t idx; + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + size_t idx; + if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit + if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) { // try unset atomically + *pidx = (i*MI_BFIELD_BITS + idx); + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + } + return false; + #endif +} + + +// find least byte in a chunk with all bits set, and try unset it atomically +// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// todo: try neon version +static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + while(true) { + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1 : 0) + const uint32_t mask = _mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte + if (mask == 0) return false; + const size_t i = _tzcnt_u32(mask); + mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS); + const size_t chunk_idx = i / MI_BFIELD_SIZE; + const size_t byte_idx = i % MI_BFIELD_SIZE; + if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) { // try to unset atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8); + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + // try again + } + #else + size_t idx; + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + const mi_bfield_t x = chunk->bfields[i]; + // has_set8 has low bit in each byte set if the byte in x == 0xFF + const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F + (x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 + >> 7; // shift high bit to low bit + size_t idx; + if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit + mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); + mi_assert_internal((idx%8)==0); + const size_t byte_idx = idx/8; + if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) { // unset the byte atomically + *pidx = (i*MI_BFIELD_BITS) + idx; + mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS); + return true; + } + // else continue + } + } + return false; + #endif +} + + +// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically +// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. +// todo: try avx2 and neon version +// todo: allow spanning across bfield boundaries? +static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { + if (n == 0 || n > MI_BFIELD_BITS) return false; // TODO: allow larger? + const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1); + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + mi_bfield_t b = chunk->bfields[i]; + size_t bshift = 0; + size_t idx; + while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit + b >>= idx; + bshift += idx; + if (bshift + n >= MI_BFIELD_BITS) break; + + if ((b&mask) == mask) { // found a match + mi_assert_internal( ((mask << bshift) >> bshift) == mask ); + if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<bfields[i] >> bshift); + } } else { - // success, we claimed the bits! - *bitmap_idx = mi_bitmap_index_create(idx, bitidx); - return true; + // advance + const size_t ones = mi_bfield_ctz(~b); // skip all ones (since it didn't fit the mask) + mi_assert_internal(ones>0); + bshift += ones; + b >>= ones; } } + } + return false; +} + + +// are all bits in a bitmap chunk set? +static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return _mm256_test_all_ones(vec); + #else + // written like this for vectorization + mi_bfield_t x = chunk->bfields[0]; + for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { + x = x & chunk->bfields[i]; + } + return (~x == 0); + #endif +} + +// are all bits in a bitmap chunk clear? +static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return _mm256_testz_si256( vec, vec ); + #else + // written like this for vectorization + mi_bfield_t x = chunk->bfields[0]; + for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { + x = x | chunk->bfields[i]; + } + return (x == 0); + #endif +} + +/* -------------------------------------------------------------------------------- + bitmap +-------------------------------------------------------------------------------- */ +// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { + if (!already_zero) { + _mi_memzero_aligned(bitmap, sizeof(*bitmap)); + } +} + +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS); + + // first chunk + size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + size_t m = MI_BITMAP_CHUNK_BITS - cidx; + if (m > n) { m = n; } + bool already_xset; + mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset); + + // n can be large so use memset for efficiency for all in-between chunks + chunk_idx++; + n -= m; + const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; + if (mid_chunks > 0) { + _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8); + chunk_idx += mid_chunks; + n -= mid_chunks * MI_BITMAP_CHUNK_BITS; + } + + // last chunk + if (n > 0) { + mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset); + } +} + + +// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0), +// and false otherwise leaving the bitmask as is. +bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx); +} + +// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) +// and false otherwise leaving the bitmask as is. +bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < MI_BITMAP_MAX_BITS); + mi_assert_internal(idx%8 == 0); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; + return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx); +} + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) +// and false otherwise leaving the bitmask as is. +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); } + if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); } + + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n); +} + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + bool local_already_xset; + if (already_xset==NULL) { already_xset = &local_already_xset; } + // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } + // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } + + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset); +} + +// Is a sequence of n bits already all set/cleared? +bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); +} + + +#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \ + { size_t _set_idx; \ + size_t _start = start % MI_BFIELD_BITS; \ + mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \ + while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \ + decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS; + +#define mi_bitmap_forall_set_chunks_end() \ + _start += _set_idx+1; /* so chunk_idx stays valid */ \ + _any_set >>= _set_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ \ + _any_set >>= 1; \ + } \ + } + +// Find a set bit in a bitmap and atomically unset it. Returns true on success, +// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. +// The low `MI_BFIELD_BITS` of start are used to set the start point of the search +// (to reduce thread contention). +bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) { + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS); + return true; + } else { - // on to the next bit range -#if MI_HAS_FAST_BITSCAN - mi_assert_internal(mapm != 0); - const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx)); - mi_assert_internal(shift > 0 && shift <= count); -#else - const size_t shift = 1; -#endif - bitidx += shift; - m <<= shift; - } - } - // no bits found - return false; -} - - -// Starts at idx, and wraps around to search in all `bitmap_fields` fields. -// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields. -bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) { - size_t idx = start_field_idx; - for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { - if (idx >= bitmap_fields) { idx = 0; } // wrap - if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { - return true; - } - } - return false; -} - - -// Set `count` bits at `bitmap_idx` to 0 atomically -// Returns `true` if all `count` bits were 1 previously. -bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - const size_t idx = mi_bitmap_index_field(bitmap_idx); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - const size_t mask = mi_bitmap_mask_(count, bitidx); - mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); - // mi_assert_internal((bitmap[idx] & mask) == mask); - const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask); - return ((prev & mask) == mask); -} - - -// Set `count` bits at `bitmap_idx` to 1 atomically -// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) { - const size_t idx = mi_bitmap_index_field(bitmap_idx); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - const size_t mask = mi_bitmap_mask_(count, bitidx); - mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); - //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0); - size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask); - if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); } - return ((prev & mask) == 0); -} - -// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one. -static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) { - const size_t idx = mi_bitmap_index_field(bitmap_idx); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - const size_t mask = mi_bitmap_mask_(count, bitidx); - mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); - const size_t field = mi_atomic_load_relaxed(&bitmap[idx]); - if (any_ones != NULL) { *any_ones = ((field & mask) != 0); } - return ((field & mask) == mask); -} - -// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. -// Returns `true` if successful when all previous `count` bits were 0. -bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - const size_t idx = mi_bitmap_index_field(bitmap_idx); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - const size_t mask = mi_bitmap_mask_(count, bitidx); - mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); - size_t expected = mi_atomic_load_relaxed(&bitmap[idx]); - do { - if ((expected & mask) != 0) return false; - } - while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask)); - mi_assert_internal((expected & mask) == 0); - return true; -} - - -bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL); -} - -bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - bool any_ones; - mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); - return any_ones; -} - - -//-------------------------------------------------------------------------- -// the `_across` functions work on bitmaps where sequences can cross over -// between the fields. This is used in arena allocation -//-------------------------------------------------------------------------- - -// Try to atomically claim a sequence of `count` bits starting from the field -// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success. -// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`) -static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) -{ - mi_assert_internal(bitmap_idx != NULL); - - // check initial trailing zeros - mi_bitmap_field_t* field = &bitmap[idx]; - size_t map = mi_atomic_load_relaxed(field); - const size_t initial = mi_clz(map); // count of initial zeros starting at idx - mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS); - if (initial == 0) return false; - if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx); // no need to cross fields (this case won't happen for us) - if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries - - // scan ahead - size_t found = initial; - size_t mask = 0; // mask bits for the final field - while(found < count) { - field++; - map = mi_atomic_load_relaxed(field); - const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found)); - mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS); - mask = mi_bitmap_mask_(mask_bits, 0); - if ((map & mask) != 0) return false; // some part is already claimed - found += mask_bits; - } - mi_assert_internal(field < &bitmap[bitmap_fields]); - - // we found a range of contiguous zeros up to the final field; mask contains mask in the final field - // now try to claim the range atomically - mi_bitmap_field_t* const final_field = field; - const size_t final_mask = mask; - mi_bitmap_field_t* const initial_field = &bitmap[idx]; - const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial; - const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx); - - // initial field - size_t newmap; - field = initial_field; - map = mi_atomic_load_relaxed(field); - do { - newmap = (map | initial_mask); - if ((map & initial_mask) != 0) { goto rollback; }; - } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); - - // intermediate fields - while (++field < final_field) { - newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0); - map = 0; - if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; } - } - - // final field - mi_assert_internal(field == final_field); - map = mi_atomic_load_relaxed(field); - do { - newmap = (map | final_mask); - if ((map & final_mask) != 0) { goto rollback; } - } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); - - // claimed! - mi_stat_counter_increase(stats->arena_crossover_count,1); - *bitmap_idx = mi_bitmap_index_create(idx, initial_idx); - return true; - -rollback: - // roll back intermediate fields - // (we just failed to claim `field` so decrement first) - while (--field > initial_field) { - newmap = 0; - map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0); - mi_assert_internal(mi_atomic_load_relaxed(field) == map); - mi_atomic_store_release(field, newmap); - } - if (field == initial_field) { // (if we failed on the initial field, `field + 1 == initial_field`) - map = mi_atomic_load_relaxed(field); - do { - mi_assert_internal((map & initial_mask) == initial_mask); - newmap = (map & ~initial_mask); - } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); - } - mi_stat_counter_increase(stats->arena_rollback_count,1); - // retry? (we make a recursive call instead of goto to be able to use const declarations) - if (retries <= 2) { - return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats); - } - else { - return false; - } -} - - -// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. -// Starts at idx, and wraps around to search in all `bitmap_fields` fields. -bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) { - mi_assert_internal(count > 0); - if (count <= 2) { - // we don't bother with crossover fields for small counts - return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx); - } - - // visit the fields - size_t idx = start_field_idx; - for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { - if (idx >= bitmap_fields) { idx = 0; } // wrap - // first try to claim inside a field - /* - if (count <= MI_BITMAP_FIELD_BITS) { - if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { - return true; + // we may find that all are unset only on a second iteration but that is ok as + // _any_set is a conservative approximation. + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); } } - */ - // if that fails, then try to claim across fields - if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) { - return true; - } } + mi_bitmap_forall_set_chunks_end(); return false; } -// Helper for masks across fields; returns the mid count, post_mask may be 0 -static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) { - MI_UNUSED(bitmap_fields); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) { - *pre_mask = mi_bitmap_mask_(count, bitidx); - *mid_mask = 0; - *post_mask = 0; - mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields); - return 0; - } - else { - const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx; - mi_assert_internal(pre_bits < count); - *pre_mask = mi_bitmap_mask_(pre_bits, bitidx); - count -= pre_bits; - const size_t mid_count = (count / MI_BITMAP_FIELD_BITS); - *mid_mask = MI_BITMAP_FIELD_FULL; - count %= MI_BITMAP_FIELD_BITS; - *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0)); - mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields); - return mid_count; + +// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. +bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) { + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8); + mi_assert_internal((*pidx % 8) == 0); + return true; + } + else { + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + } + } } + mi_bitmap_forall_set_chunks_end(); + return false; } -// Set `count` bits at `bitmap_idx` to 0 atomically -// Returns `true` if all `count` bits were 1 previously. -bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - size_t idx = mi_bitmap_index_field(bitmap_idx); - size_t pre_mask; - size_t mid_mask; - size_t post_mask; - size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); - bool all_one = true; - mi_bitmap_field_t* field = &bitmap[idx]; - size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask); // clear first part - if ((prev & pre_mask) != pre_mask) all_one = false; - while(mid_count-- > 0) { - prev = mi_atomic_and_acq_rel(field++, ~mid_mask); // clear mid part - if ((prev & mid_mask) != mid_mask) all_one = false; +// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) { + // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger + // TODO: allow spanning across chunk boundaries + if (n == 0 || n > MI_BFIELD_BITS) return false; + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n); + return true; + } + else { + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + } + } } - if (post_mask!=0) { - prev = mi_atomic_and_acq_rel(field, ~post_mask); // clear end part - if ((prev & post_mask) != post_mask) all_one = false; - } - return all_one; -} - -// Set `count` bits at `bitmap_idx` to 1 atomically -// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) { - size_t idx = mi_bitmap_index_field(bitmap_idx); - size_t pre_mask; - size_t mid_mask; - size_t post_mask; - size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); - bool all_zero = true; - bool any_zero = false; - _Atomic(size_t)*field = &bitmap[idx]; - size_t prev = mi_atomic_or_acq_rel(field++, pre_mask); - if ((prev & pre_mask) != 0) all_zero = false; - if ((prev & pre_mask) != pre_mask) any_zero = true; - while (mid_count-- > 0) { - prev = mi_atomic_or_acq_rel(field++, mid_mask); - if ((prev & mid_mask) != 0) all_zero = false; - if ((prev & mid_mask) != mid_mask) any_zero = true; - } - if (post_mask!=0) { - prev = mi_atomic_or_acq_rel(field, post_mask); - if ((prev & post_mask) != 0) all_zero = false; - if ((prev & post_mask) != post_mask) any_zero = true; - } - if (pany_zero != NULL) { *pany_zero = any_zero; } - return all_zero; -} - - -// Returns `true` if all `count` bits were 1. -// `any_ones` is `true` if there was at least one bit set to one. -static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) { - size_t idx = mi_bitmap_index_field(bitmap_idx); - size_t pre_mask; - size_t mid_mask; - size_t post_mask; - size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); - bool all_ones = true; - bool any_ones = false; - mi_bitmap_field_t* field = &bitmap[idx]; - size_t prev = mi_atomic_load_relaxed(field++); - if ((prev & pre_mask) != pre_mask) all_ones = false; - if ((prev & pre_mask) != 0) any_ones = true; - while (mid_count-- > 0) { - prev = mi_atomic_load_relaxed(field++); - if ((prev & mid_mask) != mid_mask) all_ones = false; - if ((prev & mid_mask) != 0) any_ones = true; - } - if (post_mask!=0) { - prev = mi_atomic_load_relaxed(field); - if ((prev & post_mask) != post_mask) all_ones = false; - if ((prev & post_mask) != 0) any_ones = true; - } - if (pany_ones != NULL) { *pany_ones = any_ones; } - return all_ones; -} - -bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL); -} - -bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - bool any_ones; - mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); - return any_ones; + mi_bitmap_forall_set_chunks_end(); + return false; } diff --git a/src/bitmap.h b/src/bitmap.h index f8898935..198a2902 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -6,105 +6,87 @@ terms of the MIT license. A copy of the license can be found in the file -----------------------------------------------------------------------------*/ /* ---------------------------------------------------------------------------- -Concurrent bitmap that can set/reset sequences of bits atomically, -represented as an array of fields where each field is a machine word (`size_t`) - -There are two api's; the standard one cannot have sequences that cross -between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). -(this is used in region allocation) - -The `_across` postfixed functions do allow sequences that can cross over -between the fields. (This is used in arena allocation) +Concurrent bitmap that can set/reset sequences of bits atomically ---------------------------------------------------------------------------- */ #pragma once #ifndef MI_BITMAP_H #define MI_BITMAP_H -/* ----------------------------------------------------------- - Bitmap definition ------------------------------------------------------------ */ +/* -------------------------------------------------------------------------------- + Definitions +-------------------------------------------------------------------------------- */ -#define MI_BITMAP_FIELD_BITS (8*MI_SIZE_SIZE) -#define MI_BITMAP_FIELD_FULL (~((size_t)0)) // all bits set +typedef size_t mi_bfield_t; -// An atomic bitmap of `size_t` fields -typedef _Atomic(size_t) mi_bitmap_field_t; -typedef mi_bitmap_field_t* mi_bitmap_t; +#define MI_BFIELD_BITS_SHIFT (MI_SIZE_SHIFT+3) +#define MI_BFIELD_BITS (1 << MI_BFIELD_BITS_SHIFT) +#define MI_BFIELD_SIZE (MI_BFIELD_BITS/8) +#define MI_BFIELD_BITS_MOD_MASK (MI_BFIELD_BITS - 1) +#define MI_BFIELD_LO_BIT8 ((~(mi_bfield_t(0)))/0xFF) // 0x01010101 .. +#define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. -// A bitmap index is the index of the bit in a bitmap. -typedef size_t mi_bitmap_index_t; +#define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) +#define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) -// Create a bit index. -static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) { - mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS); - return (idx*MI_BITMAP_FIELD_BITS) + bitidx; -} -static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) { - mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS); - return mi_bitmap_index_create_ex(idx,bitidx); -} - -// Get the field index from a bit index. -static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) { - return (bitmap_idx / MI_BITMAP_FIELD_BITS); -} - -// Get the bit index in a bitmap field -static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) { - return (bitmap_idx % MI_BITMAP_FIELD_BITS); -} - -// Get the full bit index -static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) { - return bitmap_idx; -} - -/* ----------------------------------------------------------- - Claim a bit sequence atomically ------------------------------------------------------------ */ - -// Try to atomically claim a sequence of `count` bits in a single -// field at `idx` in `bitmap`. Returns `true` on success. -bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx); - -// Starts at idx, and wraps around to search in all `bitmap_fields` fields. -// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields. -bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx); - -// Set `count` bits at `bitmap_idx` to 0 atomically -// Returns `true` if all `count` bits were 1 previously. -bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); - -// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. -// Returns `true` if successful when all previous `count` bits were 0. -bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); - -// Set `count` bits at `bitmap_idx` to 1 atomically -// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero); - -bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); -bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); +typedef mi_decl_align(32) struct mi_bitmap_chunk_s { + _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; +} mi_bitmap_chunk_t; -//-------------------------------------------------------------------------- -// the `_across` functions work on bitmaps where sequences can cross over -// between the fields. This is used in arena allocation -//-------------------------------------------------------------------------- +typedef mi_decl_align(32) struct mi_bitmap_s { + mi_bitmap_chunk_t chunks[MI_BFIELD_BITS]; + _Atomic(mi_bfield_t)any_set; +} mi_bitmap_t; -// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. -// Starts at idx, and wraps around to search in all `bitmap_fields` fields. -bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats); +#define MI_BITMAP_MAX_BITS (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit -// Set `count` bits at `bitmap_idx` to 0 atomically -// Returns `true` if all `count` bits were 1 previously. -bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); +/* -------------------------------------------------------------------------------- + Bitmap +-------------------------------------------------------------------------------- */ -// Set `count` bits at `bitmap_idx` to 1 atomically -// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero); +typedef bool mi_bit_t; +#define MI_BIT_SET (true) +#define MI_BIT_CLEAR (false) -bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); -bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); +// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero); -#endif +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset); + +// Is a sequence of n bits already all set/cleared? +bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) +// and false otherwise leaving the bitmask as is. +mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); + +// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) +// and false otherwise leaving the bitmask as is. +mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); + +// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) +// and false otherwise leaving the bitmask as is. +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Find a set bit in a bitmap and atomically unset it. Returns true on success, +// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. +// The low `MI_BFIELD_BITS` of start are used to set the start point of the search +// (to reduce thread contention). +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start); + +// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ); + +// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ); + +#endif // MI_XBITMAP_H diff --git a/src/free.c b/src/free.c index f2e30b65..e1cc9276 100644 --- a/src/free.c +++ b/src/free.c @@ -24,7 +24,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block); // ------------------------------------------------------ // forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) -static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block); +static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_block_t* block); // regular free of a (thread local) block pointer // fast path written carefully to prevent spilling on the stack @@ -57,7 +57,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p) { mi_assert_internal(page!=NULL && p!=NULL); - size_t diff = (uint8_t*)p - page->page_start; + size_t diff = (uint8_t*)p - mi_page_start(page); size_t adjust; if mi_likely(page->block_size_shift != 0) { adjust = diff & (((size_t)1 << page->block_size_shift) - 1); @@ -82,72 +82,55 @@ static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, vo #endif // free a local pointer (page parameter comes first for better codegen) -static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept { - MI_UNUSED(segment); +static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept { mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p); mi_block_check_unguard(page, block, p); mi_free_block_local(page, block, true /* track stats */, true /* check for a full page */); } // free a pointer owned by another thread (page parameter comes first for better codegen) -static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, mi_segment_t* segment, void* p) mi_attr_noexcept { +static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept { mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865) mi_block_check_unguard(page, block, p); - mi_free_block_mt(page, segment, block); + mi_free_block_mt(page, block); } // generic free (for runtime integration) -void mi_decl_noinline _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept { - if (is_local) mi_free_generic_local(page,segment,p); - else mi_free_generic_mt(page,segment,p); +void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) mi_attr_noexcept { + if (is_local) mi_free_generic_local(page,p); + else mi_free_generic_mt(page,p); } // Get the segment data belonging to a pointer // This is just a single `and` in release mode but does further checks in debug mode // (and secure mode) to see if this was a valid pointer. -static inline mi_segment_t* mi_checked_ptr_segment(const void* p, const char* msg) +static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) { - MI_UNUSED(msg); - - #if (MI_DEBUG>0) + MI_UNUSED_RELEASE(msg); + #if MI_DEBUG if mi_unlikely(((uintptr_t)p & (MI_INTPTR_SIZE - 1)) != 0 && !mi_option_is_enabled(mi_option_guarded_precise)) { _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p); return NULL; } #endif - - mi_segment_t* const segment = _mi_ptr_segment(p); - if mi_unlikely(segment==NULL) return segment; - - #if (MI_DEBUG>0) - if mi_unlikely(!mi_is_in_heap_region(p)) { - _mi_warning_message("%s: pointer might not point to a valid heap region: %p\n" - "(this may still be a valid very large allocation (over 64MiB))\n", msg, p); - if mi_likely(_mi_ptr_cookie(segment) == segment->cookie) { - _mi_warning_message("(yes, the previous pointer %p was valid after all)\n", p); - } + mi_page_t* const page = _mi_ptr_page(p); + #if MI_DEBUG + if (page == MI_PAGE_PTR_INVALID) { + _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p); } #endif - #if (MI_DEBUG>0 || MI_SECURE>=4) - if mi_unlikely(_mi_ptr_cookie(segment) != segment->cookie) { - _mi_error_message(EINVAL, "%s: pointer does not point to a valid heap space: %p\n", msg, p); - return NULL; - } - #endif - - return segment; + return page; } // Free a block // Fast path written carefully to prevent register spilling on the stack void mi_free(void* p) mi_attr_noexcept { - mi_segment_t* const segment = mi_checked_ptr_segment(p,"mi_free"); - if mi_unlikely(segment==NULL) return; - - const bool is_local = (_mi_prim_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); - mi_page_t* const page = _mi_segment_page_of(segment, p); + mi_page_t* const page = mi_checked_ptr_page(p,"mi_free"); + if mi_unlikely(page==NULL) return; + + const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page)); if mi_likely(is_local) { // thread-local free? if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) // thread-local, aligned, and not a full page @@ -156,12 +139,12 @@ void mi_free(void* p) mi_attr_noexcept } else { // page is full or contains (inner) aligned blocks; use generic path - mi_free_generic_local(page, segment, p); + mi_free_generic_local(page, p); } } else { // not thread-local; use generic path - mi_free_generic_mt(page, segment, p); + mi_free_generic_mt(page, p); } } @@ -169,10 +152,8 @@ void mi_free(void* p) mi_attr_noexcept bool _mi_free_delayed_block(mi_block_t* block) { // get segment and page mi_assert_internal(block!=NULL); - const mi_segment_t* const segment = _mi_ptr_segment(block); - mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie); - mi_assert_internal(_mi_thread_id() == segment->thread_id); - mi_page_t* const page = _mi_segment_page_of(segment, block); + mi_page_t* const page = mi_checked_ptr_page(block,"_mi_free_delayed_block"); + mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); // Clear the no-delayed flag so delayed freeing is used again for this page. // This must be done before collecting the free lists on this page -- otherwise @@ -242,20 +223,19 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block } // Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) -static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* segment, mi_block_t* block) +static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) { - // first see if the segment was abandoned and if we can reclaim it into our thread - if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 && - #if MI_HUGE_PAGE_ABANDON - segment->page_kind != MI_PAGE_HUGE && - #endif - mi_atomic_load_relaxed(&segment->thread_id) == 0 && // segment is abandoned? + // first see if the page was abandoned and if we can reclaim it into our thread + if (mi_page_is_abandoned(page) && + (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 || + mi_page_is_singleton(page) // only one block, and we are free-ing it + ) && mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944)) { - // the segment is abandoned, try to reclaim it into our heap - if (_mi_segment_attempt_reclaim(mi_heap_get_default(), segment)) { - mi_assert_internal(_mi_thread_id() == mi_atomic_load_relaxed(&segment->thread_id)); - mi_assert_internal(mi_heap_get_default()->tld->segments.subproc == segment->subproc); + // the page is abandoned, try to reclaim it into our heap + if (_mi_heap_try_reclaim(mi_heap_get_default(), page)) { // TODO: avoid putting it in the full free queue + mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); + // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc); mi_free(block); // recursively free as now it will be a local free in our heap return; } @@ -272,17 +252,12 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_segment_t* seg // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection _mi_padding_shrink(page, block, sizeof(mi_block_t)); - if (segment->page_kind == MI_PAGE_HUGE) { - #if MI_HUGE_PAGE_ABANDON - // huge page segments are always abandoned and can be freed immediately - _mi_segment_huge_page_free(segment, page, block); - return; - #else + if (mi_page_is_huge(page)) { + mi_assert_internal(mi_page_is_singleton(page)); // huge pages are special as they occupy the entire segment // as these are large we reset the memory occupied by the page so it is available to other threads // (as the owning thread needs to actually free the memory later). - _mi_segment_huge_page_reset(segment, page, block); - #endif + _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively } else { #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading @@ -316,9 +291,8 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p } static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept { - const mi_segment_t* const segment = mi_checked_ptr_segment(p, msg); - if mi_unlikely(segment==NULL) return 0; - const mi_page_t* const page = _mi_segment_page_of(segment, p); + const mi_page_t* const page = mi_checked_ptr_page(p,msg); + if mi_unlikely(page==NULL) return 0; if mi_likely(!mi_page_has_aligned(page)) { const mi_block_t* block = (const mi_block_t*)p; return mi_page_usable_size_of(page, block); @@ -514,20 +488,20 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { // only maintain stats for smaller objects if requested #if (MI_STAT>0) static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { -#if (MI_STAT < 2) + #if (MI_STAT < 2) MI_UNUSED(block); -#endif + #endif mi_heap_t* const heap = mi_heap_get_default(); const size_t bsize = mi_page_usable_block_size(page); -#if (MI_STAT>1) + #if (MI_STAT>1) const size_t usize = mi_page_usable_size_of(page, block); mi_heap_stat_decrease(heap, malloc, usize); -#endif - if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { + #endif + if (bsize <= MI_LARGE_MAX_OBJ_SIZE) { mi_heap_stat_decrease(heap, normal, bsize); -#if (MI_STAT > 1) + #if (MI_STAT > 1) mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], 1); -#endif + #endif } else { const size_t bpsize = mi_page_block_size(page); // match stat in page.c:mi_huge_page_alloc diff --git a/src/heap.c b/src/heap.c index 581b3f71..e4955ba7 100644 --- a/src/heap.c +++ b/src/heap.c @@ -7,11 +7,8 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc/internal.h" -#include "mimalloc/atomic.h" #include "mimalloc/prim.h" // mi_prim_get_default_heap -#include // memset, memcpy - #if defined(_MSC_VER) && (_MSC_VER < 1920) #pragma warning(disable:4204) // non-constant aggregate initializer #endif @@ -258,7 +255,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) { mi_assert_internal(heap != NULL); mi_assert_internal(mi_heap_is_initialized(heap)); // TODO: copy full empty heap instead? - memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct)); + _mi_memset(&heap->pages_free_direct, 0, sizeof(heap->pages_free_direct)); _mi_memcpy_aligned(&heap->pages, &_mi_heap_empty.pages, sizeof(heap->pages)); heap->thread_delayed_free = NULL; heap->page_count = 0; diff --git a/src/os.c b/src/os.c index 36b167cb..83521766 100644 --- a/src/os.c +++ b/src/os.c @@ -59,6 +59,10 @@ size_t _mi_os_large_page_size(void) { return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size()); } +size_t _mi_os_virtual_address_bits(void) { + return mi_os_mem_config.virtual_address_bits; +} + bool _mi_os_use_large_page(size_t size, size_t alignment) { // if we have access, check the size and alignment requirements if (mi_os_mem_config.large_page_size == 0 || !mi_option_is_enabled(mi_option_allow_large_os_pages)) return false; @@ -103,58 +107,10 @@ static void* mi_align_down_ptr(void* p, size_t alignment) { return (void*)_mi_align_down((uintptr_t)p, alignment); } - -/* ----------------------------------------------------------- - aligned hinting --------------------------------------------------------------- */ - -// On systems with enough virtual address bits, we can do efficient aligned allocation by using -// the 2TiB to 30TiB area to allocate those. If we have at least 46 bits of virtual address -// space (64TiB) we use this technique. (but see issue #939) -#if (MI_INTPTR_SIZE >= 8) && !defined(MI_NO_ALIGNED_HINT) -static mi_decl_cache_align _Atomic(uintptr_t)aligned_base; - -// Return a MI_SEGMENT_SIZE aligned address that is probably available. -// If this returns NULL, the OS will determine the address but on some OS's that may not be -// properly aligned which can be more costly as it needs to be adjusted afterwards. -// For a size > 1GiB this always returns NULL in order to guarantee good ASLR randomization; -// (otherwise an initial large allocation of say 2TiB has a 50% chance to include (known) addresses -// in the middle of the 2TiB - 6TiB address range (see issue #372)) - -#define MI_HINT_BASE ((uintptr_t)2 << 40) // 2TiB start -#define MI_HINT_AREA ((uintptr_t)4 << 40) // upto 6TiB (since before win8 there is "only" 8TiB available to processes) -#define MI_HINT_MAX ((uintptr_t)30 << 40) // wrap after 30TiB (area after 32TiB is used for huge OS pages) - -void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) -{ - if (try_alignment <= 1 || try_alignment > MI_SEGMENT_SIZE) return NULL; - if (mi_os_mem_config.virtual_address_bits < 46) return NULL; // < 64TiB virtual address space - size = _mi_align_up(size, MI_SEGMENT_SIZE); - if (size > 1*MI_GiB) return NULL; // guarantee the chance of fixed valid address is at most 1/(MI_HINT_AREA / 1<<30) = 1/4096. - #if (MI_SECURE>0) - size += MI_SEGMENT_SIZE; // put in `MI_SEGMENT_SIZE` virtual gaps between hinted blocks; this splits VLA's but increases guarded areas. - #endif - - uintptr_t hint = mi_atomic_add_acq_rel(&aligned_base, size); - if (hint == 0 || hint > MI_HINT_MAX) { // wrap or initialize - uintptr_t init = MI_HINT_BASE; - #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of aligned allocations unless in debug mode - uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap()); - init = init + ((MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF)) % MI_HINT_AREA); // (randomly 20 bits)*4MiB == 0 to 4TiB - #endif - uintptr_t expected = hint + size; - mi_atomic_cas_strong_acq_rel(&aligned_base, &expected, init); - hint = mi_atomic_add_acq_rel(&aligned_base, size); // this may still give 0 or > MI_HINT_MAX but that is ok, it is a hint after all - } - if (hint%try_alignment != 0) return NULL; - return (void*)hint; -} -#else void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { MI_UNUSED(try_alignment); MI_UNUSED(size); return NULL; } -#endif /* ----------------------------------------------------------- @@ -380,12 +336,10 @@ void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { ----------------------------------------------------------- */ void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) { - mi_assert(offset <= MI_SEGMENT_SIZE); mi_assert(offset <= size); mi_assert((alignment % _mi_os_page_size()) == 0); *memid = _mi_memid_none(); if (stats == NULL) stats = &_mi_stats_main; - if (offset > MI_SEGMENT_SIZE) return NULL; if (offset == 0) { // regular aligned allocation return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats); @@ -605,7 +559,6 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { #endif } end = start + size; - mi_assert_internal(end % MI_SEGMENT_SIZE == 0); } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end)); if (total_size != NULL) *total_size = size; diff --git a/src/page-map.c b/src/page-map.c new file mode 100644 index 00000000..d3fcef79 --- /dev/null +++ b/src/page-map.c @@ -0,0 +1,90 @@ +/*---------------------------------------------------------------------------- +Copyright (c) 2023-2024, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "bitmap.h" + +mi_decl_cache_align signed char* _mi_page_map = NULL; +static bool mi_page_map_all_committed = false; +static size_t mi_blocks_per_commit_bit = 1; +static mi_memid_t mi_page_map_memid; +static mi_bitmap_t mi_page_map_commit; + +static bool mi_page_map_init(void) { + size_t vbits = _mi_os_virtual_address_bits(); + if (vbits >= 48) vbits = 47; + // 1 byte per block = 2 GiB for 128 TiB address space (48 bit = 256 TiB address space) + // 64 KiB for 4 GiB address space (on 32-bit) + const size_t page_map_size = (MI_ZU(1) << (vbits >> MI_ARENA_BLOCK_SHIFT)); + + const size_t min_commit_size = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); + mi_blocks_per_commit_bit = mi_block_count_of_size(min_commit_size); + + mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems + _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 0, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); + if (_mi_page_map==NULL) { + _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); + return false; + } + if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) { + _mi_warning_message("the page map was committed on-demand but not zero initialized!\n"); + _mi_memzero_aligned(_mi_page_map, page_map_size); + } + return true; +} + +static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* block_count) { + size_t page_size; + *page_start = mi_page_area(page, &page_size); + if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE; } // furthest interior pointer + *block_count = mi_block_count_of_size(page_size); + return ((uintptr_t)*page_start >> MI_ARENA_BLOCK_SHIFT); +} + + + +void _mi_page_map_register(mi_page_t* page) { + if mi_unlikely(_mi_page_map == NULL) { + if (!mi_page_map_init()) return; + } + uint8_t* page_start; + size_t block_count; + const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count); + + // is the page map area that contains the page address committed? + if (!mi_page_map_all_committed) { + const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit); + const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit; + for (size_t i = 0; i < commit_bit_count; i++) { // per bit to avoid crossing over bitmap chunks + if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) { + // this may race, in which case we do multiple commits (which is ok) + _mi_os_commit(page_start + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL); + mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL); + } + } + } + + // set the offsets + for (int i = 0; i < block_count; i++) { + mi_assert_internal(i < 128); + _mi_page_map[idx + i] = (int8_t)(-i-1); + } +} + + +void _mi_page_map_unregister(mi_page_t* page) { + mi_assert_internal(_mi_page_map != NULL); + + // get index and count + uint8_t* page_start; + size_t block_count; + const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count); + + // unset the offsets + _mi_memzero(_mi_page_map + idx, block_count); +} diff --git a/src/page.c b/src/page.c index c681d6d0..a00ff615 100644 --- a/src/page.c +++ b/src/page.c @@ -59,7 +59,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page) { static bool mi_page_list_is_valid(mi_page_t* page, mi_block_t* p) { size_t psize; - uint8_t* page_area = _mi_segment_page_start(_mi_page_segment(page), page, &psize); + uint8_t* page_area = mi_page_area(page, &psize); mi_block_t* start = (mi_block_t*)page_area; mi_block_t* end = (mi_block_t*)(page_area + psize); while(p != NULL) { @@ -83,10 +83,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) { mi_assert_internal(page->capacity <= page->reserved); // const size_t bsize = mi_page_block_size(page); - mi_segment_t* segment = _mi_page_segment(page); uint8_t* start = mi_page_start(page); - mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL)); - mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE)); //mi_assert_internal(start + page->capacity*page->block_size == page->top); mi_assert_internal(mi_page_list_is_valid(page,page->free)); @@ -122,15 +119,11 @@ bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(page->keys[0] != 0); #endif if (mi_page_heap(page)!=NULL) { - mi_segment_t* segment = _mi_page_segment(page); - mi_assert_internal(!_mi_process_is_initialized || segment->thread_id == mi_page_heap(page)->thread_id || segment->thread_id==0); - #if MI_HUGE_PAGE_ABANDON - if (segment->page_kind != MI_PAGE_HUGE) - #endif + mi_assert_internal(!_mi_process_is_initialized || page->thread_id == mi_page_heap(page)->thread_id || page->thread_id==0); { mi_page_queue_t* pq = mi_page_queue_of(page); mi_assert_internal(mi_page_queue_contains(pq, page)); - mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_OBJ_SIZE_MAX || mi_page_is_in_full(page)); + mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_MAX_OBJ_SIZE || mi_page_is_in_full(page)); mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq)); } } @@ -274,16 +267,13 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size #if !MI_HUGE_PAGE_ABANDON mi_assert_internal(pq != NULL); mi_assert_internal(mi_heap_contains_queue(heap, pq)); - mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_OBJ_SIZE_MAX || block_size == pq->block_size); + mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size); #endif - mi_page_t* page = _mi_segment_page_alloc(heap, block_size, page_alignment, &heap->tld->segments, &heap->tld->os); + mi_page_t* page = _mi_heap_page_alloc(heap, block_size, page_alignment); if (page == NULL) { // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue) return NULL; } - #if MI_HUGE_PAGE_ABANDON - mi_assert_internal(pq==NULL || _mi_page_segment(page)->page_kind != MI_PAGE_HUGE); - #endif mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size); // a fresh page was found, initialize it const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc @@ -384,7 +374,6 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { mi_heap_t* pheap = mi_page_heap(page); // remove from our page list - mi_segments_tld_t* segments_tld = &pheap->tld->segments; mi_page_queue_remove(pq, page); // page is no longer associated with our heap @@ -399,8 +388,8 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { #endif // and abandon it - mi_assert_internal(mi_page_heap(page) == NULL); - _mi_segment_page_abandon(page,segments_tld); + mi_assert_internal(mi_page_is_abandoned(page)); + _mi_arena_page_abandon(page,&pheap->tld); } // force abandon a page @@ -411,8 +400,7 @@ void _mi_page_force_abandon(mi_page_t* page) { // ensure this page is no longer in the heap delayed free list _mi_heap_delayed_free_all(heap); - // We can still access the page meta-info even if it is freed as we ensure - // in `mi_segment_force_abandon` that the segment is not freed (yet) + // TODO: can we still access the page meta-info even if it is freed? if (page->capacity == 0) return; // it may have been freed now // and now unlink it from the page queue and abandon (or free) @@ -433,17 +421,18 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { mi_assert_internal(mi_page_all_free(page)); mi_assert_internal(mi_page_thread_free_flag(page)!=MI_DELAYED_FREEING); + mi_heap_t* pheap = mi_page_heap(page); + // no more aligned blocks in here mi_page_set_has_aligned(page, false); // remove from the page list // (no need to do _mi_heap_delayed_free first as all blocks are already free) - mi_segments_tld_t* segments_tld = &mi_page_heap(page)->tld->segments; mi_page_queue_remove(pq, page); // and free it mi_page_set_heap(page,NULL); - _mi_segment_page_free(page, force, segments_tld); + _mi_arena_page_free(page, force, &pheap->tld); } #define MI_MAX_RETIRE_SIZE MI_LARGE_OBJ_SIZE_MAX // should be less than size for MI_BIN_HUGE @@ -474,7 +463,7 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept { if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) { // not full or huge queue? if (pq->last==page && pq->first==page) { // the only page in the queue? mi_stat_counter_increase(_mi_stats_main.page_no_retire,1); - page->retire_expire = (bsize <= MI_SMALL_OBJ_SIZE_MAX ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4); + page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4); mi_heap_t* heap = mi_page_heap(page); mi_assert_internal(pq >= heap->pages); const size_t index = pq - heap->pages; @@ -639,7 +628,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) size_t page_size; //uint8_t* page_start = - _mi_segment_page_start(_mi_page_segment(page), page, &page_size); + mi_page_area(page, &page_size); mi_stat_counter_increase(tld->stats.pages_extended, 1); // calculate the extend count @@ -676,15 +665,13 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) // Initialize a fresh page static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) { mi_assert(page != NULL); - mi_segment_t* segment = _mi_page_segment(page); - mi_assert(segment != NULL); mi_assert_internal(block_size > 0); // set fields mi_page_set_heap(page, heap); page->block_size = block_size; size_t page_size; - page->page_start = _mi_segment_page_start(segment, page, &page_size); - mi_track_mem_noaccess(page->page_start,page_size); + uint8_t* page_start = mi_page_area(page, &page_size); + mi_track_mem_noaccess(page_start,page_size); mi_assert_internal(page_size / block_size < (1L<<16)); page->reserved = (uint16_t)(page_size / block_size); mi_assert_internal(page->reserved > 0); @@ -692,15 +679,15 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi page->keys[0] = _mi_heap_random_next(heap); page->keys[1] = _mi_heap_random_next(heap); #endif - page->free_is_zero = page->is_zero_init; + page->free_is_zero = page->memid.initially_zero; #if MI_DEBUG>2 - if (page->is_zero_init) { + if (page->memid.initially_zero) { mi_track_mem_defined(page->page_start, page_size); - mi_assert_expensive(mi_mem_is_zero(page->page_start, page_size)); + mi_assert_expensive(mi_mem_is_zero(page_start, page_size)); } #endif if (block_size > 0 && _mi_is_power_of_two(block_size)) { - page->block_size_shift = (uint8_t)(mi_ctz((uintptr_t)block_size)); + page->block_size_shift = (uint8_t)mi_ctz(block_size); } else { page->block_size_shift = 0; @@ -734,13 +721,6 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi // search for a best next page to use for at most N pages (often cut short if immediate blocks are available) #define MI_MAX_CANDIDATE_SEARCH (8) -// is the page not yet used up to its reserved space? -static bool mi_page_is_expandable(const mi_page_t* page) { - mi_assert_internal(page != NULL); - mi_assert_internal(page->capacity <= page->reserved); - return (page->capacity < page->reserved); -} - // Find a page with free blocks of `page->block_size`. static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try) @@ -907,7 +887,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a #if MI_HUGE_PAGE_ABANDON mi_page_queue_t* pq = NULL; #else - mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_OBJ_SIZE_MAX+1); // always in the huge queue regardless of the block size + mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1); // always in the huge queue regardless of the block size mi_assert_internal(mi_page_queue_is_huge(pq)); #endif mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment); @@ -915,10 +895,9 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a mi_assert_internal(mi_page_block_size(page) >= size); mi_assert_internal(mi_page_immediate_available(page)); mi_assert_internal(mi_page_is_huge(page)); - mi_assert_internal(_mi_page_segment(page)->page_kind == MI_PAGE_HUGE); - mi_assert_internal(_mi_page_segment(page)->used==1); + mi_assert_internal(mi_page_is_singleton(page)); #if MI_HUGE_PAGE_ABANDON - mi_assert_internal(_mi_page_segment(page)->thread_id==0); // abandoned, not in the huge queue + mi_assert_internal(mi_page_is_abandoned(page)); mi_page_set_heap(page, NULL); #endif mi_heap_stat_increase(heap, huge, mi_page_block_size(page)); @@ -933,7 +912,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept { // huge allocation? const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` - if mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) || huge_alignment > 0) { + if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) { if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) { _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size); return NULL; diff --git a/src/static.c b/src/static.c index 9e06ce05..b34d5d42 100644 --- a/src/static.c +++ b/src/static.c @@ -20,7 +20,7 @@ terms of the MIT license. A copy of the license can be found in the file // containing the whole library. If it is linked first // it will override all the standard library allocation // functions (on Unix's). -#include "alloc.c" // includes alloc-override.c +#include "alloc.c" // includes alloc-override.c and free.c #include "alloc-aligned.c" #include "alloc-posix.c" #include "arena.c" @@ -31,6 +31,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "options.c" #include "os.c" #include "page.c" // includes page-queue.c +#include "page-map.c" #include "random.c" #include "segment.c" #include "segment-map.c" diff --git a/src/xbitmap.c b/src/xbitmap.c deleted file mode 100644 index 68525c84..00000000 --- a/src/xbitmap.c +++ /dev/null @@ -1,599 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2024 Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -/* ---------------------------------------------------------------------------- -Concurrent bitmap that can set/reset sequences of bits atomically ----------------------------------------------------------------------------- */ - -#include "mimalloc.h" -#include "mimalloc/internal.h" -#include "mimalloc/bits.h" -#include "xbitmap.h" - -/* -------------------------------------------------------------------------------- - bfields --------------------------------------------------------------------------------- */ - -static inline size_t mi_bfield_ctz(mi_bfield_t x) { - return mi_ctz(x); -} - -static inline size_t mi_bfield_clz(mi_bfield_t x) { - return mi_clz(x); -} - -// find the least significant bit that is set (i.e. count trailing zero's) -// return false if `x==0` (with `*idx` undefined) and true otherwise, -// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). -static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { - return mi_bsf(x,idx); -} - -static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { - return mi_rotr(x,r); -} - -// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). -static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal(idx < MI_BFIELD_BITS); - const mi_bfield_t mask = ((mi_bfield_t)1)<bfields[i], idx); -} - -static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) { - mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); - const size_t i = byte_idx / MI_BFIELD_SIZE; - const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; - return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx); -} - -// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0) -static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) { - mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); - mi_assert_internal(n>0); - bool all_transition = true; - bool all_already_xset = true; - size_t idx = cidx % MI_BFIELD_BITS; - size_t field = cidx / MI_BFIELD_BITS; - while (n > 0) { - size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field - if (m > n) { m = n; } - mi_assert_internal(idx + m <= MI_BFIELD_BITS); - mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); - const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset); - all_already_xset = all_already_xset && already_xset; - // next field - field++; - idx = 0; - n -= m; - } - *palready_xset = all_already_xset; - return all_transition; -} - -// Check if a sequence of `n` bits within a chunk are all set/cleared. -static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { - mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); - mi_assert_internal(n>0); - bool all_xset = true; - size_t idx = cidx % MI_BFIELD_BITS; - size_t field = cidx / MI_BFIELD_BITS; - while (n > 0) { - size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field - if (m > n) { m = n; } - mi_assert_internal(idx + m <= MI_BFIELD_BITS); - mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); - const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask); - // next field - field++; - idx = 0; - n -= m; - } - return all_xset; -} - -// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0), -// and false otherwise leaving all bit fields as is. -static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { - mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); - mi_assert_internal(n>0); - if (n==0) return true; - size_t start_idx = cidx % MI_BFIELD_BITS; - size_t start_field = cidx / MI_BFIELD_BITS; - size_t end_field = MI_BITMAP_CHUNK_FIELDS; - size_t mask_mid = 0; - size_t mask_end = 0; - - // first field - size_t field = start_field; - size_t m = MI_BFIELD_BITS - start_idx; // m is the bits to xset in this field - if (m > n) { m = n; } - mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); - mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS); - const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask_start)) return false; - - // done? - n -= m; - if (n==0) return true; - - // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields - - // mid fields - while (n >= MI_BFIELD_BITS) { - field++; - mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); - mask_mid = ~MI_ZU(0); - if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore; - n -= MI_BFIELD_BITS; - } - - // last field - if (n > 0) { - mi_assert_internal(n < MI_BFIELD_BITS); - field++; - mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); - end_field = field; - mask_end = (MI_ZU(1)<bfields[field], mask_end)) goto restore; - } - - return true; - -restore: - // field is on the field that failed to set atomically; we need to restore all previous fields - mi_assert_internal(field > start_field); - while( field > start_field) { - field--; - const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); - bool already_xset; - mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset); - } - return false; -} - - -// find least 1-bit in a chunk and try unset it atomically -// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. -// todo: try neon version -static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) - while(true) { - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - if (_mm256_testz_si256(vec,vec)) return false; // vec == 0 ? - const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1 : 0) - const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0) - mi_assert_internal(mask != 0); - const size_t chunk_idx = _tzcnt_u32(mask) / 8; // tzcnt == 0, 8, 16, or 24 - mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); - size_t cidx; - if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) { // find the bit that is set - if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) { // unset atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); - return true; - } - } - // try again - } - #else - size_t idx; - for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - size_t idx; - if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit - if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) { // try unset atomically - *pidx = (i*MI_BFIELD_BITS + idx); - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); - return true; - } - } - } - return false; - #endif -} - - -// find least byte in a chunk with all bits set, and try unset it atomically -// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. -// todo: try neon version -static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) - while(true) { - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1 : 0) - const uint32_t mask = _mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte - if (mask == 0) return false; - const size_t i = _tzcnt_u32(mask); - mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS); - const size_t chunk_idx = i / MI_BFIELD_SIZE; - const size_t byte_idx = i % MI_BFIELD_SIZE; - if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) { // try to unset atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8); - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); - return true; - } - // try again - } - #else - size_t idx; - for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - const mi_bfield_t x = chunk->bfields[i]; - // has_set8 has low bit in each byte set if the byte in x == 0xFF - const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F - (x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 - >> 7; // shift high bit to low bit - size_t idx; - if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit - mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); - mi_assert_internal((idx%8)==0); - const size_t byte_idx = idx/8; - if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) { // unset the byte atomically - *pidx = (i*MI_BFIELD_BITS) + idx; - mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS); - return true; - } - // else continue - } - } - return false; - #endif -} - - -// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically -// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. -// todo: try avx2 and neon version -// todo: allow spanning across bfield boundaries? -static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { - if (n == 0 || n > MI_BFIELD_BITS) return false; // TODO: allow larger? - const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1); - for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - mi_bfield_t b = chunk->bfields[i]; - size_t bshift = 0; - size_t idx; - while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit - b >>= idx; - bshift += idx; - if (bshift + n >= MI_BFIELD_BITS) break; - - if ((b&mask) == mask) { // found a match - mi_assert_internal( ((mask << bshift) >> bshift) == mask ); - if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<bfields[i] >> bshift); - } - } - else { - // advance - const size_t ones = mi_bfield_ctz(~b); // skip all ones (since it didn't fit the mask) - mi_assert_internal(ones>0); - bshift += ones; - b >>= ones; - } - } - } - return false; -} - - -// are all bits in a bitmap chunk set? -static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - return _mm256_test_all_ones(vec); - #else - // written like this for vectorization - mi_bfield_t x = chunk->bfields[0]; - for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { - x = x & chunk->bfields[i]; - } - return (~x == 0); - #endif -} - -// are all bits in a bitmap chunk clear? -static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - return _mm256_testz_si256( vec, vec ); - #else - // written like this for vectorization - mi_bfield_t x = chunk->bfields[0]; - for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { - x = x | chunk->bfields[i]; - } - return (x == 0); - #endif -} - -/* -------------------------------------------------------------------------------- - bitmap --------------------------------------------------------------------------------- */ -// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true -void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { - if (!already_zero) { - _mi_memzero_aligned(bitmap, sizeof(*bitmap)); - } -} - -// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. -void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0); - mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS); - - // first chunk - size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - size_t m = MI_BITMAP_CHUNK_BITS - cidx; - if (m > n) { m = n; } - bool already_xset; - mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset); - - // n can be large so use memset for efficiency for all in-between chunks - chunk_idx++; - n -= m; - const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; - if (mid_chunks > 0) { - _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8); - chunk_idx += mid_chunks; - n -= mid_chunks * MI_BITMAP_CHUNK_BITS; - } - - // last chunk - if (n > 0) { - mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); - mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); - mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset); - } -} - - -// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0), -// and false otherwise leaving the bitmask as is. -bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < MI_BITMAP_MAX_BITS); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx); -} - -// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) -// and false otherwise leaving the bitmask as is. -bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < MI_BITMAP_MAX_BITS); - mi_assert_internal(idx%8 == 0); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; - return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx); -} - -// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) -// and false otherwise leaving the bitmask as is. -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); } - if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); } - - mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) - if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n); -} - -// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) { - mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - bool local_already_xset; - if (already_xset==NULL) { already_xset = &local_already_xset; } - // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } - // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } - - mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) - if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset); -} - -// Is a sequence of n bits already all set/cleared? -bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) - if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); -} - - -#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \ - { size_t _set_idx; \ - size_t _start = start % MI_BFIELD_BITS; \ - mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \ - while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \ - decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS; - -#define mi_bitmap_forall_set_chunks_end() \ - _start += _set_idx+1; /* so chunk_idx stays valid */ \ - _any_set >>= _set_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ \ - _any_set >>= 1; \ - } \ - } - -// Find a set bit in a bitmap and atomically unset it. Returns true on success, -// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. -// The low `MI_BFIELD_BITS` of start are used to set the start point of the search -// (to reduce thread contention). -bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) { - mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) - { - size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS); - return true; - } - else { - // we may find that all are unset only on a second iteration but that is ok as - // _any_set is a conservative approximation. - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); - } - } - } - mi_bitmap_forall_set_chunks_end(); - return false; -} - - -// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) { - mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) - { - size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8); - mi_assert_internal((*pidx % 8) == 0); - return true; - } - else { - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); - } - } - } - mi_bitmap_forall_set_chunks_end(); - return false; -} - -// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) { - // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger - // TODO: allow spanning across chunk boundaries - if (n == 0 || n > MI_BFIELD_BITS) return false; - mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) - { - size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n); - return true; - } - else { - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); - } - } - } - mi_bitmap_forall_set_chunks_end(); - return false; -} diff --git a/src/xbitmap.h b/src/xbitmap.h deleted file mode 100644 index 869db2a2..00000000 --- a/src/xbitmap.h +++ /dev/null @@ -1,94 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2023 Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -/* ---------------------------------------------------------------------------- -Concurrent bitmap that can set/reset sequences of bits atomically ----------------------------------------------------------------------------- */ -#pragma once -#ifndef MI_XBITMAP_H -#define MI_XBITMAP_H - -/* -------------------------------------------------------------------------------- - Definitions --------------------------------------------------------------------------------- */ - -typedef size_t mi_bfield_t; - -#define MI_BFIELD_BITS_SHIFT (MI_SIZE_SHIFT+3) -#define MI_BFIELD_BITS (1 << MI_BFIELD_BITS_SHIFT) -#define MI_BFIELD_SIZE (MI_BFIELD_BITS/8) -#define MI_BFIELD_BITS_MOD_MASK (MI_BFIELD_BITS - 1) -#define MI_BFIELD_LO_BIT8 ((~(mi_bfield_t(0)))/0xFF) // 0x01010101 .. -#define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. - -#define MI_BITMAP_CHUNK_BITS_SHIFT (8) // 2^8 = 256 bits per chunk -#define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) -#define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) -#define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) - -typedef mi_decl_align(32) struct mi_bitmap_chunk_s { - _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; -} mi_bitmap_chunk_t; - - -typedef mi_decl_align(32) struct mi_bitmap_s { - mi_bitmap_chunk_t chunks[MI_BFIELD_BITS]; - _Atomic(mi_bfield_t)any_set; -} mi_bitmap_t; - -#define MI_BITMAP_MAX_BITS (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit - -/* -------------------------------------------------------------------------------- - Bitmap --------------------------------------------------------------------------------- */ - -typedef bool mi_bit_t; -#define MI_BIT_SET (true) -#define MI_BIT_CLEAR (false) - -// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true -void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero); - -// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. -void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); - -// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset); - -// Is a sequence of n bits already all set/cleared? -bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); - -// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) -// and false otherwise leaving the bitmask as is. -mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); - -// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) -// and false otherwise leaving the bitmask as is. -mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); - -// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) -// and false otherwise leaving the bitmask as is. -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); - -// Find a set bit in a bitmap and atomically unset it. Returns true on success, -// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. -// The low `MI_BFIELD_BITS` of start are used to set the start point of the search -// (to reduce thread contention). -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start); - -// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ); - -// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ); - -#endif // MI_XBITMAP_H From 46afcbe06cd0000eeda5400fba7eb23453237b8c Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 29 Nov 2024 14:28:34 -0800 Subject: [PATCH 003/264] wip: further progress on segment removal; arena allocation --- include/mimalloc/internal.h | 7 +- include/mimalloc/types.h | 17 +- src/arena-page.c | 20 ++ src/arena.c | 368 ++++++++++++++++++++++++++---------- src/bitmap.c | 16 +- src/bitmap.h | 6 +- src/page-map.c | 8 +- src/page.c | 56 +++--- 8 files changed, 344 insertions(+), 154 deletions(-) create mode 100644 src/arena-page.c diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 2713c0ac..d60b0c15 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -137,6 +137,9 @@ bool _mi_arena_contains(const void* p); void _mi_arenas_collect(bool force_purge, mi_stats_t* stats); void _mi_arena_unsafe_destroy_all(mi_stats_t* stats); +mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); +void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld); +void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld); void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid); void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size); @@ -181,6 +184,7 @@ void _mi_deferred_free(mi_heap_t* heap, bool force); void _mi_page_free_collect(mi_page_t* page,bool force); void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page); // callback from segments +void _mi_page_init(mi_heap_t* heap, mi_page_t* page); size_t _mi_bin_size(uint8_t bin); // for stats uint8_t _mi_bin(size_t size); // for stats @@ -453,8 +457,7 @@ static inline size_t mi_page_block_size(const mi_page_t* page) { // Page start static inline uint8_t* mi_page_start(const mi_page_t* page) { - mi_assert(sizeof(mi_page_t) <= MI_PAGE_INFO_SIZE); - return (uint8_t*)page + MI_PAGE_INFO_SIZE; + return page->page_start; } static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) { diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 98664020..591cb603 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -127,8 +127,11 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_ARENA_BLOCK_ALIGN (MI_ARENA_BLOCK_SIZE) #define MI_BITMAP_CHUNK_BITS (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT) -#define MI_ARENA_MIN_OBJ_SIZE MI_ARENA_BLOCK_SIZE -#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE) // for now, cannot cross chunk boundaries +#define MI_ARENA_MIN_OBJ_BLOCKS (1) +#define MI_ARENA_MAX_OBJ_BLOCKS (MI_BITMAP_CHUNK_BITS) // for now, cannot cross chunk boundaries + +#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE) +#define MI_ARENA_MAX_OBJ_SIZE (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE) #define MI_SMALL_PAGE_SIZE MI_ARENA_MIN_OBJ_SIZE #define MI_MEDIUM_PAGE_SIZE (8*MI_SMALL_PAGE_SIZE) // 512 KiB (=byte in the bitmap) @@ -141,7 +144,7 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_BIN_COUNT (MI_BIN_FULL+1) -// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in dedicated orphan pages +// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in singleton pages #define MI_BLOCK_ALIGNMENT_MAX (MI_ARENA_BLOCK_ALIGN) // We never allocate more than PTRDIFF_MAX (see also ) @@ -279,7 +282,6 @@ typedef struct mi_subproc_s mi_subproc_t; // the owning heap `thread_delayed_free` list. This guarantees that pages // will be freed correctly even if only other threads free blocks. typedef struct mi_page_s { - mi_memid_t memid; // provenance of the page memory uint16_t capacity; // number of blocks committed (must be the first field for proper zero-initialisation) uint16_t reserved; // number of blocks reserved in memory mi_page_flags_t flags; // `in_full` and `has_aligned` flags (8 bits) @@ -293,6 +295,7 @@ typedef struct mi_page_s { uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type // padding size_t block_size; // size available in each block (always `>0`) + uint8_t* page_start; // start of the blocks #if (MI_ENCODE_FREELIST || MI_PADDING) uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary @@ -304,6 +307,7 @@ typedef struct mi_page_s { struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` + mi_memid_t memid; // provenance of the page memory } mi_page_t; @@ -312,7 +316,7 @@ typedef struct mi_page_s { // ------------------------------------------------------ #define MI_PAGE_ALIGN (64) -#define MI_PAGE_INFO_SIZE (MI_SIZE_SHIFT*MI_PAGE_ALIGN) // should be > sizeof(mi_page_t) +#define MI_PAGE_INFO_SIZE (2*MI_PAGE_ALIGN) // should be > sizeof(mi_page_t) // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) @@ -532,7 +536,7 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); // ------------------------------------------------------ struct mi_subproc_s { - _Atomic(size_t) abandoned_count; // count of abandoned pages for this sub-process + _Atomic(size_t) abandoned_count[MI_BIN_COUNT]; // count of abandoned pages for this sub-process _Atomic(size_t) abandoned_os_list_count; // count of abandoned pages in the os-list mi_lock_t abandoned_os_lock; // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations) mi_lock_t abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list @@ -562,6 +566,7 @@ struct mi_tld_s { mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) mi_subproc_t* subproc; // sub-process this thread belongs to. + size_t tseq; // thread sequence id mi_os_tld_t os; // os tld mi_stats_t stats; // statistics }; diff --git a/src/arena-page.c b/src/arena-page.c new file mode 100644 index 00000000..93d25dbf --- /dev/null +++ b/src/arena-page.c @@ -0,0 +1,20 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2024, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- + +-----------------------------------------------------------------------------*/ + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "bitmap.h" + + +/* ----------------------------------------------------------- + Arena allocation +----------------------------------------------------------- */ + diff --git a/src/arena.c b/src/arena.c index 28ad61f1..c9f8400b 100644 --- a/src/arena.c +++ b/src/arena.c @@ -42,6 +42,7 @@ typedef struct mi_arena_s { bool is_large; // memory area consists of large- or huge OS pages (always committed) mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. + mi_subproc_t* subproc; mi_bitmap_t blocks_free; // is the block free? mi_bitmap_t blocks_committed; // is the block committed? (i.e. accessible) @@ -99,6 +100,9 @@ mi_arena_t* mi_arena_from_index(size_t idx) { return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); } +mi_arena_t* mi_arena_from_id(mi_arena_id_t id) { + return mi_arena_from_index(mi_arena_id_index(id)); +} /* ----------------------------------------------------------- @@ -164,14 +168,11 @@ bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block Arena Allocation ----------------------------------------------------------- */ -static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, - bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) -{ - MI_UNUSED(arena_index); - mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); - +static mi_decl_noinline void* mi_arena_try_alloc_at( + mi_arena_t* arena, size_t needed_bcount, bool commit, size_t tseq, mi_memid_t* memid) +{ size_t block_index; - if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL; + if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, needed_bcount, tseq, &block_index)) return NULL; // claimed it! void* p = mi_arena_block_start(arena, block_index); @@ -192,7 +193,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed); if (!all_already_committed) { bool commit_zero = false; - if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { + if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, NULL)) { memid->initially_committed = false; } else { @@ -205,75 +206,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t ar memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount); } + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, needed_bcount)); + if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount)); } + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount)); + // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, needed_bcount)); + return p; } -// allocate in a speficic arena -static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, - size_t size, size_t alignment, - bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) -{ - mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); - if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; - - const size_t bcount = mi_block_count_of_size(size); - const size_t arena_index = mi_arena_id_index(arena_id); - mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); - mi_assert_internal(size <= mi_size_of_blocks(bcount)); - - // Check arena suitability - mi_arena_t* arena = mi_arena_from_index(arena_index); - if (arena == NULL) return NULL; - if (!allow_large && arena->is_large) return NULL; - if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; - if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity - const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); - if (match_numa_node) { if (!numa_suitable) return NULL; } - else { if (numa_suitable) return NULL; } - } - - // try to allocate - void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld); - mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); - return p; -} - - -// allocate from an arena with fallback to the OS -static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, - bool commit, bool allow_large, - mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) -{ - mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); - if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; - - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - if mi_likely(max_arena == 0) return NULL; - - if (req_arena_id != _mi_arena_id_none()) { - // try a specific arena if requested - if (mi_arena_id_index(req_arena_id) < max_arena) { - void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - } - } - else { - // try numa affine allocation - for (size_t i = 0; i < max_arena; i++) { - void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - } - - // try from another numa node instead.. - if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already - for (size_t i = 0; i < max_arena; i++) { - void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - } - } - } - return NULL; -} // try to reserve a fresh arena space static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) @@ -323,56 +263,286 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re } -void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) + + +/* ----------------------------------------------------------- + Arena iteration +----------------------------------------------------------- */ + +static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_arena_id, mi_subproc_t* subproc, int numa_node, bool allow_large) { + if (subproc != NULL && arena->subproc != subproc) return false; + if (!allow_large && arena->is_large) return false; + if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return false; + if (req_arena_id == _mi_arena_id_none()) { // if not specific, check numa affinity + const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); + if (!numa_suitable) return false; + } + return true; +} + +#define MI_THREADS_PER_ARENA (16) + +#define mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, var_arena_id, var_arena) \ + { \ + size_t _max_arena; \ + size_t _start; \ + if (req_arena_id == _mi_arena_id_none()) { \ + _max_arena = mi_atomic_load_relaxed(&mi_arena_count); \ + _start = (_max_arena <= 1 ? 0 : (tseq / MI_THREADS_PER_ARENA) % _max_arena); \ + } \ + else { \ + _max_arena = 1; \ + _start = mi_arena_id_index(req_arena_id); \ + mi_assert_internal(mi_atomic_load_relaxed(&mi_arena_count) > _start); \ + } \ + for (size_t i = 0; i < _max_arena; i++) { \ + size_t _idx = i + _start; \ + if (_idx >= _max_arena) { _idx -= _max_arena; } \ + const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); \ + mi_arena_t* const var_arena = mi_arena_from_index(_idx); \ + if (mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \ + { + +#define mi_forall_arenas_end() }}} + + +/* ----------------------------------------------------------- + Arena allocation +----------------------------------------------------------- */ + +// allocate blocks from the arenas +static mi_decl_noinline void* mi_arena_try_find_free( + size_t block_count, size_t alignment, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) { - mi_assert_internal(memid != NULL && tld != NULL); - mi_assert_internal(size > 0); - size_t tseq = _mi_thread_seq_id(); - *memid = _mi_memid_none(); + mi_assert_internal(block_count <= mi_block_count_of_size(MI_ARENA_MAX_OBJ_SIZE)); + mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; - const int numa_node = _mi_os_numa_node(tld); // current numa node + // search arena's + mi_subproc_t* const subproc = tld->subproc; + const size_t tseq = tld->tseq; + mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena) + { + void* p = mi_arena_try_alloc_at(arena, block_count, commit, tseq, memid); + if (p != NULL) return p; + } + mi_forall_arenas_end(); + return NULL; +} - // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) - if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? - if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) { - void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); +// Allocate blocks from the arena's -- potentially allocating a fresh arena +static mi_decl_noinline void* mi_arena_try_alloc( + size_t block_count, size_t alignment, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) +{ + mi_assert(block_count <= MI_ARENA_MAX_OBJ_BLOCKS); + mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + + void* p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + + // otherwise, try to first eagerly reserve a new arena + if (req_arena_id == _mi_arena_id_none()) { + mi_arena_id_t arena_id = 0; + if (mi_arena_reserve(mi_size_of_blocks(block_count), allow_large, req_arena_id, &arena_id)) { + // and try allocate in there + mi_assert_internal(req_arena_id == _mi_arena_id_none()); + p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; - - // otherwise, try to first eagerly reserve a new arena - if (req_arena_id == _mi_arena_id_none()) { - mi_arena_id_t arena_id = 0; - if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { - // and try allocate in there - mi_assert_internal(req_arena_id == _mi_arena_id_none()); - p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); - if (p != NULL) return p; - } - } } } +} +// Allocate from the OS (if allowed) +static void* mi_arena_os_alloc_aligned( + size_t size, size_t alignment, size_t align_offset, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) +{ // if we cannot use OS allocation, return NULL if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { errno = ENOMEM; return NULL; } - // finally, fall back to the OS if (align_offset > 0) { - return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); + return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, &tld->stats); } else { - return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, &tld->stats); } } + +// Allocate large sized memory +void* _mi_arena_alloc_aligned( + size_t size, size_t alignment, size_t align_offset, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) +{ + mi_assert_internal(memid != NULL && tld != NULL); + mi_assert_internal(size > 0); + + // *memid = _mi_memid_none(); + // const int numa_node = _mi_os_numa_node(&tld->os); // current numa node + + // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed? + req_arena_id == _mi_arena_id_none() && // not a specific arena? + size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && // and not too small/large + alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) // and good alignment + { + const size_t block_count = mi_block_count_of_size(size); + void* p = mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + + // fall back to the OS + return mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld); +} + void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) { return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); } + +/* ----------------------------------------------------------- + Arena page allocation +----------------------------------------------------------- */ + +static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) +{ + const size_t bin = _mi_bin(block_size); + mi_assert_internal(bin < MI_BIN_COUNT); + + // any abandoned in our size class? + mi_subproc_t* const subproc = tld->subproc; + if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) return NULL; + + // search arena's + const bool allow_large = true; + size_t tseq = tld->tseq; + mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena) + { + size_t block_index; + if (mi_bitmap_try_find_and_clear(&arena->blocks_abandoned[bin], tseq, &block_index)) { + // found an abandoned page of the right size + mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); + mi_page_t* page = (mi_page_t*)mi_arena_block_start(arena, block_index); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, block_count)); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, block_count)); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, block_count)); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, block_count)); + mi_assert_internal(mi_page_block_size(page) == block_size); + mi_assert_internal(!mi_page_is_full(page)); + mi_assert_internal(mi_page_is_abandoned(page)); + return page; + } + } + mi_forall_arenas_end(); + return false; +} + +static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) +{ + const bool allow_large = true; + const bool commit = true; + const size_t alignment = MI_ARENA_BLOCK_ALIGN; + + // try to allocate from free space in arena's + mi_memid_t memid; + mi_page_t* page = NULL; + if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) { + page = (mi_page_t*)mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, &memid, tld); + } + + // otherwise fall back to the OS + if (page == NULL) { + page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_blocks(block_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld); + } + + if (page == NULL) return NULL; + + // claimed free blocks: initialize the page partly + _mi_memzero_aligned(page, sizeof(*page)); + mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN)); + const size_t reserved = (mi_size_of_blocks(block_count) - MI_PAGE_INFO_SIZE) / block_size; + mi_assert_internal(reserved > 0 && reserved < UINT16_MAX); + page->reserved = reserved; + page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE; + page->block_size = block_size; + page->memid = memid; + page->free_is_zero = memid.initially_zero; + if (block_size > 0 && _mi_is_power_of_two(block_size)) { + page->block_size_shift = (uint8_t)mi_ctz(block_size); + } + else { + page->block_size_shift = 0; + } + + mi_assert_internal(mi_page_block_size(page) == block_size); + mi_assert_internal(mi_page_is_abandoned(page)); + return page; +} + +// block_count: arena block count for the page +// block size : page block size +static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size_t block_size) { + const size_t req_arena_id = heap->arena_id; + mi_tld_t* const tld = heap->tld; + + // 1. look for an abandoned page + mi_page_t* page = mi_arena_page_try_find_abandoned(block_count, block_size, req_arena_id, tld); + if (page != NULL) { + _mi_page_reclaim(heap,page); + return page; + } + + // 2. find a free block, potentially allocating a new arena + page = mi_arena_page_alloc_fresh(block_count, block_size, req_arena_id, tld); + if (page != NULL) { + _mi_page_init(heap, page); + return page; + } + + return NULL; +} + + +static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) { + _mi_error_message(EINVAL, "singleton page is not yet implemented\n"); + return NULL; +} + + +mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) { + mi_page_t* page; + if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) { + mi_assert_internal(_mi_is_power_of_two(page_alignment)); + page = mi_singleton_page_alloc(heap, block_size, page_alignment); + } + else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) { + page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_SMALL_PAGE_SIZE), block_size); + } + else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) { + page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); + } + else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { + page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_LARGE_PAGE_SIZE), block_size); + } + else { + page = mi_singleton_page_alloc(heap, block_size, page_alignment); + } + // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); + return page; +} + + /* ----------------------------------------------------------- Arena free ----------------------------------------------------------- */ diff --git a/src/bitmap.c b/src/bitmap.c index 463d74c7..9faa9ae9 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -512,9 +512,9 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) } -#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \ +#define mi_bitmap_forall_set_chunks(bitmap,tseq,decl_chunk_idx) \ { size_t _set_idx; \ - size_t _start = start % MI_BFIELD_BITS; \ + size_t _start = tseq % MI_BFIELD_BITS; \ mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \ while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \ decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS; @@ -530,8 +530,8 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. // The low `MI_BFIELD_BITS` of start are used to set the start point of the search // (to reduce thread contention). -bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) { - mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) +bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) { @@ -554,8 +554,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t star // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) { - mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) +bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) { + mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) { @@ -576,11 +576,11 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pi // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) { +bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) { // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger // TODO: allow spanning across chunk boundaries if (n == 0 || n > MI_BFIELD_BITS) return false; - mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) { diff --git a/src/bitmap.h b/src/bitmap.h index 198a2902..fcadc213 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -79,14 +79,14 @@ mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, si // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. // The low `MI_BFIELD_BITS` of start are used to set the start point of the search // (to reduce thread contention). -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start); +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ); +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ); // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ); +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ); #endif // MI_XBITMAP_H diff --git a/src/page-map.c b/src/page-map.c index d3fcef79..cb527886 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -32,9 +32,13 @@ static bool mi_page_map_init(void) { return false; } if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) { - _mi_warning_message("the page map was committed on-demand but not zero initialized!\n"); + _mi_warning_message("the page map was committed but not zero initialized!\n"); _mi_memzero_aligned(_mi_page_map, page_map_size); } + // commit the first part so NULL pointers get resolved without an access violation + if (!mi_page_map_all_committed) { + _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL); + } return true; } @@ -72,7 +76,7 @@ void _mi_page_map_register(mi_page_t* page) { // set the offsets for (int i = 0; i < block_count; i++) { mi_assert_internal(i < 128); - _mi_page_map[idx + i] = (int8_t)(-i-1); + _mi_page_map[idx + i] = (signed char)(-i-1); } } diff --git a/src/page.c b/src/page.c index a00ff615..fa006085 100644 --- a/src/page.c +++ b/src/page.c @@ -119,7 +119,7 @@ bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(page->keys[0] != 0); #endif if (mi_page_heap(page)!=NULL) { - mi_assert_internal(!_mi_process_is_initialized || page->thread_id == mi_page_heap(page)->thread_id || page->thread_id==0); + mi_assert_internal(!_mi_process_is_initialized || mi_page_thread_id(page) == mi_page_heap(page)->thread_id || mi_page_thread_id(page)==0); { mi_page_queue_t* pq = mi_page_queue_of(page); mi_assert_internal(mi_page_queue_contains(pq, page)); @@ -249,19 +249,22 @@ void _mi_page_free_collect(mi_page_t* page, bool force) { // called from segments when reclaiming abandoned pages void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) { + mi_page_set_heap(page, heap); + _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) + _mi_page_free_collect(page, false); // ensure used count is up to date + mi_assert_expensive(mi_page_is_valid_init(page)); mi_assert_internal(mi_page_heap(page) == heap); mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE); - #if MI_HUGE_PAGE_ABANDON - mi_assert_internal(_mi_page_segment(page)->page_kind != MI_PAGE_HUGE); - #endif - + // TODO: push on full queue immediately if it is full? mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page)); mi_page_queue_push(heap, pq, page); mi_assert_expensive(_mi_page_is_valid(page)); } + + // allocate a fresh page from a segment static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size_t block_size, size_t page_alignment) { #if !MI_HUGE_PAGE_ABANDON @@ -269,16 +272,12 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size mi_assert_internal(mi_heap_contains_queue(heap, pq)); mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size); #endif - mi_page_t* page = _mi_heap_page_alloc(heap, block_size, page_alignment); + mi_page_t* page = _mi_arena_page_alloc(heap, block_size, page_alignment); if (page == NULL) { // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue) return NULL; } mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size); - // a fresh page was found, initialize it - const size_t full_block_size = (pq == NULL || mi_page_is_huge(page) ? mi_page_block_size(page) : block_size); // see also: mi_segment_huge_page_alloc - mi_assert_internal(full_block_size >= block_size); - mi_page_init(heap, page, full_block_size, heap->tld); mi_heap_stat_increase(heap, pages, 1); if (pq != NULL) { mi_page_queue_push(heap, pq, page); } mi_assert_expensive(_mi_page_is_valid(page)); @@ -389,7 +388,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { // and abandon it mi_assert_internal(mi_page_is_abandoned(page)); - _mi_arena_page_abandon(page,&pheap->tld); + _mi_arena_page_abandon(page, pheap->tld); } // force abandon a page @@ -432,7 +431,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { // and free it mi_page_set_heap(page,NULL); - _mi_arena_page_free(page, force, &pheap->tld); + _mi_arena_page_free(page, pheap->tld); } #define MI_MAX_RETIRE_SIZE MI_LARGE_OBJ_SIZE_MAX // should be less than size for MI_BIN_HUGE @@ -617,7 +616,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co // Note: we also experimented with "bump" allocation on the first // allocations but this did not speed up any benchmark (due to an // extra test in malloc? or cache effects?) -static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) { +static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { mi_assert_expensive(mi_page_is_valid_init(page)); #if (MI_SECURE<=2) mi_assert(page->free == NULL); @@ -629,7 +628,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) size_t page_size; //uint8_t* page_start = mi_page_area(page, &page_size); - mi_stat_counter_increase(tld->stats.pages_extended, 1); + mi_heap_stat_counter_increase(heap, pages_extended, 1); // calculate the extend count const size_t bsize = mi_page_block_size(page); @@ -651,48 +650,37 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld) // and append the extend the free list if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) { - mi_page_free_list_extend(page, bsize, extend, &tld->stats ); + mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats ); } else { - mi_page_free_list_extend_secure(heap, page, bsize, extend, &tld->stats); + mi_page_free_list_extend_secure(heap, page, bsize, extend, &heap->tld->stats); } // enable the new free list page->capacity += (uint16_t)extend; - mi_stat_increase(tld->stats.page_committed, extend * bsize); + mi_heap_stat_increase(heap, page_committed, extend * bsize); mi_assert_expensive(mi_page_is_valid_init(page)); } -// Initialize a fresh page -static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi_tld_t* tld) { +// Initialize a fresh page (that is already partially initialized) +void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { mi_assert(page != NULL); - mi_assert_internal(block_size > 0); - // set fields mi_page_set_heap(page, heap); - page->block_size = block_size; size_t page_size; uint8_t* page_start = mi_page_area(page, &page_size); mi_track_mem_noaccess(page_start,page_size); - mi_assert_internal(page_size / block_size < (1L<<16)); - page->reserved = (uint16_t)(page_size / block_size); + mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16)); mi_assert_internal(page->reserved > 0); #if (MI_PADDING || MI_ENCODE_FREELIST) page->keys[0] = _mi_heap_random_next(heap); page->keys[1] = _mi_heap_random_next(heap); #endif - page->free_is_zero = page->memid.initially_zero; #if MI_DEBUG>2 if (page->memid.initially_zero) { mi_track_mem_defined(page->page_start, page_size); mi_assert_expensive(mi_mem_is_zero(page_start, page_size)); } #endif - if (block_size > 0 && _mi_is_power_of_two(block_size)) { - page->block_size_shift = (uint8_t)mi_ctz(block_size); - } - else { - page->block_size_shift = 0; - } - + mi_assert_internal(page->capacity == 0); mi_assert_internal(page->free == NULL); mi_assert_internal(page->used == 0); @@ -705,11 +693,11 @@ static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t block_size, mi mi_assert_internal(page->keys[0] != 0); mi_assert_internal(page->keys[1] != 0); #endif - mi_assert_internal(page->block_size_shift == 0 || (block_size == ((size_t)1 << page->block_size_shift))); + mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift))); mi_assert_expensive(mi_page_is_valid_init(page)); // initialize an initial free list - mi_page_extend_free(heap,page,tld); + mi_page_extend_free(heap,page); mi_assert(mi_page_immediate_available(page)); } From 68f5fb2f4b857681f80789ac0902bb39535bd072 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 29 Nov 2024 15:08:06 -0800 Subject: [PATCH 004/264] wip: further progress on segment removal; arena allocation --- ide/vs2022/mimalloc-override.vcxproj | 1 - ide/vs2022/mimalloc-override.vcxproj.filters | 3 -- ide/vs2022/mimalloc.vcxproj | 7 ++- ide/vs2022/mimalloc.vcxproj.filters | 6 +-- include/mimalloc/bits.h | 2 +- include/mimalloc/internal.h | 6 ++- src/alloc-aligned.c | 4 +- src/arena.c | 16 +++--- src/bitmap.c | 6 +-- src/free.c | 2 +- src/heap.c | 23 ++++---- src/init.c | 57 +++++++------------- src/os.c | 9 ++-- src/page-map.c | 2 +- src/page-queue.c | 10 ++-- src/page.c | 9 ++-- src/prim/windows/prim.c | 6 +-- src/stats.c | 9 ++-- 18 files changed, 80 insertions(+), 98 deletions(-) diff --git a/ide/vs2022/mimalloc-override.vcxproj b/ide/vs2022/mimalloc-override.vcxproj index 4383d886..32bd97d1 100644 --- a/ide/vs2022/mimalloc-override.vcxproj +++ b/ide/vs2022/mimalloc-override.vcxproj @@ -265,7 +265,6 @@ - diff --git a/ide/vs2022/mimalloc-override.vcxproj.filters b/ide/vs2022/mimalloc-override.vcxproj.filters index a9f66c35..6656c16d 100644 --- a/ide/vs2022/mimalloc-override.vcxproj.filters +++ b/ide/vs2022/mimalloc-override.vcxproj.filters @@ -46,9 +46,6 @@ Sources - - Sources - Sources diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 3dd7326f..41fe0b46 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -214,6 +214,12 @@ + + true + true + true + true + false @@ -258,7 +264,6 @@ - diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters index 2eed7e90..237ef1ed 100644 --- a/ide/vs2022/mimalloc.vcxproj.filters +++ b/ide/vs2022/mimalloc.vcxproj.filters @@ -58,6 +58,9 @@ Sources + + Sources + @@ -87,9 +90,6 @@ Headers - - Headers - Headers diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index ad7ea3e6..d6695a00 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -282,7 +282,7 @@ static inline size_t mi_rotr(size_t x, size_t r) { #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32) return mi_builtin(rotateright32)(x,r); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) - #if MI_BFIELD_SIZE==4 + #if MI_SIZE_BITS==32 return _lrotr(x,(int)r); #else return _rotr64(x,(int)r); diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index d60b0c15..515acfc1 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -140,6 +140,8 @@ void _mi_arena_unsafe_destroy_all(mi_stats_t* stats); mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld); void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld); +bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page); +void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap); void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid); void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size); @@ -567,11 +569,11 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) { return (page->reserved - page->used <= frac); } -static inline bool mi_page_is_abandoned(mi_page_t* page) { +static inline bool mi_page_is_abandoned(const mi_page_t* page) { return (mi_page_thread_id(page) == 0); } -static inline bool mi_page_is_huge(mi_page_t* page) { +static inline bool mi_page_is_huge(const mi_page_t* page) { return (page->block_size > MI_LARGE_MAX_OBJ_SIZE); } diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index b4da4ded..43dc2d36 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -16,12 +16,12 @@ terms of the MIT license. A copy of the license can be found in the file // ------------------------------------------------------ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) { - // objects up to `MI_MAX_ALIGN_GUARANTEE` are allocated aligned to their size (see `segment.c:_mi_segment_page_start`). + // objects up to `MI_PAGE_ALIGN` are allocated aligned to their size mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0)); if (alignment > size) return false; if (alignment <= MI_MAX_ALIGN_SIZE) return true; const size_t bsize = mi_good_size(size); - return (bsize <= MI_MAX_ALIGN_GUARANTEE && (bsize & (alignment-1)) == 0); + return (bsize <= MI_PAGE_ALIGN && (bsize & (alignment-1)) == 0); } #if MI_GUARDED diff --git a/src/arena.c b/src/arena.c index c9f8400b..0db8acf3 100644 --- a/src/arena.c +++ b/src/arena.c @@ -298,7 +298,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are for (size_t i = 0; i < _max_arena; i++) { \ size_t _idx = i + _start; \ if (_idx >= _max_arena) { _idx -= _max_arena; } \ - const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); \ + const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); MI_UNUSED(var_arena_id);\ mi_arena_t* const var_arena = mi_arena_from_index(_idx); \ if (mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \ { @@ -341,6 +341,7 @@ static mi_decl_noinline void* mi_arena_try_alloc( mi_assert(block_count <= MI_ARENA_MAX_OBJ_BLOCKS); mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + // try to find free blocks in the arena's void* p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; @@ -354,6 +355,8 @@ static mi_decl_noinline void* mi_arena_try_alloc( if (p != NULL) return p; } } + + return NULL; } // Allocate from the OS (if allowed) @@ -445,7 +448,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t bl } } mi_forall_arenas_end(); - return false; + return NULL; } static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) @@ -455,7 +458,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz const size_t alignment = MI_ARENA_BLOCK_ALIGN; // try to allocate from free space in arena's - mi_memid_t memid; + mi_memid_t memid = _mi_memid_none(); mi_page_t* page = NULL; if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) { page = (mi_page_t*)mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, &memid, tld); @@ -472,8 +475,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz _mi_memzero_aligned(page, sizeof(*page)); mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN)); const size_t reserved = (mi_size_of_blocks(block_count) - MI_PAGE_INFO_SIZE) / block_size; - mi_assert_internal(reserved > 0 && reserved < UINT16_MAX); - page->reserved = reserved; + mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); + page->reserved = (uint16_t)reserved; page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE; page->block_size = block_size; page->memid = memid; @@ -493,7 +496,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz // block_count: arena block count for the page // block size : page block size static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size_t block_size) { - const size_t req_arena_id = heap->arena_id; + const mi_arena_id_t req_arena_id = heap->arena_id; mi_tld_t* const tld = heap->tld; // 1. look for an abandoned page @@ -515,6 +518,7 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) { + MI_UNUSED(heap); MI_UNUSED(block_size); MI_UNUSED(page_alignment); _mi_error_message(EINVAL, "singleton page is not yet implemented\n"); return NULL; } diff --git a/src/bitmap.c b/src/bitmap.c index 9faa9ae9..24c0d9c9 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -149,7 +149,7 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t mi_assert_internal(idx + m <= MI_BFIELD_BITS); mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset); all_already_xset = all_already_xset && already_xset; // next field @@ -268,7 +268,6 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, // try again } #else - size_t idx; for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { size_t idx; if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit @@ -306,7 +305,6 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, // try again } #else - size_t idx; for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { const mi_bfield_t x = chunk->bfields[i]; // has_set8 has low bit in each byte set if the byte in x == 0xFF @@ -374,7 +372,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, // are all bits in a bitmap chunk set? -static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { +static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); return _mm256_test_all_ones(vec); diff --git a/src/free.c b/src/free.c index e1cc9276..224070fe 100644 --- a/src/free.c +++ b/src/free.c @@ -233,7 +233,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944)) { // the page is abandoned, try to reclaim it into our heap - if (_mi_heap_try_reclaim(mi_heap_get_default(), page)) { // TODO: avoid putting it in the full free queue + if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) { // TODO: avoid putting it in the full free queue mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc); mi_free(block); // recursively free as now it will be a local free in our heap diff --git a/src/heap.c b/src/heap.c index e4955ba7..8ee66055 100644 --- a/src/heap.c +++ b/src/heap.c @@ -54,9 +54,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ MI_UNUSED(arg1); MI_UNUSED(arg2); MI_UNUSED(pq); - mi_assert_internal(mi_page_heap(page) == heap); - mi_segment_t* segment = _mi_page_segment(page); - mi_assert_internal(segment->thread_id == heap->thread_id); + mi_assert_internal(mi_page_heap(page) == heap); mi_assert_expensive(_mi_page_is_valid(page)); return true; } @@ -135,7 +133,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. // if all memory is freed by now, all segments should be freed. // note: this only collects in the current subprocess - _mi_abandoned_reclaim_all(heap, &heap->tld->segments); + _mi_arena_reclaim_all_abandoned(heap); } // if abandoning, mark all pages to no longer add to delayed_free @@ -155,7 +153,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL ); // collect segments (purge pages, this can be expensive so don't force on abandonment) - _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments); + // _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments); // if forced, collect thread data cache on program-exit (or shared library unload) if (force && is_main_thread && mi_heap_is_backing(heap)) { @@ -320,13 +318,13 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ // stats const size_t bsize = mi_page_block_size(page); - if (bsize > MI_LARGE_OBJ_SIZE_MAX) { + if (bsize > MI_LARGE_MAX_OBJ_SIZE) { mi_heap_stat_decrease(heap, huge, bsize); } #if (MI_STAT) _mi_page_free_collect(page, false); // update used count const size_t inuse = page->used; - if (bsize <= MI_LARGE_OBJ_SIZE_MAX) { + if (bsize <= MI_LARGE_MAX_OBJ_SIZE) { mi_heap_stat_decrease(heap, normal, bsize * inuse); #if (MI_STAT>1) mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse); @@ -343,7 +341,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ // mi_page_free(page,false); page->next = NULL; page->prev = NULL; - _mi_segment_page_free(page,false /* no force? */, &heap->tld->segments); + _mi_arena_page_free(page,heap->tld); return true; // keep going } @@ -483,11 +481,8 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) { // static since it is not thread safe to access heaps from other threads. static mi_heap_t* mi_heap_of_block(const void* p) { if (p == NULL) return NULL; - mi_segment_t* segment = _mi_ptr_segment(p); - bool valid = (_mi_ptr_cookie(segment) == segment->cookie); - mi_assert_internal(valid); - if mi_unlikely(!valid) return NULL; - return mi_page_heap(_mi_segment_page_of(segment,p)); + mi_page_t* page = _mi_ptr_page(p); // TODO: check pointer validity? + return mi_page_heap(page); } bool mi_heap_contains_block(mi_heap_t* heap, const void* p) { @@ -562,7 +557,7 @@ bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_ if (page->used == 0) return true; size_t psize; - uint8_t* const pstart = _mi_segment_page_start(_mi_page_segment(page), page, &psize); + uint8_t* const pstart = mi_page_area(page, &psize); mi_heap_t* const heap = mi_page_heap(page); const size_t bsize = mi_page_block_size(page); const size_t ubsize = mi_page_usable_block_size(page); // without padding diff --git a/src/init.c b/src/init.c index 2544f097..215d6be8 100644 --- a/src/init.c +++ b/src/init.c @@ -14,8 +14,6 @@ terms of the MIT license. A copy of the license can be found in the file // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { - 0, - false, false, false, false, 0, // capacity 0, // reserved capacity { 0 }, // flags @@ -33,10 +31,9 @@ const mi_page_t _mi_page_empty = { #endif MI_ATOMIC_VAR_INIT(0), // xthread_free MI_ATOMIC_VAR_INIT(0), // xheap - NULL, NULL - #if MI_INTPTR_SIZE==4 - , { NULL } - #endif + MI_ATOMIC_VAR_INIT(0), // xthread_id + NULL, NULL, // next, prev + { { NULL, 0}, false, false, false, MI_MEM_NONE } // memid }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) @@ -63,8 +60,8 @@ const mi_page_t _mi_page_empty = { QNULL( 10240), QNULL( 12288), QNULL( 14336), QNULL( 16384), QNULL( 20480), QNULL( 24576), QNULL( 28672), QNULL( 32768), /* 56 */ \ QNULL( 40960), QNULL( 49152), QNULL( 57344), QNULL( 65536), QNULL( 81920), QNULL( 98304), QNULL(114688), QNULL(131072), /* 64 */ \ QNULL(163840), QNULL(196608), QNULL(229376), QNULL(262144), QNULL(327680), QNULL(393216), QNULL(458752), QNULL(524288), /* 72 */ \ - QNULL(MI_LARGE_OBJ_WSIZE_MAX + 1 /* 655360, Huge queue */), \ - QNULL(MI_LARGE_OBJ_WSIZE_MAX + 2) /* Full queue */ } + QNULL(MI_LARGE_MAX_OBJ_WSIZE + 1 /* 655360, Huge queue */), \ + QNULL(MI_LARGE_MAX_OBJ_WSIZE + 2) /* Full queue */ } #define MI_STAT_COUNT_NULL() {0,0,0,0} @@ -82,8 +79,6 @@ const mi_page_t _mi_page_empty = { MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ - MI_STAT_COUNT_NULL(), MI_STAT_COUNT_NULL(), \ - MI_STAT_COUNT_NULL(), \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ @@ -101,10 +96,10 @@ const mi_page_t _mi_page_empty = { mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, - MI_ATOMIC_VAR_INIT(NULL), - 0, // tid + MI_ATOMIC_VAR_INIT(NULL), // thread delayed free + 0, // thread_id + 0, // arena_id 0, // cookie - 0, // arena id { 0, 0 }, // keys { {0}, {0}, 0, true }, // random 0, // page count @@ -124,17 +119,6 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { return _mi_prim_thread_id(); } -// Thread sequence number -static _Atomic(size_t) mi_tcount; -static mi_decl_thread size_t mi_tseq; - -size_t _mi_thread_seq_id(void) mi_attr_noexcept { - size_t tseq = mi_tseq; - if (tseq == 0) { - mi_tseq = tseq = mi_atomic_add_acq_rel(&mi_tcount,1); - } - return tseq; -} // the thread-local default heap for allocation mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty; @@ -146,12 +130,10 @@ static mi_decl_cache_align mi_subproc_t mi_subproc_default; static mi_decl_cache_align mi_tld_t tld_main = { 0, false, &_mi_heap_main, &_mi_heap_main, - { { NULL, NULL }, {NULL ,NULL}, {NULL ,NULL, 0}, - 0, 0, 0, 0, 0, &mi_subproc_default, - &tld_main.stats, &tld_main.os - }, // segments + NULL, // subproc + 0, // tseq { 0, &tld_main.stats }, // os - { MI_STATS_NULL } // stats + { MI_STATS_NULL } // stats }; mi_decl_cache_align mi_heap_t _mi_heap_main = { @@ -287,9 +269,9 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) { void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { mi_heap_t* heap = mi_heap_get_default(); if (heap == NULL) return; - mi_assert(heap->tld->segments.subproc == &mi_subproc_default); - if (heap->tld->segments.subproc != &mi_subproc_default) return; - heap->tld->segments.subproc = _mi_subproc_from_id(subproc_id); + mi_assert(heap->tld->subproc == &mi_subproc_default); + if (heap->tld->subproc != &mi_subproc_default) return; + heap->tld->subproc = _mi_subproc_from_id(subproc_id); } @@ -405,14 +387,16 @@ static bool _mi_thread_heap_init(void) { return false; } +// Thread sequence number +static _Atomic(size_t) mi_tcount; + // initialize thread local data void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { _mi_memzero_aligned(tld,sizeof(mi_tld_t)); tld->heap_backing = bheap; tld->heaps = NULL; - tld->segments.subproc = &mi_subproc_default; - tld->segments.stats = &tld->stats; - tld->segments.os = &tld->os; + tld->subproc = &mi_subproc_default; + tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); tld->os.stats = &tld->stats; } @@ -449,8 +433,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { _mi_stats_done(&heap->tld->stats); // free if not the main thread - if (heap != &_mi_heap_main) { - mi_assert_internal(heap->tld->segments.count == 0 || heap->thread_id != _mi_thread_id()); + if (heap != &_mi_heap_main) { mi_thread_data_free((mi_thread_data_t*)heap); } else { diff --git a/src/os.c b/src/os.c index 83521766..da41d152 100644 --- a/src/os.c +++ b/src/os.c @@ -245,7 +245,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit // note: this is dangerous on Windows as VirtualFree needs the actual base pointer // this is handled though by having the `base` field in the memid's *base = p; // remember the base - p = mi_align_up_ptr(p, alignment); + p = _mi_align_up_ptr(p, alignment); // explicitly commit only the aligned part if (commit) { @@ -258,7 +258,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit if (p == NULL) return NULL; // and selectively unmap parts around the over-allocated area. - void* aligned_p = mi_align_up_ptr(p, alignment); + void* aligned_p = _mi_align_up_ptr(p, alignment); size_t pre_size = (uint8_t*)aligned_p - (uint8_t*)p; size_t mid_size = _mi_align_up(size, _mi_os_page_size()); size_t post_size = over_size - pre_size - mid_size; @@ -316,6 +316,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo } void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { + MI_UNUSED(stats); void* p = _mi_os_alloc(size, memid, &_mi_stats_main); if (p == NULL) return NULL; @@ -373,10 +374,10 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, if (size == 0 || addr == NULL) return NULL; // page align conservatively within the range - void* start = (conservative ? mi_align_up_ptr(addr, _mi_os_page_size()) + void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size()) : mi_align_down_ptr(addr, _mi_os_page_size())); void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size()) - : mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size())); + : _mi_align_up_ptr((uint8_t*)addr + size, _mi_os_page_size())); ptrdiff_t diff = (uint8_t*)end - (uint8_t*)start; if (diff <= 0) return NULL; diff --git a/src/page-map.c b/src/page-map.c index cb527886..d70c3ee6 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -74,7 +74,7 @@ void _mi_page_map_register(mi_page_t* page) { } // set the offsets - for (int i = 0; i < block_count; i++) { + for (int i = 0; i < (int)block_count; i++) { mi_assert_internal(i < 128); _mi_page_map[idx + i] = (signed char)(-i-1); } diff --git a/src/page-queue.c b/src/page-queue.c index 0a791adb..c6b19985 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -38,15 +38,15 @@ terms of the MIT license. A copy of the license can be found in the file static inline bool mi_page_queue_is_huge(const mi_page_queue_t* pq) { - return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+sizeof(uintptr_t))); + return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+sizeof(uintptr_t))); } static inline bool mi_page_queue_is_full(const mi_page_queue_t* pq) { - return (pq->block_size == (MI_LARGE_OBJ_SIZE_MAX+(2*sizeof(uintptr_t)))); + return (pq->block_size == (MI_LARGE_MAX_OBJ_SIZE+(2*sizeof(uintptr_t)))); } static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) { - return (pq->block_size > MI_LARGE_OBJ_SIZE_MAX); + return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE); } /* ----------------------------------------------------------- @@ -76,7 +76,7 @@ static inline uint8_t mi_bin(size_t size) { bin = (uint8_t)wsize; } #endif - else if (wsize > MI_LARGE_OBJ_WSIZE_MAX) { + else if (wsize > MI_LARGE_MAX_OBJ_WSIZE) { bin = MI_BIN_HUGE; } else { @@ -113,7 +113,7 @@ size_t _mi_bin_size(uint8_t bin) { // Good size for allocation size_t mi_good_size(size_t size) mi_attr_noexcept { - if (size <= MI_LARGE_OBJ_SIZE_MAX) { + if (size <= MI_LARGE_MAX_OBJ_SIZE) { return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE)); } else { diff --git a/src/page.c b/src/page.c index fa006085..122b4324 100644 --- a/src/page.c +++ b/src/page.c @@ -36,8 +36,8 @@ static inline mi_block_t* mi_page_block_at(const mi_page_t* page, void* page_sta return (mi_block_t*)((uint8_t*)page_start + (i * block_size)); } -static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld); -static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page, mi_tld_t* tld); +//static void mi_page_init(mi_heap_t* heap, mi_page_t* page, size_t size, mi_tld_t* tld); +static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page); #if (MI_DEBUG>=3) static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) { @@ -83,7 +83,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) { mi_assert_internal(page->capacity <= page->reserved); // const size_t bsize = mi_page_block_size(page); - uint8_t* start = mi_page_start(page); + // uint8_t* start = mi_page_start(page); //mi_assert_internal(start + page->capacity*page->block_size == page->top); mi_assert_internal(mi_page_list_is_valid(page,page->free)); @@ -414,6 +414,7 @@ void _mi_page_force_abandon(mi_page_t* page) { // Free a page with no more free blocks void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq, bool force) { + MI_UNUSED(force); mi_assert_internal(page != NULL); mi_assert_expensive(_mi_page_is_valid(page)); mi_assert_internal(pq == mi_page_queue_of(page)); @@ -784,7 +785,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p } if (page != NULL && !mi_page_immediate_available(page)) { mi_assert_internal(mi_page_is_expandable(page)); - mi_page_extend_free(heap, page, heap->tld); + mi_page_extend_free(heap, page); } if (page == NULL) { diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 1d3d6f41..418c950f 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -127,7 +127,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) ULONGLONG memInKiB = 0; if (GetPhysicallyInstalledSystemMemory(&memInKiB)) { if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) { - config->physical_memory = memInKiB * MI_KiB; + config->physical_memory = (size_t)(memInKiB * MI_KiB); } } // get the VirtualAlloc2 function @@ -175,7 +175,7 @@ int _mi_prim_free(void* addr, size_t size ) { // the start of the region. MEMORY_BASIC_INFORMATION info = { 0 }; VirtualQuery(addr, &info, sizeof(info)); - if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)MI_SEGMENT_SIZE) { + if (info.AllocationBase < addr && ((uint8_t*)addr - (uint8_t*)info.AllocationBase) < (ptrdiff_t)(4*MI_MiB)) { errcode = 0; err = (VirtualFree(info.AllocationBase, 0, MEM_RELEASE) == 0); if (err) { errcode = GetLastError(); } @@ -239,7 +239,7 @@ static void* win_virtual_alloc_prim(void* addr, size_t size, size_t try_alignmen // success, return the address return p; } - else if (max_retry_msecs > 0 && (try_alignment <= 2*MI_SEGMENT_ALIGN) && + else if (max_retry_msecs > 0 && (try_alignment <= 8*MI_MiB) && (flags&MEM_COMMIT) != 0 && (flags&MEM_LARGE_PAGES) == 0 && win_is_out_of_memory_error(GetLastError())) { // if committing regular memory and being out-of-memory, diff --git a/src/stats.c b/src/stats.c index 29376ace..14489937 100644 --- a/src/stats.c +++ b/src/stats.c @@ -90,7 +90,6 @@ static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t // must be thread safe as it is called from stats_merge static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { if (stats==src) return; - mi_stat_add(&stats->segments, &src->segments,1); mi_stat_add(&stats->pages, &src->pages,1); mi_stat_add(&stats->reserved, &src->reserved, 1); mi_stat_add(&stats->committed, &src->committed, 1); @@ -99,11 +98,9 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { mi_stat_add(&stats->page_committed, &src->page_committed, 1); mi_stat_add(&stats->pages_abandoned, &src->pages_abandoned, 1); - mi_stat_add(&stats->segments_abandoned, &src->segments_abandoned, 1); mi_stat_add(&stats->threads, &src->threads, 1); mi_stat_add(&stats->malloc, &src->malloc, 1); - mi_stat_add(&stats->segments_cache, &src->segments_cache, 1); mi_stat_add(&stats->normal, &src->normal, 1); mi_stat_add(&stats->huge, &src->huge, 1); mi_stat_add(&stats->giant, &src->giant, 1); @@ -329,9 +326,9 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_stat_peak_print(&stats->reset, "reset", 1, out, arg ); mi_stat_peak_print(&stats->purged, "purged", 1, out, arg ); mi_stat_print(&stats->page_committed, "touched", 1, out, arg); - mi_stat_print(&stats->segments, "segments", -1, out, arg); - mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg); - mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg); + //mi_stat_print(&stats->segments, "segments", -1, out, arg); + //mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg); + //mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg); mi_stat_print(&stats->pages, "pages", -1, out, arg); mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg); mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg); From 9603fe8b50121dd0ee8b4f9748faba9e749569bf Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 29 Nov 2024 16:27:58 -0800 Subject: [PATCH 005/264] can compile without missing functions --- ide/vs2022/mimalloc-override.vcxproj | 8 +- ide/vs2022/mimalloc-override.vcxproj.filters | 8 +- ide/vs2022/mimalloc.vcxproj | 6 - ide/vs2022/mimalloc.vcxproj.filters | 3 - include/mimalloc/internal.h | 6 +- src/arena.c | 971 ++----------------- src/page-map.c | 39 +- 7 files changed, 145 insertions(+), 896 deletions(-) diff --git a/ide/vs2022/mimalloc-override.vcxproj b/ide/vs2022/mimalloc-override.vcxproj index 32bd97d1..a5d5c34c 100644 --- a/ide/vs2022/mimalloc-override.vcxproj +++ b/ide/vs2022/mimalloc-override.vcxproj @@ -236,17 +236,18 @@ - + + + true true true true - - + true @@ -264,7 +265,6 @@ - diff --git a/ide/vs2022/mimalloc-override.vcxproj.filters b/ide/vs2022/mimalloc-override.vcxproj.filters index 6656c16d..60c7a1fb 100644 --- a/ide/vs2022/mimalloc-override.vcxproj.filters +++ b/ide/vs2022/mimalloc-override.vcxproj.filters @@ -46,16 +46,16 @@ Sources - - Sources - Sources Sources - + + Sources + + Sources diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 41fe0b46..8606faf3 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -214,12 +214,6 @@ - - true - true - true - true - false diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters index 237ef1ed..a47efddd 100644 --- a/ide/vs2022/mimalloc.vcxproj.filters +++ b/ide/vs2022/mimalloc.vcxproj.filters @@ -58,9 +58,6 @@ Sources - - Sources - diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 515acfc1..3c8216ec 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -130,8 +130,8 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t ma // arena.c mi_arena_id_t _mi_arena_id_none(void); void _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats); -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld); -void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld); +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld); +void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld); bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id); bool _mi_arena_contains(const void* p); void _mi_arenas_collect(bool force_purge, mi_stats_t* stats); @@ -503,7 +503,7 @@ static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); mi_atomic_store_release(&page->xheap,(uintptr_t)heap); if (heap != NULL) { - page->heap_tag = heap->tag; + page->heap_tag = heap->tag; mi_atomic_store_release(&page->xthread_id, heap->thread_id); } else { diff --git a/src/arena.c b/src/arena.c index 0db8acf3..9dbf73d6 100644 --- a/src/arena.c +++ b/src/arena.c @@ -155,15 +155,26 @@ static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, siz } // returns if the arena is exclusive -bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) { +static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) { mi_assert_internal(memid.memkind == MI_MEM_ARENA); *arena_index = mi_arena_id_index(memid.mem.arena.id); *block_index = memid.mem.arena.block_index; return memid.mem.arena.is_exclusive; } +// get the arena and block index +static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* block_index) { + size_t arena_index; + mi_arena_memid_indices(memid, &arena_index, block_index); + return mi_arena_from_index(arena_index); +} +static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index) { + // todo: maybe store the arena* directly in the page? + return mi_arena_from_memid(page->memid, block_index); +} + /* ----------------------------------------------------------- Arena Allocation ----------------------------------------------------------- */ @@ -407,7 +418,7 @@ void* _mi_arena_alloc_aligned( return mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld); } -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) { return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); } @@ -546,6 +557,95 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_ return page; } +static uint8_t* mi_arena_page_allocated_area(mi_page_t* page, size_t* psize) { + // todo: record real allocated size instead of trying to recalculate? + size_t page_size; + uint8_t* const pstart = mi_page_area(page, &page_size); + const size_t diff = pstart - (uint8_t*)page; + const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE); + if (psize != NULL) { *psize = size; } + return pstart; +} + +void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) { + size_t size; + uint8_t* pstart = mi_arena_page_allocated_area(page, &size); + _mi_arena_free(pstart, size, size, page->memid, &tld->stats); +} + +/* ----------------------------------------------------------- + Arena abandon +----------------------------------------------------------- */ + +void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { + mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(page->next==NULL); + + if (mi_page_all_free(page)) { + _mi_arena_page_free(page, tld); + } + else if (mi_page_is_full(page)) { // includes singleton pages + // leave as is; it will be reclaimed on free + } + else if (mi_memkind_is_os(page->memid.memkind)) { + _mi_error_message(EINVAL, "implement page abandon for OS allocated pages\n"); + // leave as is; it will be reclaimed on the first free + } + else if (page->memid.memkind==MI_MEM_ARENA) { + size_t size; + mi_arena_page_allocated_area(page, &size); + size_t bin = _mi_bin(mi_page_block_size(page)); + size_t block_index; + mi_arena_t* arena = mi_page_arena(page, &block_index); + bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_abandoned[bin], block_index, 1, NULL); + MI_UNUSED(were_zero); mi_assert_internal(were_zero); + mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]); + } + else { + _mi_error_message(EINVAL, "implement page abandon for external allocated pages\n"); + // leave as is; it will be reclaimed on the first free + } +} + +bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { + mi_assert_internal(mi_page_is_abandoned(page)); + // if (!mi_page_is_abandoned(page)) return false; // it is not abandoned + mi_memid_t memid = page->memid; + if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false; // don't reclaim between exclusive and non-exclusive arena's + + if mi_likely(memid.memkind == MI_MEM_ARENA) { + size_t block_index; + mi_arena_t* arena = mi_page_arena(page, &block_index); + if (arena->subproc != heap->tld->subproc) return false; // only reclaim within the same subprocess + + // don't reclaim more from a `free` call than half the current segments + // this is to prevent a pure free-ing thread to start owning too many segments + // (but not for out-of-arena segments as that is the main way to be reclaimed for those) + // if (segment->memid.memkind == MI_MEM_ARENA && heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) { + // return false; + // } + const size_t bin = _mi_bin(page->block_size); + if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->blocks_abandoned[bin], block_index, 1)) { + // we got it atomically + _mi_page_reclaim(heap, page); + mi_assert_internal(!mi_page_is_abandoned(page)); + return true; + } + } + else { + _mi_warning_message("implement reclaim for OS allocated pages\n"); + } + + + return false; +} + +void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { + MI_UNUSED(heap); + // TODO: implement this + return; +} + /* ----------------------------------------------------------- Arena free @@ -1017,97 +1117,15 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) { } -#if 0 - -#define MI_IN_ARENA_C -#include "arena-abandon.c" -#undef MI_IN_ARENA_C - -/* ----------------------------------------------------------- - Arena id's - id = arena_index + 1 ------------------------------------------------------------ */ - -size_t mi_arena_id_index(mi_arena_id_t id) { - return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); -} - -static mi_arena_id_t mi_arena_id_create(size_t arena_index) { - mi_assert_internal(arena_index < MI_MAX_ARENAS); - return (int)arena_index + 1; -} - -mi_arena_id_t _mi_arena_id_none(void) { - return 0; -} - -static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { - return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || - (arena_id == req_arena_id)); -} - -bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { - if (memid.memkind == MI_MEM_ARENA) { - return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); - } - else { - return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); - } -} - -size_t mi_arena_get_count(void) { - return mi_atomic_load_relaxed(&mi_arena_count); -} - -mi_arena_t* mi_arena_from_index(size_t idx) { - mi_assert_internal(idx < mi_arena_get_count()); - return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); -} - - -/* ----------------------------------------------------------- - Arena allocations get a (currently) 16-bit memory id where the - lower 8 bits are the arena id, and the upper bits the block index. ------------------------------------------------------------ */ - -static size_t mi_block_count_of_size(size_t size) { - return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); -} - -static size_t mi_size_of_blocks(size_t bcount) { - return (bcount * MI_ARENA_BLOCK_SIZE); -} - -static size_t mi_arena_size(mi_arena_t* arena) { - return mi_size_of_blocks(arena->block_count); -} - -static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) { - mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); - memid.mem.arena.id = id; - memid.mem.arena.block_index = bitmap_index; - memid.mem.arena.is_exclusive = is_exclusive; - return memid; -} - -bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) { - mi_assert_internal(memid.memkind == MI_MEM_ARENA); - *arena_index = mi_arena_id_index(memid.mem.arena.id); - *bitmap_index = memid.mem.arena.block_index; - return memid.mem.arena.is_exclusive; -} - - - /* ----------------------------------------------------------- Special static area for mimalloc internal structures - to avoid OS calls (for example, for the arena metadata (~= 256b)) + to avoid OS calls (for example, for the subproc metadata (~= 721b)) ----------------------------------------------------------- */ #define MI_ARENA_STATIC_MAX ((MI_INTPTR_SIZE/2)*MI_KiB) // 4 KiB on 64-bit static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX]; // must be cache aligned, see issue #895 -static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top; +static mi_decl_cache_align _Atomic(size_t)mi_arena_static_top; static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) { *memid = _mi_memid_none(); @@ -1164,784 +1182,9 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { } } -void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { - return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex))); -} - - -/* ----------------------------------------------------------- - Thread safe allocation in an arena ------------------------------------------------------------ */ - -// claim the `blocks_inuse` bits -static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats) -{ - size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx); // start from last search; ok to be relaxed as the exact start does not matter - if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) { - mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx)); // start search from found location next time around - return true; - }; +bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { + MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg); + _mi_error_message(EINVAL, "implement mi_abandon_visit_blocks\n"); return false; } - -/* ----------------------------------------------------------- - Arena Allocation ------------------------------------------------------------ */ - -static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, - bool commit, mi_memid_t* memid, mi_os_tld_t* tld) -{ - MI_UNUSED(arena_index); - mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); - - mi_bitmap_index_t bitmap_index; - if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL; - - // claimed it! - void* p = mi_arena_block_start(arena, bitmap_index); - *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index); - memid->is_pinned = arena->memid.is_pinned; - - // none of the claimed blocks should be scheduled for a decommit - if (arena->blocks_purge != NULL) { - // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`). - _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index); - } - - // set the dirty bits (todo: no need for an atomic op here?) - if (arena->memid.initially_zero && arena->blocks_dirty != NULL) { - memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL); - } - - // set commit state - if (arena->blocks_committed == NULL) { - // always committed - memid->initially_committed = true; - } - else if (commit) { - // commit requested, but the range may not be committed as a whole: ensure it is committed now - memid->initially_committed = true; - bool any_uncommitted; - _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted); - if (any_uncommitted) { - bool commit_zero = false; - if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { - memid->initially_committed = false; - } - else { - if (commit_zero) { memid->initially_zero = true; } - } - } - } - else { - // no need to commit, but check if already fully committed - memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index); - } - - return p; -} - -// allocate in a speficic arena -static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, - bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) -{ - MI_UNUSED_RELEASE(alignment); - mi_assert(alignment <= MI_SEGMENT_ALIGN); - const size_t bcount = mi_block_count_of_size(size); - const size_t arena_index = mi_arena_id_index(arena_id); - mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); - mi_assert_internal(size <= mi_size_of_blocks(bcount)); - - // Check arena suitability - mi_arena_t* arena = mi_arena_from_index(arena_index); - if (arena == NULL) return NULL; - if (!allow_large && arena->is_large) return NULL; - if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; - if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity - const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); - if (match_numa_node) { if (!numa_suitable) return NULL; } - else { if (numa_suitable) return NULL; } - } - - // try to allocate - void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld); - mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); - return p; -} - - -// allocate from an arena with fallback to the OS -static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, - bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) -{ - MI_UNUSED(alignment); - mi_assert_internal(alignment <= MI_SEGMENT_ALIGN); - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - if mi_likely(max_arena == 0) return NULL; - - if (req_arena_id != _mi_arena_id_none()) { - // try a specific arena if requested - if (mi_arena_id_index(req_arena_id) < max_arena) { - void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - } - } - else { - // try numa affine allocation - for (size_t i = 0; i < max_arena; i++) { - void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - } - - // try from another numa node instead.. - if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already - for (size_t i = 0; i < max_arena; i++) { - void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - } - } - } - return NULL; -} - -// try to reserve a fresh arena space -static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id) -{ - if (_mi_preloading()) return false; // use OS only while pre loading - if (req_arena_id != _mi_arena_id_none()) return false; - - const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); - if (arena_count > (MI_MAX_ARENAS - 4)) return false; - - size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); - if (arena_reserve == 0) return false; - - if (!_mi_os_has_virtual_reserve()) { - arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) - } - arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); - arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE); - if (arena_count >= 8 && arena_count <= 128) { - // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 ); - size_t reserve = 0; - if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { - arena_reserve = reserve; - } - } - if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size - - // commit eagerly? - bool arena_commit = false; - if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } - else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } - - return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); -} - - -void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) -{ - mi_assert_internal(memid != NULL && tld != NULL); - mi_assert_internal(size > 0); - *memid = _mi_memid_none(); - - const int numa_node = _mi_os_numa_node(tld); // current numa node - - // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) - if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? - if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) { - void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - - // otherwise, try to first eagerly reserve a new arena - if (req_arena_id == _mi_arena_id_none()) { - mi_arena_id_t arena_id = 0; - if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { - // and try allocate in there - mi_assert_internal(req_arena_id == _mi_arena_id_none()); - p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - } - } - } - } - - // if we cannot use OS allocation, return NULL - if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { - errno = ENOMEM; - return NULL; - } - - // finally, fall back to the OS - if (align_offset > 0) { - return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); - } - else { - return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); - } -} - -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) -{ - return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); -} - - -void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { - if (size != NULL) *size = 0; - size_t arena_index = mi_arena_id_index(arena_id); - if (arena_index >= MI_MAX_ARENAS) return NULL; - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); - if (arena == NULL) return NULL; - if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } - return arena->start; -} - - -/* ----------------------------------------------------------- - Arena purge ------------------------------------------------------------ */ - -static long mi_arena_purge_delay(void) { - // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay - return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); -} - -// reset or decommit in an arena and update the committed/decommit bitmaps -// assumes we own the area (i.e. blocks_in_use is claimed by us) -static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) { - mi_assert_internal(arena->blocks_committed != NULL); - mi_assert_internal(arena->blocks_purge != NULL); - mi_assert_internal(!arena->memid.is_pinned); - const size_t size = mi_size_of_blocks(blocks); - void* const p = mi_arena_block_start(arena, bitmap_idx); - bool needs_recommit; - if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { - // all blocks are committed, we can purge freely - needs_recommit = _mi_os_purge(p, size, stats); - } - else { - // some blocks are not committed -- this can happen when a partially committed block is freed - // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge - // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), - // and also undo the decommit stats (as it was already adjusted) - mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); - needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); - if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } - } - - // clear the purged blocks - _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx); - // update committed bitmap - if (needs_recommit) { - _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); - } -} - -// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. -// Note: assumes we (still) own the area as we may purge immediately -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) { - mi_assert_internal(arena->blocks_purge != NULL); - const long delay = mi_arena_purge_delay(); - if (delay < 0) return; // is purging allowed at all? - - if (_mi_preloading() || delay == 0) { - // decommit directly - mi_arena_purge(arena, bitmap_idx, blocks, stats); - } - else { - // schedule decommit - mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); - if (expire != 0) { - mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10)); // add smallish extra delay - } - else { - mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); - } - _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL); - } -} - -// purge a range of blocks -// return true if the full range was purged. -// assumes we own the area (i.e. blocks_in_use is claimed by us) -static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) { - const size_t endidx = startseqx + bitlen; - size_t bitseqx = startseqx; - bool all_purged = false; - while (bitseqx < endidx) { - // count consecutive ones in the purge mask - size_t count = 0; - while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) { - count++; - } - if (count > 0) { - // found range to be purged - const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx); - mi_arena_purge(arena, range_idx, count, stats); - if (count == bitlen) { - all_purged = true; - } - } - bitseqx += (count+1); // +1 to skip the zero bit (or end) - } - return all_purged; -} - -// returns true if anything was purged -static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) -{ - if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false; - mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); - if (expire == 0) return false; - if (!force && expire > now) return false; - - // reset expire (if not already set concurrently) - mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); - - // potential purges scheduled, walk through the bitmap - bool any_purged = false; - bool full_purge = true; - for (size_t i = 0; i < arena->field_count; i++) { - size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]); - if (purge != 0) { - size_t bitseqx = 0; - while (bitseqx < MI_BITMAP_FIELD_BITS) { - // find consecutive range of ones in the purge mask - size_t bitlen = 0; - while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) { - bitlen++; - } - // temporarily claim the purge range as "in-use" to be thread-safe with allocation - // try to claim the longest range of corresponding in_use bits - const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx); - while( bitlen > 0 ) { - if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) { - break; - } - bitlen--; - } - // actual claimed bits at `in_use` - if (bitlen > 0) { - // read purge again now that we have the in_use bits - purge = mi_atomic_load_acquire(&arena->blocks_purge[i]); - if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) { - full_purge = false; - } - any_purged = true; - // release the claimed `in_use` bits again - _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index); - } - bitseqx += (bitlen+1); // +1 to skip the zero (or end) - } // while bitseqx - } // purge != 0 - } - // if not fully purged, make sure to purge again in the future - if (!full_purge) { - const long delay = mi_arena_purge_delay(); - mi_msecs_t expected = 0; - mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay); - } - return any_purged; -} - -static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) { - if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled - - const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); - if (max_arena == 0) return; - - // allow only one thread to purge at a time - static mi_atomic_guard_t purge_guard; - mi_atomic_guard(&purge_guard) - { - mi_msecs_t now = _mi_clock_now(); - size_t max_purge_count = (visit_all ? max_arena : 1); - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); - if (arena != NULL) { - if (mi_arena_try_purge(arena, now, force, stats)) { - if (max_purge_count <= 1) break; - max_purge_count--; - } - } - } - } -} - - -/* ----------------------------------------------------------- - Arena free ------------------------------------------------------------ */ - -void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { - mi_assert_internal(size > 0 && stats != NULL); - mi_assert_internal(committed_size <= size); - if (p==NULL) return; - if (size==0) return; - const bool all_committed = (committed_size == size); - - // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) - mi_track_mem_undefined(p,size); - - if (mi_memkind_is_os(memid.memkind)) { - // was a direct OS allocation, pass through - if (!all_committed && committed_size > 0) { - // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) - _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } - _mi_os_free(p, size, memid, stats); - } - else if (memid.memkind == MI_MEM_ARENA) { - // allocated in an arena - size_t arena_idx; - size_t bitmap_idx; - mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx); - mi_assert_internal(arena_idx < MI_MAX_ARENAS); - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]); - mi_assert_internal(arena != NULL); - const size_t blocks = mi_block_count_of_size(size); - - // checks - if (arena == NULL) { - _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); - return; - } - mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx)); - if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) { - _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); - return; - } - - // potentially decommit - if (arena->memid.is_pinned || arena->blocks_committed == NULL) { - mi_assert_internal(all_committed); - } - else { - mi_assert_internal(arena->blocks_committed != NULL); - mi_assert_internal(arena->blocks_purge != NULL); - - if (!all_committed) { - // mark the entire range as no longer committed (so we recommit the full range when re-using) - _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); - mi_track_mem_noaccess(p,size); - if (committed_size > 0) { - // if partially committed, adjust the committed stats (is it will be recommitted when re-using) - // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. - _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } - // note: if not all committed, it may be that the purge will reset/decommit the entire range - // that contains already decommitted parts. Since purge consistently uses reset or decommit that - // works (as we should never reset decommitted parts). - } - // (delay) purge the entire range - mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats); - } - - // and make it available to others again - bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx); - if (!all_inuse) { - _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); - return; - }; - } - else { - // arena was none, external, or static; nothing to do - mi_assert_internal(memid.memkind < MI_MEM_OS); - } - - // purge expired decommits - mi_arenas_try_purge(false, false, stats); -} - -// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` -// for dynamic libraries that are unloaded and need to release all their allocated memory. -static void mi_arenas_unsafe_destroy(void) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - size_t new_max_arena = 0; - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); - if (arena != NULL) { - mi_lock_done(&arena->abandoned_visit_lock); - if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) { - mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); - _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); - } - else { - new_max_arena = i; - } - _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size); - } - } - - // try to lower the max arena. - size_t expected = max_arena; - mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); -} - -// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired -void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { - mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); -} - -// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` -// for dynamic libraries that are unloaded and need to release all their allocated memory. -void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { - mi_arenas_unsafe_destroy(); - _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas -} - -// Is a pointer inside any of our arenas? -bool _mi_arena_contains(const void* p) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { - return true; - } - } - return false; -} - -/* ----------------------------------------------------------- - Add an arena. ------------------------------------------------------------ */ - -static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { - mi_assert_internal(arena != NULL); - mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0); - mi_assert_internal(arena->block_count > 0); - if (arena_id != NULL) { *arena_id = -1; } - - size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); - if (i >= MI_MAX_ARENAS) { - mi_atomic_decrement_acq_rel(&mi_arena_count); - return false; - } - _mi_stat_counter_increase(&stats->arena_count,1); - arena->id = mi_arena_id_create(i); - mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); - if (arena_id != NULL) { *arena_id = arena->id; } - return true; -} - -static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept -{ - if (arena_id != NULL) *arena_id = _mi_arena_id_none(); - if (size < MI_ARENA_BLOCK_SIZE) return false; - - if (is_large) { - mi_assert_internal(memid.initially_committed && memid.is_pinned); - } - - const size_t bcount = size / MI_ARENA_BLOCK_SIZE; - const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS); - const size_t bitmaps = (memid.is_pinned ? 3 : 5); - const size_t asize = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t)); - mi_memid_t meta_memid; - mi_arena_t* arena = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid); - if (arena == NULL) return false; - - // already zero'd due to zalloc - // _mi_memzero(arena, asize); - arena->id = _mi_arena_id_none(); - arena->memid = memid; - arena->exclusive = exclusive; - arena->meta_size = asize; - arena->meta_memid = meta_memid; - arena->block_count = bcount; - arena->field_count = fields; - arena->start = (uint8_t*)start; - arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) - arena->is_large = is_large; - arena->purge_expire = 0; - arena->search_idx = 0; - mi_lock_init(&arena->abandoned_visit_lock); - // consecutive bitmaps - arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap - arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap - arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap - arena->blocks_purge = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap - // initialize committed bitmap? - if (arena->blocks_committed != NULL && arena->memid.initially_committed) { - memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning - } - - // and claim leftover blocks if needed (so we never allocate there) - ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount; - mi_assert_internal(post >= 0); - if (post > 0) { - // don't use leftover bits at the end - mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); - _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL); - } - return mi_arena_add(arena, arena_id, &_mi_stats_main); - -} - -bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); - memid.initially_committed = is_committed; - memid.initially_zero = is_zero; - memid.is_pinned = is_large; - return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id); -} - -// Reserve a range of regular OS memory -int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - if (arena_id != NULL) *arena_id = _mi_arena_id_none(); - size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block - mi_memid_t memid; - void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main); - if (start == NULL) return ENOMEM; - const bool is_large = memid.is_pinned; // todo: use separate is_large field? - if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { - _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); - _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); - return ENOMEM; - } - _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); - return 0; -} - - -// Manage a range of regular OS memory -bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { - return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); -} - -// Reserve a range of regular OS memory -int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { - return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); -} - - -/* ----------------------------------------------------------- - Debugging ------------------------------------------------------------ */ - -static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) { - _mi_verbose_message("%s%s:\n", prefix, header); - size_t bcount = 0; - size_t inuse_count = 0; - for (size_t i = 0; i < field_count; i++) { - char buf[MI_BITMAP_FIELD_BITS + 1]; - uintptr_t field = mi_atomic_load_relaxed(&fields[i]); - for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) { - if (bcount < block_count) { - bool inuse = ((((uintptr_t)1 << bit) & field) != 0); - if (inuse) inuse_count++; - buf[bit] = (inuse ? 'x' : '.'); - } - else { - buf[bit] = ' '; - } - } - buf[MI_BITMAP_FIELD_BITS] = 0; - _mi_verbose_message("%s %s\n", prefix, buf); - } - _mi_verbose_message("%s total ('x'): %zu\n", prefix, inuse_count); - return inuse_count; -} - -void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { - size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); - size_t inuse_total = 0; - size_t abandoned_total = 0; - size_t purge_total = 0; - for (size_t i = 0; i < max_arenas; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena == NULL) break; - _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : "")); - if (show_inuse) { - inuse_total += mi_debug_show_bitmap(" ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count); - } - if (arena->blocks_committed != NULL) { - mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count); - } - if (show_abandoned) { - abandoned_total += mi_debug_show_bitmap(" ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count); - } - if (show_purge && arena->blocks_purge != NULL) { - purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count); - } - } - if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", inuse_total); - if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); - if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); -} - - -/* ----------------------------------------------------------- - Reserve a huge page arena. ------------------------------------------------------------ */ -// reserve at a specific numa node -int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - if (arena_id != NULL) *arena_id = -1; - if (pages==0) return 0; - if (numa_node < -1) numa_node = -1; - if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); - size_t hsize = 0; - size_t pages_reserved = 0; - mi_memid_t memid; - void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); - if (p==NULL || pages_reserved==0) { - _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); - return ENOMEM; - } - _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); - - if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { - _mi_os_free(p, hsize, memid, &_mi_stats_main); - return ENOMEM; - } - return 0; -} - -int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { - return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); -} - -// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) -int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { - if (pages == 0) return 0; - - // pages per numa node - size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); - if (numa_count <= 0) numa_count = 1; - const size_t pages_per = pages / numa_count; - const size_t pages_mod = pages % numa_count; - const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); - - // reserve evenly among numa nodes - for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { - size_t node_pages = pages_per; // can be 0 - if (numa_node < pages_mod) node_pages++; - int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); - if (err) return err; - if (pages < node_pages) { - pages = 0; - } - else { - pages -= node_pages; - } - } - - return 0; -} - -int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { - MI_UNUSED(max_secs); - _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); - if (pages_reserved != NULL) *pages_reserved = 0; - int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); - if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; - return err; -} - - -#endif \ No newline at end of file diff --git a/src/page-map.c b/src/page-map.c index d70c3ee6..dc0145f2 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -42,6 +42,21 @@ static bool mi_page_map_init(void) { return true; } +static void mi_page_map_ensure_committed(void* p, size_t idx, size_t block_count) { + // is the page map area that contains the page address committed? + if (!mi_page_map_all_committed) { + const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit); + const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit; + for (size_t i = 0; i < commit_bit_count; i++) { // per bit to avoid crossing over bitmap chunks + if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) { + // this may race, in which case we do multiple commits (which is ok) + _mi_os_commit((uint8_t*)p + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL); + mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL); + } + } + } +} + static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* block_count) { size_t page_size; *page_start = mi_page_area(page, &page_size); @@ -60,18 +75,7 @@ void _mi_page_map_register(mi_page_t* page) { size_t block_count; const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count); - // is the page map area that contains the page address committed? - if (!mi_page_map_all_committed) { - const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit); - const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit; - for (size_t i = 0; i < commit_bit_count; i++) { // per bit to avoid crossing over bitmap chunks - if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) { - // this may race, in which case we do multiple commits (which is ok) - _mi_os_commit(page_start + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL); - mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL); - } - } - } + mi_page_map_ensure_committed(page, idx, block_count); // set the offsets for (int i = 0; i < (int)block_count; i++) { @@ -92,3 +96,14 @@ void _mi_page_map_unregister(mi_page_t* page) { // unset the offsets _mi_memzero(_mi_page_map + idx, block_count); } + + +mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { + uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT); + if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_blocks_per_commit_bit, 1)) { + return (_mi_page_map[idx] != 0); + } + else { + return false; + } +} \ No newline at end of file From e0152ab82fbe0d8a94d1068fdaed2947d3900284 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 29 Nov 2024 16:58:52 -0800 Subject: [PATCH 006/264] wip: update any_set --- src/bitmap.c | 70 +++++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 56 insertions(+), 14 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index 24c0d9c9..5ac4ca08 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -76,7 +76,6 @@ static bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, siz return mi_bfield_atomic_xset(set, b, idx); } - // Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0) // and false otherwise (leaving the bit field as is). static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) { @@ -97,6 +96,15 @@ static bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b } } +// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0) +// and false otherwise (leaving the bit field as is). +static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) { + mi_assert_internal(byte_idx < MI_BFIELD_SIZE); + const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8); + return mi_bfield_atomic_try_xset_mask(set, b, mask); +} + + // Check if all bits corresponding to a mask are set/cleared. static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) { mi_assert_internal(mask != 0); @@ -108,12 +116,11 @@ static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, } } -// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0) -// and false otherwise (leaving the bit field as is). -static bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)* b, size_t byte_idx ) { - mi_assert_internal(byte_idx < MI_BFIELD_SIZE); - const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8); - return mi_bfield_atomic_try_xset_mask(set,b,mask); +// Check if a bit is set/clear +static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { + mi_assert_internal(idx < MI_BFIELD_BITS); + const mi_bfield_t mask = ((mi_bfield_t)1)<any_set, idx); + } + else { // clear + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, idx); + } + } +} + // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { if (!already_zero) { @@ -423,6 +441,7 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ if (m > n) { m = n; } bool already_xset; mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset); + mi_bitmap_update_anyset(set, bitmap, chunk_idx); // n can be large so use memset for efficiency for all in-between chunks chunk_idx++; @@ -430,8 +449,12 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; if (mid_chunks > 0) { _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8); - chunk_idx += mid_chunks; - n -= mid_chunks * MI_BITMAP_CHUNK_BITS; + const size_t end_chunk = chunk_idx + mid_chunks; + while (chunk_idx < end_chunk) { + mi_bitmap_update_anyset(set, bitmap, chunk_idx); + chunk_idx++; + } + n -= (mid_chunks * MI_BITMAP_CHUNK_BITS); } // last chunk @@ -439,6 +462,7 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset); + mi_bitmap_update_anyset(set, bitmap, chunk_idx); } } @@ -449,7 +473,9 @@ bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < MI_BITMAP_MAX_BITS); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx); + bool ok = mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx); + if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } + return ok; } // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) @@ -459,7 +485,9 @@ bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx%8 == 0); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; - return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx); + bool ok = mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx); + if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } + return ok; } // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) @@ -475,8 +503,12 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < MI_BFIELD_BITS); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n); + + bool ok = mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n); + if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } + return ok; } // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). @@ -488,13 +520,17 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bo if (already_xset==NULL) { already_xset = &local_already_xset; } // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } - mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < MI_BFIELD_BITS); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset); + + const bool allx = mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset); + mi_bitmap_update_anyset(set, bitmap, chunk_idx); + return allx; } // Is a sequence of n bits already all set/cleared? @@ -502,10 +538,13 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) mi_assert_internal(n>0); mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < MI_BFIELD_BITS); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); } @@ -578,6 +617,9 @@ bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, s // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger // TODO: allow spanning across chunk boundaries if (n == 0 || n > MI_BFIELD_BITS) return false; + if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); + if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); + mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) { size_t cidx; From 0f635413d678608bd04600faf4e6d558507f7284 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 29 Nov 2024 17:50:37 -0800 Subject: [PATCH 007/264] wip: can run initial test --- include/mimalloc/internal.h | 5 +++-- src/arena.c | 9 ++++++--- src/bitmap.c | 2 +- src/init.c | 2 +- src/page-map.c | 20 ++++++++++---------- test/test-stress.c | 4 ++++ 6 files changed, 25 insertions(+), 17 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 3c8216ec..47301e79 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -447,7 +447,7 @@ static inline mi_page_t* _mi_ptr_page(const void* p) { #if MI_DEBUG if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID; #endif - return (mi_page_t*)((up + ofs - 1) << MI_ARENA_BLOCK_SHIFT); + return (mi_page_t*)((up + ofs + 1) << MI_ARENA_BLOCK_SHIFT); } @@ -663,7 +663,8 @@ We also pass a separate `null` value to be used as `NULL` or otherwise ------------------------------------------------------------------- */ static inline bool mi_is_in_same_page(const void* p, const void* q) { - return (_mi_ptr_page(p) == _mi_ptr_page(q)); + // return (_mi_ptr_page(p) == _mi_ptr_page(q)); + return ((uintptr_t)p / MI_LARGE_PAGE_SIZE) == ((uintptr_t)q / MI_LARGE_PAGE_SIZE); } static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) { diff --git a/src/arena.c b/src/arena.c index 9dbf73d6..a8dff8a5 100644 --- a/src/arena.c +++ b/src/arena.c @@ -415,7 +415,8 @@ void* _mi_arena_alloc_aligned( } // fall back to the OS - return mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld); + void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld); + return p; } void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) @@ -498,6 +499,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz else { page->block_size_shift = 0; } + _mi_page_map_register(page); mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(mi_page_is_abandoned(page)); @@ -564,12 +566,13 @@ static uint8_t* mi_arena_page_allocated_area(mi_page_t* page, size_t* psize) { const size_t diff = pstart - (uint8_t*)page; const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE); if (psize != NULL) { *psize = size; } - return pstart; + return (uint8_t*)page; } void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) { size_t size; uint8_t* pstart = mi_arena_page_allocated_area(page, &size); + _mi_page_map_unregister(page); _mi_arena_free(pstart, size, size, page->memid, &tld->stats); } @@ -1110,7 +1113,7 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) { const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); if (max_arena == 0) return; - _mi_error_message(EFAULT, "purging not yet implemented\n"); + // _mi_error_message(EFAULT, "purging not yet implemented\n"); MI_UNUSED(stats); MI_UNUSED(visit_all); MI_UNUSED(force); diff --git a/src/bitmap.c b/src/bitmap.c index 5ac4ca08..175bc0ec 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -144,7 +144,7 @@ static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, si // Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0) static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) { - mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); bool all_transition = true; bool all_already_xset = true; diff --git a/src/init.c b/src/init.c index 215d6be8..d11f5b5a 100644 --- a/src/init.c +++ b/src/init.c @@ -130,7 +130,7 @@ static mi_decl_cache_align mi_subproc_t mi_subproc_default; static mi_decl_cache_align mi_tld_t tld_main = { 0, false, &_mi_heap_main, &_mi_heap_main, - NULL, // subproc + &mi_subproc_default, // subproc 0, // tseq { 0, &tld_main.stats }, // os { MI_STATS_NULL } // stats diff --git a/src/page-map.c b/src/page-map.c index dc0145f2..8dfd2f26 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file mi_decl_cache_align signed char* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; -static size_t mi_blocks_per_commit_bit = 1; +static size_t mi_size_per_commit_bit = MI_ARENA_BLOCK_SIZE; static mi_memid_t mi_page_map_memid; static mi_bitmap_t mi_page_map_commit; @@ -20,13 +20,12 @@ static bool mi_page_map_init(void) { if (vbits >= 48) vbits = 47; // 1 byte per block = 2 GiB for 128 TiB address space (48 bit = 256 TiB address space) // 64 KiB for 4 GiB address space (on 32-bit) - const size_t page_map_size = (MI_ZU(1) << (vbits >> MI_ARENA_BLOCK_SHIFT)); + const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_BLOCK_SHIFT)); - const size_t min_commit_size = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); - mi_blocks_per_commit_bit = mi_block_count_of_size(min_commit_size); + mi_size_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems - _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 0, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); + _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); return false; @@ -38,6 +37,7 @@ static bool mi_page_map_init(void) { // commit the first part so NULL pointers get resolved without an access violation if (!mi_page_map_all_committed) { _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL); + _mi_page_map[0] = -1; // so _mi_ptr_page(NULL) == NULL } return true; } @@ -45,12 +45,12 @@ static bool mi_page_map_init(void) { static void mi_page_map_ensure_committed(void* p, size_t idx, size_t block_count) { // is the page map area that contains the page address committed? if (!mi_page_map_all_committed) { - const size_t commit_bit_count = _mi_divide_up(block_count, mi_blocks_per_commit_bit); - const size_t commit_bit_idx = idx / mi_blocks_per_commit_bit; + const size_t commit_bit_count = _mi_divide_up(block_count, mi_size_per_commit_bit); + const size_t commit_bit_idx = idx / mi_size_per_commit_bit; for (size_t i = 0; i < commit_bit_count; i++) { // per bit to avoid crossing over bitmap chunks if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) { // this may race, in which case we do multiple commits (which is ok) - _mi_os_commit((uint8_t*)p + (i*mi_blocks_per_commit_bit*MI_ARENA_BLOCK_SIZE), mi_blocks_per_commit_bit* MI_ARENA_BLOCK_SIZE, NULL, NULL); + _mi_os_commit(_mi_page_map + ((commit_bit_idx + i)*mi_size_per_commit_bit), mi_size_per_commit_bit, NULL, NULL); mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL); } } @@ -75,7 +75,7 @@ void _mi_page_map_register(mi_page_t* page) { size_t block_count; const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count); - mi_page_map_ensure_committed(page, idx, block_count); + mi_page_map_ensure_committed(page_start, idx, block_count); // set the offsets for (int i = 0; i < (int)block_count; i++) { @@ -100,7 +100,7 @@ void _mi_page_map_unregister(mi_page_t* page) { mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT); - if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_blocks_per_commit_bit, 1)) { + if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_size_per_commit_bit, 1)) { return (_mi_page_map[idx] != 0); } else { diff --git a/test/test-stress.c b/test/test-stress.c index 1e70e699..c7288b1a 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,6 +40,10 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; +#elif 1 +static int THREADS = 1; +static int SCALE = 10; +static int ITER = 10; #else static int THREADS = 32; // more repeatable if THREADS <= #processors static int SCALE = 25; // scaling factor From 978d844e156b0455bdc39837fade4788f5c34d5e Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 29 Nov 2024 20:23:39 -0800 Subject: [PATCH 008/264] wip: bug fixes --- include/mimalloc/types.h | 5 +- src/arena.c | 114 ++++++++++++++++++++------------------- src/bitmap.c | 6 +-- src/options.c | 2 +- src/page.c | 11 ++-- test/test-stress.c | 5 +- 6 files changed, 78 insertions(+), 65 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 591cb603..e3c0786c 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -171,12 +171,13 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) { } typedef struct mi_memid_os_info { - void* base; // actual base address of the block (used for offset aligned allocations) + void* base; // actual base address of the block (used for offset aligned allocations) size_t alignment; // alignment at allocation } mi_memid_os_info_t; typedef struct mi_memid_arena_info { - size_t block_index; // index in the arena + uint32_t block_index; // base index in the arena + uint32_t block_count; // allocated blocks mi_arena_id_t id; // arena id (>= 1) bool is_exclusive; // this arena can only be used for specific arena allocations } mi_memid_arena_info_t; diff --git a/src/arena.c b/src/arena.c index a8dff8a5..c5d8b14a 100644 --- a/src/arena.c +++ b/src/arena.c @@ -129,7 +129,7 @@ static uint8_t* mi_arena_start(mi_arena_t* arena) { } // Start of a block -void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) { +uint8_t* mi_arena_block_start(mi_arena_t* arena, size_t block_index) { return (mi_arena_start(arena) + mi_size_of_blocks(block_index)); } @@ -146,35 +146,40 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { // Create an arena memid -static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) { +static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index, size_t block_count) { + mi_assert_internal(block_index < UINT32_MAX); + mi_assert_internal(block_count < UINT32_MAX); mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); memid.mem.arena.id = id; - memid.mem.arena.block_index = block_index; + memid.mem.arena.block_index = (uint32_t)block_index; + memid.mem.arena.block_count = (uint32_t)block_count; memid.mem.arena.is_exclusive = is_exclusive; return memid; } // returns if the arena is exclusive -static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) { +static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index, size_t* block_count) { mi_assert_internal(memid.memkind == MI_MEM_ARENA); *arena_index = mi_arena_id_index(memid.mem.arena.id); - *block_index = memid.mem.arena.block_index; + if (block_index) *block_index = memid.mem.arena.block_index; + if (block_count) *block_count = memid.mem.arena.block_count; return memid.mem.arena.is_exclusive; } // get the arena and block index -static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* block_index) { +static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* block_index, size_t* block_count) { size_t arena_index; - mi_arena_memid_indices(memid, &arena_index, block_index); + mi_arena_memid_indices(memid, &arena_index, block_index, block_count); return mi_arena_from_index(arena_index); } -static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index) { +static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index, size_t* block_count) { // todo: maybe store the arena* directly in the page? - return mi_arena_from_memid(page->memid, block_index); + return mi_arena_from_memid(page->memid, block_index, block_count); } + /* ----------------------------------------------------------- Arena Allocation ----------------------------------------------------------- */ @@ -187,7 +192,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // claimed it! void* p = mi_arena_block_start(arena, block_index); - *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index); + *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index, needed_bcount); memid->is_pinned = arena->memid.is_pinned; // set the dirty bits @@ -424,7 +429,15 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); } - +static uint8_t* xmi_arena_page_allocated_area(mi_page_t* page, size_t* psize) { + // todo: record real allocated size instead of trying to recalculate? + size_t page_size; + uint8_t* const pstart = mi_page_area(page, &page_size); + const size_t diff = pstart - (uint8_t*)page; + const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE); + if (psize != NULL) { *psize = size; } + return (uint8_t*)page; +} /* ----------------------------------------------------------- Arena page allocation @@ -467,7 +480,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz { const bool allow_large = true; const bool commit = true; - const size_t alignment = MI_ARENA_BLOCK_ALIGN; + const size_t alignment = 1; // try to allocate from free space in arena's mi_memid_t memid = _mi_memid_none(); @@ -515,13 +528,13 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size // 1. look for an abandoned page mi_page_t* page = mi_arena_page_try_find_abandoned(block_count, block_size, req_arena_id, tld); if (page != NULL) { - _mi_page_reclaim(heap,page); - return page; + return page; // return as abandoned } // 2. find a free block, potentially allocating a new arena page = mi_arena_page_alloc_fresh(block_count, block_size, req_arena_id, tld); - if (page != NULL) { + if (page != NULL) { + mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.block_count == block_count); _mi_page_init(heap, page); return page; } @@ -559,21 +572,11 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_ return page; } -static uint8_t* mi_arena_page_allocated_area(mi_page_t* page, size_t* psize) { - // todo: record real allocated size instead of trying to recalculate? - size_t page_size; - uint8_t* const pstart = mi_page_area(page, &page_size); - const size_t diff = pstart - (uint8_t*)page; - const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE); - if (psize != NULL) { *psize = size; } - return (uint8_t*)page; -} + void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) { - size_t size; - uint8_t* pstart = mi_arena_page_allocated_area(page, &size); _mi_page_map_unregister(page); - _mi_arena_free(pstart, size, size, page->memid, &tld->stats); + _mi_arena_free(page, 0, 0, page->memid, &tld->stats); } /* ----------------------------------------------------------- @@ -595,11 +598,9 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { // leave as is; it will be reclaimed on the first free } else if (page->memid.memkind==MI_MEM_ARENA) { - size_t size; - mi_arena_page_allocated_area(page, &size); - size_t bin = _mi_bin(mi_page_block_size(page)); + size_t bin = _mi_bin(mi_page_block_size(page)); size_t block_index; - mi_arena_t* arena = mi_page_arena(page, &block_index); + mi_arena_t* arena = mi_page_arena(page, &block_index, NULL); bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_abandoned[bin], block_index, 1, NULL); MI_UNUSED(were_zero); mi_assert_internal(were_zero); mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]); @@ -618,7 +619,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { if mi_likely(memid.memkind == MI_MEM_ARENA) { size_t block_index; - mi_arena_t* arena = mi_page_arena(page, &block_index); + mi_arena_t* arena = mi_page_arena(page, &block_index, NULL); if (arena->subproc != heap->tld->subproc) return false; // only reclaim within the same subprocess // don't reclaim more from a `free` call than half the current segments @@ -657,7 +658,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats); void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { - mi_assert_internal(size > 0 && stats != NULL); + mi_assert_internal(size >= 0 && stats != NULL); mi_assert_internal(committed_size <= size); if (p==NULL) return; if (size==0) return; @@ -676,21 +677,19 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi } else if (memid.memkind == MI_MEM_ARENA) { // allocated in an arena - size_t arena_idx; + size_t block_count; size_t block_idx; - mi_arena_memid_indices(memid, &arena_idx, &block_idx); - mi_assert_internal(arena_idx < MI_MAX_ARENAS); - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); - mi_assert_internal(arena != NULL); - const size_t blocks = mi_block_count_of_size(size); - + mi_arena_t* arena = mi_arena_from_memid(memid, &block_idx, &block_count); + mi_assert_internal(size==1); + mi_assert_internal(mi_arena_block_start(arena,block_idx) <= p); + mi_assert_internal(mi_arena_block_start(arena,block_idx) + mi_size_of_blocks(block_count) > p); // checks if (arena == NULL) { _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } mi_assert_internal(block_idx < arena->block_count); - mi_assert_internal(block_idx > mi_arena_info_blocks()); + mi_assert_internal(block_idx >= mi_arena_info_blocks()); if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) { _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; @@ -703,7 +702,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi else { if (!all_committed) { // mark the entire range as no longer committed (so we recommit the full range when re-using) - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, block_idx, block_count, NULL); mi_track_mem_noaccess(p, size); if (committed_size > 0) { // if partially committed, adjust the committed stats (is it will be recommitted when re-using) @@ -715,13 +714,13 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi // works (as we should never reset decommitted parts). } // (delay) purge the entire range - mi_arena_schedule_purge(arena, block_idx, blocks, stats); + mi_arena_schedule_purge(arena, block_idx, block_count, stats); } // and make it available to others again - bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL); + bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, block_count, NULL); if (!all_inuse) { - _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); + _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_block_start(arena,block_idx), mi_size_of_blocks(block_count)); return; }; } @@ -846,7 +845,11 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->is_large = is_large; arena->purge_expire = 0; mi_lock_init(&arena->abandoned_visit_lock); - + mi_heap_t* heap = mi_heap_get_default(); + if (heap != NULL) { + arena->subproc = heap->tld->subproc; + } + // init bitmaps mi_bitmap_init(&arena->blocks_free,true); mi_bitmap_init(&arena->blocks_committed,true); @@ -925,18 +928,21 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ size_t bit_count = 0; size_t bit_set_count = 0; for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) { - char buf[MI_BITMAP_CHUNK_BITS + 1]; + char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf)); mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; - for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { + for (int j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { if (bit_count < block_count) { - bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS); + bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + k); + k += MI_BFIELD_BITS; + buf[k++] = ' '; } else { - _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS); - } - bit_count += MI_BFIELD_BITS; + _mi_memset(buf + k, ' ', MI_BFIELD_BITS); + k += MI_BFIELD_BITS; + } + bit_count += MI_BFIELD_BITS; } - buf[MI_BITMAP_CHUNK_BITS] = 0; + _mi_verbose_message("%s %s\n", prefix, buf); } _mi_verbose_message("%s total ('x'): %zu\n", prefix, bit_set_count); @@ -954,7 +960,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; block_total += arena->block_count; - _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : "")); + _mi_verbose_message("arena %zu: %zu blocks (%zu MiB)%s\n", i, arena->block_count, mi_size_of_blocks(arena->block_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { free_total += mi_debug_show_bitmap(" ", "free blocks", arena->block_count, &arena->blocks_free); } diff --git a/src/bitmap.c b/src/bitmap.c index 175bc0ec..1a1bb031 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -170,7 +170,7 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t // Check if a sequence of `n` bits within a chunk are all set/cleared. static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { - mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); bool all_xset = true; size_t idx = cidx % MI_BFIELD_BITS; @@ -350,7 +350,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit b >>= idx; bshift += idx; - if (bshift + n >= MI_BFIELD_BITS) break; + if (bshift + n > MI_BFIELD_BITS) break; if ((b&mask) == mask) { // found a match mi_assert_internal( ((mask << bshift) >> bshift) == mask ); @@ -448,7 +448,7 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ n -= m; const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; if (mid_chunks > 0) { - _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8); + _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), mid_chunks * (MI_BITMAP_CHUNK_BITS/8)); const size_t end_chunk = chunk_idx + mid_chunks; while (chunk_idx < end_chunk) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); diff --git a/src/options.c b/src/options.c index d565e269..2eaf29a3 100644 --- a/src/options.c +++ b/src/options.c @@ -132,7 +132,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 10, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { -1, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose diff --git a/src/page.c b/src/page.c index 122b4324..3f145347 100644 --- a/src/page.c +++ b/src/page.c @@ -274,12 +274,17 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size #endif mi_page_t* page = _mi_arena_page_alloc(heap, block_size, page_alignment); if (page == NULL) { - // this may be out-of-memory, or an abandoned page was reclaimed (and in our queue) + // out-of-memory return NULL; } - mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size); + if (mi_page_is_abandoned(page)) { + _mi_page_reclaim(heap, page); + } + else if (pq != NULL) { + mi_page_queue_push(heap, pq, page); + } mi_heap_stat_increase(heap, pages, 1); - if (pq != NULL) { mi_page_queue_push(heap, pq, page); } + mi_assert_internal(pq!=NULL || mi_page_block_size(page) >= block_size); mi_assert_expensive(_mi_page_is_valid(page)); return page; } diff --git a/test/test-stress.c b/test/test-stress.c index c7288b1a..2d7557b8 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -244,7 +244,8 @@ static void test_stress(void) { //mi_debug_show_arenas(); #endif #if !defined(NDEBUG) || defined(MI_TSAN) - if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); } + if (true) // (n + 1) % 10 == 0) + { printf("- iterations left: %3d\n", ITER - (n + 1)); } #endif } } @@ -276,7 +277,7 @@ int main(int argc, char** argv) { mi_option_enable(mi_option_visit_abandoned); #endif #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) - mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); + // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); #endif #ifndef USE_STD_MALLOC mi_stats_reset(); From 9d904e864395da8c493e4d6c997f1296bcc1f5e2 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 30 Nov 2024 10:39:30 -0800 Subject: [PATCH 009/264] wip: bug fixes --- ide/vs2022/mimalloc.vcxproj | 1 - include/mimalloc/internal.h | 9 ++++++++- src/arena.c | 33 ++++++++++++++++++--------------- src/init.c | 2 +- src/page-map.c | 12 ++++++------ src/page.c | 10 ++++++---- src/prim/windows/prim.c | 9 +++++++-- test/test-stress.c | 12 ++++++------ 8 files changed, 52 insertions(+), 36 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 8606faf3..9e8dab78 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -120,7 +120,6 @@ CompileAsCpp false stdcpp20 - AdvancedVectorExtensions2 diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 47301e79..119b7b93 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -467,6 +467,12 @@ static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) { return mi_page_start(page); } +static inline bool mi_page_contains_address(const mi_page_t* page, const void* p) { + size_t psize; + uint8_t* start = mi_page_area(page, &psize); + return (start <= p && p < start + psize); +} + static inline bool mi_page_is_in_arena(const mi_page_t* page) { return (page->memid.memkind == MI_MEM_ARENA); } @@ -663,8 +669,9 @@ We also pass a separate `null` value to be used as `NULL` or otherwise ------------------------------------------------------------------- */ static inline bool mi_is_in_same_page(const void* p, const void* q) { + mi_page_t* page = _mi_ptr_page(p); + return mi_page_contains_address(page,q); // return (_mi_ptr_page(p) == _mi_ptr_page(q)); - return ((uintptr_t)p / MI_LARGE_PAGE_SIZE) == ((uintptr_t)q / MI_LARGE_PAGE_SIZE); } static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) { diff --git a/src/arena.c b/src/arena.c index c5d8b14a..632c7a2a 100644 --- a/src/arena.c +++ b/src/arena.c @@ -316,7 +316,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are if (_idx >= _max_arena) { _idx -= _max_arena; } \ const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); MI_UNUSED(var_arena_id);\ mi_arena_t* const var_arena = mi_arena_from_index(_idx); \ - if (mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \ + if (var_arena != NULL && mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \ { #define mi_forall_arenas_end() }}} @@ -576,7 +576,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_ void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) { _mi_page_map_unregister(page); - _mi_arena_free(page, 0, 0, page->memid, &tld->stats); + _mi_arena_free(page, 1, 1, page->memid, &tld->stats); } /* ----------------------------------------------------------- @@ -590,14 +590,8 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { if (mi_page_all_free(page)) { _mi_arena_page_free(page, tld); } - else if (mi_page_is_full(page)) { // includes singleton pages - // leave as is; it will be reclaimed on free - } - else if (mi_memkind_is_os(page->memid.memkind)) { - _mi_error_message(EINVAL, "implement page abandon for OS allocated pages\n"); - // leave as is; it will be reclaimed on the first free - } else if (page->memid.memkind==MI_MEM_ARENA) { + // make available for allocations size_t bin = _mi_bin(mi_page_block_size(page)); size_t block_index; mi_arena_t* arena = mi_page_arena(page, &block_index, NULL); @@ -606,14 +600,14 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]); } else { - _mi_error_message(EINVAL, "implement page abandon for external allocated pages\n"); - // leave as is; it will be reclaimed on the first free + // page is full (or a singleton), page is OS/externally allocated + // leave as is; it will be reclaimed when an object is free'd in the page } } bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { - mi_assert_internal(mi_page_is_abandoned(page)); - // if (!mi_page_is_abandoned(page)) return false; // it is not abandoned + if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); } + if (!mi_page_is_abandoned(page)) return false; // it is not abandoned mi_memid_t memid = page->memid; if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false; // don't reclaim between exclusive and non-exclusive arena's @@ -637,7 +631,16 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { } } else { - _mi_warning_message("implement reclaim for OS allocated pages\n"); + // A page in OS or external memory + // we use the thread_id to atomically grab ownership + // TODO: respect the subproc -- do we need to add this to the page? + mi_threadid_t abandoned_thread_id = 0; + if (mi_atomic_cas_strong_acq_rel(&page->xthread_id, &abandoned_thread_id, heap->thread_id)) { + // we unabandoned partly + _mi_page_reclaim(heap, page); + mi_assert_internal(!mi_page_is_abandoned(page)); + return true; + } } @@ -1193,7 +1196,7 @@ void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg); - _mi_error_message(EINVAL, "implement mi_abandon_visit_blocks\n"); + _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n"); return false; } diff --git a/src/init.c b/src/init.c index d11f5b5a..40bc5c4a 100644 --- a/src/init.c +++ b/src/init.c @@ -396,7 +396,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { tld->heap_backing = bheap; tld->heaps = NULL; tld->subproc = &mi_subproc_default; - tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1); tld->os.stats = &tld->stats; } diff --git a/src/page-map.c b/src/page-map.c index 8dfd2f26..e803a367 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file mi_decl_cache_align signed char* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; -static size_t mi_size_per_commit_bit = MI_ARENA_BLOCK_SIZE; +static size_t mi_page_map_size_per_commit_bit = MI_ARENA_BLOCK_SIZE; static mi_memid_t mi_page_map_memid; static mi_bitmap_t mi_page_map_commit; @@ -22,7 +22,7 @@ static bool mi_page_map_init(void) { // 64 KiB for 4 GiB address space (on 32-bit) const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_BLOCK_SHIFT)); - mi_size_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); + mi_page_map_size_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); @@ -45,12 +45,12 @@ static bool mi_page_map_init(void) { static void mi_page_map_ensure_committed(void* p, size_t idx, size_t block_count) { // is the page map area that contains the page address committed? if (!mi_page_map_all_committed) { - const size_t commit_bit_count = _mi_divide_up(block_count, mi_size_per_commit_bit); - const size_t commit_bit_idx = idx / mi_size_per_commit_bit; + const size_t commit_bit_count = _mi_divide_up(block_count, mi_page_map_size_per_commit_bit); + const size_t commit_bit_idx = idx / mi_page_map_size_per_commit_bit; for (size_t i = 0; i < commit_bit_count; i++) { // per bit to avoid crossing over bitmap chunks if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) { // this may race, in which case we do multiple commits (which is ok) - _mi_os_commit(_mi_page_map + ((commit_bit_idx + i)*mi_size_per_commit_bit), mi_size_per_commit_bit, NULL, NULL); + _mi_os_commit(_mi_page_map + ((commit_bit_idx + i)*mi_page_map_size_per_commit_bit), mi_page_map_size_per_commit_bit, NULL, NULL); mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL); } } @@ -100,7 +100,7 @@ void _mi_page_map_unregister(mi_page_t* page) { mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT); - if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_size_per_commit_bit, 1)) { + if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_size_per_commit_bit, 1)) { return (_mi_page_map[idx] != 0); } else { diff --git a/src/page.c b/src/page.c index 3f145347..b6af4fd0 100644 --- a/src/page.c +++ b/src/page.c @@ -713,7 +713,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { -------------------------------------------------------------*/ // search for a best next page to use for at most N pages (often cut short if immediate blocks are available) -#define MI_MAX_CANDIDATE_SEARCH (8) +#define MI_MAX_CANDIDATE_SEARCH (0) // Find a page with free blocks of `page->block_size`. @@ -788,9 +788,11 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p if (page_candidate != NULL) { page = page_candidate; } - if (page != NULL && !mi_page_immediate_available(page)) { - mi_assert_internal(mi_page_is_expandable(page)); - mi_page_extend_free(heap, page); + if (page != NULL) { + if (!mi_page_immediate_available(page)) { + mi_assert_internal(mi_page_is_expandable(page)); + mi_page_extend_free(heap, page); + } } if (page == NULL) { diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 418c950f..276da85c 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -108,6 +108,8 @@ static bool win_enable_large_os_pages(size_t* large_page_size) // Initialize //--------------------------------------------- +static DWORD win_allocation_granularity = 64*MI_KiB; + void _mi_prim_mem_init( mi_os_mem_config_t* config ) { config->has_overcommit = false; @@ -117,7 +119,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) SYSTEM_INFO si; GetSystemInfo(&si); if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; } - if (si.dwAllocationGranularity > 0) { config->alloc_granularity = si.dwAllocationGranularity; } + if (si.dwAllocationGranularity > 0) { + config->alloc_granularity = si.dwAllocationGranularity; + win_allocation_granularity = si.dwAllocationGranularity; + } // get virtual address bits if ((uintptr_t)si.lpMaximumApplicationAddress > 0) { const size_t vbits = MI_INTPTR_BITS - mi_clz((uintptr_t)si.lpMaximumApplicationAddress); @@ -203,7 +208,7 @@ static void* win_virtual_alloc_prim_once(void* addr, size_t size, size_t try_ali } #endif // on modern Windows try use VirtualAlloc2 for aligned allocation - if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) { + if (addr == NULL && try_alignment > win_allocation_granularity && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) { MI_MEM_ADDRESS_REQUIREMENTS reqs = { 0, 0, 0 }; reqs.Alignment = try_alignment; MI_MEM_EXTENDED_PARAMETER param = { {0, 0}, {0} }; diff --git a/test/test-stress.c b/test/test-stress.c index 2d7557b8..e287cfa7 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,10 +40,10 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; -#elif 1 -static int THREADS = 1; -static int SCALE = 10; -static int ITER = 10; +#elif 0 +static int THREADS = 4; +static int SCALE = 20; +static int ITER = 20; #else static int THREADS = 32; // more repeatable if THREADS <= #processors static int SCALE = 25; // scaling factor @@ -69,7 +69,7 @@ static bool main_participates = false; // main thread participates as a #define custom_realloc(p,s) mi_realloc(p,s) #define custom_free(p) mi_free(p) #ifndef NDEBUG -#define HEAP_WALK // walk the heap objects? +#define xHEAP_WALK // walk the heap objects? #endif #endif @@ -323,7 +323,7 @@ int main(int argc, char** argv) { mi_debug_show_arenas(true,true,true); mi_collect(true); #endif - mi_stats_print(NULL); + // mi_stats_print(NULL); #endif //bench_end_program(); return 0; From 188294a0dfd26493f64e6049f438de715969e6cb Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 30 Nov 2024 11:12:39 -0800 Subject: [PATCH 010/264] wip: bug fixes --- include/mimalloc/internal.h | 1 + src/arena.c | 39 +++++++++++++++++-------------------- src/options.c | 7 +++++++ src/page-map.c | 23 +++++++++++----------- src/page-queue.c | 2 +- src/page.c | 4 ++-- src/stats.c | 4 ++-- test/test-stress.c | 11 ++++++----- 8 files changed, 49 insertions(+), 42 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 119b7b93..d4ec8bb7 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -63,6 +63,7 @@ void _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...); void _mi_warning_message(const char* fmt, ...); void _mi_verbose_message(const char* fmt, ...); void _mi_trace_message(const char* fmt, ...); +void _mi_output_message(const char* fmt, ...); void _mi_options_init(void); long _mi_option_get_fast(mi_option_t option); void _mi_error_message(int err, const char* fmt, ...); diff --git a/src/arena.c b/src/arena.c index 632c7a2a..424a9c70 100644 --- a/src/arena.c +++ b/src/arena.c @@ -429,15 +429,7 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); } -static uint8_t* xmi_arena_page_allocated_area(mi_page_t* page, size_t* psize) { - // todo: record real allocated size instead of trying to recalculate? - size_t page_size; - uint8_t* const pstart = mi_page_area(page, &page_size); - const size_t diff = pstart - (uint8_t*)page; - const size_t size = _mi_align_up(page_size + diff, MI_ARENA_BLOCK_SIZE); - if (psize != NULL) { *psize = size; } - return (uint8_t*)page; -} + /* ----------------------------------------------------------- Arena page allocation @@ -445,6 +437,7 @@ static uint8_t* xmi_arena_page_allocated_area(mi_page_t* page, size_t* psize) { static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) { + MI_UNUSED(block_count); const size_t bin = _mi_bin(block_size); mi_assert_internal(bin < MI_BIN_COUNT); @@ -693,7 +686,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi } mi_assert_internal(block_idx < arena->block_count); mi_assert_internal(block_idx >= mi_arena_info_blocks()); - if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) { + if (block_idx < mi_arena_info_blocks() || block_idx > arena->block_count) { _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } @@ -926,8 +919,8 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { return bit_set_count; } -static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) { - _mi_verbose_message("%s%s:\n", prefix, header); +static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap, bool invert) { + _mi_output_message("%s%s:\n", prefix, header); size_t bit_count = 0; size_t bit_set_count = 0; for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) { @@ -935,7 +928,11 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; for (int j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { if (bit_count < block_count) { - bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + k); + mi_bfield_t bfield = chunk->bfields[j]; + if (invert) bfield = ~bfield; + size_t xcount = mi_debug_show_bfield(bfield, buf + k); + if (invert) xcount = MI_BFIELD_BITS - xcount; + bit_set_count += xcount; k += MI_BFIELD_BITS; buf[k++] = ' '; } @@ -946,9 +943,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ bit_count += MI_BFIELD_BITS; } - _mi_verbose_message("%s %s\n", prefix, buf); + _mi_output_message("%s %s\n", prefix, buf); } - _mi_verbose_message("%s total ('x'): %zu\n", prefix, bit_set_count); + _mi_output_message("%s total ('x'): %zu\n", prefix, bit_set_count); return bit_set_count; } @@ -963,19 +960,19 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; block_total += arena->block_count; - _mi_verbose_message("arena %zu: %zu blocks (%zu MiB)%s\n", i, arena->block_count, mi_size_of_blocks(arena->block_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); + _mi_output_message("arena %zu: %zu blocks (%zu MiB)%s\n", i, arena->block_count, mi_size_of_blocks(arena->block_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { - free_total += mi_debug_show_bitmap(" ", "free blocks", arena->block_count, &arena->blocks_free); + free_total += mi_debug_show_bitmap(" ", "in-use blocks", arena->block_count, &arena->blocks_free, true); } - mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, &arena->blocks_committed); + mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, &arena->blocks_committed, false); // todo: abandoned blocks if (show_purge) { - purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, &arena->blocks_purge); + purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, &arena->blocks_purge, false); } } - if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", block_total - free_total); + if (show_inuse) _mi_output_message("total inuse blocks : %zu\n", block_total - free_total); // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); - if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); + if (show_purge) _mi_output_message("total purgeable blocks: %zu\n", purge_total); } diff --git a/src/options.c b/src/options.c index 2eaf29a3..8cb0d216 100644 --- a/src/options.c +++ b/src/options.c @@ -438,6 +438,13 @@ static void mi_vfprintf_thread(mi_output_fun* out, void* arg, const char* prefix } } +void _mi_output_message(const char* fmt, ...) { + va_list args; + va_start(args, fmt); + mi_vfprintf(NULL, NULL, NULL, fmt, args); + va_end(args); +} + void _mi_trace_message(const char* fmt, ...) { if (mi_option_get(mi_option_verbose) <= 1) return; // only with verbose level 2 or higher va_list args; diff --git a/src/page-map.c b/src/page-map.c index e803a367..f52fab10 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file mi_decl_cache_align signed char* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; -static size_t mi_page_map_size_per_commit_bit = MI_ARENA_BLOCK_SIZE; +static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_BLOCK_SIZE; static mi_memid_t mi_page_map_memid; static mi_bitmap_t mi_page_map_commit; @@ -22,7 +22,7 @@ static bool mi_page_map_init(void) { // 64 KiB for 4 GiB address space (on 32-bit) const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_BLOCK_SHIFT)); - mi_page_map_size_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); + mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); @@ -42,16 +42,16 @@ static bool mi_page_map_init(void) { return true; } -static void mi_page_map_ensure_committed(void* p, size_t idx, size_t block_count) { +static void mi_page_map_ensure_committed(size_t idx, size_t block_count) { // is the page map area that contains the page address committed? if (!mi_page_map_all_committed) { - const size_t commit_bit_count = _mi_divide_up(block_count, mi_page_map_size_per_commit_bit); - const size_t commit_bit_idx = idx / mi_page_map_size_per_commit_bit; - for (size_t i = 0; i < commit_bit_count; i++) { // per bit to avoid crossing over bitmap chunks - if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, commit_bit_idx + i, 1)) { + const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit; + const size_t commit_bit_idx_hi = (idx + block_count - 1) / mi_page_map_entries_per_commit_bit; + for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) { // per bit to avoid crossing over bitmap chunks + if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, i, 1)) { // this may race, in which case we do multiple commits (which is ok) - _mi_os_commit(_mi_page_map + ((commit_bit_idx + i)*mi_page_map_size_per_commit_bit), mi_page_map_size_per_commit_bit, NULL, NULL); - mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, commit_bit_idx + i, 1, NULL); + _mi_os_commit(_mi_page_map + (i*mi_page_map_entries_per_commit_bit), mi_page_map_entries_per_commit_bit, NULL, NULL); + mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, i, 1, NULL); } } } @@ -71,11 +71,12 @@ void _mi_page_map_register(mi_page_t* page) { if mi_unlikely(_mi_page_map == NULL) { if (!mi_page_map_init()) return; } + mi_assert(_mi_page_map!=NULL); uint8_t* page_start; size_t block_count; const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count); - mi_page_map_ensure_committed(page_start, idx, block_count); + mi_page_map_ensure_committed(idx, block_count); // set the offsets for (int i = 0; i < (int)block_count; i++) { @@ -100,7 +101,7 @@ void _mi_page_map_unregister(mi_page_t* page) { mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT); - if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_size_per_commit_bit, 1)) { + if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) { return (_mi_page_map[idx] != 0); } else { diff --git a/src/page-queue.c b/src/page-queue.c index c6b19985..3fcd700d 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -112,7 +112,7 @@ size_t _mi_bin_size(uint8_t bin) { } // Good size for allocation -size_t mi_good_size(size_t size) mi_attr_noexcept { +mi_decl_nodiscard mi_decl_export size_t mi_good_size(size_t size) mi_attr_noexcept { if (size <= MI_LARGE_MAX_OBJ_SIZE) { return _mi_bin_size(mi_bin(size + MI_PADDING_SIZE)); } diff --git a/src/page.c b/src/page.c index b6af4fd0..f8ef641e 100644 --- a/src/page.c +++ b/src/page.c @@ -638,7 +638,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { // calculate the extend count const size_t bsize = mi_page_block_size(page); - size_t extend = page->reserved - page->capacity; + size_t extend = (size_t)page->reserved - page->capacity; mi_assert_internal(extend > 0); size_t max_extend = (bsize >= MI_MAX_EXTEND_SIZE ? MI_MIN_EXTEND : MI_MAX_EXTEND_SIZE/bsize); @@ -672,7 +672,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { mi_assert(page != NULL); mi_page_set_heap(page, heap); size_t page_size; - uint8_t* page_start = mi_page_area(page, &page_size); + uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start); mi_track_mem_noaccess(page_start,page_size); mi_assert_internal(page_size / mi_page_block_size(page) < (1L<<16)); mi_assert_internal(page->reserved > 0); diff --git a/src/stats.c b/src/stats.c index 14489937..9f7a3cf0 100644 --- a/src/stats.c +++ b/src/stats.c @@ -133,7 +133,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { // unit == 0: count as decimal // unit < 0 : count in binary static void mi_printf_amount(int64_t n, int64_t unit, mi_output_fun* out, void* arg, const char* fmt) { - char buf[32]; buf[0] = 0; + char buf[32]; _mi_memzero_var(buf); int len = 32; const char* suffix = (unit <= 0 ? " " : "B"); const int64_t base = (unit == 0 ? 1000 : 1024); @@ -298,7 +298,7 @@ static void mi_cdecl mi_buffered_out(const char* msg, void* arg) { static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_attr_noexcept { // wrap the output function to be line buffered - char buf[256]; + char buf[256]; _mi_memzero_var(buf); buffered_t buffer = { out0, arg0, NULL, 0, 255 }; buffer.buf = buf; mi_output_fun* out = &mi_buffered_out; diff --git a/test/test-stress.c b/test/test-stress.c index e287cfa7..6327e995 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -48,13 +48,13 @@ static int ITER = 20; static int THREADS = 32; // more repeatable if THREADS <= #processors static int SCALE = 25; // scaling factor static int ITER = 50; // N full iterations destructing and re-creating all threads -#endif +#endif #define STRESS // undefine for leak test -static bool allow_large_objects = true; // allow very large objects? (set to `true` if SCALE>100) +static bool allow_large_objects = false; // allow very large objects? (set to `true` if SCALE>100) static size_t use_one_size = 0; // use single object size of `N * sizeof(uintptr_t)`? static bool main_participates = false; // main thread participates as a worker too @@ -244,7 +244,7 @@ static void test_stress(void) { //mi_debug_show_arenas(); #endif #if !defined(NDEBUG) || defined(MI_TSAN) - if (true) // (n + 1) % 10 == 0) + if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); } #endif } @@ -320,7 +320,7 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG - mi_debug_show_arenas(true,true,true); + mi_debug_show_arenas(true,true,false); mi_collect(true); #endif // mi_stats_print(NULL); @@ -345,9 +345,10 @@ static void run_os_threads(size_t nthreads, void (*fun)(intptr_t)) { thread_entry_fun = fun; DWORD* tids = (DWORD*)custom_calloc(nthreads,sizeof(DWORD)); HANDLE* thandles = (HANDLE*)custom_calloc(nthreads,sizeof(HANDLE)); + thandles[0] = GetCurrentThread(); // avoid lint warning const size_t start = (main_participates ? 1 : 0); for (size_t i = start; i < nthreads; i++) { - thandles[i] = CreateThread(0, 8*1024, &thread_entry, (void*)(i), 0, &tids[i]); + thandles[i] = CreateThread(0, 8*1024L, &thread_entry, (void*)(i), 0, &tids[i]); } if (main_participates) fun(0); // run the main thread as well for (size_t i = start; i < nthreads; i++) { From 309fc26b4b4d983b86f65fe4a56375c641aa2f09 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 30 Nov 2024 12:00:30 -0800 Subject: [PATCH 011/264] wip: add generic find_and_xset --- ide/vs2022/mimalloc.vcxproj | 2 ++ src/bitmap.c | 62 +++++++++++++++++++++++++++++++++++-- 2 files changed, 61 insertions(+), 3 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 9e8dab78..d03fd281 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -120,6 +120,7 @@ CompileAsCpp false stdcpp20 + AdvancedVectorExtensions2 @@ -180,6 +181,7 @@ CompileAsCpp true stdcpp20 + AdvancedVectorExtensions2 true diff --git a/src/bitmap.c b/src/bitmap.c index 1a1bb031..bb54af6b 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -37,6 +37,13 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { return mi_rotr(x,r); } +// Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR). +// return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, size_t* idx) { + return mi_bfield_find_least_bit((set ? ~x : x), idx); +} + // Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); @@ -190,7 +197,8 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz return all_xset; } -// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0), +// Try to atomically set/clear a sequence of `n` bits within a chunk. +// Returns true if all bits transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving all bit fields as is. static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); @@ -251,6 +259,54 @@ restore: } +// find least 0/1-bit in a chunk and try to set/clear it atomically +// set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// todo: try neon version +static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) { +#if 0 && defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + while (true) { + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) + // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) + if (mask==0) return false; + mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 + const size_t chunk_idx = _tzcnt_u32(mask) / 8; + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + size_t cidx; + if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear + if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + // try again + } +#else + for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + size_t idx; + if mi_unlikely(mi_bfield_find_least_to_xset(set, chunk->bfields[i], &idx)) { // find least 0-bit + if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx)) { // try to set it atomically + *pidx = (i*MI_BFIELD_BITS + idx); + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + } + return false; +#endif +} + +static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) { + return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx); +} + +static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) { + return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); +} + +/* // find least 1-bit in a chunk and try unset it atomically // set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. // todo: try neon version @@ -288,7 +344,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, return false; #endif } - +*/ // find least byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. @@ -613,7 +669,7 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) { +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) { // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger // TODO: allow spanning across chunk boundaries if (n == 0 || n > MI_BFIELD_BITS) return false; From d15e83030ea5f613f75bb3c1c1b380fe1e847467 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 30 Nov 2024 12:16:41 -0800 Subject: [PATCH 012/264] wip: rename arena blocks to slices --- include/mimalloc/internal.h | 18 +- include/mimalloc/types.h | 30 ++-- src/arena-abandon.c | 2 +- src/arena-old.c | 22 +-- src/arena.c | 330 ++++++++++++++++++------------------ src/page-map.c | 38 ++--- src/stats.c | 4 +- test/test-stress.c | 2 +- 8 files changed, 222 insertions(+), 224 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index d4ec8bb7..082882bb 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -443,16 +443,16 @@ extern signed char* _mi_page_map; #define MI_PAGE_PTR_INVALID ((mi_page_t*)(1)) static inline mi_page_t* _mi_ptr_page(const void* p) { - const uintptr_t up = ((uintptr_t)p) >> MI_ARENA_BLOCK_SHIFT; + const uintptr_t up = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT; const ptrdiff_t ofs = _mi_page_map[up]; #if MI_DEBUG if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID; #endif - return (mi_page_t*)((up + ofs + 1) << MI_ARENA_BLOCK_SHIFT); + return (mi_page_t*)((up + ofs + 1) << MI_ARENA_SLICE_SHIFT); } -// Get the block size of a page +// Get the block size of a page static inline size_t mi_page_block_size(const mi_page_t* page) { mi_assert_internal(page->block_size > 0); return page->block_size; @@ -509,8 +509,8 @@ static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); mi_atomic_store_release(&page->xheap,(uintptr_t)heap); - if (heap != NULL) { - page->heap_tag = heap->tag; + if (heap != NULL) { + page->heap_tag = heap->tag; mi_atomic_store_release(&page->xthread_id, heap->thread_id); } else { @@ -749,13 +749,13 @@ static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, c ----------------------------------------------------------- */ // Blocks needed for a given byte size -static inline size_t mi_block_count_of_size(size_t size) { - return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); +static inline size_t mi_slice_count_of_size(size_t size) { + return _mi_divide_up(size, MI_ARENA_SLICE_SIZE); } // Byte size of a number of blocks -static inline size_t mi_size_of_blocks(size_t bcount) { - return (bcount * MI_ARENA_BLOCK_SIZE); +static inline size_t mi_size_of_slices(size_t bcount) { + return (bcount * MI_ARENA_SLICE_SIZE); } diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index e3c0786c..ac0a5fc4 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -112,26 +112,26 @@ terms of the MIT license. A copy of the license can be found in the file // ------------------------------------------------------ // Sizes are for 64-bit -#ifndef MI_ARENA_BLOCK_SHIFT +#ifndef MI_ARENA_SLICE_SHIFT #ifdef MI_SMALL_PAGE_SHIFT // compatibility -#define MI_ARENA_BLOCK_SHIFT MI_SMALL_PAGE_SHIFT +#define MI_ARENA_SLICE_SHIFT MI_SMALL_PAGE_SHIFT #else -#define MI_ARENA_BLOCK_SHIFT (13 + MI_SIZE_SHIFT) // 64 KiB (32 KiB on 32-bit) +#define MI_ARENA_SLICE_SHIFT (13 + MI_SIZE_SHIFT) // 64 KiB (32 KiB on 32-bit) #endif #endif #ifndef MI_BITMAP_CHUNK_BITS_SHIFT #define MI_BITMAP_CHUNK_BITS_SHIFT 8 // optimized for 256 bits per chunk (avx2) #endif -#define MI_ARENA_BLOCK_SIZE (MI_ZU(1) << MI_ARENA_BLOCK_SHIFT) -#define MI_ARENA_BLOCK_ALIGN (MI_ARENA_BLOCK_SIZE) +#define MI_ARENA_SLICE_SIZE (MI_ZU(1) << MI_ARENA_SLICE_SHIFT) +#define MI_ARENA_SLICE_ALIGN (MI_ARENA_SLICE_SIZE) #define MI_BITMAP_CHUNK_BITS (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT) -#define MI_ARENA_MIN_OBJ_BLOCKS (1) +#define MI_ARENA_MIN_OBJ_BLOCKS (1) #define MI_ARENA_MAX_OBJ_BLOCKS (MI_BITMAP_CHUNK_BITS) // for now, cannot cross chunk boundaries -#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE) -#define MI_ARENA_MAX_OBJ_SIZE (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_BLOCK_SIZE) +#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE) +#define MI_ARENA_MAX_OBJ_SIZE (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE) #define MI_SMALL_PAGE_SIZE MI_ARENA_MIN_OBJ_SIZE #define MI_MEDIUM_PAGE_SIZE (8*MI_SMALL_PAGE_SIZE) // 512 KiB (=byte in the bitmap) @@ -145,7 +145,7 @@ terms of the MIT license. A copy of the license can be found in the file // Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in singleton pages -#define MI_BLOCK_ALIGNMENT_MAX (MI_ARENA_BLOCK_ALIGN) +#define MI_BLOCK_ALIGNMENT_MAX (MI_ARENA_SLICE_ALIGN) // We never allocate more than PTRDIFF_MAX (see also ) #define MI_MAX_ALLOC_SIZE PTRDIFF_MAX @@ -162,8 +162,8 @@ typedef enum mi_memkind_e { MI_MEM_STATIC, // allocated in a static area and should not be freed (for arena meta data for example) MI_MEM_OS, // allocated from the OS MI_MEM_OS_HUGE, // allocated as huge OS pages (usually 1GiB, pinned to physical memory) - MI_MEM_OS_REMAP, // allocated in a remapable area (i.e. using `mremap`) - MI_MEM_ARENA // allocated from an arena (the usual case) + MI_MEM_OS_REMAP, // allocated in a remapable area (i.e. using `mremap`) + MI_MEM_ARENA // allocated from an arena (the usual case) } mi_memkind_t; static inline bool mi_memkind_is_os(mi_memkind_t memkind) { @@ -176,8 +176,8 @@ typedef struct mi_memid_os_info { } mi_memid_os_info_t; typedef struct mi_memid_arena_info { - uint32_t block_index; // base index in the arena - uint32_t block_count; // allocated blocks + uint32_t slice_index; // base index in the arena + uint32_t slice_count; // allocated slices mi_arena_id_t id; // arena id (>= 1) bool is_exclusive; // this arena can only be used for specific arena allocations } mi_memid_arena_info_t; @@ -295,7 +295,7 @@ typedef struct mi_page_s { uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type // padding - size_t block_size; // size available in each block (always `>0`) + size_t block_size; // size available in each block (always `>0`) uint8_t* page_start; // start of the blocks #if (MI_ENCODE_FREELIST || MI_PADDING) @@ -340,7 +340,7 @@ typedef enum mi_page_kind_e { MI_PAGE_SMALL, // small blocks go into 64KiB pages MI_PAGE_MEDIUM, // medium blocks go into 512KiB pages MI_PAGE_LARGE, // larger blocks go into 4MiB pages - MI_PAGE_SINGLETON // page containing a single block. + MI_PAGE_SINGLETON // page containing a single block. // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`. } mi_page_kind_t; diff --git a/src/arena-abandon.c b/src/arena-abandon.c index 48e37794..14712886 100644 --- a/src/arena-abandon.c +++ b/src/arena-abandon.c @@ -344,7 +344,7 @@ bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool vi _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON"); return false; } - mi_arena_field_cursor_t current; + mi_arena_field_cursor_t current;0 _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, ¤t); mi_segment_t* segment; bool ok = true; diff --git a/src/arena-old.c b/src/arena-old.c index 8ca5aaf3..3f41e9c7 100644 --- a/src/arena-old.c +++ b/src/arena-old.c @@ -34,7 +34,7 @@ typedef struct mi_arena_s { mi_arena_id_t id; // arena id; 0 for non-specific mi_memid_t memid; // memid of the memory area _Atomic(uint8_t*)start; // the start of the memory area - size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) + size_t block_count; // size of the area in arena blocks (of `MI_ARENA_SLICE_SIZE`) size_t field_count; // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`) size_t meta_size; // size of the arena structure itself (including its bitmaps) mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) @@ -53,8 +53,8 @@ typedef struct mi_arena_s { } mi_arena_t; -#define MI_ARENA_BLOCK_SIZE (MI_SEGMENT_SIZE) // 64MiB (must be at least MI_SEGMENT_ALIGN) -#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_BLOCK_SIZE/2) // 32MiB +#define MI_ARENA_SLICE_SIZE (MI_SEGMENT_SIZE) // 64MiB (must be at least MI_SEGMENT_ALIGN) +#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_SLICE_SIZE/2) // 32MiB #define MI_MAX_ARENAS (132) // Limited as the reservation exponentially increases (and takes up .bss) // The available arenas @@ -113,11 +113,11 @@ mi_arena_t* mi_arena_from_index(size_t idx) { ----------------------------------------------------------- */ static size_t mi_block_count_of_size(size_t size) { - return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); + return _mi_divide_up(size, MI_ARENA_SLICE_SIZE); } static size_t mi_arena_block_size(size_t bcount) { - return (bcount * MI_ARENA_BLOCK_SIZE); + return (bcount * MI_ARENA_SLICE_SIZE); } static size_t mi_arena_size(mi_arena_t* arena) { @@ -363,7 +363,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re if (!_mi_os_has_virtual_reserve()) { arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) } - arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); + arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE); if (arena_count >= 8 && arena_count <= 128) { // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) @@ -429,7 +429,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) { - return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); + return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); } @@ -774,13 +774,13 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { if (arena_id != NULL) *arena_id = _mi_arena_id_none(); - if (size < MI_ARENA_BLOCK_SIZE) return false; + if (size < MI_ARENA_SLICE_SIZE) return false; if (is_large) { mi_assert_internal(memid.initially_committed && memid.is_pinned); } - const size_t bcount = size / MI_ARENA_BLOCK_SIZE; + const size_t bcount = size / MI_ARENA_SLICE_SIZE; const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS); const size_t bitmaps = (memid.is_pinned ? 3 : 5); const size_t asize = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t)); @@ -836,7 +836,7 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is // Reserve a range of regular OS memory int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { if (arena_id != NULL) *arena_id = _mi_arena_id_none(); - size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block + size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one block mi_memid_t memid; void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main); if (start == NULL) return ENOMEM; @@ -898,7 +898,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) for (size_t i = 0; i < max_arenas; i++) { mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; - _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : "")); + _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_SLICE_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { inuse_total += mi_debug_show_bitmap(" ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count); } diff --git a/src/arena.c b/src/arena.c index 424a9c70..7b5256b6 100644 --- a/src/arena.c +++ b/src/arena.c @@ -36,19 +36,19 @@ typedef struct mi_arena_s { mi_memid_t memid; // memid of the memory area mi_arena_id_t id; // arena id; 0 for non-specific - size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) + size_t slice_count; // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`) int numa_node; // associated NUMA node bool exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited - _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. + _Atomic(mi_msecs_t) purge_expire; // expiration time when slices should be decommitted from `slices_decommit`. mi_subproc_t* subproc; - mi_bitmap_t blocks_free; // is the block free? - mi_bitmap_t blocks_committed; // is the block committed? (i.e. accessible) - mi_bitmap_t blocks_purge; // can the block be purged? (block in purge => block in free) - mi_bitmap_t blocks_dirty; // is the block potentially non-zero? - mi_bitmap_t blocks_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) + mi_bitmap_t slices_free; // is the slice free? + mi_bitmap_t slices_committed; // is the slice committed? (i.e. accessible) + mi_bitmap_t slices_purge; // can the slice be purged? (slice in purge => slice in free) + mi_bitmap_t slices_dirty; // is the slice potentially non-zero? + mi_bitmap_t slices_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) // the full queue contains abandoned full pages } mi_arena_t; @@ -112,14 +112,14 @@ mi_arena_t* mi_arena_from_id(mi_arena_id_t id) { // Size of an arena static size_t mi_arena_size(mi_arena_t* arena) { - return mi_size_of_blocks(arena->block_count); + return mi_size_of_slices(arena->slice_count); } -static size_t mi_arena_info_blocks(void) { +static size_t mi_arena_info_slices(void) { const size_t os_page_size = _mi_os_page_size(); const size_t info_size = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page - const size_t info_blocks = mi_block_count_of_size(info_size); - return info_blocks; + const size_t info_slices = mi_slice_count_of_size(info_size); + return info_slices; } @@ -128,9 +128,9 @@ static uint8_t* mi_arena_start(mi_arena_t* arena) { return ((uint8_t*)arena); } -// Start of a block -uint8_t* mi_arena_block_start(mi_arena_t* arena, size_t block_index) { - return (mi_arena_start(arena) + mi_size_of_blocks(block_index)); +// Start of a slice +uint8_t* mi_arena_slice_start(mi_arena_t* arena, size_t slice_index) { + return (mi_arena_start(arena) + mi_size_of_slices(slice_index)); } // Arena area @@ -140,43 +140,43 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { if (arena_index >= MI_MAX_ARENAS) return NULL; mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); if (arena == NULL) return NULL; - if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } + if (size != NULL) { *size = mi_size_of_slices(arena->slice_count); } return mi_arena_start(arena); } // Create an arena memid -static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index, size_t block_count) { - mi_assert_internal(block_index < UINT32_MAX); - mi_assert_internal(block_count < UINT32_MAX); +static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t slice_index, size_t slice_count) { + mi_assert_internal(slice_index < UINT32_MAX); + mi_assert_internal(slice_count < UINT32_MAX); mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); memid.mem.arena.id = id; - memid.mem.arena.block_index = (uint32_t)block_index; - memid.mem.arena.block_count = (uint32_t)block_count; + memid.mem.arena.slice_index = (uint32_t)slice_index; + memid.mem.arena.slice_count = (uint32_t)slice_count; memid.mem.arena.is_exclusive = is_exclusive; return memid; } // returns if the arena is exclusive -static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index, size_t* block_count) { +static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* slice_index, size_t* slice_count) { mi_assert_internal(memid.memkind == MI_MEM_ARENA); *arena_index = mi_arena_id_index(memid.mem.arena.id); - if (block_index) *block_index = memid.mem.arena.block_index; - if (block_count) *block_count = memid.mem.arena.block_count; + if (slice_index) *slice_index = memid.mem.arena.slice_index; + if (slice_count) *slice_count = memid.mem.arena.slice_count; return memid.mem.arena.is_exclusive; } -// get the arena and block index -static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* block_index, size_t* block_count) { +// get the arena and slice index +static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* slice_index, size_t* slice_count) { size_t arena_index; - mi_arena_memid_indices(memid, &arena_index, block_index, block_count); + mi_arena_memid_indices(memid, &arena_index, slice_index, slice_count); return mi_arena_from_index(arena_index); } -static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index, size_t* block_count) { +static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* slice_index, size_t* slice_count) { // todo: maybe store the arena* directly in the page? - return mi_arena_from_memid(page->memid, block_index, block_count); + return mi_arena_from_memid(page->memid, slice_index, slice_count); } @@ -185,19 +185,19 @@ static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* block_index, size_t* b ----------------------------------------------------------- */ static mi_decl_noinline void* mi_arena_try_alloc_at( - mi_arena_t* arena, size_t needed_bcount, bool commit, size_t tseq, mi_memid_t* memid) -{ - size_t block_index; - if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, needed_bcount, tseq, &block_index)) return NULL; + mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid) +{ + size_t slice_index; + if (!mi_bitmap_try_find_and_clearN(&arena->slices_free, slice_count, tseq, &slice_index)) return NULL; // claimed it! - void* p = mi_arena_block_start(arena, block_index); - *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index, needed_bcount); + void* p = mi_arena_slice_start(arena, slice_index); + *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count); memid->is_pinned = arena->memid.is_pinned; // set the dirty bits if (arena->memid.initially_zero) { - memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL); + memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count, NULL); } // set commit state @@ -206,10 +206,10 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_committed = true; bool all_already_committed; - mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed); + mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count, &all_already_committed); if (!all_already_committed) { bool commit_zero = false; - if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, NULL)) { + if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) { memid->initially_committed = false; } else { @@ -219,13 +219,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( } else { // no need to commit, but check if already fully committed - memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount); + memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count); } - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, needed_bcount)); - if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount)); } - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount)); - // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, needed_bcount)); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count)); + if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count)); } + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count)); + // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count)); return p; } @@ -247,7 +247,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re if (!_mi_os_has_virtual_reserve()) { arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) } - arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); + arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); if (arena_count >= 8 && arena_count <= 128) { // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) @@ -259,8 +259,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re } // check arena bounds - const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1); - const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE; + const size_t min_reserve = mi_size_of_slices(mi_arena_info_slices() + 1); + const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_SLICE_SIZE; if (arena_reserve < min_reserve) { arena_reserve = min_reserve; } @@ -319,55 +319,55 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are if (var_arena != NULL && mi_arena_is_suitable(var_arena,req_arena_id,subproc,-1 /* todo: numa node */,allow_large)) \ { -#define mi_forall_arenas_end() }}} +#define mi_forall_arenas_end() }}} /* ----------------------------------------------------------- Arena allocation ----------------------------------------------------------- */ -// allocate blocks from the arenas +// allocate slices from the arenas static mi_decl_noinline void* mi_arena_try_find_free( - size_t block_count, size_t alignment, + size_t slice_count, size_t alignment, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) { - mi_assert_internal(block_count <= mi_block_count_of_size(MI_ARENA_MAX_OBJ_SIZE)); - mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); - if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; + mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE)); + mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); + if (alignment > MI_ARENA_SLICE_ALIGN) return NULL; // search arena's mi_subproc_t* const subproc = tld->subproc; const size_t tseq = tld->tseq; mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena) { - void* p = mi_arena_try_alloc_at(arena, block_count, commit, tseq, memid); + void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid); if (p != NULL) return p; } mi_forall_arenas_end(); return NULL; } -// Allocate blocks from the arena's -- potentially allocating a fresh arena +// Allocate slices from the arena's -- potentially allocating a fresh arena static mi_decl_noinline void* mi_arena_try_alloc( - size_t block_count, size_t alignment, + size_t slice_count, size_t alignment, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) { - mi_assert(block_count <= MI_ARENA_MAX_OBJ_BLOCKS); - mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + mi_assert(slice_count <= MI_ARENA_MAX_OBJ_BLOCKS); + mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); - // try to find free blocks in the arena's - void* p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld); + // try to find free slices in the arena's + void* p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; // otherwise, try to first eagerly reserve a new arena if (req_arena_id == _mi_arena_id_none()) { mi_arena_id_t arena_id = 0; - if (mi_arena_reserve(mi_size_of_blocks(block_count), allow_large, req_arena_id, &arena_id)) { + if (mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id)) { // and try allocate in there mi_assert_internal(req_arena_id == _mi_arena_id_none()); - p = mi_arena_try_find_free(block_count, alignment, commit, allow_large, req_arena_id, memid, tld); + p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; } } @@ -412,10 +412,10 @@ void* _mi_arena_alloc_aligned( if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed? req_arena_id == _mi_arena_id_none() && // not a specific arena? size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && // and not too small/large - alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) // and good alignment + alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0) // and good alignment { - const size_t block_count = mi_block_count_of_size(size); - void* p = mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, memid, tld); + const size_t slice_count = mi_slice_count_of_size(size); + void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; } @@ -426,7 +426,7 @@ void* _mi_arena_alloc_aligned( void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) { - return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); + return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); } @@ -435,10 +435,10 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t Arena page allocation ----------------------------------------------------------- */ -static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) +static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) { - MI_UNUSED(block_count); - const size_t bin = _mi_bin(block_size); + MI_UNUSED(slice_count); + const size_t bin = _mi_bin(block_size); mi_assert_internal(bin < MI_BIN_COUNT); // any abandoned in our size class? @@ -450,15 +450,15 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t bl size_t tseq = tld->tseq; mi_forall_arenas(req_arena_id, subproc, allow_large, tseq, arena_id, arena) { - size_t block_index; - if (mi_bitmap_try_find_and_clear(&arena->blocks_abandoned[bin], tseq, &block_index)) { + size_t slice_index; + if (mi_bitmap_try_find_and_clear(&arena->slices_abandoned[bin], tseq, &slice_index)) { // found an abandoned page of the right size mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); - mi_page_t* page = (mi_page_t*)mi_arena_block_start(arena, block_index); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_free, block_index, block_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, block_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, block_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, block_index, block_count)); + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(!mi_page_is_full(page)); mi_assert_internal(mi_page_is_abandoned(page)); @@ -469,7 +469,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t block_count, size_t bl return NULL; } -static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) +static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) { const bool allow_large = true; const bool commit = true; @@ -479,20 +479,20 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz mi_memid_t memid = _mi_memid_none(); mi_page_t* page = NULL; if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) { - page = (mi_page_t*)mi_arena_try_alloc(block_count, alignment, commit, allow_large, req_arena_id, &memid, tld); + page = (mi_page_t*)mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, &memid, tld); } // otherwise fall back to the OS if (page == NULL) { - page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_blocks(block_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld); + page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld); } if (page == NULL) return NULL; - // claimed free blocks: initialize the page partly + // claimed free slices: initialize the page partly _mi_memzero_aligned(page, sizeof(*page)); mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN)); - const size_t reserved = (mi_size_of_blocks(block_count) - MI_PAGE_INFO_SIZE) / block_size; + const size_t reserved = (mi_size_of_slices(slice_count) - MI_PAGE_INFO_SIZE) / block_size; mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); page->reserved = (uint16_t)reserved; page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE; @@ -512,22 +512,20 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t block_count, size_t block_siz return page; } -// block_count: arena block count for the page -// block size : page block size -static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t block_count, size_t block_size) { +static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size_t block_size) { const mi_arena_id_t req_arena_id = heap->arena_id; mi_tld_t* const tld = heap->tld; - // 1. look for an abandoned page - mi_page_t* page = mi_arena_page_try_find_abandoned(block_count, block_size, req_arena_id, tld); + // 1. look for an abandoned page + mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, tld); if (page != NULL) { return page; // return as abandoned } // 2. find a free block, potentially allocating a new arena - page = mi_arena_page_alloc_fresh(block_count, block_size, req_arena_id, tld); - if (page != NULL) { - mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.block_count == block_count); + page = mi_arena_page_alloc_fresh(slice_count, block_size, req_arena_id, tld); + if (page != NULL) { + mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); _mi_page_init(heap, page); return page; } @@ -550,17 +548,17 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_ page = mi_singleton_page_alloc(heap, block_size, page_alignment); } else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) { - page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_SMALL_PAGE_SIZE), block_size); + page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size); } else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) { - page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); + page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); } else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { - page = mi_arena_page_allocN(heap, mi_block_count_of_size(MI_LARGE_PAGE_SIZE), block_size); + page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); } else { page = mi_singleton_page_alloc(heap, block_size, page_alignment); - } + } // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); return page; } @@ -579,16 +577,16 @@ void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) { void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { mi_assert_internal(mi_page_is_abandoned(page)); mi_assert_internal(page->next==NULL); - + if (mi_page_all_free(page)) { _mi_arena_page_free(page, tld); } else if (page->memid.memkind==MI_MEM_ARENA) { // make available for allocations size_t bin = _mi_bin(mi_page_block_size(page)); - size_t block_index; - mi_arena_t* arena = mi_page_arena(page, &block_index, NULL); - bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_abandoned[bin], block_index, 1, NULL); + size_t slice_index; + mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL); + bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_abandoned[bin], slice_index, 1, NULL); MI_UNUSED(were_zero); mi_assert_internal(were_zero); mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]); } @@ -605,8 +603,8 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false; // don't reclaim between exclusive and non-exclusive arena's if mi_likely(memid.memkind == MI_MEM_ARENA) { - size_t block_index; - mi_arena_t* arena = mi_page_arena(page, &block_index, NULL); + size_t slice_index; + mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL); if (arena->subproc != heap->tld->subproc) return false; // only reclaim within the same subprocess // don't reclaim more from a `free` call than half the current segments @@ -616,7 +614,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { // return false; // } const size_t bin = _mi_bin(page->block_size); - if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->blocks_abandoned[bin], block_index, 1)) { + if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->slices_abandoned[bin], slice_index, 1)) { // we got it atomically _mi_page_reclaim(heap, page); mi_assert_internal(!mi_page_is_abandoned(page)); @@ -650,7 +648,7 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { /* ----------------------------------------------------------- Arena free ----------------------------------------------------------- */ -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats); +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats); static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats); void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { @@ -673,20 +671,20 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi } else if (memid.memkind == MI_MEM_ARENA) { // allocated in an arena - size_t block_count; - size_t block_idx; - mi_arena_t* arena = mi_arena_from_memid(memid, &block_idx, &block_count); + size_t slice_count; + size_t slice_index; + mi_arena_t* arena = mi_arena_from_memid(memid, &slice_index, &slice_count); mi_assert_internal(size==1); - mi_assert_internal(mi_arena_block_start(arena,block_idx) <= p); - mi_assert_internal(mi_arena_block_start(arena,block_idx) + mi_size_of_blocks(block_count) > p); + mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= p); + mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > p); // checks if (arena == NULL) { _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } - mi_assert_internal(block_idx < arena->block_count); - mi_assert_internal(block_idx >= mi_arena_info_blocks()); - if (block_idx < mi_arena_info_blocks() || block_idx > arena->block_count) { + mi_assert_internal(slice_index < arena->slice_count); + mi_assert_internal(slice_index >= mi_arena_info_slices()); + if (slice_index < mi_arena_info_slices() || slice_index > arena->slice_count) { _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } @@ -698,7 +696,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi else { if (!all_committed) { // mark the entire range as no longer committed (so we recommit the full range when re-using) - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, block_idx, block_count, NULL); + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slice_index, slice_count, NULL); mi_track_mem_noaccess(p, size); if (committed_size > 0) { // if partially committed, adjust the committed stats (is it will be recommitted when re-using) @@ -710,13 +708,13 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi // works (as we should never reset decommitted parts). } // (delay) purge the entire range - mi_arena_schedule_purge(arena, block_idx, block_count, stats); + mi_arena_schedule_purge(arena, slice_index, slice_count, stats); } // and make it available to others again - bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, block_count, NULL); + bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_free, slice_index, slice_count, NULL); if (!all_inuse) { - _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_block_start(arena,block_idx), mi_size_of_blocks(block_count)); + _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count)); return; }; } @@ -767,7 +765,7 @@ bool _mi_arena_contains(const void* p) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { + if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) { return true; } } @@ -781,7 +779,7 @@ bool _mi_arena_contains(const void* p) { static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { mi_assert_internal(arena != NULL); - mi_assert_internal(arena->block_count > 0); + mi_assert_internal(arena->slice_count > 0); if (arena_id != NULL) { *arena_id = -1; } size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); @@ -799,26 +797,26 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { mi_assert(!is_large || memid.initially_committed && memid.is_pinned); - mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)); + mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE)); mi_assert(start!=NULL); if (start==NULL) return false; - if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) { - // todo: use alignment in memid to align to blocksize first? - _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start); + if (!_mi_is_aligned(start,MI_ARENA_SLICE_SIZE)) { + // todo: use alignment in memid to align to slice size first? + _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_SLICE_SIZE/MI_KiB, start); return false; } if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } - const size_t info_blocks = mi_arena_info_blocks(); - const size_t bcount = size / MI_ARENA_BLOCK_SIZE; // divide down - if (bcount < info_blocks+1) { - _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB); + const size_t info_slices = mi_arena_info_slices(); + const size_t bcount = size / MI_ARENA_SLICE_SIZE; // divide down + if (bcount < info_slices+1) { + _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_slices(info_slices+1)/MI_KiB); return false; } if (bcount > MI_BITMAP_MAX_BITS) { // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) - _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB); + _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BITS)/MI_MiB); return false; } mi_arena_t* arena = (mi_arena_t*)start; @@ -826,17 +824,17 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int // commit & zero if needed bool is_zero = memid.initially_zero; if (!memid.initially_committed) { - _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main); + _mi_os_commit(arena, mi_size_of_slices(info_slices), &is_zero, &_mi_stats_main); } if (!is_zero) { - _mi_memzero(arena, mi_size_of_blocks(info_blocks)); + _mi_memzero(arena, mi_size_of_slices(info_slices)); } // init arena->id = _mi_arena_id_none(); arena->memid = memid; arena->exclusive = exclusive; - arena->block_count = bcount; + arena->slice_count = bcount; arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->is_large = is_large; arena->purge_expire = 0; @@ -845,25 +843,25 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int if (heap != NULL) { arena->subproc = heap->tld->subproc; } - + // init bitmaps - mi_bitmap_init(&arena->blocks_free,true); - mi_bitmap_init(&arena->blocks_committed,true); - mi_bitmap_init(&arena->blocks_dirty,true); - mi_bitmap_init(&arena->blocks_purge,true); + mi_bitmap_init(&arena->slices_free,true); + mi_bitmap_init(&arena->slices_committed,true); + mi_bitmap_init(&arena->slices_dirty,true); + mi_bitmap_init(&arena->slices_purge,true); for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) { - mi_bitmap_init(&arena->blocks_abandoned[i],true); + mi_bitmap_init(&arena->slices_abandoned[i],true); } - // reserve our meta info (and reserve blocks outside the memory area) - mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks); + // reserve our meta info (and reserve slices outside the memory area) + mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); if (memid.initially_committed) { - mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count); + mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_committed, 0, arena->slice_count); } else { - mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL); + mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, 0, info_slices, NULL); } - mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL); + mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, 0, info_slices, NULL); return mi_arena_add(arena, arena_id, &_mi_stats_main); } @@ -880,9 +878,9 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is // Reserve a range of regular OS memory int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { if (arena_id != NULL) *arena_id = _mi_arena_id_none(); - size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block + size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice mi_memid_t memid; - void* start = _mi_os_alloc_aligned(size, MI_ARENA_BLOCK_ALIGN, commit, allow_large, &memid, &_mi_stats_main); + void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid, &_mi_stats_main); if (start == NULL) return ENOMEM; const bool is_large = memid.is_pinned; // todo: use separate is_large field? if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { @@ -919,15 +917,15 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { return bit_set_count; } -static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap, bool invert) { +static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) { _mi_output_message("%s%s:\n", prefix, header); size_t bit_count = 0; size_t bit_set_count = 0; - for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) { + for (int i = 0; i < MI_BFIELD_BITS && bit_count < slice_count; i++) { char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf)); mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; for (int j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { - if (bit_count < block_count) { + if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; if (invert) bfield = ~bfield; size_t xcount = mi_debug_show_bfield(bfield, buf + k); @@ -939,10 +937,10 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ else { _mi_memset(buf + k, ' ', MI_BFIELD_BITS); k += MI_BFIELD_BITS; - } - bit_count += MI_BFIELD_BITS; + } + bit_count += MI_BFIELD_BITS; } - + _mi_output_message("%s %s\n", prefix, buf); } _mi_output_message("%s total ('x'): %zu\n", prefix, bit_set_count); @@ -953,26 +951,26 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) MI_UNUSED(show_abandoned); size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); size_t free_total = 0; - size_t block_total = 0; + size_t slice_total = 0; //size_t abandoned_total = 0; size_t purge_total = 0; for (size_t i = 0; i < max_arenas; i++) { mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; - block_total += arena->block_count; - _mi_output_message("arena %zu: %zu blocks (%zu MiB)%s\n", i, arena->block_count, mi_size_of_blocks(arena->block_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); + slice_total += arena->slice_count; + _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { - free_total += mi_debug_show_bitmap(" ", "in-use blocks", arena->block_count, &arena->blocks_free, true); + free_total += mi_debug_show_bitmap(" ", "in-use slices", arena->slice_count, &arena->slices_free, true); } - mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, &arena->blocks_committed, false); - // todo: abandoned blocks + mi_debug_show_bitmap(" ", "committed slices", arena->slice_count, &arena->slices_committed, false); + // todo: abandoned slices if (show_purge) { - purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, &arena->blocks_purge, false); + purge_total += mi_debug_show_bitmap(" ", "purgeable slices", arena->slice_count, &arena->slices_purge, false); } } - if (show_inuse) _mi_output_message("total inuse blocks : %zu\n", block_total - free_total); - // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); - if (show_purge) _mi_output_message("total purgeable blocks: %zu\n", purge_total); + if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); + // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total); + if (show_purge) _mi_output_message("total purgeable slices: %zu\n", purge_total); } @@ -1066,18 +1064,18 @@ static long mi_arena_purge_delay(void) { } // reset or decommit in an arena and update the committed/decommit bitmaps -// assumes we own the area (i.e. blocks_free is claimed by us) -static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { +// assumes we own the area (i.e. slices_free is claimed by us) +static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats) { mi_assert_internal(!arena->memid.is_pinned); - const size_t size = mi_size_of_blocks(blocks); - void* const p = mi_arena_block_start(arena, block_idx); + const size_t size = mi_size_of_slices(slices); + void* const p = mi_arena_slice_start(arena, slice_index); bool needs_recommit; - if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) { - // all blocks are committed, we can purge freely + if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slices)) { + // all slices are committed, we can purge freely needs_recommit = _mi_os_purge(p, size, stats); } else { - // some blocks are not committed -- this can happen when a partially committed block is freed + // some slices are not committed -- this can happen when a partially committed slice is freed // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), // and also undo the decommit stats (as it was already adjusted) @@ -1086,25 +1084,25 @@ static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, m if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } } - // clear the purged blocks - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL); + // clear the purged slices + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slices, slice_index, NULL); // update committed bitmap if (needs_recommit) { - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slices, slice_index, NULL); } } // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. // Note: assumes we (still) own the area as we may purge immediately -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats) { const long delay = mi_arena_purge_delay(); if (delay < 0) return; // is purging allowed at all? if (_mi_preloading() || delay == 0) { // decommit directly - mi_arena_purge(arena, block_idx, blocks, stats); + mi_arena_purge(arena, slice_index, slices, stats); } else { // schedule decommit diff --git a/src/page-map.c b/src/page-map.c index f52fab10..c7d5e8b4 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -11,7 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file mi_decl_cache_align signed char* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; -static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_BLOCK_SIZE; +static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; static mi_memid_t mi_page_map_memid; static mi_bitmap_t mi_page_map_commit; @@ -20,9 +20,9 @@ static bool mi_page_map_init(void) { if (vbits >= 48) vbits = 47; // 1 byte per block = 2 GiB for 128 TiB address space (48 bit = 256 TiB address space) // 64 KiB for 4 GiB address space (on 32-bit) - const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_BLOCK_SHIFT)); - - mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); + const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); + + mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); @@ -42,11 +42,11 @@ static bool mi_page_map_init(void) { return true; } -static void mi_page_map_ensure_committed(size_t idx, size_t block_count) { +static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) { // is the page map area that contains the page address committed? if (!mi_page_map_all_committed) { const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit; - const size_t commit_bit_idx_hi = (idx + block_count - 1) / mi_page_map_entries_per_commit_bit; + const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit; for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) { // per bit to avoid crossing over bitmap chunks if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, i, 1)) { // this may race, in which case we do multiple commits (which is ok) @@ -57,12 +57,12 @@ static void mi_page_map_ensure_committed(size_t idx, size_t block_count) { } } -static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* block_count) { +static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) { size_t page_size; *page_start = mi_page_area(page, &page_size); if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE; } // furthest interior pointer - *block_count = mi_block_count_of_size(page_size); - return ((uintptr_t)*page_start >> MI_ARENA_BLOCK_SHIFT); + *slice_count = mi_slice_count_of_size(page_size); + return ((uintptr_t)*page_start >> MI_ARENA_SLICE_SHIFT); } @@ -73,13 +73,13 @@ void _mi_page_map_register(mi_page_t* page) { } mi_assert(_mi_page_map!=NULL); uint8_t* page_start; - size_t block_count; - const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count); - - mi_page_map_ensure_committed(idx, block_count); + size_t slice_count; + const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count); + + mi_page_map_ensure_committed(idx, slice_count); // set the offsets - for (int i = 0; i < (int)block_count; i++) { + for (int i = 0; i < (int)slice_count; i++) { mi_assert_internal(i < 128); _mi_page_map[idx + i] = (signed char)(-i-1); } @@ -88,19 +88,19 @@ void _mi_page_map_register(mi_page_t* page) { void _mi_page_map_unregister(mi_page_t* page) { mi_assert_internal(_mi_page_map != NULL); - + // get index and count uint8_t* page_start; - size_t block_count; - const size_t idx = mi_page_map_get_idx(page, &page_start, &block_count); + size_t slice_count; + const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count); // unset the offsets - _mi_memzero(_mi_page_map + idx, block_count); + _mi_memzero(_mi_page_map + idx, slice_count); } mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { - uintptr_t idx = ((uintptr_t)p >> MI_ARENA_BLOCK_SHIFT); + uintptr_t idx = ((uintptr_t)p >> MI_ARENA_SLICE_SHIFT); if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) { return (_mi_page_map[idx] != 0); } diff --git a/src/stats.c b/src/stats.c index 9f7a3cf0..53b18da0 100644 --- a/src/stats.c +++ b/src/stats.c @@ -325,11 +325,11 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_stat_print_ex(&stats->committed, "committed", 1, out, arg, ""); mi_stat_peak_print(&stats->reset, "reset", 1, out, arg ); mi_stat_peak_print(&stats->purged, "purged", 1, out, arg ); - mi_stat_print(&stats->page_committed, "touched", 1, out, arg); //mi_stat_print(&stats->segments, "segments", -1, out, arg); //mi_stat_print(&stats->segments_abandoned, "-abandoned", -1, out, arg); //mi_stat_print(&stats->segments_cache, "-cached", -1, out, arg); - mi_stat_print(&stats->pages, "pages", -1, out, arg); + mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, ""); + mi_stat_print_ex(&stats->pages, "pages", -1, out, arg, ""); mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg); mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg); mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg); diff --git a/test/test-stress.c b/test/test-stress.c index 6327e995..e2133f7d 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -320,8 +320,8 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG - mi_debug_show_arenas(true,true,false); mi_collect(true); + mi_debug_show_arenas(true,true,false); #endif // mi_stats_print(NULL); #endif From f8d04dc2bc42efcae8f6012f2ccdef8c3056801c Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 30 Nov 2024 12:41:11 -0800 Subject: [PATCH 013/264] compile with clang and gcc --- CMakeLists.txt | 2 -- include/mimalloc/bits.h | 4 +-- include/mimalloc/internal.h | 2 +- include/mimalloc/types.h | 2 +- src/alloc-aligned.c | 6 ++-- src/arena.c | 12 ++++---- src/bitmap.c | 56 ++++++++++++++++++------------------- src/bitmap.h | 2 +- src/heap.c | 6 ++-- src/init.c | 7 +++-- src/os.c | 2 +- src/page-map.c | 2 +- src/page.c | 10 +++---- src/prim/unix/prim.c | 2 +- src/static.c | 2 -- 15 files changed, 57 insertions(+), 60 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 5cb05840..04b09252 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -57,8 +57,6 @@ set(mi_sources src/page.c src/page-map.c src/random.c - src/segment.c - src/segment-map.c src/stats.c src/prim/prim.c) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index d6695a00..79034c2f 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -291,7 +291,7 @@ static inline size_t mi_rotr(size_t x, size_t r) { // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to // avoid UB when `rshift==0`. See const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); - return (x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))); + return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1)))); #endif } @@ -310,7 +310,7 @@ static inline size_t mi_rotl(size_t x, size_t r) { // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to // avoid UB when `rshift==0`. See const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); - return (x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))) + return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1)))); #endif } diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 082882bb..1c1ec2bc 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -471,7 +471,7 @@ static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) { static inline bool mi_page_contains_address(const mi_page_t* page, const void* p) { size_t psize; uint8_t* start = mi_page_area(page, &psize); - return (start <= p && p < start + psize); + return (start <= (uint8_t*)p && (uint8_t*)p < start + psize); } static inline bool mi_page_is_in_arena(const mi_page_t* page) { diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index ac0a5fc4..cc8deeb6 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -125,7 +125,7 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_ARENA_SLICE_SIZE (MI_ZU(1) << MI_ARENA_SLICE_SHIFT) #define MI_ARENA_SLICE_ALIGN (MI_ARENA_SLICE_SIZE) -#define MI_BITMAP_CHUNK_BITS (MI_ZU(1) << MI_BITMAP_CHUNK_BITS_SHIFT) +#define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) #define MI_ARENA_MIN_OBJ_BLOCKS (1) #define MI_ARENA_MAX_OBJ_BLOCKS (MI_BITMAP_CHUNK_BITS) // for now, cannot cross chunk boundaries diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index 43dc2d36..84f49ec6 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -59,9 +59,9 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t void* p; size_t oversize; if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) { - // use OS allocation for very large alignment and allocate inside a huge page (dedicated segment with 1 page) - // This can support alignments >= MI_SEGMENT_SIZE by ensuring the object can be aligned at a point in the - // first (and single) page such that the segment info is `MI_SEGMENT_SIZE` bytes before it (so it can be found by aligning the pointer down) + // use OS allocation for very large alignment and allocate inside a huge page (not in an arena) + // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned at a point in the + // first (and single) page such that the page info is `MI_ARENA_SLICE_SIZE` bytes before it (and can be found in the _mi_page_map). if mi_unlikely(offset != 0) { // todo: cannot support offset alignment for very large alignments yet #if MI_DEBUG > 0 diff --git a/src/arena.c b/src/arena.c index 7b5256b6..b59f8ad3 100644 --- a/src/arena.c +++ b/src/arena.c @@ -652,7 +652,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats); void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { - mi_assert_internal(size >= 0 && stats != NULL); + mi_assert_internal(size > 0 && stats != NULL); mi_assert_internal(committed_size <= size); if (p==NULL) return; if (size==0) return; @@ -675,8 +675,8 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi size_t slice_index; mi_arena_t* arena = mi_arena_from_memid(memid, &slice_index, &slice_count); mi_assert_internal(size==1); - mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= p); - mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > p); + mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= (uint8_t*)p); + mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > (uint8_t*)p); // checks if (arena == NULL) { _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); @@ -796,7 +796,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { - mi_assert(!is_large || memid.initially_committed && memid.is_pinned); + mi_assert(!is_large || (memid.initially_committed && memid.is_pinned)); mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE)); mi_assert(start!=NULL); if (start==NULL) return false; @@ -849,7 +849,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int mi_bitmap_init(&arena->slices_committed,true); mi_bitmap_init(&arena->slices_dirty,true); mi_bitmap_init(&arena->slices_purge,true); - for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) { + for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) { mi_bitmap_init(&arena->slices_abandoned[i],true); } @@ -924,7 +924,7 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ for (int i = 0; i < MI_BFIELD_BITS && bit_count < slice_count; i++) { char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf)); mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; - for (int j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { + for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; if (invert) bfield = ~bfield; diff --git a/src/bitmap.c b/src/bitmap.c index bb54af6b..fe44bb67 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -22,9 +22,9 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) { return mi_ctz(x); } -static inline size_t mi_bfield_clz(mi_bfield_t x) { - return mi_clz(x); -} +//static inline size_t mi_bfield_clz(mi_bfield_t x) { +// return mi_clz(x); +//} // find the least significant bit that is set (i.e. count trailing zero's) // return false if `x==0` (with `*idx` undefined) and true otherwise, @@ -124,11 +124,11 @@ static bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, } // Check if a bit is set/clear -static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal(idx < MI_BFIELD_BITS); - const mi_bfield_t mask = ((mi_bfield_t)1)<bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear @@ -302,9 +302,9 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx); } -static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) { - return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); -} +// static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) { +// return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); +// } /* // find least 1-bit in a chunk and try unset it atomically @@ -435,19 +435,19 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, // are all bits in a bitmap chunk set? -static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - return _mm256_test_all_ones(vec); - #else - // written like this for vectorization - mi_bfield_t x = chunk->bfields[0]; - for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { - x = x & chunk->bfields[i]; - } - return (~x == 0); - #endif -} +// static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { +// #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) +// const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); +// return _mm256_test_all_ones(vec); +// #else +// // written like this for vectorization +// mi_bfield_t x = chunk->bfields[0]; +// for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { +// x = x & chunk->bfields[i]; +// } +// return (~x == 0); +// #endif +// } // are all bits in a bitmap chunk clear? static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { @@ -594,11 +594,11 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) mi_assert_internal(n>0); mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); - + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) - mi_assert_internal(chunk_idx < MI_BFIELD_BITS); + mi_assert_internal(chunk_idx < MI_BFIELD_BITS); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); diff --git a/src/bitmap.h b/src/bitmap.h index fcadc213..1a180924 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -22,7 +22,7 @@ typedef size_t mi_bfield_t; #define MI_BFIELD_BITS (1 << MI_BFIELD_BITS_SHIFT) #define MI_BFIELD_SIZE (MI_BFIELD_BITS/8) #define MI_BFIELD_BITS_MOD_MASK (MI_BFIELD_BITS - 1) -#define MI_BFIELD_LO_BIT8 ((~(mi_bfield_t(0)))/0xFF) // 0x01010101 .. +#define MI_BFIELD_LO_BIT8 (((~(mi_bfield_t)0))/0xFF) // 0x01010101 .. #define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. #define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) diff --git a/src/heap.c b/src/heap.c index 8ee66055..4da3b449 100644 --- a/src/heap.c +++ b/src/heap.c @@ -31,7 +31,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void size_t count = 0; #endif - for (size_t i = 0; i <= MI_BIN_FULL; i++) { + for (int i = 0; i <= MI_BIN_FULL; i++) { mi_page_queue_t* pq = &heap->pages[i]; mi_page_t* page = pq->first; while(page != NULL) { @@ -54,7 +54,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ MI_UNUSED(arg1); MI_UNUSED(arg2); MI_UNUSED(pq); - mi_assert_internal(mi_page_heap(page) == heap); + mi_assert_internal(mi_page_heap(page) == heap); mi_assert_expensive(_mi_page_is_valid(page)); return true; } @@ -419,7 +419,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) { // so threads may do delayed frees in either heap for a while. // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state // so after this only the new heap will get delayed frees - for (size_t i = 0; i <= MI_BIN_FULL; i++) { + for (int i = 0; i <= MI_BIN_FULL; i++) { mi_page_queue_t* pq = &heap->pages[i]; mi_page_queue_t* append = &from->pages[i]; size_t pcount = _mi_page_queue_append(heap, pq, append); diff --git a/src/init.c b/src/init.c index 40bc5c4a..1456cb4a 100644 --- a/src/init.c +++ b/src/init.c @@ -33,7 +33,7 @@ const mi_page_t _mi_page_empty = { MI_ATOMIC_VAR_INIT(0), // xheap MI_ATOMIC_VAR_INIT(0), // xthread_id NULL, NULL, // next, prev - { { NULL, 0}, false, false, false, MI_MEM_NONE } // memid + { {{ NULL, 0}}, false, false, false, MI_MEM_NONE } // memid }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) @@ -396,7 +396,8 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { tld->heap_backing = bheap; tld->heaps = NULL; tld->subproc = &mi_subproc_default; - tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->tseq = 0; + mi_atomic_add_acq_rel(&mi_tcount, 1); tld->os.stats = &tld->stats; } @@ -433,7 +434,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { _mi_stats_done(&heap->tld->stats); // free if not the main thread - if (heap != &_mi_heap_main) { + if (heap != &_mi_heap_main) { mi_thread_data_free((mi_thread_data_t*)heap); } else { diff --git a/src/os.c b/src/os.c index da41d152..110d7ec6 100644 --- a/src/os.c +++ b/src/os.c @@ -573,7 +573,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { } #endif -// Allocate MI_SEGMENT_SIZE aligned huge pages +// Allocate MI_ARENA_SLICE_ALIGN aligned huge pages void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_msecs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid) { *memid = _mi_memid_none(); if (psize != NULL) *psize = 0; diff --git a/src/page-map.c b/src/page-map.c index c7d5e8b4..07433aa3 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -107,4 +107,4 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att else { return false; } -} \ No newline at end of file +} diff --git a/src/page.c b/src/page.c index f8ef641e..d91b9123 100644 --- a/src/page.c +++ b/src/page.c @@ -250,13 +250,13 @@ void _mi_page_free_collect(mi_page_t* page, bool force) { // called from segments when reclaiming abandoned pages void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) { mi_page_set_heap(page, heap); - _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) + _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) _mi_page_free_collect(page, false); // ensure used count is up to date mi_assert_expensive(mi_page_is_valid_init(page)); mi_assert_internal(mi_page_heap(page) == heap); mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE); - + // TODO: push on full queue immediately if it is full? mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page)); mi_page_queue_push(heap, pq, page); @@ -686,7 +686,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { mi_assert_expensive(mi_mem_is_zero(page_start, page_size)); } #endif - + mi_assert_internal(page->capacity == 0); mi_assert_internal(page->free == NULL); mi_assert_internal(page->used == 0); @@ -928,8 +928,8 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed. // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed. -// The `huge_alignment` is normally 0 but is set to a multiple of MI_SEGMENT_SIZE for -// very large requested alignments in which case we use a huge segment. +// The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for +// very large requested alignments in which case we use a huge singleton page. void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept { mi_assert_internal(heap != NULL); diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 59421e52..5a4440c3 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -478,7 +478,7 @@ static long mi_prim_mbind(void* start, unsigned long len, unsigned long mode, co int _mi_prim_alloc_huge_os_pages(void* hint_addr, size_t size, int numa_node, bool* is_zero, void** addr) { bool is_large = true; *is_zero = true; - *addr = unix_mmap(hint_addr, size, MI_SEGMENT_SIZE, PROT_READ | PROT_WRITE, true, true, &is_large); + *addr = unix_mmap(hint_addr, size, MI_ARENA_SLICE_ALIGN, PROT_READ | PROT_WRITE, true, true, &is_large); if (*addr != NULL && numa_node >= 0 && numa_node < 8*MI_INTPTR_SIZE) { // at most 64 nodes unsigned long numa_mask = (1UL << numa_node); // TODO: does `mbind` work correctly for huge OS pages? should we diff --git a/src/static.c b/src/static.c index b34d5d42..0a8fa447 100644 --- a/src/static.c +++ b/src/static.c @@ -33,8 +33,6 @@ terms of the MIT license. A copy of the license can be found in the file #include "page.c" // includes page-queue.c #include "page-map.c" #include "random.c" -#include "segment.c" -#include "segment-map.c" #include "stats.c" #include "prim/prim.c" #if MI_OSX_ZONE From 55b70f1588e0df2778743673f000749fe45f7a00 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 30 Nov 2024 14:00:07 -0800 Subject: [PATCH 014/264] wip --- CMakeLists.txt | 2 +- include/mimalloc/internal.h | 5 +++-- include/mimalloc/types.h | 16 +++++++++------- src/free.c | 7 ++++--- src/heap.c | 4 ++-- src/init.c | 10 +++++----- src/page-map.c | 6 +++--- 7 files changed, 27 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 04b09252..2c04aea8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -360,7 +360,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM list(APPEND mi_cflags_dynamic -ftls-model=initial-exec) message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)") else() - list(APPEND mi_cflags -ftls-model=initial-exec) + list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2) endif() endif() if(MI_OVERRIDE) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 1c1ec2bc..39bc23eb 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -438,17 +438,18 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si } -extern signed char* _mi_page_map; +extern uint8_t* _mi_page_map; #define MI_PAGE_PTR_INVALID ((mi_page_t*)(1)) static inline mi_page_t* _mi_ptr_page(const void* p) { const uintptr_t up = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT; + // __builtin_prefetch((void*)(up << MI_ARENA_SLICE_SHIFT)); const ptrdiff_t ofs = _mi_page_map[up]; #if MI_DEBUG if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID; #endif - return (mi_page_t*)((up + ofs + 1) << MI_ARENA_SLICE_SHIFT); + return (mi_page_t*)((up - ofs + 1) << MI_ARENA_SLICE_SHIFT); } diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index cc8deeb6..f82265fb 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -283,18 +283,21 @@ typedef struct mi_subproc_s mi_subproc_t; // the owning heap `thread_delayed_free` list. This guarantees that pages // will be freed correctly even if only other threads free blocks. typedef struct mi_page_s { + _Atomic(mi_threadid_t)xthread_id; // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned) + + mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) + uint16_t used; // number of blocks in use (including blocks in `thread_free`) uint16_t capacity; // number of blocks committed (must be the first field for proper zero-initialisation) uint16_t reserved; // number of blocks reserved in memory + uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) + uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type + mi_page_flags_t flags; // `in_full` and `has_aligned` flags (8 bits) uint8_t free_is_zero:1; // `true` if the blocks in the free list are zero initialized uint8_t retire_expire:7; // expiration count for retired blocks - - mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) - mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) - uint16_t used; // number of blocks in use (including blocks in `thread_free`) - uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) - uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type // padding + + mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) size_t block_size; // size available in each block (always `>0`) uint8_t* page_start; // start of the blocks @@ -304,7 +307,6 @@ typedef struct mi_page_s { _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads _Atomic(uintptr_t) xheap; // heap this threads belong to. - _Atomic(mi_threadid_t)xthread_id; // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned) struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` diff --git a/src/free.c b/src/free.c index 224070fe..5dbea4a4 100644 --- a/src/free.c +++ b/src/free.c @@ -126,10 +126,11 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) // Fast path written carefully to prevent register spilling on the stack void mi_free(void* p) mi_attr_noexcept { + if (p==NULL) return; mi_page_t* const page = mi_checked_ptr_page(p,"mi_free"); - if mi_unlikely(page==NULL) return; + // if mi_unlikely(page==NULL) return; + - const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page)); if mi_likely(is_local) { // thread-local free? if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) @@ -257,7 +258,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block // huge pages are special as they occupy the entire segment // as these are large we reset the memory occupied by the page so it is available to other threads // (as the owning thread needs to actually free the memory later). - _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively + _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively } else { #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading diff --git a/src/heap.c b/src/heap.c index 4da3b449..746ba4d0 100644 --- a/src/heap.c +++ b/src/heap.c @@ -31,7 +31,7 @@ static bool mi_heap_visit_pages(mi_heap_t* heap, heap_page_visitor_fun* fn, void size_t count = 0; #endif - for (int i = 0; i <= MI_BIN_FULL; i++) { + for (size_t i = 0; i <= MI_BIN_FULL; i++) { mi_page_queue_t* pq = &heap->pages[i]; mi_page_t* page = pq->first; while(page != NULL) { @@ -419,7 +419,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) { // so threads may do delayed frees in either heap for a while. // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state // so after this only the new heap will get delayed frees - for (int i = 0; i <= MI_BIN_FULL; i++) { + for (size_t i = 0; i <= MI_BIN_FULL; i++) { mi_page_queue_t* pq = &heap->pages[i]; mi_page_queue_t* append = &from->pages[i]; size_t pcount = _mi_page_queue_append(heap, pq, append); diff --git a/src/init.c b/src/init.c index 1456cb4a..16130af7 100644 --- a/src/init.c +++ b/src/init.c @@ -14,16 +14,17 @@ terms of the MIT license. A copy of the license can be found in the file // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { + MI_ATOMIC_VAR_INIT(0), // xthread_id + NULL, // free + 0, // used 0, // capacity 0, // reserved capacity + 0, // block size shift + 0, // heap tag { 0 }, // flags false, // is_zero 0, // retire_expire - NULL, // free NULL, // local_free - 0, // used - 0, // block size shift - 0, // heap tag 0, // block_size NULL, // page_start #if (MI_PADDING || MI_ENCODE_FREELIST) @@ -31,7 +32,6 @@ const mi_page_t _mi_page_empty = { #endif MI_ATOMIC_VAR_INIT(0), // xthread_free MI_ATOMIC_VAR_INIT(0), // xheap - MI_ATOMIC_VAR_INIT(0), // xthread_id NULL, NULL, // next, prev { {{ NULL, 0}}, false, false, false, MI_MEM_NONE } // memid }; diff --git a/src/page-map.c b/src/page-map.c index 07433aa3..624f615c 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -9,7 +9,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/internal.h" #include "bitmap.h" -mi_decl_cache_align signed char* _mi_page_map = NULL; +mi_decl_cache_align uint8_t* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; static mi_memid_t mi_page_map_memid; @@ -25,7 +25,7 @@ static bool mi_page_map_init(void) { mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems - _mi_page_map = (int8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); + _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); return false; @@ -81,7 +81,7 @@ void _mi_page_map_register(mi_page_t* page) { // set the offsets for (int i = 0; i < (int)slice_count; i++) { mi_assert_internal(i < 128); - _mi_page_map[idx + i] = (signed char)(-i-1); + _mi_page_map[idx + i] = (i+1); } } From 9ebe941ce0cb4705e584c7c638b7345458c6e79c Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 30 Nov 2024 20:21:32 -0800 Subject: [PATCH 015/264] first version that passes the make test --- include/mimalloc/internal.h | 36 +++++++++++++---- include/mimalloc/types.h | 20 +++++----- src/alloc-aligned.c | 40 +++++++++---------- src/arena.c | 78 +++++++++++++++++++++++++------------ src/bitmap.c | 2 +- src/free.c | 13 ++++--- src/page-map.c | 13 ++++--- src/page.c | 26 ++++++++++--- test/test-api.c | 14 +++---- test/test-stress.c | 4 +- 10 files changed, 155 insertions(+), 91 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 39bc23eb..02a62bec 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -440,16 +440,34 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si extern uint8_t* _mi_page_map; -#define MI_PAGE_PTR_INVALID ((mi_page_t*)(1)) +static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) { + #if 1 + const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT; + const size_t ofs = _mi_page_map[idx]; + if (valid != NULL) *valid = (ofs != 0); + return (mi_page_t*)((idx - ofs + 1) << MI_ARENA_SLICE_SHIFT); + #else + const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT; + const uintptr_t up = idx << MI_ARENA_SLICE_SHIFT; + __builtin_prefetch((void*)up); + const size_t ofs = _mi_page_map[idx]; + if (valid != NULL) *valid = (ofs != 0); + return (mi_page_t*)(up - ((ofs - 1) << MI_ARENA_SLICE_SHIFT)); + #endif +} + +static inline mi_page_t* _mi_checked_ptr_page(const void* p) { + bool valid; + mi_page_t* const page = _mi_ptr_page_ex(p,&valid); + return (valid ? page : NULL); +} static inline mi_page_t* _mi_ptr_page(const void* p) { - const uintptr_t up = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT; - // __builtin_prefetch((void*)(up << MI_ARENA_SLICE_SHIFT)); - const ptrdiff_t ofs = _mi_page_map[up]; #if MI_DEBUG - if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID; + return _mi_checked_ptr_page(p); + #else + return _mi_ptr_page_ex(p,NULL); #endif - return (mi_page_t*)((up - ofs + 1) << MI_ARENA_SLICE_SHIFT); } @@ -509,12 +527,13 @@ static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); - mi_atomic_store_release(&page->xheap,(uintptr_t)heap); if (heap != NULL) { + mi_atomic_store_release(&page->xheap, (uintptr_t)heap); page->heap_tag = heap->tag; mi_atomic_store_release(&page->xthread_id, heap->thread_id); } else { + mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc); mi_atomic_store_release(&page->xthread_id,0); } } @@ -578,11 +597,12 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) { } static inline bool mi_page_is_abandoned(const mi_page_t* page) { + // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) return (mi_page_thread_id(page) == 0); } static inline bool mi_page_is_huge(const mi_page_t* page) { - return (page->block_size > MI_LARGE_MAX_OBJ_SIZE); + return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN)); } diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index f82265fb..271c7efb 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -123,15 +123,16 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_BITMAP_CHUNK_BITS_SHIFT 8 // optimized for 256 bits per chunk (avx2) #endif +#define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) #define MI_ARENA_SLICE_SIZE (MI_ZU(1) << MI_ARENA_SLICE_SHIFT) #define MI_ARENA_SLICE_ALIGN (MI_ARENA_SLICE_SIZE) -#define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) -#define MI_ARENA_MIN_OBJ_BLOCKS (1) -#define MI_ARENA_MAX_OBJ_BLOCKS (MI_BITMAP_CHUNK_BITS) // for now, cannot cross chunk boundaries +#define MI_ARENA_MIN_OBJ_SLICES (1) +#define MI_ARENA_MAX_OBJ_SLICES (MI_SIZE_BITS) // for now, cannot cross bit field boundaries.. todo: make it at least MI_BITMAP_CHUNK_BITS ? (16 MiB) +// #define MI_ARENA_MAX_OBJ_BLOCKS (MI_BITMAP_CHUNK_BITS) // for now, cannot cross chunk boundaries -#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE) -#define MI_ARENA_MAX_OBJ_SIZE (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE) +#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE) +#define MI_ARENA_MAX_OBJ_SIZE (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE) #define MI_SMALL_PAGE_SIZE MI_ARENA_MIN_OBJ_SIZE #define MI_MEDIUM_PAGE_SIZE (8*MI_SMALL_PAGE_SIZE) // 512 KiB (=byte in the bitmap) @@ -144,9 +145,6 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_BIN_COUNT (MI_BIN_FULL+1) -// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in singleton pages -#define MI_BLOCK_ALIGNMENT_MAX (MI_ARENA_SLICE_ALIGN) - // We never allocate more than PTRDIFF_MAX (see also ) #define MI_MAX_ALLOC_SIZE PTRDIFF_MAX @@ -318,8 +316,10 @@ typedef struct mi_page_s { // Object sizes // ------------------------------------------------------ -#define MI_PAGE_ALIGN (64) -#define MI_PAGE_INFO_SIZE (2*MI_PAGE_ALIGN) // should be > sizeof(mi_page_t) +#define MI_PAGE_ALIGN MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map. +#define MI_PAGE_MIN_BLOCK_ALIGN (32) // minimal block alignment in a page +#define MI_PAGE_MAX_OVERALLOC_ALIGN MI_ARENA_SLICE_SIZE // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation +#define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index 84f49ec6..9673334a 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -16,12 +16,11 @@ terms of the MIT license. A copy of the license can be found in the file // ------------------------------------------------------ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) { - // objects up to `MI_PAGE_ALIGN` are allocated aligned to their size + // objects up to `MI_PAGE_MIN_BLOCK_ALIGN` are always allocated aligned to their size mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0)); if (alignment > size) return false; - if (alignment <= MI_MAX_ALIGN_SIZE) return true; const size_t bsize = mi_good_size(size); - return (bsize <= MI_PAGE_ALIGN && (bsize & (alignment-1)) == 0); + return (bsize <= MI_PAGE_MIN_BLOCK_ALIGN && (bsize & (alignment-1)) == 0); } #if MI_GUARDED @@ -39,9 +38,9 @@ static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, si static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) { const size_t rate = heap->guarded_sample_rate; - heap->guarded_sample_rate = 0; + if (rate != 0) { heap->guarded_sample_rate = 0; } // don't write to constant heap_empty void* p = _mi_heap_malloc_zero(heap, size, zero); - heap->guarded_sample_rate = rate; + if (rate != 0) { heap->guarded_sample_rate = rate; } return p; } #else @@ -58,21 +57,20 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t void* p; size_t oversize; - if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) { - // use OS allocation for very large alignment and allocate inside a huge page (not in an arena) - // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned at a point in the - // first (and single) page such that the page info is `MI_ARENA_SLICE_SIZE` bytes before it (and can be found in the _mi_page_map). + if mi_unlikely(alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) { + // use OS allocation for large alignments and allocate inside a singleton page (not in an arena) + // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned + // in the first (and single) page such that the page info is `MI_PAGE_ALIGN` bytes before it (and can be found in the _mi_page_map). if mi_unlikely(offset != 0) { // todo: cannot support offset alignment for very large alignments yet -#if MI_DEBUG > 0 - _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset); -#endif + #if MI_DEBUG > 0 + _mi_error_message(EOVERFLOW, "aligned allocation with a large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset); + #endif return NULL; } oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size); // note: no guarded as alignment > 0 - p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block - // zero afterwards as only the area from the aligned_p may be committed! + p = _mi_heap_malloc_zero_ex(heap, oversize, zero, alignment); // the page block size should be large enough to align in the single huge page block if (p == NULL) return NULL; } else { @@ -113,13 +111,13 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t #endif // now zero the block if needed - if (alignment > MI_BLOCK_ALIGNMENT_MAX) { - // for the tracker, on huge aligned allocations only from the start of the large block is defined - mi_track_mem_undefined(aligned_p, size); - if (zero) { - _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p)); - } - } + //if (alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) { + // // for the tracker, on huge aligned allocations only from the start of the large block is defined + // mi_track_mem_undefined(aligned_p, size); + // if (zero) { + // _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p)); + // } + //} if (p != aligned_p) { mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p)); diff --git a/src/arena.c b/src/arena.c index b59f8ad3..a2d3f560 100644 --- a/src/arena.c +++ b/src/arena.c @@ -354,9 +354,9 @@ static mi_decl_noinline void* mi_arena_try_alloc( bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) { - mi_assert(slice_count <= MI_ARENA_MAX_OBJ_BLOCKS); + mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); - + // try to find free slices in the arena's void* p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; @@ -469,33 +469,48 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl return NULL; } -static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) +static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, size_t block_alignment, + mi_arena_id_t req_arena_id, mi_tld_t* tld) { const bool allow_large = true; const bool commit = true; - const size_t alignment = 1; - + const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); + const size_t page_alignment = MI_ARENA_SLICE_ALIGN; + // try to allocate from free space in arena's mi_memid_t memid = _mi_memid_none(); mi_page_t* page = NULL; - if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) { - page = (mi_page_t*)mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, &memid, tld); + if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's? + !os_align && // not large alignment + slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large + { + page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, &memid, tld); } // otherwise fall back to the OS if (page == NULL) { - page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld); + if (os_align) { + // note: slice_count already includes the page + mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment)); + page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid, tld); + } + else { + page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld); + } } if (page == NULL) return NULL; + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment)); // claimed free slices: initialize the page partly - _mi_memzero_aligned(page, sizeof(*page)); - mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN)); - const size_t reserved = (mi_size_of_slices(slice_count) - MI_PAGE_INFO_SIZE) / block_size; + if (!memid.initially_zero) { _mi_memzero_aligned(page, sizeof(*page)); } + mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)); + const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); + const size_t reserved = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size); mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); page->reserved = (uint16_t)reserved; - page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE; + page->page_start = (uint8_t*)page + block_start; page->block_size = block_size; page->memid = memid; page->free_is_zero = memid.initially_zero; @@ -523,7 +538,7 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size } // 2. find a free block, potentially allocating a new arena - page = mi_arena_page_alloc_fresh(slice_count, block_size, req_arena_id, tld); + page = mi_arena_page_alloc_fresh(slice_count, block_size, 1, req_arena_id, tld); if (page != NULL) { mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); _mi_page_init(heap, page); @@ -534,18 +549,27 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size } -static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) { - MI_UNUSED(heap); MI_UNUSED(block_size); MI_UNUSED(page_alignment); - _mi_error_message(EINVAL, "singleton page is not yet implemented\n"); - return NULL; +static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { + const mi_arena_id_t req_arena_id = heap->arena_id; + mi_tld_t* const tld = heap->tld; + const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); + const size_t info_size = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); + const size_t slice_count = mi_slice_count_of_size(info_size + block_size); + + mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld); + if (page == NULL) return NULL; + + mi_assert(page != NULL); + mi_assert(page->reserved == 1); + return page; } -mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) { +mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { mi_page_t* page; - if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) { - mi_assert_internal(_mi_is_power_of_two(page_alignment)); - page = mi_singleton_page_alloc(heap, block_size, page_alignment); + if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) { + mi_assert_internal(_mi_is_power_of_two(block_alignment)); + page = mi_singleton_page_alloc(heap, block_size, block_alignment); } else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) { page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size); @@ -557,7 +581,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_ page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); } else { - page = mi_singleton_page_alloc(heap, block_size, page_alignment); + page = mi_singleton_page_alloc(heap, block_size, block_alignment); } // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); return page; @@ -598,7 +622,10 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); } - if (!mi_page_is_abandoned(page)) return false; // it is not abandoned + // if (!mi_page_is_abandoned(page)) return false; // it is not abandoned (anymore) + + // note: we can access the page even it is in the meantime reclaimed by another thread since + // we only call this when on free (and thus there is still an object alive in the page) mi_memid_t memid = page->memid; if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false; // don't reclaim between exclusive and non-exclusive arena's @@ -623,11 +650,12 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { } else { // A page in OS or external memory + if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false; + // we use the thread_id to atomically grab ownership - // TODO: respect the subproc -- do we need to add this to the page? mi_threadid_t abandoned_thread_id = 0; if (mi_atomic_cas_strong_acq_rel(&page->xthread_id, &abandoned_thread_id, heap->thread_id)) { - // we unabandoned partly + // we got it atomically _mi_page_reclaim(heap, page); mi_assert_internal(!mi_page_is_abandoned(page)); return true; diff --git a/src/bitmap.c b/src/bitmap.c index fe44bb67..dd1afe75 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -263,7 +263,7 @@ restore: // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. // todo: try neon version static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) { -#if 0 && defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) +#if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) diff --git a/src/free.c b/src/free.c index 5dbea4a4..c7d92292 100644 --- a/src/free.c +++ b/src/free.c @@ -115,7 +115,7 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) #endif mi_page_t* const page = _mi_ptr_page(p); #if MI_DEBUG - if (page == MI_PAGE_PTR_INVALID) { + if (page == NULL && p != NULL) { _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p); } #endif @@ -126,11 +126,9 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) // Fast path written carefully to prevent register spilling on the stack void mi_free(void* p) mi_attr_noexcept { - if (p==NULL) return; mi_page_t* const page = mi_checked_ptr_page(p,"mi_free"); - // if mi_unlikely(page==NULL) return; - - + if mi_unlikely(page==NULL) return; + const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page)); if mi_likely(is_local) { // thread-local free? if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) @@ -235,11 +233,14 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block { // the page is abandoned, try to reclaim it into our heap if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) { // TODO: avoid putting it in the full free queue - mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); + mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc); mi_free(block); // recursively free as now it will be a local free in our heap return; } + else { + mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages + } } // The padding check may access the non-thread-owned page for the key values. diff --git a/src/page-map.c b/src/page-map.c index 624f615c..a3e9a649 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -37,7 +37,8 @@ static bool mi_page_map_init(void) { // commit the first part so NULL pointers get resolved without an access violation if (!mi_page_map_all_committed) { _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL); - _mi_page_map[0] = -1; // so _mi_ptr_page(NULL) == NULL + _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL + mi_assert_internal(_mi_ptr_page(NULL)==NULL); } return true; } @@ -60,9 +61,9 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) { static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) { size_t page_size; *page_start = mi_page_area(page, &page_size); - if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE; } // furthest interior pointer - *slice_count = mi_slice_count_of_size(page_size); - return ((uintptr_t)*page_start >> MI_ARENA_SLICE_SHIFT); + if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; } // furthest interior pointer + *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks + return ((uintptr_t)page >> MI_ARENA_SLICE_SHIFT); } @@ -79,9 +80,9 @@ void _mi_page_map_register(mi_page_t* page) { mi_page_map_ensure_committed(idx, slice_count); // set the offsets - for (int i = 0; i < (int)slice_count; i++) { + for (size_t i = 0; i < slice_count; i++) { mi_assert_internal(i < 128); - _mi_page_map[idx + i] = (i+1); + _mi_page_map[idx + i] = (uint8_t)(i+1); } } diff --git a/src/page.c b/src/page.c index d91b9123..af55b3b3 100644 --- a/src/page.c +++ b/src/page.c @@ -41,9 +41,10 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page); #if (MI_DEBUG>=3) static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) { + mi_assert_internal(_mi_ptr_page(page) == page); size_t count = 0; while (head != NULL) { - mi_assert_internal(page == _mi_ptr_page(head)); + mi_assert_internal((uint8_t*)head - (uint8_t*)page > MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head)); count++; head = mi_block_next(page, head); } @@ -123,7 +124,7 @@ bool _mi_page_is_valid(mi_page_t* page) { { mi_page_queue_t* pq = mi_page_queue_of(page); mi_assert_internal(mi_page_queue_contains(pq, page)); - mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_MAX_OBJ_SIZE || mi_page_is_in_full(page)); + mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page)); mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq)); } } @@ -258,7 +259,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE); // TODO: push on full queue immediately if it is full? - mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page)); + mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); mi_page_queue_push(heap, pq, page); mi_assert_expensive(_mi_page_is_valid(page)); } @@ -279,6 +280,15 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size } if (mi_page_is_abandoned(page)) { _mi_page_reclaim(heap, page); + if (!mi_page_immediate_available(page)) { + if (mi_page_is_expandable(page)) { + mi_page_extend_free(heap, page); + } + else { + mi_assert(false); // should not happen? + return NULL; + } + } } else if (pq != NULL) { mi_page_queue_push(heap, pq, page); @@ -295,7 +305,7 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) { mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0); if (page==NULL) return NULL; mi_assert_internal(pq->block_size==mi_page_block_size(page)); - mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page))); + mi_assert_internal(pq==mi_heap_page_queue_of(heap, page)); return page; } @@ -713,7 +723,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { -------------------------------------------------------------*/ // search for a best next page to use for at most N pages (often cut short if immediate blocks are available) -#define MI_MAX_CANDIDATE_SEARCH (0) +#define MI_MAX_CANDIDATE_SEARCH (8) // Find a page with free blocks of `page->block_size`. @@ -723,7 +733,9 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p #if MI_STAT size_t count = 0; #endif + #if MI_MAX_CANDIDATE_SEARCH > 1 size_t candidate_count = 0; // we reset this on the first candidate to limit the search + #endif mi_page_t* page_candidate = NULL; // a page with free space mi_page_t* page = pq->first; @@ -793,17 +805,21 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p mi_assert_internal(mi_page_is_expandable(page)); mi_page_extend_free(heap, page); } + mi_assert_internal(mi_page_immediate_available(page)); } if (page == NULL) { _mi_heap_collect_retired(heap, false); // perhaps make a page available page = mi_page_fresh(heap, pq); + mi_assert_internal(page == NULL || mi_page_immediate_available(page)); if (page == NULL && first_try) { // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again page = mi_page_queue_find_free_ex(heap, pq, false); + mi_assert_internal(page == NULL || mi_page_immediate_available(page)); } } else { + mi_assert_internal(page == NULL || mi_page_immediate_available(page)); // move the page to the front of the queue mi_page_queue_move_to_front(heap, pq, page); page->retire_expire = 0; diff --git a/test/test-api.c b/test/test-api.c index 15484544..ee7c56bb 100644 --- a/test/test-api.c +++ b/test/test-api.c @@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-) #include "mimalloc.h" // #include "mimalloc/internal.h" -#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX +#include "mimalloc/types.h" // for MI_DEBUG and MI_PAGE_MAX_OVERALLOC_ALIGN #include "testhelper.h" @@ -169,7 +169,7 @@ int main(void) { /* CHECK_BODY("malloc-aligned6") { bool ok = true; - for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) { + for (size_t align = 1; align <= MI_PAGE_MAX_OVERALLOC_ALIGN && ok; align *= 2) { void* ps[8]; for (int i = 0; i < 8 && ok; i++) { ps[i] = mi_malloc_aligned(align*13 // size @@ -186,16 +186,16 @@ int main(void) { }; */ CHECK_BODY("malloc-aligned7") { - void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX); + void* p = mi_malloc_aligned(1024,MI_PAGE_MAX_OVERALLOC_ALIGN); mi_free(p); - result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0; + result = ((uintptr_t)p % MI_PAGE_MAX_OVERALLOC_ALIGN) == 0; }; CHECK_BODY("malloc-aligned8") { bool ok = true; for (int i = 0; i < 5 && ok; i++) { int n = (1 << i); - void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX); - ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0; + void* p = mi_malloc_aligned(1024, n * MI_PAGE_MAX_OVERALLOC_ALIGN); + ok = ((uintptr_t)p % (n*MI_PAGE_MAX_OVERALLOC_ALIGN)) == 0; mi_free(p); } result = ok; @@ -203,7 +203,7 @@ int main(void) { CHECK_BODY("malloc-aligned9") { // test large alignments bool ok = true; void* p[8]; - size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 }; + size_t sizes[8] = { 8, 512, 1024 * 1024, MI_PAGE_MAX_OVERALLOC_ALIGN, MI_PAGE_MAX_OVERALLOC_ALIGN + 1, 2 * MI_PAGE_MAX_OVERALLOC_ALIGN, 8 * MI_PAGE_MAX_OVERALLOC_ALIGN, 0 }; for (int i = 0; i < 28 && ok; i++) { int align = (1 << i); for (int j = 0; j < 8 && ok; j++) { diff --git a/test/test-stress.c b/test/test-stress.c index e2133f7d..76dfe877 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -42,7 +42,7 @@ static int SCALE = 10; static int ITER = 10; #elif 0 static int THREADS = 4; -static int SCALE = 20; +static int SCALE = 100; static int ITER = 20; #else static int THREADS = 32; // more repeatable if THREADS <= #processors @@ -54,7 +54,7 @@ static int ITER = 50; // N full iterations destructing and re-creating a #define STRESS // undefine for leak test -static bool allow_large_objects = false; // allow very large objects? (set to `true` if SCALE>100) +static bool allow_large_objects = true; // allow very large objects? (set to `true` if SCALE>100) static size_t use_one_size = 0; // use single object size of `N * sizeof(uintptr_t)`? static bool main_participates = false; // main thread participates as a worker too From 8f2a5864b8c88913ce6d68f8bd7c40f1aae230f2 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 30 Nov 2024 22:54:57 -0800 Subject: [PATCH 016/264] pass all debug tests --- include/mimalloc/internal.h | 2 +- src/alloc.c | 6 +++++- src/arena.c | 20 +++++++++++++++++++- src/os.c | 16 +++++++++++----- src/page-map.c | 12 ++++++++++-- src/page.c | 4 ++++ 6 files changed, 50 insertions(+), 10 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 02a62bec..01b7076b 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -463,7 +463,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) { } static inline mi_page_t* _mi_ptr_page(const void* p) { - #if MI_DEBUG + #if 1 // MI_DEBUG return _mi_checked_ptr_page(p); #else return _mi_ptr_page_ex(p,NULL); diff --git a/src/alloc.c b/src/alloc.c index 00f6d1a4..840d34fe 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -30,7 +30,11 @@ terms of the MIT license. A copy of the license can be found in the file // Note: in release mode the (inlined) routine is about 7 instructions with a single test. extern inline void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept { - mi_assert_internal(page->block_size == 0 /* empty heap */ || mi_page_block_size(page) >= size); + if (page->block_size != 0) { // not the empty heap + mi_assert_internal(mi_page_block_size(page) >= size); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); + } // check the free list mi_block_t* const block = page->free; diff --git a/src/arena.c b/src/arena.c index a2d3f560..66f83d4f 100644 --- a/src/arena.c +++ b/src/arena.c @@ -462,6 +462,9 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(!mi_page_is_full(page)); mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); + mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); return page; } } @@ -521,6 +524,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz page->block_size_shift = 0; } _mi_page_map_register(page); + mi_assert_internal(_mi_ptr_page(page)==page); + mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(mi_page_is_abandoned(page)); @@ -561,6 +566,9 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si mi_assert(page != NULL); mi_assert(page->reserved == 1); + mi_assert_internal(_mi_ptr_page(page)==page); + mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); + return page; } @@ -584,6 +592,11 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block page = mi_singleton_page_alloc(heap, block_size, block_alignment); } // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); + mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); + mi_assert_internal(block_alignment <= MI_PAGE_MAX_OVERALLOC_ALIGN || _mi_is_aligned(mi_page_start(page), block_alignment)); + return page; } @@ -601,11 +614,14 @@ void _mi_arena_page_free(mi_page_t* page, mi_tld_t* tld) { void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { mi_assert_internal(mi_page_is_abandoned(page)); mi_assert_internal(page->next==NULL); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); + if (mi_page_all_free(page)) { _mi_arena_page_free(page, tld); } - else if (page->memid.memkind==MI_MEM_ARENA) { + else if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) { // make available for allocations size_t bin = _mi_bin(mi_page_block_size(page)); size_t slice_index; @@ -622,6 +638,8 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); } + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); // if (!mi_page_is_abandoned(page)) return false; // it is not abandoned (anymore) // note: we can access the page even it is in the meantime reclaimed by another thread since diff --git a/src/os.c b/src/os.c index 110d7ec6..931abc7f 100644 --- a/src/os.c +++ b/src/os.c @@ -219,20 +219,26 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL; size = _mi_align_up(size, _mi_os_page_size()); + const bool use_overalloc = (alignment > mi_os_mem_config.alloc_granularity && alignment <= size/8); + // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD) - void* p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats); - if (p == NULL) return NULL; + void* p = NULL; + if (!use_overalloc) { + p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats); + } // aligned already? - if (((uintptr_t)p % alignment) == 0) { + if (p != NULL && ((uintptr_t)p % alignment) == 0) { *base = p; } else { // if not aligned, free it, overallocate, and unmap around it #if !MI_TRACK_ASAN - _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit); + if (!use_overalloc) { + _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit); + } #endif - mi_os_prim_free(p, size, commit, stats); + if (p != NULL) { mi_os_prim_free(p, size, commit, stats); } if (size >= (SIZE_MAX - alignment)) return NULL; // overflow const size_t over_size = size + alignment; diff --git a/src/page-map.c b/src/page-map.c index a3e9a649..15578301 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -36,7 +36,9 @@ static bool mi_page_map_init(void) { } // commit the first part so NULL pointers get resolved without an access violation if (!mi_page_map_all_committed) { - _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL); + bool is_zero; + _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero, NULL); + if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(_mi_page_map, _mi_os_page_size()); } _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL mi_assert_internal(_mi_ptr_page(NULL)==NULL); } @@ -51,7 +53,11 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) { for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) { // per bit to avoid crossing over bitmap chunks if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, i, 1)) { // this may race, in which case we do multiple commits (which is ok) - _mi_os_commit(_mi_page_map + (i*mi_page_map_entries_per_commit_bit), mi_page_map_entries_per_commit_bit, NULL, NULL); + bool is_zero; + uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit); + const size_t size = mi_page_map_entries_per_commit_bit; + _mi_os_commit(start, size, &is_zero, NULL); + if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); } mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, i, 1, NULL); } } @@ -69,6 +75,8 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* void _mi_page_map_register(mi_page_t* page) { + mi_assert_internal(page != NULL); + mi_assert_internal(_mi_is_aligned(page,MI_PAGE_ALIGN)); if mi_unlikely(_mi_page_map == NULL) { if (!mi_page_map_init()) return; } diff --git a/src/page.c b/src/page.c index af55b3b3..243d9bf7 100644 --- a/src/page.c +++ b/src/page.c @@ -745,7 +745,9 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p #if MI_STAT count++; #endif + #if MI_MAX_CANDIDATE_SEARCH > 1 candidate_count++; + #endif // collect freed blocks by us and other threads _mi_page_free_collect(page, false); @@ -978,6 +980,8 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al mi_assert_internal(mi_page_immediate_available(page)); mi_assert_internal(mi_page_block_size(page) >= size); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); // and try again, this time succeeding! (i.e. this should never recurse through _mi_page_malloc) if mi_unlikely(zero && mi_page_is_huge(page)) { From 1d7a9f62a517e6667b50175bce4766b1c5d0f495 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 1 Dec 2024 12:54:16 -0800 Subject: [PATCH 017/264] bug fixes --- include/mimalloc/internal.h | 2 +- src/arena.c | 17 +++++++---------- src/free.c | 35 ++++++++++++++++++++--------------- src/init.c | 3 +-- src/os.c | 7 ++++--- test/test-stress.c | 2 +- 6 files changed, 34 insertions(+), 32 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 01b7076b..ec106047 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -598,7 +598,7 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) { static inline bool mi_page_is_abandoned(const mi_page_t* page) { // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) - return (mi_page_thread_id(page) == 0); + return (mi_atomic_load_acquire(&page->xthread_id) == 0); } static inline bool mi_page_is_huge(const mi_page_t* page) { diff --git a/src/arena.c b/src/arena.c index 66f83d4f..a713a110 100644 --- a/src/arena.c +++ b/src/arena.c @@ -646,11 +646,12 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { // we only call this when on free (and thus there is still an object alive in the page) mi_memid_t memid = page->memid; if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false; // don't reclaim between exclusive and non-exclusive arena's + if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false; if mi_likely(memid.memkind == MI_MEM_ARENA) { size_t slice_index; mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL); - if (arena->subproc != heap->tld->subproc) return false; // only reclaim within the same subprocess + //if (arena->subproc != heap->tld->subproc) return false; // only reclaim within the same subprocess // don't reclaim more from a `free` call than half the current segments // this is to prevent a pure free-ing thread to start owning too many segments @@ -665,6 +666,11 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(!mi_page_is_abandoned(page)); return true; } + else { + if (mi_page_is_abandoned(page)) { + mi_assert(false); + } + } } else { // A page in OS or external memory @@ -1089,15 +1095,6 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv -/* ----------------------------------------------------------- - Abandoned pages ------------------------------------------------------------ */ - -void mi_arena_page_abandon(mi_page_t* page) { - mi_assert_internal(mi_page_is_abandoned(page)); - if (mi_page_is_full(page)) {} -} - /* ----------------------------------------------------------- diff --git a/src/free.c b/src/free.c index c7d92292..f0ce8c22 100644 --- a/src/free.c +++ b/src/free.c @@ -225,24 +225,29 @@ static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) { // first see if the page was abandoned and if we can reclaim it into our thread - if (mi_page_is_abandoned(page) && - (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 || - mi_page_is_singleton(page) // only one block, and we are free-ing it - ) && - mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944)) - { - // the page is abandoned, try to reclaim it into our heap - if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) { // TODO: avoid putting it in the full free queue - mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); - // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc); - mi_free(block); // recursively free as now it will be a local free in our heap - return; - } - else { - mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages + if (mi_page_is_abandoned(page)) { + if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 || + mi_page_is_singleton(page)) { // only one block, and we are free-ing it + if (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944)) + { + // the page is abandoned, try to reclaim it into our heap + if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) { // TODO: avoid putting it in the full free queue + mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); + // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc); + mi_free(block); // recursively free as now it will be a local free in our heap + return; + } + else { + if (mi_page_is_abandoned(page)) { + mi_assert(false); + } + mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages + } + } } } + // The padding check may access the non-thread-owned page for the key values. // that is safe as these are constant and the page won't be freed (as the block is not freed yet). mi_check_padding(page, block); diff --git a/src/init.c b/src/init.c index 16130af7..2378b3c8 100644 --- a/src/init.c +++ b/src/init.c @@ -396,8 +396,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { tld->heap_backing = bheap; tld->heaps = NULL; tld->subproc = &mi_subproc_default; - tld->tseq = 0; - mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1); tld->os.stats = &tld->stats; } diff --git a/src/os.c b/src/os.c index 931abc7f..0aa0a681 100644 --- a/src/os.c +++ b/src/os.c @@ -219,11 +219,12 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL; size = _mi_align_up(size, _mi_os_page_size()); - const bool use_overalloc = (alignment > mi_os_mem_config.alloc_granularity && alignment <= size/8); + // try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste). + const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64); // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD) void* p = NULL; - if (!use_overalloc) { + if (try_direct_alloc) { p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats); } @@ -234,7 +235,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit else { // if not aligned, free it, overallocate, and unmap around it #if !MI_TRACK_ASAN - if (!use_overalloc) { + if (try_direct_alloc) { _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit); } #endif diff --git a/test/test-stress.c b/test/test-stress.c index 76dfe877..9a89744e 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -36,7 +36,7 @@ static int ITER = 400; static int THREADS = 8; static int SCALE = 25; static int ITER = 20; -#elif defined(xMI_GUARDED) // with debug guard pages reduce parameters to stay within the azure pipeline limits +#elif defined(MI_GUARDED) // with debug guard pages reduce parameters to stay within the azure pipeline limits static int THREADS = 8; static int SCALE = 10; static int ITER = 10; From 2f789aae9a1ed271e3feb22e4ead04db809e4e2e Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 1 Dec 2024 16:26:59 -0800 Subject: [PATCH 018/264] wip: cannot compile --- include/mimalloc/internal.h | 84 +++++++++++++++++++------------------ include/mimalloc/types.h | 20 +++++---- src/bitmap.c | 45 ++++++++++++++++++++ src/bitmap.h | 28 ++++++++++++- src/free.c | 81 +++++++++++++++++++++++------------ 5 files changed, 181 insertions(+), 77 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index ec106047..84244c21 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -92,11 +92,13 @@ bool _mi_preloading(void); // true while the C runtime is not in void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); + mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; -size_t _mi_thread_seq_id(void) mi_attr_noexcept; +size_t _mi_thread_seq_id(void) mi_attr_noexcept; + mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); -void _mi_heap_guarded_init(mi_heap_t* heap); +void _mi_heap_guarded_init(mi_heap_t* heap); // os.c void _mi_os_init(void); // called from process init @@ -180,8 +182,6 @@ void _mi_heap_delayed_free_all(mi_heap_t* heap); bool _mi_heap_delayed_free_partial(mi_heap_t* heap); void _mi_heap_collect_retired(mi_heap_t* heap, bool force); -void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never); -bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never); size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); void _mi_deferred_free(mi_heap_t* heap, bool force); @@ -426,6 +426,10 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) { return ((uintptr_t)p ^ _mi_heap_main.cookie); } +static inline mi_tld_t* _mi_tld(void) { + return mi_heap_get_default()->tld; +} + /* ----------------------------------------------------------- Pages ----------------------------------------------------------- */ @@ -507,53 +511,53 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) { return mi_page_block_size(page) - MI_PADDING_SIZE; } +//static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { +// mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); +// if (heap != NULL) { +// mi_atomic_store_release(&page->xheap, (uintptr_t)heap); +// page->heap_tag = heap->tag; +// mi_atomic_store_release(&page->xthread_id, heap->thread_id); +// } +// else { +// mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc); +// mi_atomic_store_release(&page->xthread_id,0); +// } +//} + +// Thread free flag helpers +static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { + return (mi_block_t*)(tf & ~1); +} +static inline bool mi_tf_is_owned(mi_thread_free_t tf) { + return ((tf & 1) == 0); +} +static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) { + return (mi_thread_free_t)((uintptr_t)block | (owned ? 0 : 1)); +} + + // Thread free access static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { - return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3); + return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); } -static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) { - return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3); -} - -// Heap access -static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { - return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap)); +// Owned? +static inline bool mi_page_is_owned(const mi_page_t* page) { + return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); } +// Thread id of thread that owns this page static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { return mi_atomic_load_relaxed(&page->xthread_id); } -static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { - mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); - if (heap != NULL) { - mi_atomic_store_release(&page->xheap, (uintptr_t)heap); - page->heap_tag = heap->tag; - mi_atomic_store_release(&page->xthread_id, heap->thread_id); - } - else { - mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc); - mi_atomic_store_release(&page->xthread_id,0); - } -} -// Thread free flag helpers -static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { - return (mi_block_t*)(tf & ~0x03); -} -static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) { - return (mi_delayed_t)(tf & 0x03); -} -static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) { - return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed); -} -static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) { - return mi_tf_make(mi_tf_block(tf),delayed); -} -static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) { - return mi_tf_make(block, mi_tf_delayed(tf)); -} +//static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) { +// return mi_tf_make(mi_tf_block(tf),delayed); +//} +//static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) { +// return mi_tf_make(block, mi_tf_delayed(tf)); +//} // are all blocks in a page freed? // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`. diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 271c7efb..7329cb86 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -216,13 +216,14 @@ typedef struct mi_block_s { #endif -// The delayed flags are used for efficient multi-threaded free-ing -typedef enum mi_delayed_e { - MI_USE_DELAYED_FREE = 0, // push on the owning heap thread delayed list - MI_DELAYED_FREEING = 1, // temporary: another thread is accessing the owning heap - MI_NO_DELAYED_FREE = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list - MI_NEVER_DELAYED_FREE = 3 // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim -} mi_delayed_t; +// The owned flags are used for efficient multi-threaded free-ing +// When we push on the page thread free queue of an abandoned page, +// we also atomically get to own it. This is needed to atomically +// abandon a page (while other threads could concurrently free blocks in it). +typedef enum mi_owned_e { + MI_OWNED = 0, // some heap owns this page + MI_ABANDONED = 1, // the page is abandoned +} mi_owned_t; // The `in_full` and `has_aligned` page flags are put in a union to efficiently @@ -247,7 +248,7 @@ typedef union mi_page_flags_s { #endif // Thread free list. -// We use the bottom 2 bits of the pointer for mi_delayed_t flags +// We use the bottom bit of the pointer for `mi_owned_t` flags typedef uintptr_t mi_thread_free_t; // Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython) @@ -304,10 +305,11 @@ typedef struct mi_page_s { #endif _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - _Atomic(uintptr_t) xheap; // heap this threads belong to. + // _Atomic(uintptr_t) xheap; // heap this threads belong to. struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` + mi_subproc_t* subproc; // sub-process of this heap mi_memid_t memid; // provenance of the page memory } mi_page_t; diff --git a/src/bitmap.c b/src/bitmap.c index dd1afe75..5cce6bfa 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -693,3 +693,48 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t mi_bitmap_forall_set_chunks_end(); return false; } + + + +mi_decl_nodiscard bool mi_pairmap_xset(mi_pair_t set, mi_bitmap_t* bitmap, size_t idx); +mi_decl_nodiscard bool mi_pairmap_xset_while_not_busy(mi_pair_t set, mi_bitmap_t* bitmap, size_t idx); + +mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) { + size_t set_idx; + size_t start = tseq % MI_BFIELD_BITS; + size_t epoch = mi_atomic_load_acquire(&pairmap->epoch); + mi_bfield_t any_set = mi_bfield_rotate_right(mi_atomic_load_relaxed(&pairmap->any_set), start); + while (mi_bfield_find_least_bit(any_set, &set_idx)) { + size_t chunk_idx = 2*((set_idx + start) % MI_BFIELD_BITS); + { + // look at chunk_idx and chunck_idx+1 + mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx]; + mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx+1]; + size_t cidx; + if (mi_pairmap_chunk_find_and_set_busy(chunk1, &cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx < MI_PAIRMAP_MAX_BITS); + return true; + } + else { + if (mi_pairmap_chunk_find_and_set_busy(chunk2, &cidx)) { + *pidx = ((chunk_idx+1) * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx < MI_PAIRMAP_MAX_BITS); + return true; + } + else if (mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk2)) { + + mi_bfield_atomic_xset(MI_BIT_CLEAR, &pairmap->any_set, chunk_idx/2); + } + } + else { + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, chunk_idx); + } + } + } + start += set_idx+1; /* so chunk_idx stays valid */ + any_set >>= set_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ + any_set >>= 1; + } +} diff --git a/src/bitmap.h b/src/bitmap.h index 1a180924..2b4bfc25 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -41,7 +41,7 @@ typedef mi_decl_align(32) struct mi_bitmap_s { #define MI_BITMAP_MAX_BITS (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit /* -------------------------------------------------------------------------------- - Bitmap + Atomic bitmap -------------------------------------------------------------------------------- */ typedef bool mi_bit_t; @@ -89,4 +89,30 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ); + +/* -------------------------------------------------------------------------------- + Atomic bitmap for a pair of bits +-------------------------------------------------------------------------------- */ + +typedef mi_bfield_t mi_pair_t; + +#define MI_PAIR_CLEAR (0) +#define MI_PAIR_BUSY (1) +#define MI_PAIR_BUSYX (2) +#define MI_PAIR_SET (3) + +typedef mi_decl_align(32) struct mi_pairmap_s { + mi_bitmap_chunk_t chunks[2*MI_BFIELD_BITS]; + _Atomic(mi_bfield_t) any_set; + _Atomic(size_t) epoch; +} mi_pairmap_t; + +#define MI_PAIRMAP_MAX_PAIRS (MI_BITMAP_MAX_BITS) // 16k pairs on 64bit, 8k pairs on 32bit +#define MI_PAIRMAP_MAX_BITS (2*MI_PAIRMAP_MAX_PAIRS) + +mi_decl_nodiscard bool mi_pairmap_xset(mi_pair_t set, mi_pairmap_t* pairmap, size_t idx); +mi_decl_nodiscard bool mi_pairmap_xset_while_not_busy(mi_pair_t set, mi_pairmap_t* pairmap, size_t idx); +mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t n, size_t tseq, size_t* pidx); + + #endif // MI_XBITMAP_H diff --git a/src/free.c b/src/free.c index f0ce8c22..42fcd07e 100644 --- a/src/free.c +++ b/src/free.c @@ -147,39 +147,66 @@ void mi_free(void* p) mi_attr_noexcept } } -// return true if successful -bool _mi_free_delayed_block(mi_block_t* block) { - // get segment and page - mi_assert_internal(block!=NULL); - mi_page_t* const page = mi_checked_ptr_page(block,"_mi_free_delayed_block"); - mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); - // Clear the no-delayed flag so delayed freeing is used again for this page. - // This must be done before collecting the free lists on this page -- otherwise - // some blocks may end up in the page `thread_free` list with no blocks in the - // heap `thread_delayed_free` list which may cause the page to be never freed! - // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`) - if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) { - return false; - } - - // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count - _mi_page_free_collect(page, false); - - // and free the block (possibly freeing the page as well since `used` is updated) - mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */); - return true; -} // ------------------------------------------------------ // Multi-threaded Free (`_mt`) // ------------------------------------------------------ -// Push a block that is owned by another thread on its page-local thread free -// list or it's heap delayed free list. Such blocks are later collected by -// the owning thread in `_mi_free_delayed_block`. -static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block ) +static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_thread_id(page)==0); + + // we own the page now.. + // first remove it from the abandoned pages in the arena + mi_heap_t* const heap = mi_heap_get_default(); + _mi_arena_page_unabandon(page,heap->tld); + + // collect the thread atomic free list + _mi_page_free_collect(page, false); // update `used` count + if (mi_page_is_singleton(page)) mi_assert_internal(mi_page_all_free(page)); + + if (mi_page_all_free(page)) { + // we can free the page directly + _mi_arena_page_free(page, heap->tld); + } + else { + // the page has still some blocks in use + // reclaim in our heap if compatible, or otherwise abandon again + if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && + (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) && // we did not already terminate our thread (can this happen? yes, due to thread-local destructors for example (issue #944)) + (page->subproc == heap->tld->subproc) && // don't reclaim across sub-processes + mi_arena_page_try_reclaim(page) // and we can reclaim it from the arena + ) + { + // make it part of our heap + _mi_heap_page_reclaim(heap, page); + } + else { + // abandon again + _mi_arena_page_abandon(page, heap->tld); + } + } +} + +// Push a block that is owned by another thread on its page-local thread free list. +static void mi_decl_noinline mi_free_block_delayed_mt(mi_page_t* page, mi_block_t* block) { + // push atomically on the page thread free list + mi_thread_free_t tf_new; + mi_thread_free_t tf; + do { + tf = mi_atomic_load_relaxed(&page->xthread_free); + mi_block_set_next(page, block, mi_tf_block(tf)); + tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); + } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf, tf_new)); + + // and atomically reclaim the page if it was abandoned + bool reclaimed = !mi_tf_is_owned(tf); + if (reclaimed) mi_free_try_reclaim_mt(page); +} + + /* // Try to put the block on either the page-local thread free list, // or the heap delayed free list (if this is the first non-local free in that page) mi_thread_free_t tfreex; @@ -276,7 +303,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block // thread_delayed free list (or heap delayed free list) mi_free_block_delayed_mt(page,block); } - +*/ // ------------------------------------------------------ // Usable size From 69ac69abac87b513674f79d1217aab00e2b6ccb8 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 2 Dec 2024 00:31:08 -0800 Subject: [PATCH 019/264] wip: use epoch with 512bit chunks --- include/mimalloc/bits.h | 119 +++++--- include/mimalloc/types.h | 2 +- src/arena.c | 61 +++-- src/bitmap.c | 567 +++++++++++++++++++++++++++------------ src/bitmap.h | 65 ++++- src/free.c | 4 +- src/libc.c | 10 +- src/options.c | 2 +- 8 files changed, 574 insertions(+), 256 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 79034c2f..90d56b4f 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -145,20 +145,13 @@ typedef int32_t mi_ssize_t; size_t _mi_clz_generic(size_t x); size_t _mi_ctz_generic(size_t x); +uint32_t _mi_ctz_generic32(uint32_t x); static inline size_t mi_ctz(size_t x) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0 uint64_t r; __asm volatile ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); return r; - #elif defined(__GNUC__) && MI_ARCH_ARM64 - uint64_t r; - __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc"); - return r; - #elif defined(__GNUC__) && MI_ARCH_RISCV - size_t r; - __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : ); - return r; #elif MI_ARCH_X64 && defined(__BMI1__) return (size_t)_tzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) @@ -168,6 +161,17 @@ static inline size_t mi_ctz(size_t x) { #else return (_BitScanForward64(&idx, x) ? (size_t)idx : 64); #endif + /* + // for arm64 and riscv, the builtin_ctz is defined for 0 as well + #elif defined(__GNUC__) && MI_ARCH_ARM64 + uint64_t r; + __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_RISCV + size_t r; + __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : ); + return r; + */ #elif mi_has_builtin_size(ctz) return (x!=0 ? (size_t)mi_builtin_size(ctz)(x) : MI_SIZE_BITS); #else @@ -177,18 +181,10 @@ static inline size_t mi_ctz(size_t x) { } static inline size_t mi_clz(size_t x) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 uint64_t r; __asm volatile ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); return r; - #elif defined(__GNUC__) && MI_ARCH_ARM64 - uint64_t r; - __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc"); - return r; - #elif defined(__GNUC__) && MI_ARCH_RISCV - size_t r; - __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : ); - return r; #elif MI_ARCH_X64 && defined(__BMI1__) return (size_t)_lzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) @@ -198,6 +194,17 @@ static inline size_t mi_clz(size_t x) { #else return (_BitScanReverse64(&idx, x) ? 63 - (size_t)idx : 64); #endif + /* + // for arm64 and riscv, the builtin_clz is defined for 0 as well + #elif defined(__GNUC__) && MI_ARCH_ARM64 + uint64_t r; + __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_RISCV + size_t r; + __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : ); + return r; + */ #elif mi_has_builtin_size(clz) return (x!=0 ? (size_t)mi_builtin_size(clz)(x) : MI_SIZE_BITS); #else @@ -206,6 +213,26 @@ static inline size_t mi_clz(size_t x) { #endif } +static inline uint32_t mi_ctz32(uint32_t x) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0 + uint32_t r; + __asm volatile ("tzcntl\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif MI_ARCH_X64 && defined(__BMI1__) + return (uint32_t)_tzcnt_u32(x); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long idx; + return (_BitScanForward(&idx, x) ? (uint32_t)idx : 32); + #elif mi_has_builtin(ctz) && (INT_MAX == INT32_MAX) + return (x!=0 ? (uint32_t)mi_builtin(ctz)(x) : 32); + #elif mi_has_builtin(ctzl) && (LONG_MAX == INT32_MAX) + return (x!=0 ? (uint32_t)mi_builtin(ctzl)(x) : 32); + #else + #define MI_HAS_FAST_BITSCAN 0 + return _mi_ctz_generic32(x); + #endif +} + #ifndef MI_HAS_FAST_BITSCAN #define MI_HAS_FAST_BITSCAN 1 #endif @@ -229,6 +256,22 @@ static inline bool mi_bsf(size_t x, size_t* idx) { #endif } +// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's) +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bsf32(uint32_t x, uint32_t* idx) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + // on x64 the carry flag is set on zero which gives better codegen + bool is_zero; + __asm ("tzcntl\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc"); + return !is_zero; + #else + *idx = mi_ctz32(x); + return (x!=0); + #endif +} + + // Bit scan reverse: find the most significant bit that is set // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). @@ -248,29 +291,6 @@ static inline bool mi_bsr(size_t x, size_t* idx) { } -/* -------------------------------------------------------------------------------- - find least/most significant bit position --------------------------------------------------------------------------------- */ - -// Find most significant bit index, or MI_SIZE_BITS if 0 -static inline size_t mi_find_msb(size_t x) { - #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) - unsigned long i; - #if MI_SIZE_BITS==32 - return (_BitScanReverse(&i, x) ? i : 32); - #else - return (_BitScanReverse64(&i, x) ? i : 64); - #endif - #else - return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x)); - #endif -} - -// Find least significant bit index, or MI_SIZE_BITS if 0 (this equals `mi_ctz`, count trailing zero's) -static inline size_t mi_find_lsb(size_t x) { - return mi_ctz(x); -} - /* -------------------------------------------------------------------------------- rotate @@ -288,13 +308,26 @@ static inline size_t mi_rotr(size_t x, size_t r) { return _rotr64(x,(int)r); #endif #else - // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to + // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to // avoid UB when `rshift==0`. See const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); return ((x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1)))); #endif } +static inline uint32_t mi_rotr32(uint32_t x, uint32_t r) { + #if mi_has_builtin(rotateright32) + return mi_builtin(rotateright32)(x, r); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + return _lrotr(x, (int)r); + #else + // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to + // avoid UB when `rshift==0`. See + const unsigned int rshift = (unsigned int)(r) & 31; + return ((x >> rshift) | (x << ((-rshift) & 31))); + #endif +} + static inline size_t mi_rotl(size_t x, size_t r) { #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64) return mi_builtin(rotateleft64)(x,r); @@ -307,7 +340,7 @@ static inline size_t mi_rotl(size_t x, size_t r) { return _rotl64(x,(int)r); #endif #else - // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to + // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to // avoid UB when `rshift==0`. See const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); return ((x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1)))); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 271c7efb..fe7e8227 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -120,7 +120,7 @@ terms of the MIT license. A copy of the license can be found in the file #endif #endif #ifndef MI_BITMAP_CHUNK_BITS_SHIFT -#define MI_BITMAP_CHUNK_BITS_SHIFT 8 // optimized for 256 bits per chunk (avx2) +#define MI_BITMAP_CHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) #endif #define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) diff --git a/src/arena.c b/src/arena.c index a713a110..cc2fe7b8 100644 --- a/src/arena.c +++ b/src/arena.c @@ -197,7 +197,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // set the dirty bits if (arena->memid.initially_zero) { - memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count, NULL); + memid->initially_zero = mi_bitmap_setN(&arena->slices_dirty, slice_index, slice_count, NULL); } // set commit state @@ -206,7 +206,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_committed = true; bool all_already_committed; - mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count, &all_already_committed); + mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &all_already_committed); if (!all_already_committed) { bool commit_zero = false; if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) { @@ -219,13 +219,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( } else { // no need to commit, but check if already fully committed - memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count); + memid->initially_committed = mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count); } - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count)); - if (commit) { mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count)); } - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count)); - // mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); + if (commit) { mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); } + mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); + // mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); return p; } @@ -455,10 +455,10 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl // found an abandoned page of the right size mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_dirty, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(!mi_page_is_full(page)); mi_assert_internal(mi_page_is_abandoned(page)); @@ -626,7 +626,7 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) { size_t bin = _mi_bin(mi_page_block_size(page)); size_t slice_index; mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL); - bool were_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_abandoned[bin], slice_index, 1, NULL); + bool were_zero = mi_bitmap_setN(&arena->slices_abandoned[bin], slice_index, 1, NULL); MI_UNUSED(were_zero); mi_assert_internal(were_zero); mi_atomic_increment_relaxed(&tld->subproc->abandoned_count[bin]); } @@ -660,7 +660,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { // return false; // } const size_t bin = _mi_bin(page->block_size); - if (mi_bitmap_try_xsetN(MI_BIT_CLEAR, &arena->slices_abandoned[bin], slice_index, 1)) { + if (mi_bitmap_try_clear(&arena->slices_abandoned[bin], slice_index)) { // we got it atomically _mi_page_reclaim(heap, page); mi_assert_internal(!mi_page_is_abandoned(page)); @@ -668,7 +668,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { } else { if (mi_page_is_abandoned(page)) { - mi_assert(false); + // mi_assert(false); } } } @@ -748,7 +748,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi else { if (!all_committed) { // mark the entire range as no longer committed (so we recommit the full range when re-using) - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slice_index, slice_count, NULL); + mi_bitmap_clearN(&arena->slices_committed, slice_index, slice_count); mi_track_mem_noaccess(p, size); if (committed_size > 0) { // if partially committed, adjust the committed stats (is it will be recommitted when re-using) @@ -764,7 +764,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi } // and make it available to others again - bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_free, slice_index, slice_count, NULL); + bool all_inuse = mi_bitmap_setN(&arena->slices_free, slice_index, slice_count, NULL); if (!all_inuse) { _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count)); return; @@ -906,14 +906,14 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int } // reserve our meta info (and reserve slices outside the memory area) - mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); + mi_bitmap_unsafe_setN(&arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); if (memid.initially_committed) { - mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->slices_committed, 0, arena->slice_count); + mi_bitmap_unsafe_setN(&arena->slices_committed, 0, arena->slice_count); } else { - mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_committed, 0, info_slices, NULL); + mi_bitmap_setN(&arena->slices_committed, 0, info_slices, NULL); } - mi_bitmap_xsetN(MI_BIT_SET, &arena->slices_dirty, 0, info_slices, NULL); + mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL); return mi_arena_add(arena, arena_id, &_mi_stats_main); } @@ -973,10 +973,16 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ _mi_output_message("%s%s:\n", prefix, header); size_t bit_count = 0; size_t bit_set_count = 0; - for (int i = 0; i < MI_BFIELD_BITS && bit_count < slice_count; i++) { - char buf[MI_BITMAP_CHUNK_BITS + 32]; _mi_memzero(buf, sizeof(buf)); + for (int i = 0; i < MI_BITMAP_CHUNK_COUNT && bit_count < slice_count; i++) { + char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { + if (j > 0 && (j % 4) == 0) { + buf[k++] = '\n'; + _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix); + buf[k++] = ' '; + buf[k++] = ' '; + } if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; if (invert) bfield = ~bfield; @@ -987,12 +993,11 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ buf[k++] = ' '; } else { - _mi_memset(buf + k, ' ', MI_BFIELD_BITS); + _mi_memset(buf + k, 'o', MI_BFIELD_BITS); k += MI_BFIELD_BITS; } - bit_count += MI_BFIELD_BITS; + bit_count += MI_BFIELD_BITS; } - _mi_output_message("%s %s\n", prefix, buf); } _mi_output_message("%s total ('x'): %zu\n", prefix, bit_set_count); @@ -1113,7 +1118,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, const size_t size = mi_size_of_slices(slices); void* const p = mi_arena_slice_start(arena, slice_index); bool needs_recommit; - if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->slices_committed, slice_index, slices)) { + if (mi_bitmap_is_setN(&arena->slices_committed, slice_index, slices)) { // all slices are committed, we can purge freely needs_recommit = _mi_os_purge(p, size, stats); } @@ -1128,11 +1133,11 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, } // clear the purged slices - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_purge, slices, slice_index, NULL); + mi_bitmap_clearN(&arena->slices_purge, slices, slice_index); // update committed bitmap if (needs_recommit) { - mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->slices_committed, slices, slice_index, NULL); + mi_bitmap_clearN(&arena->slices_committed, slices, slice_index); } } diff --git a/src/bitmap.c b/src/bitmap.c index dd1afe75..d24a89be 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -44,85 +44,168 @@ static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, siz return mi_bfield_find_least_bit((set ? ~x : x), idx); } -// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). -static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { +// Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 +static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = ((mi_bfield_t)1)<bfields[i], idx); -} - -static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) { - mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); - const size_t i = byte_idx / MI_BFIELD_SIZE; - const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; - return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx); -} - -// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0) -static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) { +// Set/clear a sequence of `n` bits within a chunk. +// Returns true if all bits transitioned from 0 to 1 (or 1 to 0). +static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* pall_already_xset) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); bool all_transition = true; @@ -164,17 +234,28 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset); + all_transition = all_transition && mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset ); all_already_xset = all_already_xset && already_xset; // next field field++; idx = 0; n -= m; } - *palready_xset = all_already_xset; + if (pall_already_xset!=NULL) { *pall_already_xset = all_already_xset; } return all_transition; } + +static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_set) { + return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, all_allready_set); +} + +static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_clear) { + return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, all_allready_clear); +} + + + // Check if a sequence of `n` bits within a chunk are all set/cleared. static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); @@ -197,6 +278,38 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz return all_xset; } + + +static inline bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx) { + mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + return mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx); +} + +static inline bool mi_bitmap_chunk_try_set(mi_bitmap_chunk_t* chunk, size_t cidx) { + return mi_bitmap_chunk_try_xset(MI_BIT_SET, chunk, cidx); +} + +static inline bool mi_bitmap_chunk_try_clear(mi_bitmap_chunk_t* chunk, size_t cidx) { + return mi_bitmap_chunk_try_xset(MI_BIT_CLEAR, chunk, cidx); +} + +static inline bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) { + mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); + const size_t i = byte_idx / MI_BFIELD_SIZE; + const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; + return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx); +} + +static inline bool mi_bitmap_chunk_try_set8(mi_bitmap_chunk_t* chunk, size_t byte_idx) { + return mi_bitmap_chunk_try_xset8(MI_BIT_SET, chunk, byte_idx); +} + +static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t byte_idx) { + return mi_bitmap_chunk_try_xset8(MI_BIT_CLEAR, chunk, byte_idx); +} + // Try to atomically set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving all bit fields as is. @@ -252,12 +365,19 @@ restore: while( field > start_field) { field--; const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); - bool already_xset; - mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset); + mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, NULL); } return false; } +static inline bool mi_bitmap_chunk_try_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + return mi_bitmap_chunk_try_xsetN(MI_BIT_SET, chunk, cidx, n); +} + +static inline bool mi_bitmap_chunk_try_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + return mi_bitmap_chunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n); +} + // find least 0/1-bit in a chunk and try to set/clear it atomically // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. @@ -265,8 +385,8 @@ restore: static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) { #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) while (true) { - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) if (mask==0) return false; @@ -283,6 +403,46 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu } // try again } +#elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512) + while (true) { + size_t chunk_idx = 0; + #if 1 + __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + if ((set ? _mm256_test_all_ones(vec) : _mm256_testz_si256(vec,vec))) { + chunk_idx += 4; + vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1); + } + const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) + // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) + if (mask==0) return false; + mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 + chunk_idx += _tzcnt_u32(mask) / 8; + #else + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); + const __m256i cmpv = (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256()); + const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) + const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) + const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) + const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; + // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) + if (mask==0) return false; + mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. + const size_t chunk_idx = _tzcnt_u64(mask) / 8; + #endif + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + size_t cidx; + if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear + if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + // try again + } #else for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { size_t idx; @@ -302,49 +462,10 @@ static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx); } -// static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) { -// return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); -// } - -/* -// find least 1-bit in a chunk and try unset it atomically -// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. -// todo: try neon version -static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) - while(true) { - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - if (_mm256_testz_si256(vec,vec)) return false; // vec == 0 ? - const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1 : 0) - const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0) - mi_assert_internal(mask != 0); - const size_t chunk_idx = _tzcnt_u32(mask) / 8; // tzcnt == 0, 8, 16, or 24 - mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); - size_t cidx; - if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) { // find the bit that is set - if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) { // unset atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); - return true; - } - } - // try again - } - #else - for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - size_t idx; - if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit - if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) { // try unset atomically - *pidx = (i*MI_BFIELD_BITS + idx); - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); - return true; - } - } - } - return false; - #endif +static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) { + return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); } -*/ + // find least byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. @@ -392,7 +513,8 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, } -// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically +// find a sequence of `n` bits in a chunk with all `n` (`< MI_BFIELD_BITS`!) bits set, +// and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. // todo: try avx2 and neon version // todo: allow spanning across bfield boundaries? @@ -410,7 +532,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, if ((b&mask) == mask) { // found a match mi_assert_internal( ((mask << bshift) >> bshift) == mask ); - if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<bfields[i],mask<bfields); return _mm256_testz_si256( vec, vec ); + #elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512) + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + if (!_mm256_testz_si256(vec1, vec1)) return false; + const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); + return (_mm256_testz_si256(vec2, vec2)); #else - // written like this for vectorization - mi_bfield_t x = chunk->bfields[0]; - for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { - x = x | chunk->bfields[i]; + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + if (chunk->bfields[i] != 0) return false; } - return (x == 0); + return true; #endif } +/* -------------------------------------------------------------------------------- + epochset (for now for 32-bit sets only) +-------------------------------------------------------------------------------- */ + +static void mi_epochset_split(mi_epochset_t es, uint32_t* bset, size_t* epoch) { + *bset = (uint32_t)es; + *epoch = (size_t)(es >> 32); +} + +static mi_epochset_t mi_epochset_join(uint32_t bset, size_t epoch) { + return ((uint64_t)epoch << 32) | bset; +} + +// setting a bit increases the epoch +static void mi_epochset_set(_Atomic(mi_epochset_t)*es, size_t idx) { + mi_assert(idx < 32); + size_t epoch; + uint32_t bset; + mi_epochset_t es_new; + mi_epochset_t es_old = mi_atomic_load_relaxed(es); + do { + mi_epochset_split(es_old, &bset, &epoch); + es_new = mi_epochset_join(bset | (MI_ZU(1)<any_set, chunk_idx); +} + +static bool mi_bitmap_anyset_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, size_t epoch) { + mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT); + return mi_epochset_try_clear(&bitmap->any_set, chunk_idx, epoch); +} + +static uint32_t mi_bitmap_anyset(mi_bitmap_t* bitmap, size_t* epoch) { + uint32_t bset; + mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, epoch); + return bset; +} + +static size_t mi_bitmap_epoch(mi_bitmap_t* bitmap) { + size_t epoch; + uint32_t bset; + mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, &epoch); + return epoch; +} + /* -------------------------------------------------------------------------------- bitmap -------------------------------------------------------------------------------- */ -static void mi_bitmap_update_anyset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { - if (set) { - mi_bfield_atomic_xset(MI_BIT_SET, &bitmap->any_set, idx); - } - else { // clear - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, idx); - } - } -} // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { @@ -485,8 +664,8 @@ void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { } } -// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. -void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { +// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS); @@ -495,19 +674,18 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; size_t m = MI_BITMAP_CHUNK_BITS - cidx; if (m > n) { m = n; } - bool already_xset; - mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset); - mi_bitmap_update_anyset(set, bitmap, chunk_idx); + mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL); + mi_bitmap_anyset_set(bitmap, chunk_idx); // n can be large so use memset for efficiency for all in-between chunks chunk_idx++; n -= m; const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; if (mid_chunks > 0) { - _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), mid_chunks * (MI_BITMAP_CHUNK_BITS/8)); + _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * (MI_BITMAP_CHUNK_BITS/8)); const size_t end_chunk = chunk_idx + mid_chunks; while (chunk_idx < end_chunk) { - mi_bitmap_update_anyset(set, bitmap, chunk_idx); + mi_bitmap_anyset_set(bitmap, chunk_idx); chunk_idx++; } n -= (mid_chunks * MI_BITMAP_CHUNK_BITS); @@ -517,8 +695,8 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ if (n > 0) { mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); - mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset); - mi_bitmap_update_anyset(set, bitmap, chunk_idx); + mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); + mi_bitmap_anyset_set(bitmap, chunk_idx); } } @@ -528,12 +706,26 @@ void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_ bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < MI_BITMAP_MAX_BITS); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - bool ok = mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx); - if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } - return ok; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + if (set) { + // first set the anyset since it is a conservative approximation (increases epoch) + mi_bitmap_anyset_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_try_set(&bitmap->chunks[chunk_idx], cidx); + } + else { + const size_t epoch = mi_bitmap_epoch(bitmap); + bool cleared = mi_bitmap_chunk_try_clear(&bitmap->chunks[chunk_idx], cidx); + if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; + } } + + + // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise leaving the bitmask as is. bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { @@ -541,11 +733,23 @@ bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx%8 == 0); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; - bool ok = mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx); - if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } - return ok; + if (set) { + // first set the anyset since it is a conservative approximation (increases epoch) + mi_bitmap_anyset_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx); + } + else { + const size_t epoch = mi_bitmap_epoch(bitmap); + bool cleared = mi_bitmap_chunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx); + if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; + } } + // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! @@ -561,22 +765,32 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < MI_BFIELD_BITS); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - - bool ok = mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n); - if (ok) { mi_bitmap_update_anyset(set, bitmap, chunk_idx); } - return ok; + + if (set) { + // first set the anyset since it is a conservative approximation (increases epoch) + mi_bitmap_anyset_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n); + } + else { + const size_t epoch = mi_bitmap_epoch(bitmap); + bool cleared = mi_bitmap_chunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n); + if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; + } } + // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) { +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset ) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - bool local_already_xset; - if (already_xset==NULL) { already_xset = &local_already_xset; } - // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } - // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } - mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + + //if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } + //if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; @@ -584,11 +798,23 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bo mi_assert_internal(chunk_idx < MI_BFIELD_BITS); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - const bool allx = mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset); - mi_bitmap_update_anyset(set, bitmap, chunk_idx); - return allx; + if (set) { + // first set the anyset since it is a conservative approximation (increases epoch) + mi_bitmap_anyset_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset); + } + else { + const size_t epoch = mi_bitmap_epoch(bitmap); + bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset); + if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; + } } + // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); @@ -605,16 +831,18 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) } -#define mi_bitmap_forall_set_chunks(bitmap,tseq,decl_chunk_idx) \ - { size_t _set_idx; \ - size_t _start = tseq % MI_BFIELD_BITS; \ - mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \ - while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \ - decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS; +#define mi_bitmap_forall_set_chunks(bitmap,tseq,name_epoch,name_chunk_idx) \ + { uint32_t _bit_idx; \ + uint32_t _start = (uint32_t)(tseq % MI_EPOCHSET_BITS); \ + size_t name_epoch; \ + uint32_t _any_set = mi_bitmap_anyset(bitmap,&name_epoch); \ + _any_set = mi_rotr32(_any_set, _start); \ + while (mi_bsf32(_any_set,&_bit_idx)) { \ + size_t name_chunk_idx = (_bit_idx + _start) % MI_BFIELD_BITS; #define mi_bitmap_forall_set_chunks_end() \ - _start += _set_idx+1; /* so chunk_idx stays valid */ \ - _any_set >>= _set_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ \ + _start += _bit_idx+1; /* so chunk_idx stays valid */ \ + _any_set >>= _bit_idx; /* skip scanned bits (and avoid UB with (_bit_idx+1)) */ \ _any_set >>= 1; \ } \ } @@ -623,8 +851,8 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. // The low `MI_BFIELD_BITS` of start are used to set the start point of the search // (to reduce thread contention). -bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_set_chunks(bitmap, tseq, epoch, chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) { @@ -635,8 +863,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx else { // we may find that all are unset only on a second iteration but that is ok as // _any_set is a conservative approximation. - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); } } } @@ -647,8 +875,8 @@ bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) { - mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) { + mi_bitmap_forall_set_chunks(bitmap,tseq, epoch, chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) { @@ -658,8 +886,10 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid return true; } else { - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + // we may find that all are unset only on a second iteration but that is ok as + // _any_set is a conservative approximation. + if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); } } } @@ -672,11 +902,8 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) { // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger // TODO: allow spanning across chunk boundaries - if (n == 0 || n > MI_BFIELD_BITS) return false; - if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); - if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); - - mi_bitmap_forall_set_chunks(bitmap,tseq,size_t chunk_idx) + if (n == 0 || n > MI_BFIELD_BITS) return false; + mi_bitmap_forall_set_chunks(bitmap,tseq,epoch,chunk_idx) { size_t cidx; if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) { @@ -685,8 +912,10 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t return true; } else { - if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + // we may find that all are unset only on a second iteration but that is ok as + // _any_set is a conservative approximation. + if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); } } } diff --git a/src/bitmap.h b/src/bitmap.h index 1a180924..38137b0f 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -25,20 +25,26 @@ typedef size_t mi_bfield_t; #define MI_BFIELD_LO_BIT8 (((~(mi_bfield_t)0))/0xFF) // 0x01010101 .. #define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. +#define MI_BITMAP_CHUNK_SIZE (MI_BITMAP_CHUNK_BITS / 8) #define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) #define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) -typedef mi_decl_align(32) struct mi_bitmap_chunk_s { +// 512 bits on 64_bit +typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s { _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; } mi_bitmap_chunk_t; +// for now 32 (note: with ABA instructions we can make this 64) +#define MI_EPOCHSET_BITS (32) +#define MI_BITMAP_CHUNK_COUNT MI_EPOCHSET_BITS +typedef uint64_t mi_epochset_t; -typedef mi_decl_align(32) struct mi_bitmap_s { - mi_bitmap_chunk_t chunks[MI_BFIELD_BITS]; - _Atomic(mi_bfield_t)any_set; +typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s { + mi_bitmap_chunk_t chunks[MI_BITMAP_CHUNK_COUNT]; + _Atomic(mi_epochset_t) any_set; } mi_bitmap_t; -#define MI_BITMAP_MAX_BITS (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit +#define MI_BITMAP_MAX_BITS (MI_BITMAP_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit /* -------------------------------------------------------------------------------- Bitmap @@ -52,29 +58,73 @@ typedef bool mi_bit_t; void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero); // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. -void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); +void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! // If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset); +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset); + +static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_set) { + return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, all_already_set); +} + +static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_xsetN(MI_BIT_CLEAR, bitmap, idx, n, NULL); +} + // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); +static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n); +} + +static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n); +} + + // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) // and false otherwise leaving the bitmask as is. mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); +static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx); +} + +static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx); +} + + // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise leaving the bitmask as is. mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); +static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx); +} + +static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx); +} + // Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); +static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n); +} + +static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n); +} + + // Find a set bit in a bitmap and atomically unset it. Returns true on success, // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. // The low `MI_BFIELD_BITS` of start are used to set the start point of the search @@ -89,4 +139,5 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ); + #endif // MI_XBITMAP_H diff --git a/src/free.c b/src/free.c index f0ce8c22..1e9fe478 100644 --- a/src/free.c +++ b/src/free.c @@ -239,9 +239,9 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block } else { if (mi_page_is_abandoned(page)) { - mi_assert(false); + // mi_assert(false); } - mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages + // mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages } } } diff --git a/src/libc.c b/src/libc.c index 05ed7b02..20e9e38b 100644 --- a/src/libc.c +++ b/src/libc.c @@ -280,7 +280,7 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { // generic trailing and leading zero count // -------------------------------------------------------- -static inline size_t mi_ctz_generic32(uint32_t x) { +uint32_t _mi_ctz_generic32(uint32_t x) { // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, @@ -290,7 +290,7 @@ static inline size_t mi_ctz_generic32(uint32_t x) { return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27]; } -static inline size_t mi_clz_generic32(uint32_t x) { +static size_t mi_clz_generic32(uint32_t x) { // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, @@ -319,10 +319,10 @@ size_t _mi_clz_generic(size_t x) { size_t _mi_ctz_generic(size_t x) { if (x==0) return MI_SIZE_BITS; #if (MI_SIZE_BITS <= 32) - return mi_ctz_generic32((uint32_t)x); + return _mi_ctz_generic32((uint32_t)x); #else - const size_t count = mi_ctz_generic32((uint32_t)x); + const size_t count = _mi_ctz_generic32((uint32_t)x); if (count < 32) return count; - return (32 + mi_ctz_generic32((uint32_t)(x>>32))); + return (32 + _mi_ctz_generic32((uint32_t)(x>>32))); #endif } diff --git a/src/options.c b/src/options.c index 8cb0d216..1e64c08e 100644 --- a/src/options.c +++ b/src/options.c @@ -412,7 +412,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me // Define our own limited `fprintf` that avoids memory allocation. // We do this using `_mi_vsnprintf` with a limited buffer. static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) { - char buf[512]; + char buf[768]; if (fmt==NULL) return; if (!mi_recurse_enter()) return; _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args); From c9abfe82533fc1e863375cbb17a1d642107fda46 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 2 Dec 2024 16:24:40 -0800 Subject: [PATCH 020/264] wip: can run mstress --- include/mimalloc/types.h | 5 +++++ src/arena.c | 3 ++- src/free.c | 19 +++++++++---------- src/page.c | 5 +++-- 4 files changed, 19 insertions(+), 13 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 9b772db6..6f2f9c5f 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -321,7 +321,12 @@ typedef struct mi_page_s { #define MI_PAGE_ALIGN MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map. #define MI_PAGE_MIN_BLOCK_ALIGN (32) // minimal block alignment in a page #define MI_PAGE_MAX_OVERALLOC_ALIGN MI_ARENA_SLICE_SIZE // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation + +#if MI_DEBUG && MI_SIZE_SIZE == 8 +#define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) +#else #define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) +#endif // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) diff --git a/src/arena.c b/src/arena.c index 8a5c8f5d..ad919a6e 100644 --- a/src/arena.c +++ b/src/arena.c @@ -538,8 +538,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); + mi_page_try_claim_ownership(page); mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(mi_page_is_owned(page)); return page; } @@ -627,7 +629,6 @@ void _mi_arena_page_free(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); - mi_assert_internal(!mi_page_is_singleton(page)); mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); diff --git a/src/free.c b/src/free.c index b6f75c4a..03f93cf3 100644 --- a/src/free.c +++ b/src/free.c @@ -158,13 +158,15 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { mi_assert_internal(mi_page_thread_id(page)==0); // we own the page now.. + + // first remove it from the abandoned pages in the arena -- this waits for any readers to finish + _mi_arena_page_unabandon(page); // this must be before collect + // collect the thread atomic free list _mi_page_free_collect(page, false); // update `used` count - if (mi_page_is_singleton(page)) mi_assert_internal(mi_page_all_free(page)); + if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); } if (mi_page_all_free(page)) { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); // we can free the page directly _mi_arena_page_free(page); return; @@ -186,17 +188,14 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) ) { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); - // and make it part of our heap + // make it part of our heap _mi_heap_page_reclaim(tagheap, page); return; } } - - // give up ownership as we cannot reclaim this page - // note: we don't need to re-abandon as we did not yet unabandon - _mi_page_unown(page); + + // we cannot reclaim this page.. abandon it again + _mi_arena_page_abandon(page); } } diff --git a/src/page.c b/src/page.c index 2d87d80b..fdbcfff1 100644 --- a/src/page.c +++ b/src/page.c @@ -120,7 +120,7 @@ bool _mi_page_is_valid(mi_page_t* page) { mi_assert_internal(page->keys[0] != 0); #endif if (!mi_page_is_abandoned(page)) { - mi_assert_internal(!_mi_process_is_initialized); + //mi_assert_internal(!_mi_process_is_initialized); { mi_page_queue_t* pq = mi_page_queue_of(page); mi_assert_internal(mi_page_queue_contains(pq, page)); @@ -734,7 +734,8 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(page->capacity == 0); mi_assert_internal(page->free == NULL); mi_assert_internal(page->used == 0); - mi_assert_internal(page->xthread_free == 0); + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(page->xthread_free == 1); mi_assert_internal(page->next == NULL); mi_assert_internal(page->prev == NULL); mi_assert_internal(page->retire_expire == 0); From 5e95ebc7a015b7ced0f89485d5050bad4d255077 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 2 Dec 2024 17:46:41 -0800 Subject: [PATCH 021/264] fix free stats --- include/mimalloc/internal.h | 1 + src/arena.c | 11 ++++++++--- src/free.c | 21 ++++++++++++++++----- src/options.c | 2 +- src/page.c | 3 ++- test/test-stress.c | 16 ++++++++++++---- 6 files changed, 40 insertions(+), 14 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 8669fa80..afdfe822 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -223,6 +223,7 @@ mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p); bool _mi_free_delayed_block(mi_block_t* block); // void _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); +void _mi_stat_free(const mi_page_t* page, const mi_block_t* block); // "libc.c" #include diff --git a/src/arena.c b/src/arena.c index ad919a6e..194854a2 100644 --- a/src/arena.c +++ b/src/arena.c @@ -622,6 +622,10 @@ void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(mi_page_all_free(page)); mi_assert_internal(page->next==NULL); + #if MI_STAT > 1 + _mi_page_free_collect(page, true); + #endif + #if MI_DEBUG>1 if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) { size_t bin = _mi_bin(mi_page_block_size(page)); @@ -665,7 +669,6 @@ void _mi_arena_page_abandon(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); // mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); - _mi_page_unown(page); bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index); MI_UNUSED(were_zero); mi_assert_internal(were_zero); mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]); @@ -673,8 +676,9 @@ void _mi_arena_page_abandon(mi_page_t* page) { else { // page is full (or a singleton), page is OS/externally allocated // leave as is; it will be reclaimed when an object is free'd in the page - _mi_page_unown(page); - } + } + _mi_page_unown(page); + mi_stat_increase(_mi_stats_main.pages_abandoned, 1); } // called from `mi_free` if trying to unabandon an abandoned page @@ -704,6 +708,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { // nothing to do // TODO: maintain count of these as well? } + mi_stat_decrease(_mi_stats_main.pages_abandoned, 1); } /* diff --git a/src/free.c b/src/free.c index 03f93cf3..4ba6d6cc 100644 --- a/src/free.c +++ b/src/free.c @@ -16,7 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file static void mi_check_padding(const mi_page_t* page, const mi_block_t* block); static bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block); static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block); -static void mi_stat_free(const mi_page_t* page, const mi_block_t* block); +// static void _mi_stat_free(const mi_page_t* page, const mi_block_t* block); // ------------------------------------------------------ @@ -33,7 +33,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool // checks if mi_unlikely(mi_check_is_double_free(page, block)) return; mi_check_padding(page, block); - if (track_stats) { mi_stat_free(page, block); } + if (track_stats) { _mi_stat_free(page, block); } #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN && !MI_GUARDED memset(block, MI_DEBUG_FREED, mi_page_block_size(page)); #endif @@ -199,9 +199,20 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { } } -// Push a block that is owned by another thread on its page-local thread free list. +// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) { + // adjust stats (after padding check and potentially recursive `mi_free` above) + _mi_stat_free(page, block); // stat_free may access the padding + mi_track_free_size(block, mi_page_usable_size_of(page, block)); + + // _mi_padding_shrink(page, block, sizeof(mi_block_t)); + #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading + size_t dbgsize = mi_usable_size(block); + if (dbgsize > MI_MiB) { dbgsize = MI_MiB; } + _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize); + #endif + // push atomically on the page thread free list mi_thread_free_t tf_new; mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); @@ -532,7 +543,7 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { // only maintain stats for smaller objects if requested #if (MI_STAT>0) -static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { +void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) { #if (MI_STAT < 2) MI_UNUSED(block); #endif @@ -554,7 +565,7 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { } } #else -static void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { +void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) { MI_UNUSED(page); MI_UNUSED(block); } #endif diff --git a/src/options.c b/src/options.c index 759d096d..b69058cc 100644 --- a/src/options.c +++ b/src/options.c @@ -158,7 +158,7 @@ static mi_option_desc_t options[_mi_option_last] = UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. - { 1, UNINIT, MI_OPTION(eager_abandon) }, + { 0, UNINIT, MI_OPTION(eager_abandon) }, }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/page.c b/src/page.c index fdbcfff1..8cdfd6be 100644 --- a/src/page.c +++ b/src/page.c @@ -189,10 +189,11 @@ static void _mi_page_thread_free_collect(mi_page_t* page) size_t count = 1; mi_block_t* tail = head; mi_block_t* next; - while ((next = mi_block_next(page,tail)) != NULL && count <= max_count) { + while( (next = mi_block_next(page,tail)) != NULL && count <= max_count) { count++; tail = next; } + // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free) if (count > max_count) { _mi_error_message(EFAULT, "corrupted thread-free list\n"); diff --git a/test/test-stress.c b/test/test-stress.c index 9a89744e..61d1424a 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,10 +40,10 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; -#elif 0 +#elif 1 static int THREADS = 4; static int SCALE = 100; -static int ITER = 20; +static int ITER = 50; #else static int THREADS = 32; // more repeatable if THREADS <= #processors static int SCALE = 25; // scaling factor @@ -227,7 +227,7 @@ static void test_stress(void) { run_os_threads(THREADS, &stress); #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) // switch between arena and OS allocation for testing - mi_option_set_enabled(mi_option_disallow_arena_alloc, (n%2)==1); + // mi_option_set_enabled(mi_option_disallow_arena_alloc, (n%2)==1); #endif #ifdef HEAP_WALK size_t total = 0; @@ -248,7 +248,14 @@ static void test_stress(void) { { printf("- iterations left: %3d\n", ITER - (n + 1)); } #endif } -} + // clean up + for (int i = 0; i < TRANSFERS; i++) { + void* p = atomic_exchange_ptr(&transfer[i], NULL); + if (p != NULL) { + free_items(p); + } + } +} #ifndef STRESS static void leak(intptr_t tid) { @@ -320,6 +327,7 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG + // mi_debug_show_arenas(true, true, false); mi_collect(true); mi_debug_show_arenas(true,true,false); #endif From fe5a3141142d27f1a0a54f95e8cb397b21ae19f3 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 2 Dec 2024 19:31:36 -0800 Subject: [PATCH 022/264] add base and size to OS memid --- include/mimalloc/bits.h | 24 ++++++++++++ include/mimalloc/internal.h | 4 +- include/mimalloc/types.h | 1 + src/arena.c | 14 +++---- src/bitmap.c | 73 +++++++++++++++++++++++++++---------- src/bitmap.h | 6 +-- src/options.c | 2 +- src/os.c | 18 +++++---- test/test-stress.c | 2 +- 9 files changed, 104 insertions(+), 40 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 90d56b4f..f3bbe3bc 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -237,6 +237,30 @@ static inline uint32_t mi_ctz32(uint32_t x) { #define MI_HAS_FAST_BITSCAN 1 #endif + + +static inline size_t mi_popcount(size_t x) { +#if mi_has_builtin_size(popcount) + return mi_builtin_size(popcount)(x); +#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #if MI_SIZE_BITS==32 + return __popcnt(x); + #else + return __popcnt64(x); + #endif +#elif MI_ARCH_X64 && defined(__BMI1__) + return (size_t)_mm_popcnt_u64(x); +#else + #define MI_HAS_FAST_POPCOUNT 0 + error define generic popcount +#endif +} + +#ifndef MI_HAS_FAST_POPCOUNT +#define MI_HAS_FAST_POPCOUNT 1 +#endif + + /* -------------------------------------------------------------------------------- find trailing/leading zero (bit scan forward/reverse) -------------------------------------------------------------------------------- */ diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index afdfe822..7d263d47 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -844,8 +844,10 @@ static inline mi_memid_t _mi_memid_none(void) { return _mi_memid_create(MI_MEM_NONE); } -static inline mi_memid_t _mi_memid_create_os(bool committed, bool is_zero, bool is_large) { +static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool committed, bool is_zero, bool is_large) { mi_memid_t memid = _mi_memid_create(MI_MEM_OS); + memid.mem.os.base = base; + memid.mem.os.size = size; memid.initially_committed = committed; memid.initially_zero = is_zero; memid.is_pinned = is_large; diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 6f2f9c5f..dafd25f1 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -171,6 +171,7 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) { typedef struct mi_memid_os_info { void* base; // actual base address of the block (used for offset aligned allocations) size_t alignment; // alignment at allocation + size_t size; // allocated full size } mi_memid_os_info_t; typedef struct mi_memid_arena_info { diff --git a/src/arena.c b/src/arena.c index 194854a2..08b6c98d 100644 --- a/src/arena.c +++ b/src/arena.c @@ -204,17 +204,19 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // commit requested, but the range may not be committed as a whole: ensure it is committed now memid->initially_committed = true; - bool all_already_committed; - mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &all_already_committed); - if (!all_already_committed) { + size_t already_committed_count = 0; + mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &already_committed_count); + if (already_committed_count < slice_count) { + // recommit the full range bool commit_zero = false; + mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) { memid->initially_committed = false; } else { if (commit_zero) { memid->initially_zero = true; } } - } + } } else { // no need to commit, but check if already fully committed @@ -622,10 +624,6 @@ void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(mi_page_all_free(page)); mi_assert_internal(page->next==NULL); - #if MI_STAT > 1 - _mi_page_free_collect(page, true); - #endif - #if MI_DEBUG>1 if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) { size_t bin = _mi_bin(mi_page_block_size(page)); diff --git a/src/bitmap.c b/src/bitmap.c index ed991441..c7c78dec 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -22,6 +22,11 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) { return mi_ctz(x); } + +static inline size_t mi_bfield_popcount(mi_bfield_t x) { + return mi_popcount(x); +} + //static inline size_t mi_bfield_clz(mi_bfield_t x) { // return mi_clz(x); //} @@ -70,26 +75,57 @@ static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, s } } +// Set a pair of bits atomically, and return true of the mask bits transitioned from all 0's to 1's. +static inline bool mi_bfield_atomic_set2(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_already_set) { + mi_assert_internal(idx < MI_BFIELD_BITS-1); + const size_t mask = (mi_bfield_t)0x03 << idx; + mi_bfield_t old = mi_atomic_load_relaxed(b); + while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)); // try to atomically set the mask bits until success + if (all_already_set!=NULL) { *all_already_set = ((old&mask)==mask); } + return ((old&mask) == 0); +} + +// Clear a pair of bits atomically, and return true of the mask bits transitioned from all 1's to 0's +static inline bool mi_bfield_atomic_clear2(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_already_clear) { + mi_assert_internal(idx < MI_BFIELD_BITS-1); + const size_t mask = (mi_bfield_t)0x03 << idx; + mi_bfield_t old = mi_atomic_load_relaxed(b); + while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)); // try to atomically clear the mask bits until success + if (all_already_clear!=NULL) { *all_already_clear = ((old&mask) == 0); } + return ((old&mask) == mask); +} + +// Set/clear a pair of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's) +static inline bool mi_bfield_atomic_xset2(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx, bool* already_xset) { + if (set) { + return mi_bfield_atomic_set2(b, idx, already_xset); + } + else { + return mi_bfield_atomic_clear2(b, idx, already_xset); + } +} + + // Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's. -static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_set) { +static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)); // try to atomically set the mask bits until success - if (already_set!=NULL) { *already_set = ((old&mask)==mask); } + if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); } return ((old&mask) == 0); } // Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's -static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_clear) { +static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_clear) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)); // try to atomically clear the mask bits until success - if (already_clear!=NULL) { *already_clear = ((old&mask)==0); } + if (already_clear!=NULL) { *already_clear = mi_bfield_popcount(~(old&mask)); } return ((old&mask) == mask); } // Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's) -static inline bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* already_xset) { +static inline bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_xset) { mi_assert_internal(mask != 0); if (set) { return mi_bfield_atomic_set_mask(b, mask, already_xset); @@ -225,9 +261,8 @@ static inline bool mi_bitmap_chunk_xset2(mi_bit_t set, mi_bitmap_chunk_t* chunk, const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; mi_assert_internal(idx < MI_BFIELD_BITS-1); - mi_assert_internal((idx%2)==0); - const size_t mask = (mi_bfield_t)0x03 << idx; - return mi_bfield_atomic_xset_mask(set, &chunk->bfields[i], mask, all_already_xset); + mi_assert_internal((idx%2)==0); + return mi_bfield_atomic_xset2(set, &chunk->bfields[i], idx, all_already_xset); } static inline bool mi_bitmap_chunk_set2(mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_set) { @@ -241,11 +276,11 @@ static inline bool mi_bitmap_chunk_clear2(mi_bitmap_chunk_t* chunk, size_t cidx, // Set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0). -static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* pall_already_xset) { +static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* pall_already_xset) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); bool all_transition = true; - bool all_already_xset = true; + size_t all_already_xset = 0; size_t idx = cidx % MI_BFIELD_BITS; size_t field = cidx / MI_BFIELD_BITS; while (n > 0) { @@ -254,9 +289,9 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t mi_assert_internal(idx + m <= MI_BFIELD_BITS); mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset ); - all_already_xset = all_already_xset && already_xset; + all_already_xset += already_xset; // next field field++; idx = 0; @@ -267,12 +302,12 @@ static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t } -static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_set) { - return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, all_allready_set); +static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { + return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set); } -static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* all_allready_clear) { - return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, all_allready_clear); +static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* already_clear) { + return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, already_clear); } @@ -829,7 +864,7 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset ) { +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); @@ -846,11 +881,11 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bo // first set the anyset since it is a conservative approximation (increases epoch) mi_bitmap_anyset_set(bitmap, chunk_idx); // then actually try to set it atomically - return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset); + return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset); } else { const size_t epoch = mi_bitmap_epoch(bitmap); - bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, all_already_xset); + bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, already_xset); if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); } diff --git a/src/bitmap.h b/src/bitmap.h index 62aab7a7..8c961fe1 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -65,10 +65,10 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! // If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_xset); +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset); -static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, bool* all_already_set) { - return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, all_already_set); +static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) { + return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, already_set); } static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { diff --git a/src/options.c b/src/options.c index b69058cc..759d096d 100644 --- a/src/options.c +++ b/src/options.c @@ -158,7 +158,7 @@ static mi_option_desc_t options[_mi_option_last] = UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. - { 0, UNINIT, MI_OPTION(eager_abandon) }, + { 1, UNINIT, MI_OPTION(eager_abandon) }, }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/os.c b/src/os.c index 0aa0a681..bac59437 100644 --- a/src/os.c +++ b/src/os.c @@ -128,21 +128,24 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_st if (err != 0) { _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr); } - if (still_committed) { _mi_stat_decrease(&stats->committed, size); } + if (still_committed) { + _mi_stat_decrease(&stats->committed, size); + } _mi_stat_decrease(&stats->reserved, size); } void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats) { if (stats == NULL) stats = &_mi_stats_main; if (mi_memkind_is_os(memid.memkind)) { - size_t csize = _mi_os_good_alloc_size(size); + size_t csize = memid.mem.os.size; + if (csize==0) { _mi_os_good_alloc_size(size); } void* base = addr; // different base? (due to alignment) - if (memid.mem.os.base != NULL) { + if (memid.mem.os.base != base) { mi_assert(memid.mem.os.base <= addr); mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr); base = memid.mem.os.base; - csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); + if (memid.mem.os.size==0) { csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); } } // free it if (memid.memkind == MI_MEM_OS_HUGE) { @@ -296,7 +299,7 @@ void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { bool os_is_zero = false; void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats); if (p != NULL) { - *memid = _mi_memid_create_os(true, os_is_zero, os_is_large); + *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large); } return p; } @@ -315,9 +318,10 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo void* os_base = NULL; void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats ); if (p != NULL) { - *memid = _mi_memid_create_os(commit, os_is_zero, os_is_large); + *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large); memid->mem.os.base = os_base; memid->mem.os.alignment = alignment; + memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base); // todo: return from prim_alloc_aligned } return p; } @@ -642,7 +646,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse if (psize != NULL) { *psize = page * MI_HUGE_OS_PAGE_SIZE; } if (page != 0) { mi_assert(start != NULL); - *memid = _mi_memid_create_os(true /* is committed */, all_zero, true /* is_large */); + *memid = _mi_memid_create_os(start, *psize, true /* is committed */, all_zero, true /* is_large */); memid->memkind = MI_MEM_OS_HUGE; mi_assert(memid->is_pinned); #ifdef MI_TRACK_ASAN diff --git a/test/test-stress.c b/test/test-stress.c index 61d1424a..487f7215 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,7 +40,7 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; -#elif 1 +#elif 0 static int THREADS = 4; static int SCALE = 100; static int ITER = 50; From bd5f7de3f416bb8a90d97d0ef1ae6b69ecebbe37 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 2 Dec 2024 20:21:35 -0800 Subject: [PATCH 023/264] can run basic test --- src/arena.c | 4 ++-- src/bitmap.c | 30 ++++++++++++++++++++++-------- src/init.c | 2 +- src/page-queue.c | 30 +++++++++++++++++++++++++++++- src/page.c | 9 ++++++--- test/test-stress.c | 4 ++-- 6 files changed, 62 insertions(+), 17 deletions(-) diff --git a/src/arena.c b/src/arena.c index 08b6c98d..317a7e48 100644 --- a/src/arena.c +++ b/src/arena.c @@ -676,7 +676,7 @@ void _mi_arena_page_abandon(mi_page_t* page) { // leave as is; it will be reclaimed when an object is free'd in the page } _mi_page_unown(page); - mi_stat_increase(_mi_stats_main.pages_abandoned, 1); + _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1); } // called from `mi_free` if trying to unabandon an abandoned page @@ -706,7 +706,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { // nothing to do // TODO: maintain count of these as well? } - mi_stat_decrease(_mi_stats_main.pages_abandoned, 1); + _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); } /* diff --git a/src/bitmap.c b/src/bitmap.c index c7c78dec..eb5da086 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -453,6 +453,20 @@ static inline bool mi_bitmap_chunk_try_clearN(mi_bitmap_chunk_t* chunk, size_t c return mi_bitmap_chunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n); } +#if defined(__AVX2__) +static inline __m256i mi_mm256_zero(void) { + return _mm256_setzero_si256(); +} +static inline __m256i mi_mm256_ones(void) { + return _mm256_set1_epi64x(~0); +} +static inline bool mi_mm256_is_ones(__m256i vec) { + return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec)); +} +static inline bool mi_mm256_is_zero( __m256i vec) { + return _mm256_testz_si256(vec,vec); +} +#endif // find least 0/1-bit in a chunk and try to set/clear it atomically // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. @@ -461,7 +475,7 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF : 0) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) if (mask==0) return false; @@ -483,11 +497,11 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu size_t chunk_idx = 0; #if 1 __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - if ((set ? _mm256_test_all_ones(vec) : _mm256_testz_si256(vec,vec))) { + if ((set ? mi_mm256_is_ones(vec) : mi_mm256_is_zero(vec))) { chunk_idx += 4; vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1); } - const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF : 0) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) if (mask==0) return false; @@ -496,7 +510,7 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chu #else const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); - const __m256i cmpv = (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256()); + const __m256i cmpv = (set ? mi_mm256_ones() : mi_mm256_zero()); const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) @@ -549,7 +563,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) while(true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1 : 0) + const __m256i vcmp = _mm256_cmpeq_epi8(vec, mi_mm256_ones()); // (byte == ~0 ? -1 : 0) const uint32_t mask = _mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte if (mask == 0) return false; const size_t i = _tzcnt_u32(mask); @@ -650,12 +664,12 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - return _mm256_testz_si256( vec, vec ); + return mi_mm256_is_zero(vec); #elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512) const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); - if (!_mm256_testz_si256(vec1, vec1)) return false; + if (!mi_mm256_is_zero(vec1)) return false; const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); - return (_mm256_testz_si256(vec2, vec2)); + return (mi_mm256_is_zero(vec2)); #else for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { if (chunk->bfields[i] != 0) return false; diff --git a/src/init.c b/src/init.c index 05ce54b4..d1670d02 100644 --- a/src/init.c +++ b/src/init.c @@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = { NULL, // xheap NULL, NULL, // next, prev NULL, // subproc - { {{ NULL, 0}}, false, false, false, MI_MEM_NONE } // memid + { {{ NULL, 0, 0}}, false, false, false, MI_MEM_NONE } // memid }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) diff --git a/src/page-queue.c b/src/page-queue.c index 552e12c3..ad616b1d 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -260,6 +260,34 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_ heap->page_count++; } +static void mi_page_queue_push_at_end(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) { + mi_assert_internal(mi_page_heap(page) == heap); + mi_assert_internal(!mi_page_queue_contains(queue, page)); + + mi_assert_internal(mi_page_block_size(page) == queue->block_size || + (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || + (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); + + mi_page_set_in_full(page, mi_page_queue_is_full(queue)); + + page->prev = queue->last; + page->next = NULL; + if (queue->last != NULL) { + mi_assert_internal(queue->last->next == NULL); + queue->last->next = page; + queue->last = page; + } + else { + queue->first = queue->last = page; + } + + // update direct + if (queue->first == page) { + mi_heap_queue_first_update(heap, queue); + } + heap->page_count++; +} + static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(mi_page_heap(page) == heap); mi_assert_internal(mi_page_queue_contains(queue, page)); @@ -344,7 +372,7 @@ static void mi_page_queue_enqueue_from(mi_page_queue_t* to, mi_page_queue_t* fro static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t* from, mi_page_t* page) { // note: we could insert at the front to increase reuse, but it slows down certain benchmarks (like `alloc-test`) - mi_page_queue_enqueue_from_ex(to, from, false /* enqueue at the end of the `to` queue? */, page); + mi_page_queue_enqueue_from_ex(to, from, true /* enqueue at the end of the `to` queue? */, page); } // Only called from `mi_heap_absorb`. diff --git a/src/page.c b/src/page.c index 8cdfd6be..4d26dbad 100644 --- a/src/page.c +++ b/src/page.c @@ -274,7 +274,7 @@ void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page) mi_page_set_heap(page,heap); _mi_page_free_collect(page, false); // ensure used count is up to date mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); - mi_page_queue_push(heap, pq, page); + mi_page_queue_push_at_end(heap, pq, page); mi_assert_expensive(_mi_page_is_valid(page)); } @@ -807,8 +807,11 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p page_candidate = page; candidate_count = 0; } - else if (/* !mi_page_is_expandable(page) && */ page->used >= page_candidate->used) { - if (mi_page_all_free(page_candidate)) { _mi_page_free(page_candidate, pq); } + else if (mi_page_all_free(page_candidate)) { + _mi_page_free(page_candidate, pq); + page_candidate = page; + } + else if (page->used >= page_candidate->used) { page_candidate = page; } // if we find a non-expandable candidate, or searched for N pages, return with the best candidate diff --git a/test/test-stress.c b/test/test-stress.c index 487f7215..ffeb5dea 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -46,7 +46,7 @@ static int SCALE = 100; static int ITER = 50; #else static int THREADS = 32; // more repeatable if THREADS <= #processors -static int SCALE = 25; // scaling factor +static int SCALE = 50; // scaling factor static int ITER = 50; // N full iterations destructing and re-creating all threads #endif @@ -54,7 +54,7 @@ static int ITER = 50; // N full iterations destructing and re-creating a #define STRESS // undefine for leak test -static bool allow_large_objects = true; // allow very large objects? (set to `true` if SCALE>100) +static bool allow_large_objects = false; // allow very large objects? (set to `true` if SCALE>100) static size_t use_one_size = 0; // use single object size of `N * sizeof(uintptr_t)`? static bool main_participates = false; // main thread participates as a worker too From 833b091ff9a54f42e110093031e1bc9fa204cc52 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 2 Dec 2024 20:25:44 -0800 Subject: [PATCH 024/264] can run the full test suite --- include/mimalloc/internal.h | 5 ++--- src/free.c | 10 +++++----- 2 files changed, 7 insertions(+), 8 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 7d263d47..cee88684 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -220,10 +220,9 @@ void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_att void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept; mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p); -bool _mi_free_delayed_block(mi_block_t* block); -// void _mi_free_generic(mi_segment_t* segment, mi_page_t* page, bool is_local, void* p) mi_attr_noexcept; // for runtime integration void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); -void _mi_stat_free(const mi_page_t* page, const mi_block_t* block); +// bool _mi_free_delayed_block(mi_block_t* block); + // "libc.c" #include diff --git a/src/free.c b/src/free.c index 4ba6d6cc..4bce6886 100644 --- a/src/free.c +++ b/src/free.c @@ -16,7 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file static void mi_check_padding(const mi_page_t* page, const mi_block_t* block); static bool mi_check_is_double_free(const mi_page_t* page, const mi_block_t* block); static size_t mi_page_usable_size_of(const mi_page_t* page, const mi_block_t* block); -// static void _mi_stat_free(const mi_page_t* page, const mi_block_t* block); +static void mi_stat_free(const mi_page_t* page, const mi_block_t* block); // ------------------------------------------------------ @@ -33,7 +33,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool // checks if mi_unlikely(mi_check_is_double_free(page, block)) return; mi_check_padding(page, block); - if (track_stats) { _mi_stat_free(page, block); } + if (track_stats) { mi_stat_free(page, block); } #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN && !MI_GUARDED memset(block, MI_DEBUG_FREED, mi_page_block_size(page)); #endif @@ -203,7 +203,7 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) { // adjust stats (after padding check and potentially recursive `mi_free` above) - _mi_stat_free(page, block); // stat_free may access the padding + mi_stat_free(page, block); // stat_free may access the padding mi_track_free_size(block, mi_page_usable_size_of(page, block)); // _mi_padding_shrink(page, block, sizeof(mi_block_t)); @@ -543,7 +543,7 @@ static void mi_check_padding(const mi_page_t* page, const mi_block_t* block) { // only maintain stats for smaller objects if requested #if (MI_STAT>0) -void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) { +void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { #if (MI_STAT < 2) MI_UNUSED(block); #endif @@ -565,7 +565,7 @@ void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) { } } #else -void _mi_stat_free(const mi_page_t* page, const mi_block_t* block) { +void mi_stat_free(const mi_page_t* page, const mi_block_t* block) { MI_UNUSED(page); MI_UNUSED(block); } #endif From 666c089fc85b67c0773e502856f5b9fb179164cd Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 3 Dec 2024 10:51:13 -0800 Subject: [PATCH 025/264] revise free reclaim; ensure unown cannot race with a free --- include/mimalloc/internal.h | 88 ++++++++++++++------ include/mimalloc/types.h | 4 + src/arena.c | 71 +++++++++++----- src/bitmap.c | 16 ++-- src/free.c | 156 +++++++++++++++++++++++++++++++----- src/init.c | 2 +- src/options.c | 2 +- src/page.c | 2 +- src/stats.c | 4 + test/test-stress.c | 15 +++- 10 files changed, 281 insertions(+), 79 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index cee88684..56172bcd 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -143,7 +143,8 @@ void _mi_arena_unsafe_destroy_all(mi_stats_t* stats); mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); void _mi_arena_page_free(mi_page_t* page); void _mi_arena_page_abandon(mi_page_t* page); -void _mi_arena_page_unabandon(mi_page_t* page); +void _mi_arena_page_unabandon(mi_page_t* page); +bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page); bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page); void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap); @@ -572,29 +573,6 @@ static inline bool mi_page_is_owned(const mi_page_t* page) { return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); } -// Unown a page that is currently owned -static inline void _mi_page_unown(mi_page_t* page) { - mi_assert_internal(mi_page_is_owned(page)); - mi_assert_internal(mi_page_thread_id(page)==0); - const uintptr_t old = mi_atomic_and_acq_rel(&page->xthread_free, ~((uintptr_t)1)); - mi_assert_internal((old&1)==1); MI_UNUSED(old); - /* - mi_thread_free_t tf_new; - mi_thread_free_t tf_old; - do { - tf_old = mi_atomic_load_relaxed(&page->xthread_free); - mi_assert_internal(mi_tf_is_owned(tf_old)); - tf_new = mi_tf_create(mi_tf_block(tf_old), false); - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new)); - */ -} - -// get ownership if it is not yet owned -static inline bool mi_page_try_claim_ownership(mi_page_t* page) { - const uintptr_t old = mi_atomic_or_acq_rel(&page->xthread_free, 1); - return ((old&1)==0); -} - //static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) { // return mi_tf_make(mi_tf_block(tf),delayed); @@ -638,7 +616,7 @@ static inline bool mi_page_is_full(mi_page_t* page) { } // is more than 7/8th of a page in use? -static inline bool mi_page_mostly_used(const mi_page_t* page) { +static inline bool mi_page_is_mostly_used(const mi_page_t* page) { if (page==NULL) return true; uint16_t frac = page->reserved / 8U; return (page->reserved - page->used <= frac); @@ -646,9 +624,22 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) { static inline bool mi_page_is_abandoned(const mi_page_t* page) { // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) - return (mi_atomic_load_acquire(&page->xthread_id) == 0); + return (mi_atomic_load_acquire(&page->xthread_id) <= 1); } +static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) { + return (mi_atomic_load_acquire(&page->xthread_id) == 1); +} + +static inline void mi_page_set_abandoned_mapped(mi_page_t* page) { + mi_atomic_or_acq_rel(&page->xthread_id, (uintptr_t)1); +} + +static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) { + mi_atomic_and_acq_rel(&page->xthread_id, ~(uintptr_t)1); +} + + static inline bool mi_page_is_huge(const mi_page_t* page) { return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN)); } @@ -659,6 +650,51 @@ static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) } +// Unown a page that is currently owned +static inline void _mi_page_unown_unconditional(mi_page_t* page) { + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_thread_id(page)==0); + const uintptr_t old = mi_atomic_and_acq_rel(&page->xthread_free, ~((uintptr_t)1)); + mi_assert_internal((old&1)==1); MI_UNUSED(old); + /* + mi_thread_free_t tf_new; + mi_thread_free_t tf_old; + do { + tf_old = mi_atomic_load_relaxed(&page->xthread_free); + mi_assert_internal(mi_tf_is_owned(tf_old)); + tf_new = mi_tf_create(mi_tf_block(tf_old), false); + } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new)); + */ +} + + +// get ownership if it is not yet owned +static inline bool mi_page_try_claim_ownership(mi_page_t* page) { + const uintptr_t old = mi_atomic_or_acq_rel(&page->xthread_free, 1); + return ((old&1)==0); +} + +static inline void _mi_page_unown(mi_page_t* page) { + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(mi_page_thread_id(page)==0); + mi_thread_free_t tf_new; + mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); + do { + mi_assert_internal(mi_tf_is_owned(tf_old)); + while mi_unlikely(mi_tf_block(tf_old) != NULL) { + _mi_page_free_collect(page, false); // update used + if (mi_page_all_free(page)) { // it may become free just before unowning it + _mi_arena_page_unabandon(page); + _mi_arena_page_free(page); + return; + } + tf_old = mi_atomic_load_relaxed(&page->xthread_free); + } + mi_assert_internal(mi_tf_block(tf_old)==NULL); + tf_new = mi_tf_create(NULL, false); + } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new)); +} //----------------------------------------------------------- // Page flags diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index dafd25f1..4430cd6c 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -505,6 +505,10 @@ typedef struct mi_stats_s { mi_stat_count_t giant; mi_stat_count_t malloc; mi_stat_counter_t pages_extended; + mi_stat_counter_t pages_reclaim_on_alloc; + mi_stat_counter_t pages_reclaim_on_free; + mi_stat_counter_t pages_reabandon_full; + mi_stat_counter_t pages_unabandon_busy_wait; mi_stat_counter_t mmap_calls; mi_stat_counter_t commit_calls; mi_stat_counter_t reset_calls; diff --git a/src/arena.c b/src/arena.c index 317a7e48..a2343674 100644 --- a/src/arena.c +++ b/src/arena.c @@ -42,7 +42,7 @@ typedef struct mi_arena_s { bool is_large; // memory area consists of large- or huge OS pages (always committed) mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited _Atomic(mi_msecs_t) purge_expire; // expiration time when slices should be decommitted from `slices_decommit`. - + mi_bitmap_t slices_free; // is the slice free? mi_bitmap_t slices_committed; // is the slice committed? (i.e. accessible) mi_bitmap_t slices_purge; // can the slice be purged? (slice in purge => slice in free) @@ -216,7 +216,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( else { if (commit_zero) { memid->initially_zero = true; } } - } + } } else { // no need to commit, but check if already fully committed @@ -355,7 +355,7 @@ static mi_decl_noinline void* mi_arena_try_alloc( { mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); - + // try to find free slices in the arena's void* p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; @@ -457,7 +457,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl // try to claim ownership atomically mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); if (!mi_page_try_claim_ownership(page)) { - // a concurrent free already grabbed the page. + // a concurrent free already grabbed the page. // Restore the abandoned_map to make it available again (unblocking busy waiters) mi_pairmap_set(pairmap, slice_index); } @@ -465,6 +465,9 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl // we got ownership, clear the abandoned entry (unblocking busy waiters) mi_pairmap_clear(pairmap, slice_index); mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); + _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); + _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); + _mi_page_free_collect(page, false); // update `used` count mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); @@ -472,7 +475,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); - mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); + mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(mi_page_is_abandoned(page)); mi_assert_internal(mi_page_is_owned(page)); @@ -492,11 +495,11 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz const bool commit = true; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t page_alignment = MI_ARENA_SLICE_ALIGN; - + // try to allocate from free space in arena's mi_memid_t memid = _mi_memid_none(); mi_page_t* page = NULL; - if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's? + if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's? !os_align && // not large alignment slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large { @@ -575,16 +578,16 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t info_size = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); const size_t slice_count = mi_slice_count_of_size(info_size + block_size); - + mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld); if (page == NULL) return NULL; - + mi_assert(page != NULL); - mi_assert(page->reserved == 1); + mi_assert(page->reserved == 1); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); - return page; + return page; } @@ -646,17 +649,17 @@ void _mi_arena_page_free(mi_page_t* page) { Arena abandon ----------------------------------------------------------- */ -void _mi_arena_page_abandon(mi_page_t* page) { +static void mi_arena_page_abandon_no_stat(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); mi_assert_internal(!mi_page_all_free(page)); mi_assert_internal(page->next==NULL); - + mi_subproc_t* subproc = page->subproc; if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) { - // make available for allocations + // make available for allocations size_t bin = _mi_bin(mi_page_block_size(page)); size_t slice_index; size_t slice_count; @@ -667,6 +670,7 @@ void _mi_arena_page_abandon(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); // mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); + mi_page_set_abandoned_mapped(page); bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index); MI_UNUSED(were_zero); mi_assert_internal(were_zero); mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]); @@ -676,34 +680,59 @@ void _mi_arena_page_abandon(mi_page_t* page) { // leave as is; it will be reclaimed when an object is free'd in the page } _mi_page_unown(page); +} + +void _mi_arena_page_abandon(mi_page_t* page) { + mi_arena_page_abandon_no_stat(page); _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1); } +bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) { + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(!mi_page_is_abandoned_mapped(page)); + mi_assert_internal(!mi_page_is_full(page)); + mi_assert_internal(!mi_page_all_free(page)); + mi_assert_internal(!mi_page_is_singleton(page)); + if (mi_page_is_full(page) || mi_page_is_abandoned_mapped(page) || page->memid.memkind != MI_MEM_ARENA) { + return false; + } + else { + _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1); + mi_arena_page_abandon_no_stat(page); + return true; + } +} + // called from `mi_free` if trying to unabandon an abandoned page void _mi_arena_page_unabandon(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); - - if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) { + + if (mi_page_is_abandoned_mapped(page)) { + mi_assert_internal(page->memid.memkind==MI_MEM_ARENA); // remove from the abandoned map size_t bin = _mi_bin(mi_page_block_size(page)); size_t slice_index; size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); - + mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done mi_pairmap_clear_while_not_busy(&arena->pages_abandoned[bin], slice_index); + mi_page_clear_abandoned_mapped(page); mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]); } else { // page is full (or a singleton), page is OS/externally allocated - // nothing to do + // nothing to do // TODO: maintain count of these as well? } _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); @@ -715,7 +744,7 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); // if (!mi_page_is_abandoned(page)) return false; // it is not abandoned (anymore) - + // note: we can access the page even it is in the meantime reclaimed by another thread since // we only call this when on free (and thus there is still an object alive in the page) mi_memid_t memid = page->memid; @@ -967,7 +996,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->is_large = is_large; arena->purge_expire = 0; mi_lock_init(&arena->abandoned_visit_lock); - + // init bitmaps mi_bitmap_init(&arena->slices_free,true); mi_bitmap_init(&arena->slices_committed,true); @@ -1068,7 +1097,7 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ _mi_memset(buf + k, 'o', MI_BFIELD_BITS); k += MI_BFIELD_BITS; } - bit_count += MI_BFIELD_BITS; + bit_count += MI_BFIELD_BITS; } _mi_output_message("%s %s\n", prefix, buf); } diff --git a/src/bitmap.c b/src/bitmap.c index eb5da086..df25e028 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -80,7 +80,7 @@ static inline bool mi_bfield_atomic_set2(_Atomic(mi_bfield_t)*b, size_t idx, boo mi_assert_internal(idx < MI_BFIELD_BITS-1); const size_t mask = (mi_bfield_t)0x03 << idx; mi_bfield_t old = mi_atomic_load_relaxed(b); - while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)); // try to atomically set the mask bits until success + while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) { }; // try to atomically set the mask bits until success if (all_already_set!=NULL) { *all_already_set = ((old&mask)==mask); } return ((old&mask) == 0); } @@ -90,7 +90,7 @@ static inline bool mi_bfield_atomic_clear2(_Atomic(mi_bfield_t)*b, size_t idx, b mi_assert_internal(idx < MI_BFIELD_BITS-1); const size_t mask = (mi_bfield_t)0x03 << idx; mi_bfield_t old = mi_atomic_load_relaxed(b); - while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)); // try to atomically clear the mask bits until success + while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) { }; // try to atomically clear the mask bits until success if (all_already_clear!=NULL) { *all_already_clear = ((old&mask) == 0); } return ((old&mask) == mask); } @@ -110,7 +110,7 @@ static inline bool mi_bfield_atomic_xset2(mi_bit_t set, _Atomic(mi_bfield_t)*b, static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); - while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)); // try to atomically set the mask bits until success + while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)) { }; // try to atomically set the mask bits until success if (already_set!=NULL) { *already_set = mi_bfield_popcount(old&mask); } return ((old&mask) == 0); } @@ -119,7 +119,7 @@ static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_clear) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); - while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)); // try to atomically clear the mask bits until success + while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) { }; // try to atomically clear the mask bits until success if (already_clear!=NULL) { *already_clear = mi_bfield_popcount(~(old&mask)); } return ((old&mask) == mask); } @@ -1115,16 +1115,18 @@ static inline bool mi_bfield_atomic_clear_while_not_busy(_Atomic(mi_bfield_t)*b, mi_assert_internal(idx < MI_BFIELD_BITS-1); const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx); const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); - mi_bfield_t old; mi_bfield_t bnew; + mi_bfield_t old = mi_atomic_load_relaxed(b); do { - old = mi_atomic_load_relaxed(b); if mi_unlikely((old&mask)==mask_busy) { old = mi_atomic_load_acquire(b); + if ((old&mask)==mask_busy) { + _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); + } while ((old&mask)==mask_busy) { // busy wait mi_atomic_yield(); old = mi_atomic_load_acquire(b); - } + } } bnew = (old & ~mask); // clear } while (!mi_atomic_cas_weak_acq_rel(b, &old, bnew)); diff --git a/src/free.c b/src/free.c index 4bce6886..6e8514c6 100644 --- a/src/free.c +++ b/src/free.c @@ -128,7 +128,7 @@ void mi_free(void* p) mi_attr_noexcept { mi_page_t* const page = mi_checked_ptr_page(p,"mi_free"); if mi_unlikely(page==NULL) return; - + const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page)); if mi_likely(is_local) { // thread-local free? if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) @@ -156,50 +156,164 @@ void mi_free(void* p) mi_attr_noexcept static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_thread_id(page)==0); - +#if 1 // we own the page now.. - - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); // this must be before collect - - // collect the thread atomic free list + // safe to collect the thread atomic free list _mi_page_free_collect(page, false); // update `used` count + #if MI_DEBUG > 1 if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); } + #endif - if (mi_page_all_free(page)) { + // 1. free if the page is free now + if (mi_page_all_free(page)) + { + // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish) + _mi_arena_page_unabandon(page); // we can free the page directly _mi_arena_page_free(page); return; } - else { - // the page has still some blocks in use + // 2. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations + else if (!mi_page_is_mostly_used(page) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page + !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && + _mi_arena_page_try_reabandon_to_mapped(page)) + { + return; + } + // 3. if the page is not too full, we can try to reclaim it for ourselves + else if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 && + !mi_page_is_mostly_used(page)) + { + // the page has still some blocks in use (but not too many) // reclaim in our heap if compatible, or otherwise abandon again // todo: optimize this check further? // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) mi_heap_t* const heap = mi_prim_get_default_heap(); - - if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && // only if reclaim on free is allowed - (heap != (mi_heap_t*)&_mi_heap_empty)) // we did not already terminate our thread (can this happen? + if (heap != (mi_heap_t*)&_mi_heap_empty) // we did not already terminate our thread (can this happen? { mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); - if ((tagheap != NULL) && // don't reclaim across heap object types + if ((tagheap != NULL) && // don't reclaim across heap object types (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) - (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) + (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) ) { - // make it part of our heap + // first remove it from the abandoned pages in the arena -- this waits for any readers to finish + _mi_arena_page_unabandon(page); _mi_heap_page_reclaim(tagheap, page); + _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); return; - } + } + } + } + + // not reclaimed or free'd, unown again + _mi_page_unown(page); + +#else + if (!mi_page_is_abandoned_mapped(page)) { + // singleton or OS allocated + if (mi_page_is_singleton(page)) { + // free singleton pages + #if MI_DEBUG>1 + _mi_page_free_collect(page, false); // update `used` count + mi_assert_internal(mi_page_all_free(page)); + #endif + // we can free the page directly + _mi_arena_page_free(page); + return; + } + else { + const bool was_full = mi_page_is_full(page); + _mi_page_free_collect(page,false); // update used + if (mi_page_all_free(page)) { + // no need to unabandon as it is unmapped + _mi_arena_page_free(page); + return; + } + else if (was_full && _mi_arena_page_reabandon_full(page)) { + return; + } + else if (!mi_page_is_mostly_used(page) && _mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) { + // the page has still some blocks in use (but not too many) + // reclaim in our heap if compatible, or otherwise abandon again + // todo: optimize this check further? + // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should + // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) + mi_heap_t* const heap = mi_prim_get_default_heap(); + if (heap != (mi_heap_t*)&_mi_heap_empty) { // we did not already terminate our thread (can this happen? + mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); + if ((tagheap != NULL) && // don't reclaim across heap object types + (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) + (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) + ) + { + _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); + // make it part of our heap (no need to unabandon as is unmapped) + _mi_heap_page_reclaim(tagheap, page); + return; + } + } + } + } + } + else { + // don't reclaim pages that can be found for fresh page allocations + } + + // not reclaimed or free'd, unown again + _mi_page_unown(page); +#endif +} + +/* +// we own the page now.. +// safe to collect the thread atomic free list +_mi_page_free_collect(page, false); // update `used` count +if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); } + +if (mi_page_all_free(page)) { + // first remove it from the abandoned pages in the arena -- this waits for any readers to finish + _mi_arena_page_unabandon(page); // this must be before free'ing + // we can free the page directly + _mi_arena_page_free(page); + return; +} +else if (!mi_page_is_mostly_used(page)) { + // the page has still some blocks in use (but not too many) + // reclaim in our heap if compatible, or otherwise abandon again + // todo: optimize this check further? + // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should + // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) + mi_heap_t* const heap = mi_prim_get_default_heap(); + + if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && // only if reclaim on free is allowed + (heap != (mi_heap_t*)&_mi_heap_empty)) // we did not already terminate our thread (can this happen? + { + mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); + if ((tagheap != NULL) && // don't reclaim across heap object types + (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) + (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) + ) + { + // first remove it from the abandoned pages in the arena -- this waits for any readers to finish + _mi_arena_page_unabandon(page); + _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); + // make it part of our heap + _mi_heap_page_reclaim(tagheap, page); + return; } - - // we cannot reclaim this page.. abandon it again - _mi_arena_page_abandon(page); } } -// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. +// we cannot reclaim this page.. leave it abandoned +// todo: should re-abandon or otherwise a partly used page could never be re-used if the +// objects in it are not freed explicitly. +_mi_page_unown(page); +*/ + + +// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) { // adjust stats (after padding check and potentially recursive `mi_free` above) diff --git a/src/init.c b/src/init.c index d1670d02..01beb222 100644 --- a/src/init.c +++ b/src/init.c @@ -83,7 +83,7 @@ const mi_page_t _mi_page_empty = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 } \ + { 0, 0 }, { 0, 0 }, { 0, 0 } \ MI_STAT_COUNT_END_NULL() // -------------------------------------------------------- diff --git a/src/options.c b/src/options.c index 759d096d..1b326cc3 100644 --- a/src/options.c +++ b/src/options.c @@ -143,7 +143,7 @@ static mi_option_desc_t options[_mi_option_last] = { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`) { 10, UNINIT, MI_OPTION(arena_purge_mult) }, // purge delay multiplier for arena's { 1, UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) }, - { 1, UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free + { 0, UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { MI_DEFAULT_DISALLOW_ARENA_ALLOC, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. #if defined(MI_VISIT_ABANDONED) diff --git a/src/page.c b/src/page.c index 4d26dbad..9ea7a979 100644 --- a/src/page.c +++ b/src/page.c @@ -811,7 +811,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p _mi_page_free(page_candidate, pq); page_candidate = page; } - else if (page->used >= page_candidate->used) { + else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { page_candidate = page; } // if we find a non-expandable candidate, or searched for N pages, return with the best candidate diff --git a/src/stats.c b/src/stats.c index 53b18da0..2a793b59 100644 --- a/src/stats.c +++ b/src/stats.c @@ -331,6 +331,10 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_stat_print_ex(&stats->page_committed, "touched", 1, out, arg, ""); mi_stat_print_ex(&stats->pages, "pages", -1, out, arg, ""); mi_stat_print(&stats->pages_abandoned, "-abandoned", -1, out, arg); + mi_stat_counter_print(&stats->pages_reclaim_on_alloc, "-reclaima", out, arg); + mi_stat_counter_print(&stats->pages_reclaim_on_free, "-reclaimf", out, arg); + mi_stat_counter_print(&stats->pages_reabandon_full, "-reabandon", out, arg); + mi_stat_counter_print(&stats->pages_unabandon_busy_wait, "-waits", out, arg); mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg); mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg); mi_stat_counter_print(&stats->arena_count, "arenas", out, arg); diff --git a/test/test-stress.c b/test/test-stress.c index ffeb5dea..4c2719aa 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -43,7 +43,13 @@ static int ITER = 10; #elif 0 static int THREADS = 4; static int SCALE = 100; +static int ITER = 10; +#define ALLOW_LARGE false +#elif 1 +static int THREADS = 32; +static int SCALE = 50; static int ITER = 50; +#define ALLOW_LARGE false #else static int THREADS = 32; // more repeatable if THREADS <= #processors static int SCALE = 50; // scaling factor @@ -54,7 +60,12 @@ static int ITER = 50; // N full iterations destructing and re-creating a #define STRESS // undefine for leak test -static bool allow_large_objects = false; // allow very large objects? (set to `true` if SCALE>100) +#ifndef ALLOW_LARGE +#define ALLOW_LARGE true +#endif + +static bool allow_large_objects = ALLOW_LARGE; // allow very large objects? (set to `true` if SCALE>100) + static size_t use_one_size = 0; // use single object size of `N * sizeof(uintptr_t)`? static bool main_participates = false; // main thread participates as a worker too @@ -332,6 +343,8 @@ int main(int argc, char** argv) { mi_debug_show_arenas(true,true,false); #endif // mi_stats_print(NULL); +#else + mi_stats_print(NULL); // so we see rss/commit/elapsed #endif //bench_end_program(); return 0; From 3fc2c8e279bc7d0ba18378ec1f525adff8672a87 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 3 Dec 2024 11:06:07 -0800 Subject: [PATCH 026/264] fix assertions --- include/mimalloc/internal.h | 3 +-- src/bitmap.c | 4 +--- src/free.c | 2 +- src/init.c | 3 ++- src/page.c | 2 +- test/test-stress.c | 4 ++-- 6 files changed, 8 insertions(+), 10 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 56172bcd..9fa27f31 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -676,8 +676,7 @@ static inline bool mi_page_try_claim_ownership(mi_page_t* page) { static inline void _mi_page_unown(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); - mi_assert_internal(mi_page_is_abandoned(page)); - mi_assert_internal(mi_page_thread_id(page)==0); + mi_assert_internal(mi_page_is_abandoned(page)); mi_thread_free_t tf_new; mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); do { diff --git a/src/bitmap.c b/src/bitmap.c index df25e028..4eadce80 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1120,9 +1120,7 @@ static inline bool mi_bfield_atomic_clear_while_not_busy(_Atomic(mi_bfield_t)*b, do { if mi_unlikely((old&mask)==mask_busy) { old = mi_atomic_load_acquire(b); - if ((old&mask)==mask_busy) { - _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); - } + if ((old&mask)==mask_busy) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); } while ((old&mask)==mask_busy) { // busy wait mi_atomic_yield(); old = mi_atomic_load_acquire(b); diff --git a/src/free.c b/src/free.c index 6e8514c6..70ef5d8a 100644 --- a/src/free.c +++ b/src/free.c @@ -155,7 +155,7 @@ void mi_free(void* p) mi_attr_noexcept static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); - mi_assert_internal(mi_page_thread_id(page)==0); + mi_assert_internal(mi_page_is_abandoned(page)); #if 1 // we own the page now.. // safe to collect the thread atomic free list diff --git a/src/init.c b/src/init.c index 01beb222..99a5ea39 100644 --- a/src/init.c +++ b/src/init.c @@ -83,7 +83,8 @@ const mi_page_t _mi_page_empty = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 } \ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ + { 0, 0 } \ MI_STAT_COUNT_END_NULL() // -------------------------------------------------------- diff --git a/src/page.c b/src/page.c index 9ea7a979..e5e3f972 100644 --- a/src/page.c +++ b/src/page.c @@ -811,7 +811,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p _mi_page_free(page_candidate, pq); page_candidate = page; } - else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { + else if (page->used >= page_candidate->used) { // && !mi_page_is_mostly_used(page)) { page_candidate = page; } // if we find a non-expandable candidate, or searched for N pages, return with the best candidate diff --git a/test/test-stress.c b/test/test-stress.c index 4c2719aa..9e53e920 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -45,14 +45,14 @@ static int THREADS = 4; static int SCALE = 100; static int ITER = 10; #define ALLOW_LARGE false -#elif 1 +#elif 0 static int THREADS = 32; static int SCALE = 50; static int ITER = 50; #define ALLOW_LARGE false #else static int THREADS = 32; // more repeatable if THREADS <= #processors -static int SCALE = 50; // scaling factor +static int SCALE = 25; // scaling factor static int ITER = 50; // N full iterations destructing and re-creating all threads #endif From 8d9c725482537a811b4eb9c982bfbfdf7680cbc1 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 3 Dec 2024 17:27:43 -0800 Subject: [PATCH 027/264] increase MAX_OBJ_SLICES to a full chunk (32MiB) --- include/mimalloc/internal.h | 15 +++ include/mimalloc/types.h | 3 +- src/arena.c | 65 ++++++++++--- src/bitmap.c | 185 +++++++++++++++++++++++++++--------- src/bitmap.h | 47 ++++----- src/os.c | 15 --- src/page-map.c | 2 +- 7 files changed, 230 insertions(+), 102 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 9fa27f31..34dbab07 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -339,6 +339,21 @@ static inline uint8_t* _mi_align_up_ptr(void* p, size_t alignment) { } +static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) { + mi_assert_internal(alignment != 0); + uintptr_t mask = alignment - 1; + if ((alignment & mask) == 0) { // power of two? + return (sz & ~mask); + } + else { + return ((sz / alignment) * alignment); + } +} + +static inline void* mi_align_down_ptr(void* p, size_t alignment) { + return (void*)_mi_align_down((uintptr_t)p, alignment); +} + // Divide upwards: `s <= _mi_divide_up(s,d)*d < s+d`. static inline uintptr_t _mi_divide_up(uintptr_t size, size_t divider) { mi_assert_internal(divider != 0); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 4430cd6c..3d83e27a 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -128,8 +128,7 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_ARENA_SLICE_ALIGN (MI_ARENA_SLICE_SIZE) #define MI_ARENA_MIN_OBJ_SLICES (1) -#define MI_ARENA_MAX_OBJ_SLICES (MI_SIZE_BITS) // for now, cannot cross bit field boundaries.. todo: make it at least MI_BITMAP_CHUNK_BITS ? (16 MiB) -// #define MI_ARENA_MAX_OBJ_BLOCKS (MI_BITMAP_CHUNK_BITS) // for now, cannot cross chunk boundaries +#define MI_ARENA_MAX_OBJ_SLICES (MI_BITMAP_CHUNK_BITS) // 32 MiB (for now, cannot cross chunk boundaries) #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE) #define MI_ARENA_MAX_OBJ_SIZE (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE) diff --git a/src/arena.c b/src/arena.c index a2343674..1b891377 100644 --- a/src/arena.c +++ b/src/arena.c @@ -193,30 +193,55 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( void* p = mi_arena_slice_start(arena, slice_index); *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count); memid->is_pinned = arena->memid.is_pinned; - + // set the dirty bits if (arena->memid.initially_zero) { + // size_t dirty_count = 0; memid->initially_zero = mi_bitmap_setN(&arena->slices_dirty, slice_index, slice_count, NULL); + //if (dirty_count>0) { + // if (memid->initially_zero) { + // _mi_error_message(EFAULT, "ouch1\n"); + // } + // // memid->initially_zero = false; + //} + //else { + // if (!memid->initially_zero) { + // _mi_error_message(EFAULT, "ouch2\n"); + // } + // // memid->initially_zero = true; + //} } // set commit state if (commit) { - // commit requested, but the range may not be committed as a whole: ensure it is committed now memid->initially_committed = true; - size_t already_committed_count = 0; - mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &already_committed_count); - if (already_committed_count < slice_count) { - // recommit the full range + // commit requested, but the range may not be committed as a whole: ensure it is committed now + if (!mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)) { + // not fully committed: commit the full range and set the commit bits + // (this may race and we may double-commit which is fine) bool commit_zero = false; - mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) { memid->initially_committed = false; } else { if (commit_zero) { memid->initially_zero = true; } + #if MI_DEBUG > 1 + if (memid->initially_zero) { + if (!mi_mem_is_zero(p, mi_size_of_slices(slice_count))) { + _mi_error_message(EFAULT, "arena allocation was not zero-initialized!\n"); + memid->initially_zero = false; + } + } + #endif + size_t already_committed_count = 0; + mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &already_committed_count); + if (already_committed_count < slice_count) { + // todo: also decrease total + mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); + } } - } + } } else { // no need to commit, but check if already fully committed @@ -523,7 +548,18 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment)); // claimed free slices: initialize the page partly - if (!memid.initially_zero) { _mi_memzero_aligned(page, sizeof(*page)); } + if (!memid.initially_zero) { + _mi_memzero_aligned(page, sizeof(*page)); + } + #if MI_DEBUG > 1 + else { + if (!mi_mem_is_zero(page, mi_size_of_slices(slice_count))) { + _mi_error_message(EFAULT, "page memory was not zero initialized!\n"); + memid.initially_zero = false; + _mi_memzero_aligned(page, sizeof(*page)); + } + } + #endif mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)); const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); const size_t reserved = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size); @@ -668,7 +704,7 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); - // mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); mi_page_set_abandoned_mapped(page); bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index); @@ -851,6 +887,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi mi_assert_internal(all_committed); } else { + /* if (!all_committed) { // mark the entire range as no longer committed (so we recommit the full range when re-using) mi_bitmap_clearN(&arena->slices_committed, slice_index, slice_count); @@ -864,6 +901,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi // that contains already decommitted parts. Since purge consistently uses reset or decommit that // works (as we should never reset decommitted parts). } + */ // (delay) purge the entire range mi_arena_schedule_purge(arena, slice_index, slice_count, stats); } @@ -1014,7 +1052,12 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int else { mi_bitmap_setN(&arena->slices_committed, 0, info_slices, NULL); } - mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL); + if (!memid.initially_zero) { + mi_bitmap_unsafe_setN(&arena->slices_dirty, 0, arena->slice_count); + } + else { + mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL); + } return mi_arena_add(arena, arena_id, &_mi_stats_main); } diff --git a/src/bitmap.c b/src/bitmap.c index 4eadce80..a6c9e879 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -42,6 +42,25 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { return mi_rotr(x,r); } +static inline mi_bfield_t mi_bfield_zero(void) { + return 0; +} + +static inline mi_bfield_t mi_bfield_one(void) { + return 1; +} + +static inline mi_bfield_t mi_bfield_all_set(void) { + return ~((mi_bfield_t)0); +} + +static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) { + mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS); + const mi_bfield_t mask0 = (bit_count < MI_BFIELD_BITS ? (mi_bfield_one() << bit_count)-1 : mi_bfield_all_set()); + return (mask0 << shiftl); +} + + // Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR). // return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). @@ -52,7 +71,7 @@ static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, siz // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); - const mi_bfield_t mask = ((mi_bfield_t)1)< n) { m = n; } mi_assert_internal(idx + m <= MI_BFIELD_BITS); mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); - const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset ); + const bool transition = mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset); + if (already_xset > 0 && transition) { + _mi_error_message(EFAULT, "ouch\n"); + } + all_transition = all_transition && transition; all_already_xset += already_xset; // next field field++; @@ -335,7 +372,6 @@ static inline bool mi_bitmap_chunk_is_clear2(mi_bitmap_chunk_t* chunk, size_t ci static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); - bool all_xset = true; size_t idx = cidx % MI_BFIELD_BITS; size_t field = cidx / MI_BFIELD_BITS; while (n > 0) { @@ -343,14 +379,16 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz if (m > n) { m = n; } mi_assert_internal(idx + m <= MI_BFIELD_BITS); mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); - const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask); + const size_t mask = mi_bfield_mask(m, idx); + if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask)) { + return false; + } // next field field++; idx = 0; n -= m; } - return all_xset; + return true; } @@ -389,14 +427,14 @@ static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t b // Returns true if all bits transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving all bit fields as is. static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { - mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); if (n==0) return true; size_t start_idx = cidx % MI_BFIELD_BITS; size_t start_field = cidx / MI_BFIELD_BITS; size_t end_field = MI_BITMAP_CHUNK_FIELDS; - size_t mask_mid = 0; - size_t mask_end = 0; + mi_bfield_t mask_mid = 0; + mi_bfield_t mask_end = 0; // first field size_t field = start_field; @@ -404,7 +442,7 @@ static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, si if (m > n) { m = n; } mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS); - const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask_start)) return false; // done? @@ -417,7 +455,7 @@ static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, si while (n >= MI_BFIELD_BITS) { field++; mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); - mask_mid = ~MI_ZU(0); + mask_mid = mi_bfield_all_set(); if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore; n -= MI_BFIELD_BITS; } @@ -428,7 +466,7 @@ static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, si field++; mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); end_field = field; - mask_end = (MI_ZU(1)<bfields[field], mask_end)) goto restore; } @@ -602,14 +640,12 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, } -// find a sequence of `n` bits in a chunk with all `n` (`< MI_BFIELD_BITS`!) bits set, -// and try unset it atomically +// find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set, +// and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. -// todo: try avx2 and neon version -// todo: allow spanning across bfield boundaries? -static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { - if (n == 0 || n > MI_BFIELD_BITS) return false; // TODO: allow larger? - const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1); +static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { + if (n == 0 || n > MI_BFIELD_BITS) return false; + const mi_bfield_t mask = mi_bfield_mask(n, 0); for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { mi_bfield_t b = chunk->bfields[i]; size_t bshift = 0; @@ -636,8 +672,48 @@ static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, // advance const size_t ones = mi_bfield_ctz(~b); // skip all ones (since it didn't fit the mask) mi_assert_internal(ones>0); - bshift += ones; b >>= ones; + bshift += ones; + } + } + } + return false; +} + +// find a sequence of `n` bits in a chunk with `n < MI_BITMAP_CHUNK_BITS` with all bits set, +// and try to clear them atomically. +// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. +static bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { + if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; // cannot be more than a chunk + if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx); + + // we align an a field, and require `field_count` fields to be all clear. + // n >= MI_BFIELD_BITS; find a first field that is 0 + const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields + for (size_t i = 0; i <= MI_BITMAP_CHUNK_FIELDS - field_count; i++) + { + // first pre-scan for a range of fields that are all set + bool allset = true; + size_t j = 0; + do { + mi_assert_internal(i + j < MI_BITMAP_CHUNK_FIELDS); + mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i+j]); + if (~b != 0) { + allset = false; + i += j; // no need to look again at the previous fields + break; + } + } while (++j < field_count); + + // if all set, we can try to atomically clear them + if (allset) { + const size_t cidx = i*MI_BFIELD_BITS; + if (mi_bitmap_chunk_try_clearN(chunk, cidx, n)) { + // we cleared all atomically + *pidx = cidx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS); + return true; } } } @@ -796,7 +872,7 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving the bitmask as is. -bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { +static bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < MI_BITMAP_MAX_BITS); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; @@ -816,12 +892,9 @@ bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { } } - - - // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise leaving the bitmask as is. -bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { +static bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < MI_BITMAP_MAX_BITS); mi_assert_internal(idx%8 == 0); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; @@ -846,13 +919,12 @@ bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { +static bool mi_bitmap_try_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); } - if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); } - + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + if (n==0 || idx + n > MI_BITMAP_MAX_BITS) return false; + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) @@ -875,13 +947,21 @@ bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n } } +bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS); + if (n==1) return mi_bitmap_try_xset(set, bitmap, idx); + if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx); + return mi_bitmap_try_xsetN_(set, bitmap, idx, n); +} + // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) { +static bool mi_bitmap_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + //TODO: specialize? //if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } //if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } @@ -899,14 +979,26 @@ bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, si } else { const size_t epoch = mi_bitmap_epoch(bitmap); - bool cleared = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, already_xset); - if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + size_t already_clear = 0; + const bool allset = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear); + if (already_xset != NULL) { *already_xset = already_clear; } + if (already_clear < n && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); } - return cleared; + return allset; } } +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) { + mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS); + //TODO: specialize? + //if (n==1) return mi_bitmap_xset(set, bitmap, idx); + //if (n==8) return mi_bitmap_xset8(set, bitmap, idx); + return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset); +} + // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { @@ -949,7 +1041,7 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) // and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. // The low `MI_BFIELD_BITS` of start are used to set the start point of the search // (to reduce thread contention). -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { +mi_decl_nodiscard static bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { mi_bitmap_forall_set_chunks(bitmap, tseq, epoch, chunk_idx) { size_t cidx; @@ -973,7 +1065,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t // Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) { +mi_decl_nodiscard static bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) { mi_bitmap_forall_set_chunks(bitmap,tseq, epoch, chunk_idx) { size_t cidx; @@ -997,10 +1089,9 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) { - // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger - // TODO: allow spanning across chunk boundaries - if (n == 0 || n > MI_BFIELD_BITS) return false; +mi_decl_nodiscard static bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) { + // TODO: allow spanning across chunk boundaries? + if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; mi_bitmap_forall_set_chunks(bitmap,tseq,epoch,chunk_idx) { size_t cidx; @@ -1021,6 +1112,12 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t return false; } +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { + if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); + if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); + return mi_bitmap_try_find_and_clearN_(bitmap, n, tseq, pidx); +} + /* -------------------------------------------------------------------------------- pairmap epochset diff --git a/src/bitmap.h b/src/bitmap.h index 8c961fe1..948bd1e3 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -90,28 +90,28 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) // and false otherwise leaving the bitmask as is. -mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); - -static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) { - return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx); -} - -static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { - return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx); -} +//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); +// +//static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) { +// return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx); +//} +// +//static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { +// return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx); +//} // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise leaving the bitmask as is. -mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); - -static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) { - return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx); -} - -static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) { - return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx); -} +//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); +// +//static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) { +// return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx); +//} +// +//static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) { +// return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx); +//} // Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. @@ -126,17 +126,6 @@ static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n); } - -// Find a set bit in a bitmap and atomically unset it. Returns true on success, -// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. -// The low `MI_BFIELD_BITS` of start are used to set the start point of the search -// (to reduce thread contention). -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); - -// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ); - // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ); diff --git a/src/os.c b/src/os.c index bac59437..c7f464c0 100644 --- a/src/os.c +++ b/src/os.c @@ -92,21 +92,6 @@ void _mi_os_init(void) { bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats); bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats); -static inline uintptr_t _mi_align_down(uintptr_t sz, size_t alignment) { - mi_assert_internal(alignment != 0); - uintptr_t mask = alignment - 1; - if ((alignment & mask) == 0) { // power of two? - return (sz & ~mask); - } - else { - return ((sz / alignment) * alignment); - } -} - -static void* mi_align_down_ptr(void* p, size_t alignment) { - return (void*)_mi_align_down((uintptr_t)p, alignment); -} - void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { MI_UNUSED(try_alignment); MI_UNUSED(size); return NULL; diff --git a/src/page-map.c b/src/page-map.c index 15578301..0e99a890 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -24,7 +24,7 @@ static bool mi_page_map_init(void) { mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); - mi_page_map_all_committed = _mi_os_has_overcommit(); // commit on-access on Linux systems + mi_page_map_all_committed = false; // _mi_os_has_overcommit(); // commit on-access on Linux systems? _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); From e5fdd6e110471b6665ee388366c7aa493c2a7557 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 3 Dec 2024 22:43:14 -0800 Subject: [PATCH 028/264] wip: initial large bitmaps --- src/arena.c | 162 ++++++----- src/bitmap.c | 666 +++++++++++++++++++++++---------------------- src/bitmap.h | 108 +++++--- src/page-map.c | 3 +- test/test-stress.c | 2 +- 5 files changed, 501 insertions(+), 440 deletions(-) diff --git a/src/arena.c b/src/arena.c index 1b891377..f8b6fca1 100644 --- a/src/arena.c +++ b/src/arena.c @@ -37,18 +37,20 @@ typedef struct mi_arena_s { mi_arena_id_t id; // arena id; 0 for non-specific size_t slice_count; // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`) + size_t info_slices; // initial slices reserved for the arena bitmaps int numa_node; // associated NUMA node bool exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited _Atomic(mi_msecs_t) purge_expire; // expiration time when slices should be decommitted from `slices_decommit`. - mi_bitmap_t slices_free; // is the slice free? - mi_bitmap_t slices_committed; // is the slice committed? (i.e. accessible) - mi_bitmap_t slices_purge; // can the slice be purged? (slice in purge => slice in free) - mi_bitmap_t slices_dirty; // is the slice potentially non-zero? + mi_bitmap_t* slices_free; // is the slice free? + mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) + mi_bitmap_t* slices_purge; // can the slice be purged? (slice in purge => slice in free) + mi_bitmap_t* slices_dirty; // is the slice potentially non-zero? mi_pairmap_t pages_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) // the full queue contains abandoned full pages + // followed by the bitmaps (whose size depends on the arena size) } mi_arena_t; #define MI_MAX_ARENAS (1024) // Limited for now (and takes up .bss) @@ -58,6 +60,7 @@ static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 + /* ----------------------------------------------------------- Arena id's id = arena_index + 1 @@ -103,6 +106,11 @@ mi_arena_t* mi_arena_from_id(mi_arena_id_t id) { return mi_arena_from_index(mi_arena_id_index(id)); } +static size_t mi_arena_info_slices(mi_arena_t* arena) { + return arena->info_slices; +} + + /* ----------------------------------------------------------- Util @@ -114,14 +122,6 @@ static size_t mi_arena_size(mi_arena_t* arena) { return mi_size_of_slices(arena->slice_count); } -static size_t mi_arena_info_slices(void) { - const size_t os_page_size = _mi_os_page_size(); - const size_t info_size = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page - const size_t info_slices = mi_slice_count_of_size(info_size); - return info_slices; -} - - // Start of the arena memory area static uint8_t* mi_arena_start(mi_arena_t* arena) { return ((uint8_t*)arena); @@ -187,7 +187,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid) { size_t slice_index; - if (!mi_bitmap_try_find_and_clearN(&arena->slices_free, slice_count, tseq, &slice_index)) return NULL; + if (!mi_bitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL; // claimed it! void* p = mi_arena_slice_start(arena, slice_index); @@ -197,7 +197,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // set the dirty bits if (arena->memid.initially_zero) { // size_t dirty_count = 0; - memid->initially_zero = mi_bitmap_setN(&arena->slices_dirty, slice_index, slice_count, NULL); + memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL); //if (dirty_count>0) { // if (memid->initially_zero) { // _mi_error_message(EFAULT, "ouch1\n"); @@ -217,7 +217,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_committed = true; // commit requested, but the range may not be committed as a whole: ensure it is committed now - if (!mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)) { + if (!mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)) { // not fully committed: commit the full range and set the commit bits // (this may race and we may double-commit which is fine) bool commit_zero = false; @@ -235,7 +235,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( } #endif size_t already_committed_count = 0; - mi_bitmap_setN(&arena->slices_committed, slice_index, slice_count, &already_committed_count); + mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count); if (already_committed_count < slice_count) { // todo: also decrease total mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); @@ -245,13 +245,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( } else { // no need to commit, but check if already fully committed - memid->initially_committed = mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count); + memid->initially_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count); } - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); - if (commit) { mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); } - mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); - // mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); } + mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); + // mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); return p; } @@ -285,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re } // check arena bounds - const size_t min_reserve = mi_size_of_slices(mi_arena_info_slices() + 1); - const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_SLICE_SIZE; + const size_t min_reserve = 8; // hope that fits minimal bitmaps? + const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE; // 16 GiB if (arena_reserve < min_reserve) { arena_reserve = min_reserve; } @@ -494,10 +494,10 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); _mi_page_free_collect(page, false); // update `used` count - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); @@ -670,9 +670,9 @@ void _mi_arena_page_free(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_pairmap_is_clear(&arena->pages_abandoned[bin], slice_index)); } #endif @@ -701,10 +701,10 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(!mi_page_is_singleton(page)); - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(&arena->slices_dirty, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_page_set_abandoned_mapped(page); bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index); @@ -757,9 +757,9 @@ void _mi_arena_page_unabandon(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(&arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(&arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done mi_pairmap_clear_while_not_busy(&arena->pages_abandoned[bin], slice_index); @@ -876,8 +876,8 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi return; } mi_assert_internal(slice_index < arena->slice_count); - mi_assert_internal(slice_index >= mi_arena_info_slices()); - if (slice_index < mi_arena_info_slices() || slice_index > arena->slice_count) { + mi_assert_internal(slice_index >= mi_arena_info_slices(arena)); + if (slice_index < mi_arena_info_slices(arena) || slice_index > arena->slice_count) { _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); return; } @@ -907,7 +907,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi } // and make it available to others again - bool all_inuse = mi_bitmap_setN(&arena->slices_free, slice_index, slice_count, NULL); + bool all_inuse = mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL); if (!all_inuse) { _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count)); return; @@ -989,6 +989,29 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* return true; } +static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_base) { + if (slice_count == 0) slice_count = MI_BITMAP_CHUNK_BITS; + mi_assert_internal((slice_count % MI_BITMAP_CHUNK_BITS) == 0); + const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BITMAP_CHUNK_SIZE); + const size_t bitmaps_size = 4 * mi_bitmap_size(slice_count,NULL); + const size_t pairmaps_size = MI_BIN_COUNT * 2 * mi_bitmap_size(slice_count,NULL); + const size_t size = base_size + bitmaps_size + pairmaps_size; + + const size_t os_page_size = _mi_os_page_size(); + const size_t info_size = _mi_align_up(size, os_page_size) + os_page_size; // + guard page + const size_t info_slices = mi_slice_count_of_size(info_size); + + if (bitmap_base != NULL) *bitmap_base = base_size; + return info_slices; +} + +static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) { + mi_bitmap_t* bitmap = (mi_bitmap_t*)(*base); + *base = (*base) + mi_bitmap_init(bitmap, slice_count, true /* already zero */); + return bitmap; +} + + static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { mi_assert(!is_large || (memid.initially_committed && memid.is_pinned)); @@ -1003,23 +1026,25 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } - const size_t info_slices = mi_arena_info_slices(); - const size_t bcount = size / MI_ARENA_SLICE_SIZE; // divide down - if (bcount < info_slices+1) { + const size_t slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BITMAP_CHUNK_BITS); + if (slice_count > MI_BITMAP_MAX_BIT_COUNT) { // 16 GiB for now + // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) + _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB); + return false; + } + size_t bitmap_base; + const size_t info_slices = mi_arena_info_slices_needed(slice_count, &bitmap_base); + if (slice_count < info_slices+1) { _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_slices(info_slices+1)/MI_KiB); return false; } - if (bcount > MI_BITMAP_MAX_BITS) { - // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) - _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BITS)/MI_MiB); - return false; - } + mi_arena_t* arena = (mi_arena_t*)start; // commit & zero if needed bool is_zero = memid.initially_zero; if (!memid.initially_committed) { - _mi_os_commit(arena, mi_size_of_slices(info_slices), &is_zero, &_mi_stats_main); + _mi_os_commit(arena, mi_size_of_slices(info_slices), NULL, &_mi_stats_main); } if (!is_zero) { _mi_memzero(arena, mi_size_of_slices(info_slices)); @@ -1029,34 +1054,37 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->id = _mi_arena_id_none(); arena->memid = memid; arena->exclusive = exclusive; - arena->slice_count = bcount; + arena->slice_count = slice_count; + arena->info_slices = info_slices; arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->is_large = is_large; arena->purge_expire = 0; mi_lock_init(&arena->abandoned_visit_lock); // init bitmaps - mi_bitmap_init(&arena->slices_free,true); - mi_bitmap_init(&arena->slices_committed,true); - mi_bitmap_init(&arena->slices_dirty,true); - mi_bitmap_init(&arena->slices_purge,true); + uint8_t* base = mi_arena_start(arena) + bitmap_base; + arena->slices_free = mi_arena_bitmap_init(slice_count,&base); + arena->slices_committed = mi_arena_bitmap_init(slice_count,&base); + arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base); + arena->slices_purge = mi_arena_bitmap_init(slice_count,&base); for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) { - mi_pairmap_init(&arena->pages_abandoned[i],true); + mi_pairmap_init(&arena->pages_abandoned[i], mi_arena_bitmap_init(slice_count, &base), mi_arena_bitmap_init(slice_count, &base)); } + mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena))); // reserve our meta info (and reserve slices outside the memory area) - mi_bitmap_unsafe_setN(&arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); + mi_bitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); if (memid.initially_committed) { - mi_bitmap_unsafe_setN(&arena->slices_committed, 0, arena->slice_count); + mi_bitmap_unsafe_setN(arena->slices_committed, 0, arena->slice_count); } else { - mi_bitmap_setN(&arena->slices_committed, 0, info_slices, NULL); + mi_bitmap_setN(arena->slices_committed, 0, info_slices, NULL); } if (!memid.initially_zero) { - mi_bitmap_unsafe_setN(&arena->slices_dirty, 0, arena->slice_count); + mi_bitmap_unsafe_setN(arena->slices_dirty, 0, arena->slice_count); } else { - mi_bitmap_setN(&arena->slices_dirty, 0, info_slices, NULL); + mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL); } return mi_arena_add(arena, arena_id, &_mi_stats_main); @@ -1117,7 +1145,7 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ _mi_output_message("%s%s:\n", prefix, header); size_t bit_count = 0; size_t bit_set_count = 0; - for (int i = 0; i < MI_BITMAP_CHUNK_COUNT && bit_count < slice_count; i++) { + for (int i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { @@ -1161,12 +1189,12 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) slice_total += arena->slice_count; _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { - free_total += mi_debug_show_bitmap(" ", "in-use slices", arena->slice_count, &arena->slices_free, true); + free_total += mi_debug_show_bitmap(" ", "in-use slices", arena->slice_count, arena->slices_free, true); } - mi_debug_show_bitmap(" ", "committed slices", arena->slice_count, &arena->slices_committed, false); + mi_debug_show_bitmap(" ", "committed slices", arena->slice_count, arena->slices_committed, false); // todo: abandoned slices if (show_purge) { - purge_total += mi_debug_show_bitmap(" ", "purgeable slices", arena->slice_count, &arena->slices_purge, false); + purge_total += mi_debug_show_bitmap(" ", "purgeable slices", arena->slice_count, arena->slices_purge, false); } } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); @@ -1262,7 +1290,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, const size_t size = mi_size_of_slices(slices); void* const p = mi_arena_slice_start(arena, slice_index); bool needs_recommit; - if (mi_bitmap_is_setN(&arena->slices_committed, slice_index, slices)) { + if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slices)) { // all slices are committed, we can purge freely needs_recommit = _mi_os_purge(p, size, stats); } @@ -1277,11 +1305,11 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, } // clear the purged slices - mi_bitmap_clearN(&arena->slices_purge, slices, slice_index); + mi_bitmap_clearN(arena->slices_purge, slices, slice_index); // update committed bitmap if (needs_recommit) { - mi_bitmap_clearN(&arena->slices_committed, slices, slice_index); + mi_bitmap_clearN(arena->slices_committed, slices, slice_index); } } diff --git a/src/bitmap.c b/src/bitmap.c index a6c9e879..4156cfd1 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -64,7 +64,7 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) { // Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR). // return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). -static inline bool mi_bfield_find_least_to_xset(mi_bit_t set, mi_bfield_t x, size_t* idx) { +static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) { return mi_bfield_find_least_bit((set ? ~x : x), idx); } @@ -85,7 +85,7 @@ static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx) { } // Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). -static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { +static inline bool mi_bfield_atomic_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) { if (set) { return mi_bfield_atomic_set(b, idx); } @@ -115,7 +115,7 @@ static inline bool mi_bfield_atomic_clear2(_Atomic(mi_bfield_t)*b, size_t idx, b } // Set/clear a pair of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's) -static inline bool mi_bfield_atomic_xset2(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx, bool* already_xset) { +static inline bool mi_bfield_atomic_xset2(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx, bool* already_xset) { if (set) { return mi_bfield_atomic_set2(b, idx, already_xset); } @@ -143,7 +143,7 @@ static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield } // Set/clear a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's (or all 1's to 0's) -static inline bool mi_bfield_atomic_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_xset) { +static inline bool mi_bfield_atomic_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_xset) { mi_assert_internal(mask != 0); if (set) { return mi_bfield_atomic_set_mask(b, mask, already_xset); @@ -169,7 +169,7 @@ static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx } // Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0) -static inline bool mi_bfield_atomic_try_xset( mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { +static inline bool mi_bfield_atomic_try_xset( mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first return mi_bfield_atomic_xset(set, b, idx); @@ -201,7 +201,7 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bf // Tries to (un)set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask (or mask to 0) // and false otherwise (leaving the bit field as is). -static inline bool mi_bfield_atomic_try_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) { +static inline bool mi_bfield_atomic_try_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask ) { mi_assert_internal(mask != 0); if (set) { return mi_bfield_atomic_try_set_mask(b, mask); @@ -228,7 +228,7 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by // Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise (leaving the bit field as is). -static inline bool mi_bfield_atomic_try_xset8(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) { +static inline bool mi_bfield_atomic_try_xset8(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) { mi_assert_internal(byte_idx < MI_BFIELD_SIZE); const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8); return mi_bfield_atomic_try_xset_mask(set, b, mask); @@ -264,7 +264,7 @@ static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfi // Check if all bits corresponding to a mask are set/cleared. -static inline bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) { +static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) { mi_assert_internal(mask != 0); if (set) { return mi_bfield_atomic_is_set_mask(b, mask); @@ -276,7 +276,7 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_bit_t set, _Atomic(mi_bfield // Check if a bit is set/clear -// static inline bool mi_bfield_atomic_is_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { +// static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) { // mi_assert_internal(idx < MI_BFIELD_BITS); // const mi_bfield_t mask = mi_bfield_one()<bfields[i], idx, all_already_xset); } @@ -309,7 +309,7 @@ static inline bool mi_bitmap_chunk_clear2(mi_bitmap_chunk_t* chunk, size_t cidx, // Set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0). -static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* pall_already_xset) { +static bool mi_bitmap_chunk_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* pall_already_xset) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); bool all_transition = true; @@ -349,7 +349,7 @@ static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, // check if a pair of bits is set/clear -static inline bool mi_bitmap_chunk_is_xset2(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx) { +static inline bool mi_bitmap_chunk_is_xset2(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx) { mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; @@ -369,7 +369,7 @@ static inline bool mi_bitmap_chunk_is_clear2(mi_bitmap_chunk_t* chunk, size_t ci // Check if a sequence of `n` bits within a chunk are all set/cleared. -static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { +static bool mi_bitmap_chunk_is_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); size_t idx = cidx % MI_BFIELD_BITS; @@ -393,7 +393,7 @@ static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, siz -static inline bool mi_bitmap_chunk_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx) { +static inline bool mi_bitmap_chunk_try_xset(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx) { mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; @@ -408,7 +408,7 @@ static inline bool mi_bitmap_chunk_try_clear(mi_bitmap_chunk_t* chunk, size_t ci return mi_bitmap_chunk_try_xset(MI_BIT_CLEAR, chunk, cidx); } -static inline bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) { +static inline bool mi_bitmap_chunk_try_xset8(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) { mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); const size_t i = byte_idx / MI_BFIELD_SIZE; const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; @@ -426,7 +426,7 @@ static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t b // Try to atomically set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving all bit fields as is. -static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { +static bool mi_bitmap_chunk_try_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(n>0); if (n==0) return true; @@ -442,7 +442,7 @@ static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, si if (m > n) { m = n; } mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS); - const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx); + const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx); if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false; // done? @@ -509,7 +509,7 @@ static inline bool mi_mm256_is_zero( __m256i vec) { // find least 0/1-bit in a chunk and try to set/clear it atomically // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. // todo: try neon version -static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) { +static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) { #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); @@ -644,7 +644,7 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, // and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { - if (n == 0 || n > MI_BFIELD_BITS) return false; + if (n == 0 || n > MI_BFIELD_BITS) return false; const mi_bfield_t mask = mi_bfield_mask(n, 0); for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { mi_bfield_t b = chunk->bfields[i]; @@ -683,14 +683,14 @@ static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_ // find a sequence of `n` bits in a chunk with `n < MI_BITMAP_CHUNK_BITS` with all bits set, // and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. -static bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { +static bool mi_bitmap_chunk_find_and_try_clearN_(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; // cannot be more than a chunk - if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx); + // if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx); // we align an a field, and require `field_count` fields to be all clear. // n >= MI_BFIELD_BITS; find a first field that is 0 const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields - for (size_t i = 0; i <= MI_BITMAP_CHUNK_FIELDS - field_count; i++) + for (size_t i = 0; i <= MI_BITMAP_CHUNK_FIELDS - field_count; i++) { // first pre-scan for a range of fields that are all set bool allset = true; @@ -721,6 +721,14 @@ static bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t } +static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { + if (n==1) return mi_bitmap_chunk_find_and_try_clear(chunk, pidx); + if (n==8) return mi_bitmap_chunk_find_and_try_clear8(chunk, pidx); + if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; // cannot be more than a chunk + if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx); + return mi_bitmap_chunk_find_and_try_clearN_(chunk, n, pidx); +} + // are all bits in a bitmap chunk set? // static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { // #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) @@ -755,70 +763,76 @@ static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { } /* -------------------------------------------------------------------------------- - epochset (for now for 32-bit sets only) + chunkmap (for now for 32-bit sets only) -------------------------------------------------------------------------------- */ -static void mi_epochset_split(mi_epochset_t es, uint32_t* bset, size_t* epoch) { - *bset = (uint32_t)es; - *epoch = (size_t)(es >> 32); +static void mi_chunkmap_split(mi_chunkmap_t es, mi_cmap_t* cmap, mi_epoch_t* epoch) { + *cmap = (mi_cmap_t)es; + *epoch = (mi_epoch_t)(es >> 32); } -static mi_epochset_t mi_epochset_join(uint32_t bset, size_t epoch) { - return ((uint64_t)epoch << 32) | bset; +static mi_chunkmap_t mi_chunkmap_join(mi_cmap_t cmap, mi_epoch_t epoch) { + return ((mi_chunkmap_t)epoch << MI_CHUNKMAP_BITS) | cmap; } // setting a bit increases the epoch -static void mi_epochset_set(_Atomic(mi_epochset_t)*es, size_t idx) { - mi_assert(idx < 32); - size_t epoch; - uint32_t bset; - mi_epochset_t es_new; - mi_epochset_t es_old = mi_atomic_load_relaxed(es); +static void mi_chunkmap_set(_Atomic(mi_chunkmap_t)* cm, size_t idx) { + mi_assert(idx < MI_CHUNKMAP_BITS); + mi_epoch_t epoch; + mi_cmap_t cmap; + mi_chunkmap_t cm_new; + mi_chunkmap_t cm_old = mi_atomic_load_relaxed(cm); do { - mi_epochset_split(es_old, &bset, &epoch); - es_new = mi_epochset_join(bset | (MI_ZU(1)<any_set, chunk_idx); +static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) { + mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); + const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS; + const size_t idx = chunk_idx % MI_CHUNKMAP_BITS; + mi_chunkmap_set(&bitmap->chunk_maps[cmidx], idx); } -static bool mi_bitmap_anyset_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, size_t epoch) { - mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT); - return mi_epochset_try_clear(&bitmap->any_set, chunk_idx, epoch); +static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, mi_epoch_t epoch) { + mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); + const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS; + const size_t idx = chunk_idx % MI_CHUNKMAP_BITS; + return mi_chunkmap_try_clear(&bitmap->chunk_maps[cmidx], idx, epoch); } -static uint32_t mi_bitmap_anyset(mi_bitmap_t* bitmap, size_t* epoch) { - uint32_t bset; - mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, epoch); - return bset; +static mi_cmap_t mi_bitmap_chunkmap(mi_bitmap_t* bitmap, size_t chunk_idx, mi_epoch_t* epoch) { + mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); + const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS; + mi_assert_internal(cmidx < bitmap->chunk_map_count); + mi_cmap_t cmap; + mi_chunkmap_split(mi_atomic_load_relaxed(&bitmap->chunk_maps[cmidx]), &cmap, epoch); + return cmap; } -static size_t mi_bitmap_epoch(mi_bitmap_t* bitmap) { - size_t epoch; - uint32_t bset; - mi_epochset_split(mi_atomic_load_relaxed(&bitmap->any_set), &bset, &epoch); +static mi_epoch_t mi_bitmap_chunkmap_epoch(mi_bitmap_t* bitmap, size_t chunk_idx) { + mi_epoch_t epoch; + mi_bitmap_chunkmap(bitmap, chunk_idx, &epoch); return epoch; } @@ -826,17 +840,38 @@ static size_t mi_bitmap_epoch(mi_bitmap_t* bitmap) { bitmap -------------------------------------------------------------------------------- */ +size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) { + mi_assert_internal((bit_count % MI_BITMAP_CHUNK_BITS) == 0); + bit_count = _mi_align_up(bit_count, MI_BITMAP_CHUNK_BITS); + mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT); + mi_assert_internal(bit_count > 0); + const size_t chunk_count = bit_count / MI_BITMAP_CHUNK_BITS; + mi_assert_internal(chunk_count >= 1); + const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BITMAP_CHUNK_SIZE); + mi_assert_internal( (size%MI_BITMAP_CHUNK_SIZE) == 0 ); + if (pchunk_count != NULL) { *pchunk_count = chunk_count; } + return size; +} + // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true -void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { +// returns the size of the bitmap +size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) { + size_t chunk_count; + const size_t size = mi_bitmap_size(bit_count, &chunk_count); if (!already_zero) { - _mi_memzero_aligned(bitmap, sizeof(*bitmap)); + _mi_memzero_aligned(bitmap, size); } + bitmap->chunk_map_count = _mi_divide_up(chunk_count, MI_CHUNKMAP_BITS); + mi_assert_internal(bitmap->chunk_map_count <= MI_BITMAP_MAX_CHUNKMAPS); + bitmap->chunk_count = chunk_count; + mi_assert_internal(bitmap->chunk_map_count <= MI_BITMAP_MAX_CHUNK_COUNT); + return size; } // Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); - mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS); + mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); // first chunk size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; @@ -844,17 +879,17 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { size_t m = MI_BITMAP_CHUNK_BITS - cidx; if (m > n) { m = n; } mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL); - mi_bitmap_anyset_set(bitmap, chunk_idx); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // n can be large so use memset for efficiency for all in-between chunks chunk_idx++; n -= m; const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; if (mid_chunks > 0) { - _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * (MI_BITMAP_CHUNK_BITS/8)); + _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BITMAP_CHUNK_SIZE); const size_t end_chunk = chunk_idx + mid_chunks; while (chunk_idx < end_chunk) { - mi_bitmap_anyset_set(bitmap, chunk_idx); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); chunk_idx++; } n -= (mid_chunks * MI_BITMAP_CHUNK_BITS); @@ -865,28 +900,29 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); - mi_bitmap_anyset_set(bitmap, chunk_idx); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); } } // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving the bitmask as is. -static bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < MI_BITMAP_MAX_BITS); +static bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (set) { - // first set the anyset since it is a conservative approximation (increases epoch) - mi_bitmap_anyset_set(bitmap, chunk_idx); + // first set the chunkmap since it is a conservative approximation (increases epoch) + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // then actually try to set it atomically return mi_bitmap_chunk_try_set(&bitmap->chunks[chunk_idx], cidx); } else { - const size_t epoch = mi_bitmap_epoch(bitmap); + const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx); bool cleared = mi_bitmap_chunk_try_clear(&bitmap->chunks[chunk_idx], cidx); - if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); } return cleared; } @@ -894,22 +930,24 @@ static bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise leaving the bitmask as is. -static bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < MI_BITMAP_MAX_BITS); +static bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); mi_assert_internal(idx%8 == 0); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + if (set) { // first set the anyset since it is a conservative approximation (increases epoch) - mi_bitmap_anyset_set(bitmap, chunk_idx); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // then actually try to set it atomically return mi_bitmap_chunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx); } else { - const size_t epoch = mi_bitmap_epoch(bitmap); + const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx); bool cleared = mi_bitmap_chunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx); - if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); } return cleared; } @@ -919,71 +957,63 @@ static bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -static bool mi_bitmap_try_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { +static bool mi_bitmap_try_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); - if (n==0 || idx + n > MI_BITMAP_MAX_BITS) return false; + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); + if (n==0 || idx + n > mi_bitmap_max_bits(bitmap)) return false; const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) - mi_assert_internal(chunk_idx < MI_BFIELD_BITS); + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia if (set) { - // first set the anyset since it is a conservative approximation (increases epoch) - mi_bitmap_anyset_set(bitmap, chunk_idx); + // first set the chunkmap since it is a conservative approximation (increases epoch) + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // then actually try to set it atomically return mi_bitmap_chunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n); } else { - const size_t epoch = mi_bitmap_epoch(bitmap); + const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx); bool cleared = mi_bitmap_chunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n); - if (cleared && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); } return cleared; } } -bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { +bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS); if (n==1) return mi_bitmap_try_xset(set, bitmap, idx); if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx); + // todo: add 32/64 for large pages return mi_bitmap_try_xsetN_(set, bitmap, idx, n); } - -// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// Set/clear a sequence of 2 bits that were on an even `idx` in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -static bool mi_bitmap_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) { - mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - - //TODO: specialize? - //if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } - //if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } - +static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal((idx%2)==0); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) - mi_assert_internal(chunk_idx < MI_BFIELD_BITS); - if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - + mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + if (set) { - // first set the anyset since it is a conservative approximation (increases epoch) - mi_bitmap_anyset_set(bitmap, chunk_idx); + // first set the chunkmap since it is a conservative approximation (increases epoch) + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // then actually try to set it atomically - return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset); + return mi_bitmap_chunk_set2(&bitmap->chunks[chunk_idx], cidx, NULL); } else { - const size_t epoch = mi_bitmap_epoch(bitmap); - size_t already_clear = 0; - const bool allset = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear); - if (already_xset != NULL) { *already_xset = already_clear; } - if (already_clear < n && epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx); + bool already_clear = false; + const bool allset = mi_bitmap_chunk_clear2(&bitmap->chunks[chunk_idx], cidx, &already_clear); + if (!already_clear && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); } return allset; } @@ -991,25 +1021,67 @@ static bool mi_bitmap_xsetN_(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) { +static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + + if (set) { + // first set the chunkmap since it is a conservative approximation (increases epoch) + mi_bitmap_chunkmap_set(bitmap, chunk_idx); + // then actually try to set it atomically + return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset); + } + else { + const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx); + size_t already_clear = 0; + const bool allset = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear); + if (already_xset != NULL) { *already_xset = already_clear; } + if (already_clear < n && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); + } + return allset; + } +} + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) { mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS); //TODO: specialize? //if (n==1) return mi_bitmap_xset(set, bitmap, idx); + //if (n==2) return mi_bitmap_xset(set, bitmap, idx); //if (n==8) return mi_bitmap_xset8(set, bitmap, idx); return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset); } +// Is a sequence of 2 bits already all set/cleared? +static inline bool mi_bitmap_is_xset2(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx + 2 <= mi_bitmap_max_bits(bitmap)); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + return mi_bitmap_chunk_is_xset2(set, &bitmap->chunks[chunk_idx], cidx); +} + + // Is a sequence of n bits already all set/cleared? -bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { +bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); - mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) - mi_assert_internal(chunk_idx < MI_BFIELD_BITS); + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); @@ -1020,185 +1092,121 @@ bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) bitmap try_find_and_clear -------------------------------------------------------------------------------- */ +typedef bool (mi_bitmap_find_fun_t)(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx); -#define mi_bitmap_forall_set_chunks(bitmap,tseq,name_epoch,name_chunk_idx) \ - { uint32_t _bit_idx; \ - uint32_t _start = (uint32_t)(tseq % MI_EPOCHSET_BITS); \ - size_t name_epoch; \ - uint32_t _any_set = mi_bitmap_anyset(bitmap,&name_epoch); \ - _any_set = mi_rotr32(_any_set, _start); \ - while (mi_bsf32(_any_set,&_bit_idx)) { \ - size_t name_chunk_idx = (_bit_idx + _start) % MI_BFIELD_BITS; +static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx, mi_bitmap_find_fun_t* find_fun) +{ + if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; + + // start chunk index -- todo: can depend on the tseq to decrease contention between threads + MI_UNUSED(tseq); + const size_t chunk_start = 0; + const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; + const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; -#define mi_bitmap_forall_set_chunks_end() \ - _start += _bit_idx+1; /* so chunk_idx calculation stays valid */ \ - _any_set >>= _bit_idx; /* skip scanned bits (and avoid UB with (_bit_idx+1)) */ \ - _any_set >>= 1; \ - } \ - } - -// Find a set bit in a bitmap and atomically unset it. Returns true on success, -// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. -// The low `MI_BFIELD_BITS` of start are used to set the start point of the search -// (to reduce thread contention). -mi_decl_nodiscard static bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - mi_bitmap_forall_set_chunks(bitmap, tseq, epoch, chunk_idx) + // for each chunkmap entry `i` + for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { - size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS); - return true; - } - else { - // we may find that all are unset only on a second iteration but that is ok as - // _any_set is a conservative approximation. - if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); + size_t i = (_i + chunk_map_start); + if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count; // adjust for the start position + + const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; + mi_epoch_t epoch; + mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &epoch); + if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); } // rotate right for the start position (on the first iteration) + + uint32_t cmap_idx; // one bit set of each chunk that may have bits set + size_t cmap_idx_shift = 0; // shift through the cmap + while (mi_bsf32(cmap, &cmap_idx)) { // find least bit that is set + // adjust for the start position + if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } + // set the chunk idx + const size_t chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; + + // try to find and clear N bits in that chunk + if (chunk_idx < mi_bitmap_chunk_count(bitmap)) { // we can have less chunks than in the chunkmap.. + if ((*find_fun)(bitmap, n, chunk_idx, epoch, pidx)) { + return true; + } } + + // skip to the next bit + cmap_idx_shift += cmap_idx+1; + cmap >>= cmap_idx; // skip scanned bits (and avoid UB for `cmap_idx+1`) + cmap >>= 1; } } - mi_bitmap_forall_set_chunks_end(); + return false; } - -// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. -mi_decl_nodiscard static bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx ) { - mi_bitmap_forall_set_chunks(bitmap,tseq, epoch, chunk_idx) - { - size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8); - mi_assert_internal((*pidx % 8) == 0); - return true; - } - else { - // we may find that all are unset only on a second iteration but that is ok as - // _any_set is a conservative approximation. - if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); - } - } +static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); + return true; + } + else { + // we may find that all are cleared only on a second iteration but that is ok as + // the chunkmap is a conservative approximation. + if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); + } + return false; } - mi_bitmap_forall_set_chunks_end(); - return false; } // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -mi_decl_nodiscard static bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ) { - // TODO: allow spanning across chunk boundaries? - if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; - mi_bitmap_forall_set_chunks(bitmap,tseq,epoch,chunk_idx) - { - size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n); - return true; - } - else { - // we may find that all are unset only on a second iteration but that is ok as - // _any_set is a conservative approximation. - if (epoch == mi_bitmap_epoch(bitmap) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_anyset_try_clear(bitmap, chunk_idx, epoch); - } - } - } - mi_bitmap_forall_set_chunks_end(); - return false; -} - -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { - if (n == 1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); - if (n == 8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); - return mi_bitmap_try_find_and_clearN_(bitmap, n, tseq, pidx); -} - - -/* -------------------------------------------------------------------------------- - pairmap epochset --------------------------------------------------------------------------------- */ - -static void mi_pairmap_anyset_set(mi_pairmap_t* pairmap, size_t chunk_idx) { - mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT); - mi_epochset_set(&pairmap->any_set, chunk_idx); -} - -static bool mi_pairmap_anyset_try_clear(mi_pairmap_t* pairmap, size_t chunk_idx, size_t epoch) { - mi_assert(chunk_idx < MI_BITMAP_CHUNK_COUNT); - return mi_epochset_try_clear(&pairmap->any_set, chunk_idx, epoch); -} - -static uint32_t mi_pairmap_anyset(mi_pairmap_t* pairmap, size_t* epoch) { - uint32_t bset; - mi_epochset_split(mi_atomic_load_relaxed(&pairmap->any_set), &bset, epoch); - return bset; -} - -static size_t mi_pairmap_epoch(mi_pairmap_t* pairmap) { - size_t epoch; - uint32_t bset; - mi_epochset_split(mi_atomic_load_relaxed(&pairmap->any_set), &bset, &epoch); - return epoch; +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) +{ + return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at); } /* -------------------------------------------------------------------------------- pairmap -------------------------------------------------------------------------------- */ -// initialize a pairmap to all clear; avoid a mem_zero if `already_zero` is true -void mi_pairmap_init(mi_pairmap_t* pairmap, bool already_zero) { - if (!already_zero) { - _mi_memzero_aligned(pairmap, sizeof(*pairmap)); +void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2) { + mi_assert_internal(mi_bitmap_chunk_count(bm1)==mi_bitmap_chunk_count(bm2)); + pairmap->bitmap1 = bm1; + pairmap->bitmap2 = bm2; +} + +static void mi_pairmap_from_pair_idx(mi_pairmap_t* pairmap, size_t pair_idx, mi_bitmap_t** bitmap, size_t* pidx) { + const size_t idx = 2*pair_idx; + const size_t maxbits = mi_bitmap_max_bits(pairmap->bitmap1); + mi_assert_internal(pair_idx < maxbits); + if (idx < maxbits) { + *bitmap = pairmap->bitmap1; + *pidx = idx; + } + else { + *bitmap = pairmap->bitmap2; + *pidx = idx - maxbits; } } -/* -------------------------------------------------------------------------------- - pairmap set/clear unconditionally --------------------------------------------------------------------------------- */ - -// is a pairmap entry clear? -bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) { - const size_t idx = 2*pair_idx; - mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - return mi_bitmap_chunk_is_clear2(&pairmap->chunks[chunk_idx], cidx); -} - -// A reader can set from busy, or a new abandoned page can set from clear bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx) { - const size_t idx = 2*pair_idx; - mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - // first set the anyset since it is a conservative approximation(increases epoch) - mi_pairmap_anyset_set(pairmap, chunk_idx/2); - return mi_bitmap_chunk_set2(&pairmap->chunks[chunk_idx], cidx, NULL); + mi_bitmap_t* bitmap; + size_t idx; + mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); + return mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx); } -// A busy reader can clear unconditionally -void mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx) { - const size_t idx = 2*pair_idx; - mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - const size_t epoch = mi_pairmap_epoch(pairmap); - bool both_already_clear = false; - mi_bitmap_chunk_clear2(&pairmap->chunks[chunk_idx], cidx, &both_already_clear); - mi_assert_internal(!both_already_clear); // in our use cases this should not happen - if (!both_already_clear && epoch == mi_pairmap_epoch(pairmap)) { - const size_t chunk_idx1 = 2*(chunk_idx/2); // round down to even - mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx1]; - mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx1 + 1]; - if (mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk2)) { - mi_pairmap_anyset_try_clear(pairmap, chunk_idx1/2, epoch); - } - } +bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx) { + mi_bitmap_t* bitmap; + size_t idx; + mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); + return mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx); +} + +bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) { + mi_bitmap_t* bitmap; + size_t idx; + mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); + return mi_bitmap_is_xset2(MI_BIT_CLEAR, bitmap, idx); } @@ -1207,8 +1215,8 @@ void mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx) { pairmap clear while not busy -------------------------------------------------------------------------------- */ -static inline bool mi_bfield_atomic_clear_while_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). +static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) { + mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). mi_assert_internal(idx < MI_BFIELD_BITS-1); const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx); const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); @@ -1221,41 +1229,44 @@ static inline bool mi_bfield_atomic_clear_while_not_busy(_Atomic(mi_bfield_t)*b, while ((old&mask)==mask_busy) { // busy wait mi_atomic_yield(); old = mi_atomic_load_acquire(b); - } + } } bnew = (old & ~mask); // clear } while (!mi_atomic_cas_weak_acq_rel(b, &old, bnew)); mi_assert_internal((old&mask) != mask_busy); // we should never clear a busy page mi_assert_internal((old&mask) == mask); // in our case: we should only go from set to clear (when reclaiming an abandoned page from a free) - return true; + return ((old&mask) == mask); } -static void mi_pairmap_chunk_clear_while_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) { +static inline bool mi_bitmap_chunk_clear2_while_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) { mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; - mi_bfield_atomic_clear_while_not_busy(&chunk->bfields[i], idx); + return mi_bfield_atomic_clear2_while_not_busy(&chunk->bfields[i], idx); } -// Used for a page about to be freed to clear itself from the abandoned map; it has to wait -// for all readers to finish reading the page -void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { - const size_t idx = 2*pair_idx; - mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS); +static bool mi_bitmap_clear2_while_not_busy(mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal((idx%2)==0); + mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - const size_t epoch = mi_pairmap_epoch(pairmap); - mi_pairmap_chunk_clear_while_not_busy(&pairmap->chunks[chunk_idx], cidx); - if (epoch == mi_pairmap_epoch(pairmap)) { - const size_t chunk_idx1 = 2*(chunk_idx/2); // round down to even - mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx1]; - mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx1 + 1]; - if (mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk2)) { - mi_pairmap_anyset_try_clear(pairmap, chunk_idx1/2, epoch); - } - } + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx); + bool cleared = mi_bitmap_chunk_clear2_while_not_busy(&bitmap->chunks[chunk_idx], cidx); + if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); + } + return cleared; } +void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { + mi_bitmap_t* bitmap; + size_t idx; + mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); + mi_bitmap_clear2_while_not_busy(bitmap, idx); +} + + /* -------------------------------------------------------------------------------- pairmap try and set busy @@ -1263,7 +1274,7 @@ void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { // Atomically go from set to busy, or return false otherwise and leave the bit field as-is. static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). + mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). mi_assert_internal(idx < MI_BFIELD_BITS-1); const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx); const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); @@ -1277,11 +1288,11 @@ static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t return true; } -static inline bool mi_pairmap_chunk_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) { +static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) { for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { size_t idx; if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i], &idx)) { // find least 1-bit, it may be set or busy - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). + mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) { *pidx = (i*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1); @@ -1292,41 +1303,36 @@ static inline bool mi_pairmap_chunk_find_and_set_busy(mi_bitmap_chunk_t* chunk, return false; } +static bool mi_bitmap_try_find_and_set_busy_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { + MI_UNUSED(epoch); + mi_assert_internal(n==2); + size_t cidx; + if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); + return true; + } + else { + return false; + } +} + +static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { + return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_set_busy_at); +} + // Used to find an abandoned page, and transition from set to busy. mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) { - uint32_t bit_idx; - uint32_t start = (uint32_t)(tseq % MI_EPOCHSET_BITS); - size_t epoch; - uint32_t any_set = mi_pairmap_anyset(pairmap,&epoch); - any_set = mi_rotr32(any_set, start); - while (mi_bsf32(any_set,&bit_idx)) { \ - size_t chunk_idx = 2*((bit_idx + start) % MI_BFIELD_BITS); - { - // look at chunk_idx and chunck_idx+1 - mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx]; - mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx+1]; - size_t cidx; - if (mi_pairmap_chunk_find_and_set_busy(chunk1, &cidx)) { - const size_t idx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS); - mi_assert_internal((idx%2)==0); - *pidx = idx/2; - return true; - } - else if (mi_pairmap_chunk_find_and_set_busy(chunk2, &cidx)) { - const size_t idx = ((chunk_idx+1) * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(idx < MI_PAIRMAP_MAX_BITS); - mi_assert_internal((idx%2)==0); - *pidx = idx/2; - return true; - } - else if (epoch == mi_pairmap_epoch(pairmap) && mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk1)) { - mi_pairmap_anyset_try_clear(pairmap, chunk_idx/2, epoch); - } + size_t idx = 0; + if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, &idx)) { + if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, &idx)) { + return false; + } + else { + idx += mi_bitmap_max_bits(pairmap->bitmap1); } - start += bit_idx+1; /* so chunk_idx computation stays valid */ - any_set >>= bit_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ - any_set >>= 1; } - return false; + mi_assert_internal((idx%2)==0); + *pidx = idx/2; + return true; } diff --git a/src/bitmap.h b/src/bitmap.h index 948bd1e3..9b931c95 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -34,30 +34,56 @@ typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s { _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; } mi_bitmap_chunk_t; -// for now 32 (note: with ABA instructions we can make this 64) -#define MI_EPOCHSET_BITS (32) -#define MI_BITMAP_CHUNK_COUNT MI_EPOCHSET_BITS -typedef uint64_t mi_epochset_t; +// for now 32-bit epoch + 32-bit bit-set (note: with ABA instructions we can double this) +typedef uint64_t mi_chunkmap_t; +typedef uint32_t mi_epoch_t; +typedef uint32_t mi_cmap_t; + +#define MI_CHUNKMAP_BITS (32) // 1 chunkmap tracks 32 chunks + +#define MI_BITMAP_MAX_CHUNKMAPS (16) +#define MI_BITMAP_MAX_CHUNK_COUNT (MI_BITMAP_MAX_CHUNKMAPS * MI_CHUNKMAP_BITS) +#define MI_BITMAP_MIN_CHUNK_COUNT (1 * MI_CHUNKMAP_BITS) // 1 GiB arena + +#define MI_BITMAP_MAX_BIT_COUNT (MI_BITMAP_MAX_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 16 GiB arena +#define MI_BITMAP_MIN_BIT_COUNT (MI_BITMAP_MIN_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 1 GiB arena typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s { - mi_bitmap_chunk_t chunks[MI_BITMAP_CHUNK_COUNT]; - _Atomic(mi_epochset_t) any_set; + _Atomic(size_t) chunk_map_count; + _Atomic(size_t) chunk_count; + _Atomic(mi_chunkmap_t) chunk_maps[MI_BITMAP_MAX_CHUNKMAPS]; + // padding + mi_bitmap_chunk_t chunks[MI_BITMAP_MIN_BIT_COUNT]; // or more, up to MI_BITMAP_MAX_CHUNK_COUNT } mi_bitmap_t; -// 16k bits on 64bit, 8k bits on 32bit -// with 64KiB slices, this can address a 1GiB arena -#define MI_BITMAP_MAX_BITS (MI_BITMAP_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) +static inline size_t mi_bitmap_chunk_map_count(const mi_bitmap_t* bitmap) { + return mi_atomic_load_relaxed(&bitmap->chunk_map_count); +} + +static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) { + return mi_atomic_load_relaxed(&bitmap->chunk_count); +} + +static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) { + return (mi_bitmap_chunk_count(bitmap) * MI_BITMAP_CHUNK_BITS); +} + + /* -------------------------------------------------------------------------------- Atomic bitmap -------------------------------------------------------------------------------- */ -typedef bool mi_bit_t; +typedef bool mi_xset_t; #define MI_BIT_SET (true) #define MI_BIT_CLEAR (false) + +size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count); + // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true -void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero); +// returns the size of the bitmap. +size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero); // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); @@ -65,7 +91,7 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! // If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. -bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset); +bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset); static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) { return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, already_set); @@ -77,7 +103,7 @@ static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { // Is a sequence of n bits already all set/cleared? -bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); +bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n); @@ -88,9 +114,29 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n } +// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) +// and false otherwise leaving the bitmask as is. +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n); +} + +static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n); +} + +// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); + + + + // Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) // and false otherwise leaving the bitmask as is. -//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); +//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx); // //static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) { // return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx); @@ -103,7 +149,7 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n // Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) // and false otherwise leaving the bitmask as is. -//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); +//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx); // //static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) { // return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx); @@ -113,48 +159,28 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n // return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx); //} -// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) -// and false otherwise leaving the bitmask as is. -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); - -static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { - return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n); -} - -static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { - return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n); -} - -// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx ); - /* -------------------------------------------------------------------------------- Atomic bitmap for a pair of bits -------------------------------------------------------------------------------- */ -typedef mi_bfield_t mi_pair_t; - #define MI_PAIR_CLEAR (0) #define MI_PAIR_BUSY (1) #define MI_PAIR_UNUSED (2) // should never occur #define MI_PAIR_SET (3) -typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_pairmap_s { - mi_bitmap_chunk_t chunks[2*MI_BITMAP_CHUNK_COUNT]; - _Atomic(mi_epochset_t) any_set; +typedef struct mi_pairmap_s { + mi_bitmap_t* bitmap1; + mi_bitmap_t* bitmap2; } mi_pairmap_t; -#define MI_PAIRMAP_MAX_PAIRS (MI_BITMAP_MAX_BITS) // 16k pairs on 64bit, 8k pairs on 32bit -#define MI_PAIRMAP_MAX_BITS (2*MI_PAIRMAP_MAX_PAIRS) + // initialize a pairmap to all unset; avoid a mem_zero if `already_zero` is true -void mi_pairmap_init(mi_pairmap_t* pairmap, bool already_zero); +void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2); bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx); +bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx); bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx); -void mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx); void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx); mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx); diff --git a/src/page-map.c b/src/page-map.c index 0e99a890..35a22d8d 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -22,7 +22,8 @@ static bool mi_page_map_init(void) { // 64 KiB for 4 GiB address space (on 32-bit) const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); - mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size,MI_BITMAP_MAX_BITS); + mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_MIN_BIT_COUNT); + mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true); mi_page_map_all_committed = false; // _mi_os_has_overcommit(); // commit on-access on Linux systems? _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); diff --git a/test/test-stress.c b/test/test-stress.c index 9e53e920..e49fde00 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -41,7 +41,7 @@ static int THREADS = 8; static int SCALE = 10; static int ITER = 10; #elif 0 -static int THREADS = 4; +static int THREADS = 1; static int SCALE = 100; static int ITER = 10; #define ALLOW_LARGE false From bc7fe399b159e548c7b42cb4fbd287e0d12bffd0 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 3 Dec 2024 23:35:33 -0800 Subject: [PATCH 029/264] large bitmaps working; lock on arena_reserve --- include/mimalloc/internal.h | 1 + src/arena.c | 42 ++++++++++++++++++++++++++----------- src/bitmap.c | 4 ++-- src/init.c | 1 + test/test-stress.c | 5 +++++ 5 files changed, 39 insertions(+), 14 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 34dbab07..c92375c5 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -132,6 +132,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t ma // arena.c mi_arena_id_t _mi_arena_id_none(void); +void _mi_arena_init(void); void _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats); void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld); void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld); diff --git a/src/arena.c b/src/arena.c index f8b6fca1..bc885ef8 100644 --- a/src/arena.c +++ b/src/arena.c @@ -53,13 +53,19 @@ typedef struct mi_arena_s { // followed by the bitmaps (whose size depends on the arena size) } mi_arena_t; -#define MI_MAX_ARENAS (1024) // Limited for now (and takes up .bss) +#define MI_MAX_ARENAS (160) // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`) + // 160 arenas is enough for ~2 TiB memory // The available arenas static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 +static mi_lock_t mi_arena_reserve_lock; + +void _mi_arena_init(void) { + mi_lock_init(&mi_arena_reserve_lock); +} /* ----------------------------------------------------------- Arena id's @@ -275,9 +281,9 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re } arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); - if (arena_count >= 8 && arena_count <= 128) { - // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); + if (arena_count >= 1 && arena_count <= 128) { + // scale up the arena sizes exponentially every 8 entries + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); size_t reserve = 0; if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { arena_reserve = reserve; @@ -285,7 +291,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re } // check arena bounds - const size_t min_reserve = 8; // hope that fits minimal bitmaps? + const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps? const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE; // 16 GiB if (arena_reserve < min_reserve) { arena_reserve = min_reserve; @@ -380,21 +386,32 @@ static mi_decl_noinline void* mi_arena_try_alloc( { mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); - + void* p; +again: // try to find free slices in the arena's - void* p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); + p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; - // otherwise, try to first eagerly reserve a new arena - if (req_arena_id == _mi_arena_id_none()) { + // did we need a specific arena? + if (req_arena_id != _mi_arena_id_none()) return NULL; + + // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?) + if (mi_lock_try_acquire(&mi_arena_reserve_lock)) { mi_arena_id_t arena_id = 0; - if (mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id)) { + bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id); + mi_lock_release(&mi_arena_reserve_lock); + if (ok) { // and try allocate in there mi_assert_internal(req_arena_id == _mi_arena_id_none()); p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); if (p != NULL) return p; } } + else { + // if we are racing with another thread wait until the new arena is reserved (todo: a better yield?) + mi_atomic_yield(); + goto again; + } return NULL; } @@ -524,7 +541,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz // try to allocate from free space in arena's mi_memid_t memid = _mi_memid_none(); mi_page_t* page = NULL; - if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's? + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's? !os_align && // not large alignment slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large { @@ -982,6 +999,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* mi_atomic_decrement_acq_rel(&mi_arena_count); return false; } + _mi_stat_counter_increase(&stats->arena_count,1); arena->id = mi_arena_id_create(i); mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); @@ -1145,7 +1163,7 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ _mi_output_message("%s%s:\n", prefix, header); size_t bit_count = 0; size_t bit_set_count = 0; - for (int i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { + for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { diff --git a/src/bitmap.c b/src/bitmap.c index 4156cfd1..2dbba52d 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -985,7 +985,7 @@ static bool mi_bitmap_try_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, } } -bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { +mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS); if (n==1) return mi_bitmap_try_xset(set, bitmap, idx); if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx); @@ -1304,7 +1304,7 @@ static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chun } static bool mi_bitmap_try_find_and_set_busy_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { - MI_UNUSED(epoch); + MI_UNUSED(epoch); MI_UNUSED(n); mi_assert_internal(n==2); size_t cidx; if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) { diff --git a/src/init.c b/src/init.c index 99a5ea39..3dcb68e3 100644 --- a/src/init.c +++ b/src/init.c @@ -619,6 +619,7 @@ void mi_process_init(void) mi_attr_noexcept { mi_detect_cpu_features(); _mi_os_init(); + _mi_arena_init(); mi_heap_main_init(); #if MI_DEBUG _mi_verbose_message("debug level : %d\n", MI_DEBUG); diff --git a/test/test-stress.c b/test/test-stress.c index e49fde00..904b1acc 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -50,6 +50,11 @@ static int THREADS = 32; static int SCALE = 50; static int ITER = 50; #define ALLOW_LARGE false +#elif 0 +static int THREADS = 64; +static int SCALE = 400; +static int ITER = 10; +#define ALLOW_LARGE true #else static int THREADS = 32; // more repeatable if THREADS <= #processors static int SCALE = 25; // scaling factor From 45f7fb559ace2ba1c463d0ca48dbeff62e46d117 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 4 Dec 2024 00:14:56 -0800 Subject: [PATCH 030/264] small fixes --- include/mimalloc/internal.h | 11 +++++++++-- src/bitmap.h | 3 ++- test/test-stress.c | 2 +- 3 files changed, 12 insertions(+), 4 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index c92375c5..cb689877 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -487,7 +487,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) { } static inline mi_page_t* _mi_ptr_page(const void* p) { - #if 1 // MI_DEBUG + #if MI_DEBUG return _mi_checked_ptr_page(p); #else return _mi_ptr_page_ex(p,NULL); @@ -638,6 +638,13 @@ static inline bool mi_page_is_mostly_used(const mi_page_t* page) { return (page->reserved - page->used <= frac); } +// is less than 1/n'th of a page free? +static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) { + if (page==NULL) return true; + uint16_t frac = page->reserved / n; + return (page->reserved - page->used <= frac); +} + static inline bool mi_page_is_abandoned(const mi_page_t* page) { // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) return (mi_atomic_load_acquire(&page->xthread_id) <= 1); @@ -692,7 +699,7 @@ static inline bool mi_page_try_claim_ownership(mi_page_t* page) { static inline void _mi_page_unown(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); - mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(mi_page_is_abandoned(page)); mi_thread_free_t tf_new; mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); do { diff --git a/src/bitmap.h b/src/bitmap.h index 9b931c95..d73ee98a 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -51,8 +51,9 @@ typedef uint32_t mi_cmap_t; typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s { _Atomic(size_t) chunk_map_count; _Atomic(size_t) chunk_count; + size_t padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc _Atomic(mi_chunkmap_t) chunk_maps[MI_BITMAP_MAX_CHUNKMAPS]; - // padding + mi_bitmap_chunk_t chunks[MI_BITMAP_MIN_BIT_COUNT]; // or more, up to MI_BITMAP_MAX_CHUNK_COUNT } mi_bitmap_t; diff --git a/test/test-stress.c b/test/test-stress.c index 904b1acc..0b1b6c8d 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -45,7 +45,7 @@ static int THREADS = 1; static int SCALE = 100; static int ITER = 10; #define ALLOW_LARGE false -#elif 0 +#elif 1 static int THREADS = 32; static int SCALE = 50; static int ITER = 50; From afe90891529058605f9bd910953304322e291aeb Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 4 Dec 2024 19:15:55 -0800 Subject: [PATCH 031/264] more documentation; better pairmap find_and_set_to_busy, busy flag is now 0x10 --- src/arena.c | 88 ++++++++--------- src/bitmap.c | 212 ++++++++++++++++++++++++++--------------- src/bitmap.h | 125 +++++++++++++++--------- src/free.c | 262 ++++++--------------------------------------------- src/page.c | 2 +- 5 files changed, 296 insertions(+), 393 deletions(-) diff --git a/src/arena.c b/src/arena.c index bc885ef8..19815616 100644 --- a/src/arena.c +++ b/src/arena.c @@ -199,7 +199,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( void* p = mi_arena_slice_start(arena, slice_index); *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count); memid->is_pinned = arena->memid.is_pinned; - + // set the dirty bits if (arena->memid.initially_zero) { // size_t dirty_count = 0; @@ -239,7 +239,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_zero = false; } } - #endif + #endif size_t already_committed_count = 0; mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count); if (already_committed_count < slice_count) { @@ -247,7 +247,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); } } - } + } } else { // no need to commit, but check if already fully committed @@ -282,8 +282,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); if (arena_count >= 1 && arena_count <= 128) { - // scale up the arena sizes exponentially every 8 entries - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); + // scale up the arena sizes exponentially every 8 entries + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); size_t reserve = 0; if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { arena_reserve = reserve; @@ -399,7 +399,7 @@ again: if (mi_lock_try_acquire(&mi_arena_reserve_lock)) { mi_arena_id_t arena_id = 0; bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id); - mi_lock_release(&mi_arena_reserve_lock); + mi_lock_release(&mi_arena_reserve_lock); if (ok) { // and try allocate in there mi_assert_internal(req_arena_id == _mi_arena_id_none()); @@ -476,6 +476,19 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t Arena page allocation ----------------------------------------------------------- */ +static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2) { + mi_arena_t* arena = (mi_arena_t*)arg1; + mi_subproc_t* subproc = (mi_subproc_t*)arg2; + + // found an abandoned page of the right size + // it is set busy for now so we can read safely even with concurrent mi_free reclaiming + // try to claim ownership atomically + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + if (subproc != page->subproc) return false; + if (!mi_page_try_claim_ownership(page)) return false; + return true; +} + static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) { MI_UNUSED(slice_count); @@ -493,38 +506,29 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl { size_t slice_index; mi_pairmap_t* const pairmap = &arena->pages_abandoned[bin]; - while (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index)) { // todo: don't restart from scratch if we fail for some entry? - // found an abandoned page of the right size - // it is set busy for now so we can read safely even with concurrent mi_free reclaiming - // try to claim ownership atomically - mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); - if (!mi_page_try_claim_ownership(page)) { - // a concurrent free already grabbed the page. - // Restore the abandoned_map to make it available again (unblocking busy waiters) - mi_pairmap_set(pairmap, slice_index); - } - else { - // we got ownership, clear the abandoned entry (unblocking busy waiters) - mi_pairmap_clear(pairmap, slice_index); - mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); - _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); - _mi_page_free_collect(page, false); // update `used` count - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); - mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); - mi_assert_internal(_mi_ptr_page(page)==page); - mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); - mi_assert_internal(mi_page_block_size(page) == block_size); - mi_assert_internal(mi_page_is_abandoned(page)); - mi_assert_internal(mi_page_is_owned(page)); - mi_assert_internal(!mi_page_is_full(page)); - return page; - } - } + if (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) { + // found an abandoned page of the right size + // and claimed ownership. + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_is_abandoned(page)); + mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); + _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); + _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); + + _mi_page_free_collect(page, false); // update `used` count + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_ptr_page(page)==page); + mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); + mi_assert_internal(mi_page_block_size(page) == block_size); + mi_assert_internal(!mi_page_is_full(page)); + return page; + } } mi_forall_arenas_end(); return NULL; @@ -565,8 +569,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment)); // claimed free slices: initialize the page partly - if (!memid.initially_zero) { - _mi_memzero_aligned(page, sizeof(*page)); + if (!memid.initially_zero) { + _mi_memzero_aligned(page, sizeof(*page)); } #if MI_DEBUG > 1 else { @@ -779,7 +783,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done - mi_pairmap_clear_while_not_busy(&arena->pages_abandoned[bin], slice_index); + mi_pairmap_clear_once_not_busy(&arena->pages_abandoned[bin], slice_index); mi_page_clear_abandoned_mapped(page); mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]); } @@ -999,7 +1003,7 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* mi_atomic_decrement_acq_rel(&mi_arena_count); return false; } - + _mi_stat_counter_increase(&stats->arena_count,1); arena->id = mi_arena_id_create(i); mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); @@ -1049,7 +1053,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB); return false; - } + } size_t bitmap_base; const size_t info_slices = mi_arena_info_slices_needed(slice_count, &bitmap_base); if (slice_count < info_slices+1) { diff --git a/src/bitmap.c b/src/bitmap.c index 2dbba52d..1aa0a822 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -995,13 +995,13 @@ mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, s // Set/clear a sequence of 2 bits that were on an even `idx` in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { +static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal((idx%2)==0); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - + if (set) { // first set the chunkmap since it is a conservative approximation (increases epoch) mi_bitmap_chunkmap_set(bitmap, chunk_idx); @@ -1066,7 +1066,7 @@ static inline bool mi_bitmap_is_xset2(mi_xset_t set, mi_bitmap_t* bitmap, size_t mi_assert_internal(idx + 2 <= mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); + mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); return mi_bitmap_chunk_is_xset2(set, &bitmap->chunks[chunk_idx], cidx); } @@ -1091,13 +1091,13 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n /* -------------------------------------------------------------------------------- bitmap try_find_and_clear -------------------------------------------------------------------------------- */ - +/* typedef bool (mi_bitmap_find_fun_t)(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx); static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx, mi_bitmap_find_fun_t* find_fun) { if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; - + // start chunk index -- todo: can depend on the tseq to decrease contention between threads MI_UNUSED(tseq); const size_t chunk_start = 0; @@ -1105,7 +1105,7 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; // for each chunkmap entry `i` - for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) + for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { size_t i = (_i + chunk_map_start); if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count; // adjust for the start position @@ -1122,50 +1122,106 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } // set the chunk idx const size_t chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; - + // try to find and clear N bits in that chunk if (chunk_idx < mi_bitmap_chunk_count(bitmap)) { // we can have less chunks than in the chunkmap.. if ((*find_fun)(bitmap, n, chunk_idx, epoch, pidx)) { return true; } } - + // skip to the next bit cmap_idx_shift += cmap_idx+1; cmap >>= cmap_idx; // skip scanned bits (and avoid UB for `cmap_idx+1`) cmap >>= 1; } } - + return false; } +*/ -static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { - size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); - return true; - } - else { - // we may find that all are cleared only on a second iteration but that is ok as - // the chunkmap is a conservative approximation. - if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } - return false; - } -} +#define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \ + { \ + /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ + MI_UNUSED(tseq); \ + const size_t chunk_start = 0; \ + const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; \ + const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; \ + /* for each chunkmap entry `i` */ \ + for (size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { \ + size_t i = (_i + chunk_map_start); \ + if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count; /* adjust for the start position */ \ + \ + const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; \ + mi_epoch_t name_epoch; \ + mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &name_epoch); \ + if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); } /* rotate right for the start position (on the first iteration) */ \ + \ + uint32_t cmap_idx; /* one bit set of each chunk that may have bits set */ \ + size_t cmap_idx_shift = 0; /* shift through the cmap */ \ + while (mi_bsf32(cmap, &cmap_idx)) { /* find least bit that is set */ \ + /* adjust for the start position again */ \ + if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } \ + /* set the chunk idx */ \ + const size_t name_chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; \ + /* try to find and clear N bits in that chunk */ \ + if (name_chunk_idx < mi_bitmap_chunk_count(bitmap)) { /* we can have less chunks than in the chunkmap.. */ + +#define mi_bitmap_forall_chunks_end() \ + } \ + /* skip to the next bit */ \ + cmap_idx_shift += cmap_idx+1; \ + cmap >>= cmap_idx; /* skip scanned bits (and avoid UB for `cmap_idx+1`) */ \ + cmap >>= 1; \ + } \ + }} + +//static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { +// size_t cidx; +// if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { +// *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; +// mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); +// return true; +// } +// else { +// // we may find that all are cleared only on a second iteration but that is ok as +// // the chunkmap is a conservative approximation. +// if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { +// mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); +// } +// return false; +// } +//} // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { - return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at); + // return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at); + mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); + return true; + } + else { + // we may find that all are cleared only on a second iteration but that is ok as + // the chunkmap is a conservative approximation. + if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); + } + // continue + } + } + mi_bitmap_forall_chunks_end(); + return false; } /* -------------------------------------------------------------------------------- - pairmap + pairmap -------------------------------------------------------------------------------- */ void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2) { @@ -1215,10 +1271,10 @@ bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) { pairmap clear while not busy -------------------------------------------------------------------------------- */ -static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). +static inline bool mi_bfield_atomic_clear2_once_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) { + mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set). mi_assert_internal(idx < MI_BFIELD_BITS-1); - const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx); + const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx); const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); mi_bfield_t bnew; mi_bfield_t old = mi_atomic_load_relaxed(b); @@ -1238,32 +1294,32 @@ static inline bool mi_bfield_atomic_clear2_while_not_busy(_Atomic(mi_bfield_t)*b return ((old&mask) == mask); } -static inline bool mi_bitmap_chunk_clear2_while_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) { +static inline bool mi_bitmap_chunk_clear2_once_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) { mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; - return mi_bfield_atomic_clear2_while_not_busy(&chunk->bfields[i], idx); + return mi_bfield_atomic_clear2_once_not_busy(&chunk->bfields[i], idx); } -static bool mi_bitmap_clear2_while_not_busy(mi_bitmap_t* bitmap, size_t idx) { +static bool mi_bitmap_clear2_once_not_busy(mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal((idx%2)==0); mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx); - bool cleared = mi_bitmap_chunk_clear2_while_not_busy(&bitmap->chunks[chunk_idx], cidx); + bool cleared = mi_bitmap_chunk_clear2_once_not_busy(&bitmap->chunks[chunk_idx], cidx); if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } + } return cleared; } -void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { +void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { mi_bitmap_t* bitmap; size_t idx; mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); - mi_bitmap_clear2_while_not_busy(bitmap, idx); + mi_bitmap_clear2_once_not_busy(bitmap, idx); } @@ -1274,9 +1330,9 @@ void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { // Atomically go from set to busy, or return false otherwise and leave the bit field as-is. static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). + mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set). mi_assert_internal(idx < MI_BFIELD_BITS-1); - const mi_bfield_t mask = ((mi_bfield_t)0x03 << idx); + const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx); const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); mi_bfield_t old; mi_bfield_t bnew; @@ -1290,49 +1346,57 @@ static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) { for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - size_t idx; - if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i], &idx)) { // find least 1-bit, it may be set or busy - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 01 (busy), and 11 (set). - if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) { - *pidx = (i*MI_BFIELD_BITS) + idx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1); - return true; + while (true) { + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]) & MI_BFIELD_LO_BIT2; // only keep MI_PAIR_SET bits + size_t idx; + if (!mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit + break; // not found: continue with the next field + } + else { + mi_assert_internal((idx%2)==0); + if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) { + *pidx = (i*MI_BFIELD_BITS) + idx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1); + return true; + } + // else: try this word once again } } } return false; } -static bool mi_bitmap_try_find_and_set_busy_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { - MI_UNUSED(epoch); MI_UNUSED(n); - mi_assert_internal(n==2); - size_t cidx; - if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); - return true; - } - else { - return false; - } -} -static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { - return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_set_busy_at); +static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t idx_offset, size_t* ppair_idx, + mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2) +{ + mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) + { + MI_UNUSED(epoch); MI_UNUSED(n); + mi_assert_internal(n==2); + size_t cidx; + if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) { + const size_t idx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal((idx%2)==0); + const size_t pair_idx = (idx + idx_offset)/2; + if (claim(pair_idx, arg1, arg2)) { // while busy, the claim function can read from the page + mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx); // claimed, clear the entry + *ppair_idx = pair_idx; + return true; + } + else { + mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx); // not claimed, reset the entry + // and continue + } + } + } + mi_bitmap_forall_chunks_end(); + return false; } // Used to find an abandoned page, and transition from set to busy. -mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) { - size_t idx = 0; - if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, &idx)) { - if (!mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, &idx)) { - return false; - } - else { - idx += mi_bitmap_max_bits(pairmap->bitmap1); - } - } - mi_assert_internal((idx%2)==0); - *pidx = idx/2; - return true; +mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pair_idx, + mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2 ) { + if (mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, 0, pair_idx, claim, arg1, arg2)) return true; + return mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, mi_bitmap_max_bits(pairmap->bitmap1), pair_idx, claim, arg1, arg2); } diff --git a/src/bitmap.h b/src/bitmap.h index d73ee98a..ca62735b 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -13,9 +13,47 @@ Concurrent bitmap that can set/reset sequences of bits atomically #define MI_BITMAP_H /* -------------------------------------------------------------------------------- - Definitions --------------------------------------------------------------------------------- */ + Atomic bitmaps: + `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`) + each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB). + We need 16K bits to represent a 1GiB arena. + + `mi_bitmap_chunk_t`: a chunk of bfield's of a total of MI_BITMAP_CHUNK_BITS (= 512) + allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number + of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB). + These chunks are cache-aligned and we can use AVX2/AVX512/SVE/SVE2/etc. instructions + to scan for bits (perhaps) more efficiently. + + `mi_chunkmap_t`: for each chunk we track if it has (potentially) any bit set. + The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set. + This is used to avoid scanning every chunk. (and thus strictly an optimization) + It is conservative: it is fine to a bit in the chunk map even if the chunk turns out + to have no bits set. + + When we (potentially) set a bit in a chunk, we first update the chunkmap. + However, when we clear a bit in a chunk, and the chunk is indeed all clear, we + cannot safely clear the bit corresponding to the chunk in the chunkmap since it + may race with another thread setting a bit in the same chunk (and we may clear the + bit even though a bit is set in the chunk which is not allowed). + + To fix this, the chunkmap contains 32-bits of bits for chunks, and a 32-bit "epoch" + counter that is increased everytime a bit is set. We only clear a bit if the epoch + stayed the same over our clear operation (so we know no other thread in the mean + time set a bit in any of the chunks corresponding to the chunkmap). + Since increasing the epoch and setting a bit must be atomic, we use only half-word + bits (32) (we could use 128-bit atomics if needed since modern hardware supports this) + + `mi_bitmap_t`: a bitmap with N chunks. A bitmap always has MI_BITMAP_MAX_CHUNK_FIELDS (=16) + and can support arena's from few chunks up to 16 chunkmap's = 16 * 32 chunks = 16 GiB + The `chunk_count` can be anything from 1 to the max supported by the chunkmap's but + each chunk is always complete (512 bits, so 512 * 64KiB = 32MiB memory area's). + + For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count + and pop-count (but we think it can be adapted work reasonably well on older hardware too) +--------------------------------------------------------------------------------------------- */ + +// A word-size bit field. typedef size_t mi_bfield_t; #define MI_BFIELD_BITS_SHIFT (MI_SIZE_SHIFT+3) @@ -29,16 +67,18 @@ typedef size_t mi_bfield_t; #define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) #define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) -// 512 bits on 64_bit +// A bitmap chunk contains 512 bits of bfields on 64_bit (256 on 32-bit) typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s { _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; } mi_bitmap_chunk_t; + // for now 32-bit epoch + 32-bit bit-set (note: with ABA instructions we can double this) typedef uint64_t mi_chunkmap_t; typedef uint32_t mi_epoch_t; typedef uint32_t mi_cmap_t; + #define MI_CHUNKMAP_BITS (32) // 1 chunkmap tracks 32 chunks #define MI_BITMAP_MAX_CHUNKMAPS (16) @@ -48,15 +88,18 @@ typedef uint32_t mi_cmap_t; #define MI_BITMAP_MAX_BIT_COUNT (MI_BITMAP_MAX_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 16 GiB arena #define MI_BITMAP_MIN_BIT_COUNT (MI_BITMAP_MIN_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 1 GiB arena + +// An atomic bitmap typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s { - _Atomic(size_t) chunk_map_count; - _Atomic(size_t) chunk_count; + _Atomic(size_t) chunk_map_count; // valid chunk_map's + _Atomic(size_t) chunk_count; // total count of chunks size_t padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc _Atomic(mi_chunkmap_t) chunk_maps[MI_BITMAP_MAX_CHUNKMAPS]; - + mi_bitmap_chunk_t chunks[MI_BITMAP_MIN_BIT_COUNT]; // or more, up to MI_BITMAP_MAX_CHUNK_COUNT } mi_bitmap_t; + static inline size_t mi_bitmap_chunk_map_count(const mi_bitmap_t* bitmap) { return mi_atomic_load_relaxed(&bitmap->chunk_map_count); } @@ -72,17 +115,19 @@ static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) { /* -------------------------------------------------------------------------------- - Atomic bitmap + Atomic bitmap operations -------------------------------------------------------------------------------- */ +// Many operations are generic over setting or clearing the bit sequence: we use `mi_xset_t` for this (true if setting, false if clearing) typedef bool mi_xset_t; #define MI_BIT_SET (true) #define MI_BIT_CLEAR (false) +// Required size of a bitmap to represent `bit_count` bits. size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count); -// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true // returns the size of the bitmap. size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero); @@ -134,56 +179,46 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t - -// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) -// and false otherwise leaving the bitmask as is. -//mi_decl_nodiscard bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx); -// -//static inline bool mi_bitmap_try_set(mi_bitmap_t* bitmap, size_t idx) { -// return mi_bitmap_try_xset(MI_BIT_SET, bitmap, idx); -//} -// -//static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { -// return mi_bitmap_try_xset(MI_BIT_CLEAR, bitmap, idx); -//} - - -// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) -// and false otherwise leaving the bitmask as is. -//mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx); -// -//static inline bool mi_bitmap_try_set8(mi_bitmap_t* bitmap, size_t idx) { -// return mi_bitmap_try_xset8(MI_BIT_SET, bitmap, idx); -//} -// -//static inline bool mi_bitmap_try_clear8(mi_bitmap_t* bitmap, size_t idx) { -// return mi_bitmap_try_xset8(MI_BIT_CLEAR, bitmap, idx); -//} - - /* -------------------------------------------------------------------------------- - Atomic bitmap for a pair of bits + Atomic bitmap for a pair of bits. + + The valid pairs are CLEAR (0), SET (3), or BUSY (2). + + These bit pairs are used in the abandoned pages maps: when set, the entry has + an available page. When we scan for an available abandoned page and find an entry SET, + we first set it to BUSY, and try to claim the page atomically (since it can race + with a concurrent `mi_free` which also tries to claim the page). However, unlike `mi_free`, + we cannot be sure that a concurrent `mi_free` also didn't free (and decommit) the page + just when we got the entry. Therefore, a page can only be freed after `mi_arena_unabandon` + which (busy) waits until the BUSY flag is cleared to ensure all readers are done. + (and pair-bit operations must therefore be release_acquire). -------------------------------------------------------------------------------- */ #define MI_PAIR_CLEAR (0) -#define MI_PAIR_BUSY (1) -#define MI_PAIR_UNUSED (2) // should never occur +#define MI_PAIR_UNUSED (1) // should never occur +#define MI_PAIR_BUSY (2) #define MI_PAIR_SET (3) +// 0b....0101010101010101 +#define MI_BFIELD_LO_BIT2 ((MI_BFIELD_LO_BIT8 << 6)|(MI_BFIELD_LO_BIT8 << 4)|(MI_BFIELD_LO_BIT8 << 2)|MI_BFIELD_LO_BIT8) + +// A pairmap manipulates pairs of bits (and consists of 2 bitmaps) typedef struct mi_pairmap_s { mi_bitmap_t* bitmap1; - mi_bitmap_t* bitmap2; + mi_bitmap_t* bitmap2; } mi_pairmap_t; - - -// initialize a pairmap to all unset; avoid a mem_zero if `already_zero` is true +// initialize a pairmap to all clear; avoid a mem_zero if `already_zero` is true void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2); bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx); bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx); bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx); -void mi_pairmap_clear_while_not_busy(mi_pairmap_t* pairmap, size_t pair_idx); -mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx); +void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx); + +typedef bool (mi_bitmap_claim_while_busy_fun_t)(size_t pair_index, void* arg1, void* arg2); +mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx, + mi_bitmap_claim_while_busy_fun_t* claim, void* arg1 ,void* arg2 + ); -#endif // MI_XBITMAP_H +#endif // MI_BITMAP_H diff --git a/src/free.c b/src/free.c index 70ef5d8a..1e07dbd2 100644 --- a/src/free.c +++ b/src/free.c @@ -148,15 +148,44 @@ void mi_free(void* p) mi_attr_noexcept } - // ------------------------------------------------------ // Multi-threaded Free (`_mt`) // ------------------------------------------------------ +static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page); + +// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. +static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) +{ + // adjust stats (after padding check and potentially recursive `mi_free` above) + mi_stat_free(page, block); // stat_free may access the padding + mi_track_free_size(block, mi_page_usable_size_of(page, block)); + + // _mi_padding_shrink(page, block, sizeof(mi_block_t)); + #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading + size_t dbgsize = mi_usable_size(block); + if (dbgsize > MI_MiB) { dbgsize = MI_MiB; } + _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize); + #endif + + // push atomically on the page thread free list + mi_thread_free_t tf_new; + mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); + do { + mi_block_set_next(page, block, mi_tf_block(tf_old)); + tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); + } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); + + // and atomically reclaim the page if it was abandoned + bool reclaimed = !mi_tf_is_owned(tf_old); + if (reclaimed) { + mi_free_try_reclaim_mt(page); + } +} + static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); -#if 1 // we own the page now.. // safe to collect the thread atomic free list _mi_page_free_collect(page, false); // update `used` count @@ -209,237 +238,8 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { // not reclaimed or free'd, unown again _mi_page_unown(page); - -#else - if (!mi_page_is_abandoned_mapped(page)) { - // singleton or OS allocated - if (mi_page_is_singleton(page)) { - // free singleton pages - #if MI_DEBUG>1 - _mi_page_free_collect(page, false); // update `used` count - mi_assert_internal(mi_page_all_free(page)); - #endif - // we can free the page directly - _mi_arena_page_free(page); - return; - } - else { - const bool was_full = mi_page_is_full(page); - _mi_page_free_collect(page,false); // update used - if (mi_page_all_free(page)) { - // no need to unabandon as it is unmapped - _mi_arena_page_free(page); - return; - } - else if (was_full && _mi_arena_page_reabandon_full(page)) { - return; - } - else if (!mi_page_is_mostly_used(page) && _mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) { - // the page has still some blocks in use (but not too many) - // reclaim in our heap if compatible, or otherwise abandon again - // todo: optimize this check further? - // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should - // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) - mi_heap_t* const heap = mi_prim_get_default_heap(); - if (heap != (mi_heap_t*)&_mi_heap_empty) { // we did not already terminate our thread (can this happen? - mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); - if ((tagheap != NULL) && // don't reclaim across heap object types - (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) - (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) - ) - { - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); - // make it part of our heap (no need to unabandon as is unmapped) - _mi_heap_page_reclaim(tagheap, page); - return; - } - } - } - } - } - else { - // don't reclaim pages that can be found for fresh page allocations - } - - // not reclaimed or free'd, unown again - _mi_page_unown(page); -#endif } -/* -// we own the page now.. -// safe to collect the thread atomic free list -_mi_page_free_collect(page, false); // update `used` count -if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); } - -if (mi_page_all_free(page)) { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); // this must be before free'ing - // we can free the page directly - _mi_arena_page_free(page); - return; -} -else if (!mi_page_is_mostly_used(page)) { - // the page has still some blocks in use (but not too many) - // reclaim in our heap if compatible, or otherwise abandon again - // todo: optimize this check further? - // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should - // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) - mi_heap_t* const heap = mi_prim_get_default_heap(); - - if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) && // only if reclaim on free is allowed - (heap != (mi_heap_t*)&_mi_heap_empty)) // we did not already terminate our thread (can this happen? - { - mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); - if ((tagheap != NULL) && // don't reclaim across heap object types - (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) - (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) - ) - { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); - // make it part of our heap - _mi_heap_page_reclaim(tagheap, page); - return; - } - } -} - -// we cannot reclaim this page.. leave it abandoned -// todo: should re-abandon or otherwise a partly used page could never be re-used if the -// objects in it are not freed explicitly. -_mi_page_unown(page); -*/ - - -// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. -static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) -{ - // adjust stats (after padding check and potentially recursive `mi_free` above) - mi_stat_free(page, block); // stat_free may access the padding - mi_track_free_size(block, mi_page_usable_size_of(page, block)); - - // _mi_padding_shrink(page, block, sizeof(mi_block_t)); - #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading - size_t dbgsize = mi_usable_size(block); - if (dbgsize > MI_MiB) { dbgsize = MI_MiB; } - _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize); - #endif - - // push atomically on the page thread free list - mi_thread_free_t tf_new; - mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); - do { - mi_block_set_next(page, block, mi_tf_block(tf_old)); - tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); - } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); - - // and atomically reclaim the page if it was abandoned - bool reclaimed = !mi_tf_is_owned(tf_old); - if (reclaimed) { - mi_free_try_reclaim_mt(page); - } -} - - /* - // Try to put the block on either the page-local thread free list, - // or the heap delayed free list (if this is the first non-local free in that page) - mi_thread_free_t tfreex; - bool use_delayed; - mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); - do { - use_delayed = (mi_tf_delayed(tfree) == MI_USE_DELAYED_FREE); - if mi_unlikely(use_delayed) { - // unlikely: this only happens on the first concurrent free in a page that is in the full list - tfreex = mi_tf_set_delayed(tfree,MI_DELAYED_FREEING); - } - else { - // usual: directly add to page thread_free list - mi_block_set_next(page, block, mi_tf_block(tfree)); - tfreex = mi_tf_set_block(tfree,block); - } - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); - - // If this was the first non-local free, we need to push it on the heap delayed free list instead - if mi_unlikely(use_delayed) { - // racy read on `heap`, but ok because MI_DELAYED_FREEING is set (see `mi_heap_delete` and `mi_heap_collect_abandon`) - mi_heap_t* const heap = (mi_heap_t*)(mi_atomic_load_acquire(&page->xheap)); //mi_page_heap(page); - mi_assert_internal(heap != NULL); - if (heap != NULL) { - // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity) - mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); - do { - mi_block_set_nextx(heap,block,dfree, heap->keys); - } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); - } - - // and reset the MI_DELAYED_FREEING flag - tfree = mi_atomic_load_relaxed(&page->xthread_free); - do { - tfreex = tfree; - mi_assert_internal(mi_tf_delayed(tfree) == MI_DELAYED_FREEING); - tfreex = mi_tf_set_delayed(tfree,MI_NO_DELAYED_FREE); - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); - } -} - -// Multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) -static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) -{ - // first see if the page was abandoned and if we can reclaim it into our thread - if (mi_page_is_abandoned(page)) { - if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 || - mi_page_is_singleton(page)) { // only one block, and we are free-ing it - if (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) // and we did not already exit this thread (without this check, a fresh heap will be initalized (issue #944)) - { - // the page is abandoned, try to reclaim it into our heap - if (_mi_arena_try_reclaim(mi_heap_get_default(), page)) { // TODO: avoid putting it in the full free queue - mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page)); - // mi_assert_internal(mi_heap_get_default()->tld->subproc == page->subproc); - mi_free(block); // recursively free as now it will be a local free in our heap - return; - } - else { - if (mi_page_is_abandoned(page)) { - // mi_assert(false); - } - // mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages - } - } - } - } - - - // The padding check may access the non-thread-owned page for the key values. - // that is safe as these are constant and the page won't be freed (as the block is not freed yet). - mi_check_padding(page, block); - - // adjust stats (after padding check and potentially recursive `mi_free` above) - mi_stat_free(page, block); // stat_free may access the padding - mi_track_free_size(block, mi_page_usable_size_of(page,block)); - - // for small size, ensure we can fit the delayed thread pointers without triggering overflow detection - _mi_padding_shrink(page, block, sizeof(mi_block_t)); - - if (mi_page_is_huge(page)) { - mi_assert_internal(mi_page_is_singleton(page)); - // huge pages are special as they occupy the entire segment - // as these are large we reset the memory occupied by the page so it is available to other threads - // (as the owning thread needs to actually free the memory later). - _mi_os_reset(mi_page_start(page), mi_page_block_size(page), NULL); // resets conservatively - } - else { - #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading - memset(block, MI_DEBUG_FREED, mi_usable_size(block)); - #endif - } - - // and finally free the actual block by pushing it on the owning heap - // thread_delayed free list (or heap delayed free list) - mi_free_block_delayed_mt(page,block); -} -*/ // ------------------------------------------------------ // Usable size diff --git a/src/page.c b/src/page.c index e5e3f972..faef2f48 100644 --- a/src/page.c +++ b/src/page.c @@ -44,7 +44,7 @@ static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) { mi_assert_internal(_mi_ptr_page(page) == page); size_t count = 0; while (head != NULL) { - mi_assert_internal((uint8_t*)head - (uint8_t*)page > MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head)); + mi_assert_internal((uint8_t*)head - (uint8_t*)page > (ptrdiff_t)MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head)); count++; head = mi_block_next(page, head); } From bc67be4d79ff03ef824efcecd0aae1066b068b16 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 4 Dec 2024 21:40:57 -0800 Subject: [PATCH 032/264] small adjustments --- include/mimalloc/bits.h | 13 ++++++ src/arena.c | 58 +----------------------- src/bitmap.c | 98 +++++++++++------------------------------ src/bitmap.h | 2 +- src/init.c | 2 +- test/test-stress.c | 4 +- 6 files changed, 43 insertions(+), 134 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index f3bbe3bc..e1951cf7 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -314,6 +314,19 @@ static inline bool mi_bsr(size_t x, size_t* idx) { #endif } +// Bit scan reverse: find the most significant bit that is set +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bsr32(uint32_t x, uint32_t* idx) { +#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long i; + return (_BitScanReverse(&i, x) ? (*idx = i, true) : false); +#else + const size_t r = mi_clz((size_t)x); + *idx = (~r & (MI_SIZE_BITS - 1)) - (MI_SIZE_SIZE - sizeof(uint32_t)); + return (x!=0); +#endif +} /* -------------------------------------------------------------------------------- diff --git a/src/arena.c b/src/arena.c index 19815616..79a52c4d 100644 --- a/src/arena.c +++ b/src/arena.c @@ -335,7 +335,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are size_t _start; \ if (req_arena_id == _mi_arena_id_none()) { \ _max_arena = mi_atomic_load_relaxed(&mi_arena_count); \ - _start = (_max_arena <= 1 ? 0 : (tseq / MI_THREADS_PER_ARENA) % _max_arena); \ + _start = (_max_arena <= 2 ? 0 : (tseq % (_max_arena-1))); \ } \ else { \ _max_arena = 1; \ @@ -795,62 +795,6 @@ void _mi_arena_page_unabandon(mi_page_t* page) { _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); } -/* -bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) { - if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); } - mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); - mi_assert_internal(_mi_ptr_page(page)==page); - // if (!mi_page_is_abandoned(page)) return false; // it is not abandoned (anymore) - - // note: we can access the page even it is in the meantime reclaimed by another thread since - // we only call this when on free (and thus there is still an object alive in the page) - mi_memid_t memid = page->memid; - if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false; // don't reclaim between exclusive and non-exclusive arena's - if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false; - - if mi_likely(memid.memkind == MI_MEM_ARENA) { - size_t slice_index; - mi_arena_t* arena = mi_page_arena(page, &slice_index, NULL); - //if (arena->subproc != heap->tld->subproc) return false; // only reclaim within the same subprocess - - // don't reclaim more from a `free` call than half the current segments - // this is to prevent a pure free-ing thread to start owning too many segments - // (but not for out-of-arena segments as that is the main way to be reclaimed for those) - // if (segment->memid.memkind == MI_MEM_ARENA && heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) { - // return false; - // } - const size_t bin = _mi_bin(page->block_size); - if (mi_bitmap_try_clear(&arena->slices_abandoned[bin], slice_index)) { - // we got it atomically - _mi_page_reclaim(heap, page); - mi_assert_internal(!mi_page_is_abandoned(page)); - return true; - } - else { - if (mi_page_is_abandoned(page)) { - // mi_assert(false); - } - } - } - else { - // A page in OS or external memory - if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false; - - // we use the thread_id to atomically grab ownership - mi_threadid_t abandoned_thread_id = 0; - if (mi_atomic_cas_strong_acq_rel(&page->xthread_id, &abandoned_thread_id, heap->thread_id)) { - // we got it atomically - _mi_page_reclaim(heap, page); - mi_assert_internal(!mi_page_is_abandoned(page)); - return true; - } - } - - - return false; -} -*/ - void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { MI_UNUSED(heap); // TODO: implement this diff --git a/src/bitmap.c b/src/bitmap.c index 1aa0a822..d5578cfb 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -768,7 +768,7 @@ static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { static void mi_chunkmap_split(mi_chunkmap_t es, mi_cmap_t* cmap, mi_epoch_t* epoch) { *cmap = (mi_cmap_t)es; - *epoch = (mi_epoch_t)(es >> 32); + if (epoch!=NULL) { *epoch = (mi_epoch_t)(es >> 32); } } static mi_chunkmap_t mi_chunkmap_join(mi_cmap_t cmap, mi_epoch_t epoch) { @@ -1091,80 +1091,50 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n /* -------------------------------------------------------------------------------- bitmap try_find_and_clear -------------------------------------------------------------------------------- */ -/* -typedef bool (mi_bitmap_find_fun_t)(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx); - -static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx, mi_bitmap_find_fun_t* find_fun) -{ - if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; - - // start chunk index -- todo: can depend on the tseq to decrease contention between threads - MI_UNUSED(tseq); - const size_t chunk_start = 0; - const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; - const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; - - // for each chunkmap entry `i` - for( size_t _i = 0; _i < bitmap->chunk_map_count; _i++) - { - size_t i = (_i + chunk_map_start); - if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count; // adjust for the start position - - const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; - mi_epoch_t epoch; - mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &epoch); - if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); } // rotate right for the start position (on the first iteration) - - uint32_t cmap_idx; // one bit set of each chunk that may have bits set - size_t cmap_idx_shift = 0; // shift through the cmap - while (mi_bsf32(cmap, &cmap_idx)) { // find least bit that is set - // adjust for the start position - if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } - // set the chunk idx - const size_t chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; - - // try to find and clear N bits in that chunk - if (chunk_idx < mi_bitmap_chunk_count(bitmap)) { // we can have less chunks than in the chunkmap.. - if ((*find_fun)(bitmap, n, chunk_idx, epoch, pidx)) { - return true; - } - } - - // skip to the next bit - cmap_idx_shift += cmap_idx+1; - cmap >>= cmap_idx; // skip scanned bits (and avoid UB for `cmap_idx+1`) - cmap >>= 1; +static inline size_t mi_bitmap_find_hi_chunk(mi_bitmap_t* bitmap) { + size_t hi_chunk_map_idx = 0; + mi_cmap_t hi_cmap = 0; + for (size_t i = 1; i < mi_bitmap_chunk_map_count(bitmap); i++) { + mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, i, NULL); + if (cmap != 0) { + hi_chunk_map_idx = i; + hi_cmap = cmap; } } - - return false; + uint32_t cmap_idx; + if (mi_bsr32(hi_cmap, &cmap_idx)) { + const size_t hi = (hi_chunk_map_idx * MI_CHUNKMAP_BITS) + cmap_idx; + mi_assert_internal(hi < mi_bitmap_chunk_count(bitmap)); + return hi; + } + else { + return 0; + } } -*/ #define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \ { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ - const size_t chunk_start = 0; \ + const size_t chunk_start = 0; /* tseq % (1 + mi_bitmap_find_hi_chunk(bitmap)); */ \ const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; \ - const size_t chunk_map_start_idx = chunk_start % MI_CHUNKMAP_BITS; \ + const uint32_t chunk_map_start_idx = (uint32_t)(chunk_start % MI_CHUNKMAP_BITS); \ /* for each chunkmap entry `i` */ \ for (size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { \ size_t i = (_i + chunk_map_start); \ - if (i > bitmap->chunk_map_count) i -= bitmap->chunk_map_count; /* adjust for the start position */ \ + if (i >= bitmap->chunk_map_count) { i -= bitmap->chunk_map_count; } /* adjust for the start position */ \ \ const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; \ mi_epoch_t name_epoch; \ mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &name_epoch); \ - if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); } /* rotate right for the start position (on the first iteration) */ \ + uint32_t cmap_idx_shift = 0; /* shift through the cmap */ \ + if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); cmap_idx_shift = chunk_map_start_idx; } /* rotate right for the start position (on the first iteration) */ \ \ uint32_t cmap_idx; /* one bit set of each chunk that may have bits set */ \ - size_t cmap_idx_shift = 0; /* shift through the cmap */ \ while (mi_bsf32(cmap, &cmap_idx)) { /* find least bit that is set */ \ - /* adjust for the start position again */ \ - if (_i == 0) { cmap_idx = (cmap_idx + chunk_map_start_idx) % MI_CHUNKMAP_BITS; } \ /* set the chunk idx */ \ - const size_t name_chunk_idx = chunk_idx0 + cmap_idx + cmap_idx_shift; \ + size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_CHUNKMAP_BITS); \ + if (name_chunk_idx >= mi_bitmap_chunk_count(bitmap)) { name_chunk_idx -= mi_bitmap_chunk_count(bitmap); } \ /* try to find and clear N bits in that chunk */ \ if (name_chunk_idx < mi_bitmap_chunk_count(bitmap)) { /* we can have less chunks than in the chunkmap.. */ @@ -1177,28 +1147,10 @@ static inline bool mi_bitmap_try_find(mi_bitmap_t* bitmap, size_t n, size_t tseq } \ }} -//static bool mi_bitmap_try_find_and_clearN_at(mi_bitmap_t* bitmap, size_t n, size_t chunk_idx, mi_epoch_t epoch, size_t* pidx) { -// size_t cidx; -// if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { -// *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; -// mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); -// return true; -// } -// else { -// // we may find that all are cleared only on a second iteration but that is ok as -// // the chunkmap is a conservative approximation. -// if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { -// mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); -// } -// return false; -// } -//} - // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { - // return mi_bitmap_try_find(bitmap, n, tseq, pidx, &mi_bitmap_try_find_and_clearN_at); mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) { size_t cidx; diff --git a/src/bitmap.h b/src/bitmap.h index ca62735b..78ee5380 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -91,7 +91,7 @@ typedef uint32_t mi_cmap_t; // An atomic bitmap typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s { - _Atomic(size_t) chunk_map_count; // valid chunk_map's + _Atomic(size_t) chunk_map_count; // valid chunk_maps entries _Atomic(size_t) chunk_count; // total count of chunks size_t padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc _Atomic(mi_chunkmap_t) chunk_maps[MI_BITMAP_MAX_CHUNKMAPS]; diff --git a/src/init.c b/src/init.c index 3dcb68e3..353b0ce4 100644 --- a/src/init.c +++ b/src/init.c @@ -400,7 +400,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { tld->heap_backing = bheap; tld->heaps = NULL; tld->subproc = &mi_subproc_default; - tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); tld->os.stats = &tld->stats; } diff --git a/test/test-stress.c b/test/test-stress.c index 0b1b6c8d..61891269 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -343,9 +343,9 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG - // mi_debug_show_arenas(true, true, false); + mi_debug_show_arenas(true, true, false); mi_collect(true); - mi_debug_show_arenas(true,true,false); + // mi_debug_show_arenas(true,true,false); #endif // mi_stats_print(NULL); #else From 0616ee151e75329b425dd999104c2c84e2e1c3ae Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 5 Dec 2024 11:29:25 -0800 Subject: [PATCH 033/264] change to full_page_retain --- include/mimalloc.h | 2 +- include/mimalloc/types.h | 4 ++-- src/heap.c | 2 +- src/options.c | 4 ++-- src/page.c | 25 +++++++++++++++---------- 5 files changed, 21 insertions(+), 16 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index b87e8db2..ba426488 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -379,7 +379,7 @@ typedef enum mi_option_e { mi_option_guarded_sample_rate, // 1 out of N allocations in the min/max range will be guarded (=1000) mi_option_guarded_sample_seed, // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0) mi_option_target_segments_per_thread, // experimental (=0) - mi_option_eager_abandon, // eagerly abandon pages from the heap if suitable (to reduce memory footprint in multi-threaded code) + mi_option_full_page_retain, // retain N full pages per size class (=4, lower it to reduce memory footprint in multi-thread applications) _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 3d83e27a..348e2aa9 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -305,7 +305,7 @@ typedef struct mi_page_s { #endif _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - + mi_heap_t* heap; // heap this threads belong to. struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` @@ -417,7 +417,7 @@ struct mi_heap_s { size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread bool no_reclaim; // `true` if this heap should not reclaim abandoned pages - bool eager_abandon; // `true` if this heap can abandon pages to reduce memory footprint + bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint uint8_t tag; // custom tag, can be used for separating heaps based on the object types #if MI_GUARDED size_t guarded_size_min; // minimal size for guarded objects diff --git a/src/heap.c b/src/heap.c index 96342907..833af278 100644 --- a/src/heap.c +++ b/src/heap.c @@ -206,7 +206,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool heap->thread_id = _mi_thread_id(); heap->arena_id = arena_id; heap->no_reclaim = noreclaim; - heap->eager_abandon = (!noreclaim && mi_option_is_enabled(mi_option_eager_abandon)); + heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0); heap->tag = tag; if (heap == tld->heap_backing) { _mi_random_init(&heap->random); diff --git a/src/options.c b/src/options.c index 1b326cc3..a6d42c58 100644 --- a/src/options.c +++ b/src/options.c @@ -143,7 +143,7 @@ static mi_option_desc_t options[_mi_option_last] = { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`) { 10, UNINIT, MI_OPTION(arena_purge_mult) }, // purge delay multiplier for arena's { 1, UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) }, - { 0, UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free + { 1, UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { MI_DEFAULT_DISALLOW_ARENA_ALLOC, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. #if defined(MI_VISIT_ABANDONED) @@ -158,7 +158,7 @@ static mi_option_desc_t options[_mi_option_last] = UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. - { 1, UNINIT, MI_OPTION(eager_abandon) }, + { 2, UNINIT, MI_OPTION(full_page_retain) }, }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/page.c b/src/page.c index faef2f48..9b35a4db 100644 --- a/src/page.c +++ b/src/page.c @@ -212,7 +212,7 @@ void _mi_page_free_collect(mi_page_t* page, bool force) { mi_assert_internal(page!=NULL); // collect the thread free list - _mi_page_thread_free_collect(page); + _mi_page_thread_free_collect(page); // and the local free list if (page->local_free != NULL) { @@ -264,7 +264,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) { */ // called from `mi_free` on a reclaim, and fresh_alloc if we get an abandoned page -void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page) +void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); @@ -381,7 +381,7 @@ void _mi_page_unfull(mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_expensive(_mi_page_is_valid(page)); mi_assert_internal(mi_page_is_in_full(page)); - mi_assert_internal(!mi_page_heap(page)->eager_abandon); + mi_assert_internal(!mi_page_heap(page)->allow_page_abandon); if (!mi_page_is_in_full(page)) return; mi_heap_t* heap = mi_page_heap(page); @@ -398,7 +398,7 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { mi_assert_internal(!mi_page_is_in_full(page)); mi_heap_t* heap = mi_page_heap(page); - if (heap->eager_abandon) { + if (heap->allow_page_abandon) { // abandon full pages _mi_page_abandon(page, pq); } @@ -761,9 +761,10 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { // search for a best next page to use for at most N pages (often cut short if immediate blocks are available) #define MI_MAX_CANDIDATE_SEARCH (8) +#define MI_MAX_FULL_PAGES_PER_QUEUE (4) // Find a page with free blocks of `page->block_size`. -static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try) +static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try) { // search through the pages in "next fit" order #if MI_STAT @@ -772,6 +773,7 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p #if MI_MAX_CANDIDATE_SEARCH > 1 size_t candidate_count = 0; // we reset this on the first candidate to limit the search #endif + size_t full_page_count = 0; mi_page_t* page_candidate = NULL; // a page with free space mi_page_t* page = pq->first; @@ -797,8 +799,11 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p // if the page is completely full, move it to the `mi_pages_full` // queue so we don't visit long-lived pages too often. if (!immediate_available && !mi_page_is_expandable(page)) { - mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page)); - mi_page_to_full(page, pq); + full_page_count++; + if (full_page_count > MI_MAX_FULL_PAGES_PER_QUEUE) { + mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page)); + mi_page_to_full(page, pq); + } } else { // the page has free space, make it a candidate @@ -807,8 +812,8 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p page_candidate = page; candidate_count = 0; } - else if (mi_page_all_free(page_candidate)) { - _mi_page_free(page_candidate, pq); + else if (mi_page_all_free(page_candidate)) { + _mi_page_free(page_candidate, pq); page_candidate = page; } else if (page->used >= page_candidate->used) { // && !mi_page_is_mostly_used(page)) { @@ -1000,7 +1005,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al mi_assert_internal(mi_heap_is_initialized(heap)); // call potential deferred free routines - _mi_deferred_free(heap, false); + // _mi_deferred_free(heap, false); // free delayed frees from other threads (but skip contended ones) // _mi_heap_delayed_free_partial(heap); From 7443ee317e189937118c93157eb7b70125ad60a3 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 5 Dec 2024 17:00:23 -0800 Subject: [PATCH 034/264] tune free-ing and abandoning --- include/mimalloc.h | 7 +- include/mimalloc/internal.h | 5 +- include/mimalloc/types.h | 6 +- src/bitmap.c | 8 +-- src/free.c | 124 +++++++++++++++++++++--------------- src/heap.c | 14 ++++ src/init.c | 3 +- src/options.c | 5 +- src/page-map.c | 15 +++-- src/page.c | 31 ++++----- 10 files changed, 125 insertions(+), 93 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index ba426488..907ffadb 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MIMALLOC_H #define MIMALLOC_H -#define MI_MALLOC_VERSION 188 // major + 2 digits minor +#define MI_MALLOC_VERSION 300 // major + 2 digits minor // ------------------------------------------------------ // Compiler specific attributes @@ -369,7 +369,6 @@ typedef enum mi_option_e { mi_option_arena_reserve, // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`) mi_option_arena_purge_mult, // multiplier for `purge_delay` for the purging delay for arenas (=10) mi_option_purge_extend_delay, - mi_option_abandoned_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) mi_option_disallow_arena_alloc, // 1 = do not use arena's for allocation (except if using specific arena id's) mi_option_retry_on_oom, // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows) mi_option_visit_abandoned, // allow visiting heap blocks from abandoned threads (=0) @@ -379,7 +378,9 @@ typedef enum mi_option_e { mi_option_guarded_sample_rate, // 1 out of N allocations in the min/max range will be guarded (=1000) mi_option_guarded_sample_seed, // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0) mi_option_target_segments_per_thread, // experimental (=0) - mi_option_full_page_retain, // retain N full pages per size class (=4, lower it to reduce memory footprint in multi-thread applications) + mi_option_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) + mi_option_full_page_retain, // retain N full pages per size class (=2) + mi_option_max_page_candidates, // max candidate pages to consider for allocation (=4) _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index cb689877..3a8b272e 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -27,6 +27,8 @@ terms of the MIT license. A copy of the license can be found in the file #if defined(_MSC_VER) #pragma warning(disable:4127) // suppress constant conditional warning (due to MI_SECURE paths) #pragma warning(disable:26812) // unscoped enum warning +#pragma warning(disable:28159) // don't use GetVersion +#pragma warning(disable:4996) // don't use GetVersion #define mi_decl_noinline __declspec(noinline) #define mi_decl_thread __declspec(thread) #define mi_decl_align(a) __declspec(align(a)) @@ -169,6 +171,7 @@ void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current); */ // "page-map.c" +bool _mi_page_map_init(void); void _mi_page_map_register(mi_page_t* page); void _mi_page_map_unregister(mi_page_t* page); @@ -638,7 +641,7 @@ static inline bool mi_page_is_mostly_used(const mi_page_t* page) { return (page->reserved - page->used <= frac); } -// is less than 1/n'th of a page free? +// is more than (n-1)/n'th of a page in use? static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) { if (page==NULL) return true; uint16_t frac = page->reserved / n; diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 348e2aa9..d4c37c37 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -12,10 +12,8 @@ terms of the MIT license. A copy of the license can be found in the file // This file contains the main type definitions for mimalloc: // mi_heap_t : all data for a thread-local heap, contains // lists of all managed heap pages. -// mi_segment_t : a larger chunk of memory (32GiB) from where pages -// are allocated. // mi_page_t : a mimalloc page (usually 64KiB or 512KiB) from -// where objects are allocated. +// where objects of a single size are allocated. // Note: we write "OS page" for OS memory pages while // using plain "page" for mimalloc pages (`mi_page_t`). // -------------------------------------------------------------------------- @@ -417,7 +415,7 @@ struct mi_heap_s { size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread bool no_reclaim; // `true` if this heap should not reclaim abandoned pages - bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint + bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint uint8_t tag; // custom tag, can be used for separating heaps based on the object types #if MI_GUARDED size_t guarded_size_min; // minimal size for guarded objects diff --git a/src/bitmap.c b/src/bitmap.c index d5578cfb..f25c91ac 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -861,10 +861,10 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) if (!already_zero) { _mi_memzero_aligned(bitmap, size); } - bitmap->chunk_map_count = _mi_divide_up(chunk_count, MI_CHUNKMAP_BITS); - mi_assert_internal(bitmap->chunk_map_count <= MI_BITMAP_MAX_CHUNKMAPS); - bitmap->chunk_count = chunk_count; - mi_assert_internal(bitmap->chunk_map_count <= MI_BITMAP_MAX_CHUNK_COUNT); + mi_atomic_store_release(&bitmap->chunk_map_count, _mi_divide_up(chunk_count, MI_CHUNKMAP_BITS)); + mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_map_count) <= MI_BITMAP_MAX_CHUNKMAPS); + mi_atomic_store_release(&bitmap->chunk_count, chunk_count); + mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT); return size; } diff --git a/src/free.c b/src/free.c index 1e07dbd2..0ff4bf60 100644 --- a/src/free.c +++ b/src/free.c @@ -23,9 +23,6 @@ static void mi_stat_free(const mi_page_t* page, const mi_block_t* block); // Free // ------------------------------------------------------ -// forward declaration of multi-threaded free (`_mt`) (or free in huge block if compiled with MI_HUGE_PAGE_ABANDON) -static mi_decl_noinline void mi_free_block_mt(mi_page_t* page, mi_block_t* block); - // regular free of a (thread local) block pointer // fast path written carefully to prevent spilling on the stack static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool track_stats, bool check_full) @@ -50,6 +47,40 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool } } +// Forward declaration for multi-threaded collect +static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page); + +// Free a block multi-threaded +static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) +{ + // adjust stats (after padding check and potentially recursive `mi_free` above) + mi_stat_free(page, block); // stat_free may access the padding + mi_track_free_size(block, mi_page_usable_size_of(page, block)); + + // _mi_padding_shrink(page, block, sizeof(mi_block_t)); +#if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading + size_t dbgsize = mi_usable_size(block); + if (dbgsize > MI_MiB) { dbgsize = MI_MiB; } + _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize); +#endif + + // push atomically on the page thread free list + mi_thread_free_t tf_new; + mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); + do { + mi_block_set_next(page, block, mi_tf_block(tf_old)); + tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); + } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); + + // and atomically try to collect the page if it was abandoned + const bool is_owned_now = !mi_tf_is_owned(tf_old); + if (is_owned_now) { + mi_assert_internal(mi_page_is_abandoned(page)); + mi_free_try_collect_mt(page); + } +} + + // Adjust a block that was allocated aligned, to the actual start of the block in the page. // note: this can be called from `mi_free_generic_mt` where a non-owning thread accesses the // `page_start` and `block_size` fields; however these are constant and the page won't be @@ -81,6 +112,7 @@ static inline void mi_block_check_unguard(mi_page_t* page, mi_block_t* block, vo } #endif + // free a local pointer (page parameter comes first for better codegen) static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_attr_noexcept { mi_block_t* const block = (mi_page_has_aligned(page) ? _mi_page_ptr_unalign(page, p) : (mi_block_t*)p); @@ -101,6 +133,7 @@ void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) else mi_free_generic_mt(page,p); } + // Get the segment data belonging to a pointer // This is just a single `and` in release mode but does further checks in debug mode // (and secure mode) to see if this was a valid pointer. @@ -142,8 +175,16 @@ void mi_free(void* p) mi_attr_noexcept } } else { - // not thread-local; use generic path - mi_free_generic_mt(page, p); + // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap) + if mi_likely(page->flags.full_aligned == 0) { + // blocks are aligned (and not a full page) + mi_block_t* const block = (mi_block_t*)p; + mi_free_block_mt(page,block); + } + else { + // page is full or contains (inner) aligned blocks; use generic multi-thread path + mi_free_generic_mt(page, p); + } } } @@ -152,40 +193,11 @@ void mi_free(void* p) mi_attr_noexcept // Multi-threaded Free (`_mt`) // ------------------------------------------------------ -static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page); -// Push a block that is owned by another thread (or abandoned) on its page-local thread free list. -static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block) -{ - // adjust stats (after padding check and potentially recursive `mi_free` above) - mi_stat_free(page, block); // stat_free may access the padding - mi_track_free_size(block, mi_page_usable_size_of(page, block)); - - // _mi_padding_shrink(page, block, sizeof(mi_block_t)); - #if (MI_DEBUG>0) && !MI_TRACK_ENABLED && !MI_TSAN // note: when tracking, cannot use mi_usable_size with multi-threading - size_t dbgsize = mi_usable_size(block); - if (dbgsize > MI_MiB) { dbgsize = MI_MiB; } - _mi_memset_aligned(block, MI_DEBUG_FREED, dbgsize); - #endif - - // push atomically on the page thread free list - mi_thread_free_t tf_new; - mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); - do { - mi_block_set_next(page, block, mi_tf_block(tf_old)); - tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); - } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); - - // and atomically reclaim the page if it was abandoned - bool reclaimed = !mi_tf_is_owned(tf_old); - if (reclaimed) { - mi_free_try_reclaim_mt(page); - } -} - -static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { +static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); + // we own the page now.. // safe to collect the thread atomic free list _mi_page_free_collect(page, false); // update `used` count @@ -202,16 +214,10 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { _mi_arena_page_free(page); return; } - // 2. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations - else if (!mi_page_is_mostly_used(page) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page - !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && - _mi_arena_page_try_reabandon_to_mapped(page)) - { - return; - } - // 3. if the page is not too full, we can try to reclaim it for ourselves - else if (_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0 && - !mi_page_is_mostly_used(page)) + + // 2. if the page is not too full, we can try to reclaim it for ourselves + if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && + !mi_page_is_used_at_frac(page,8)) { // the page has still some blocks in use (but not too many) // reclaim in our heap if compatible, or otherwise abandon again @@ -222,20 +228,32 @@ static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) { if (heap != (mi_heap_t*)&_mi_heap_empty) // we did not already terminate our thread (can this happen? { mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); - if ((tagheap != NULL) && // don't reclaim across heap object types + if ((tagheap != NULL) && // don't reclaim across heap object types + (!tagheap->no_reclaim) && // we are allowed to reclaim abandoned pages (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) ) - { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); - _mi_heap_page_reclaim(tagheap, page); - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); - return; + { + if (mi_page_queue(tagheap, page->block_size)->first != NULL) { // don't reclaim for an block_size we don't use + // first remove it from the abandoned pages in the arena -- this waits for any readers to finish + _mi_arena_page_unabandon(page); + _mi_heap_page_reclaim(tagheap, page); + _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); + return; + } } } } + // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations + if (!mi_page_is_used_at_frac(page, 4) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page + !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && + _mi_arena_page_try_reabandon_to_mapped(page)) + { + return; + } + + // not reclaimed or free'd, unown again _mi_page_unown(page); } diff --git a/src/heap.c b/src/heap.c index 833af278..2ff40930 100644 --- a/src/heap.c +++ b/src/heap.c @@ -208,6 +208,20 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool heap->no_reclaim = noreclaim; heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0); heap->tag = tag; + + #if defined(WIN32) && (MI_ARCH_X64 || MI_ARCH_X86) + // disallow reclaim for threads running in the windows threadpool + const DWORD winVersion = GetVersion(); + const DWORD winMajorVersion = (DWORD)(LOBYTE(LOWORD(winVersion))); + if (winMajorVersion >= 6) { + _TEB* const teb = NtCurrentTeb(); + void* const poolData = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778))); + if (poolData != NULL) { + heap->no_reclaim = true; + } + } + #endif + if (heap == tld->heap_backing) { _mi_random_init(&heap->random); } diff --git a/src/init.c b/src/init.c index 353b0ce4..64b31e1b 100644 --- a/src/init.c +++ b/src/init.c @@ -400,7 +400,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { tld->heap_backing = bheap; tld->heaps = NULL; tld->subproc = &mi_subproc_default; - tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1); tld->os.stats = &tld->stats; } @@ -619,6 +619,7 @@ void mi_process_init(void) mi_attr_noexcept { mi_detect_cpu_features(); _mi_os_init(); + _mi_page_map_init(); _mi_arena_init(); mi_heap_main_init(); #if MI_DEBUG diff --git a/src/options.c b/src/options.c index a6d42c58..f2e9297f 100644 --- a/src/options.c +++ b/src/options.c @@ -143,7 +143,6 @@ static mi_option_desc_t options[_mi_option_last] = { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`) { 10, UNINIT, MI_OPTION(arena_purge_mult) }, // purge delay multiplier for arena's { 1, UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) }, - { 1, UNINIT, MI_OPTION(abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { MI_DEFAULT_DISALLOW_ARENA_ALLOC, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. #if defined(MI_VISIT_ABANDONED) @@ -158,7 +157,9 @@ static mi_option_desc_t options[_mi_option_last] = UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. + { 1, UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { 2, UNINIT, MI_OPTION(full_page_retain) }, + { 4, UNINIT, MI_OPTION(max_page_candidates) }, }; static void mi_option_init(mi_option_desc_t* desc); @@ -189,7 +190,7 @@ void _mi_options_init(void) { } } _mi_verbose_message("guarded build: %s\n", mi_option_get(mi_option_guarded_sample_rate) != 0 ? "enabled" : "disabled"); - #endif + #endif } long _mi_option_get_fast(mi_option_t option) { diff --git a/src/page-map.c b/src/page-map.c index 35a22d8d..25693064 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -13,9 +13,9 @@ mi_decl_cache_align uint8_t* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; static mi_memid_t mi_page_map_memid; -static mi_bitmap_t mi_page_map_commit; +static mi_bitmap_t mi_page_map_commit = { 1, MI_BITMAP_MIN_CHUNK_COUNT }; -static bool mi_page_map_init(void) { +bool _mi_page_map_init(void) { size_t vbits = _mi_os_virtual_address_bits(); if (vbits >= 48) vbits = 47; // 1 byte per block = 2 GiB for 128 TiB address space (48 bit = 256 TiB address space) @@ -23,7 +23,7 @@ static bool mi_page_map_init(void) { const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_MIN_BIT_COUNT); - mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true); + // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true); mi_page_map_all_committed = false; // _mi_os_has_overcommit(); // commit on-access on Linux systems? _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); @@ -57,11 +57,15 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) { bool is_zero; uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit); const size_t size = mi_page_map_entries_per_commit_bit; - _mi_os_commit(start, size, &is_zero, NULL); + _mi_os_commit(start, size, &is_zero, NULL); if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); } mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, i, 1, NULL); } } + #if MI_DEBUG > 0 + _mi_page_map[idx] = 0; + _mi_page_map[idx+slice_count-1] = 0; + #endif } } @@ -78,8 +82,9 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* void _mi_page_map_register(mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_internal(_mi_is_aligned(page,MI_PAGE_ALIGN)); + mi_assert_internal(_mi_page_map != NULL); // should be initialized before multi-thread access! if mi_unlikely(_mi_page_map == NULL) { - if (!mi_page_map_init()) return; + if (!_mi_page_map_init()) return; } mi_assert(_mi_page_map!=NULL); uint8_t* page_start; diff --git a/src/page.c b/src/page.c index 9b35a4db..056c9506 100644 --- a/src/page.c +++ b/src/page.c @@ -758,11 +758,6 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { Find pages with free blocks -------------------------------------------------------------*/ -// search for a best next page to use for at most N pages (often cut short if immediate blocks are available) -#define MI_MAX_CANDIDATE_SEARCH (8) - -#define MI_MAX_FULL_PAGES_PER_QUEUE (4) - // Find a page with free blocks of `page->block_size`. static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* pq, bool first_try) { @@ -770,10 +765,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m #if MI_STAT size_t count = 0; #endif - #if MI_MAX_CANDIDATE_SEARCH > 1 - size_t candidate_count = 0; // we reset this on the first candidate to limit the search - #endif - size_t full_page_count = 0; + long candidate_limit = 0; // we reset this on the first candidate to limit the search + long full_page_retain = _mi_option_get_fast(mi_option_full_page_retain); mi_page_t* page_candidate = NULL; // a page with free space mi_page_t* page = pq->first; @@ -783,14 +776,11 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m #if MI_STAT count++; #endif - #if MI_MAX_CANDIDATE_SEARCH > 1 - candidate_count++; - #endif - + candidate_limit--; + // collect freed blocks by us and other threads _mi_page_free_collect(page, false); - #if MI_MAX_CANDIDATE_SEARCH > 1 // search up to N pages for a best candidate // is the local free list non-empty? @@ -799,8 +789,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m // if the page is completely full, move it to the `mi_pages_full` // queue so we don't visit long-lived pages too often. if (!immediate_available && !mi_page_is_expandable(page)) { - full_page_count++; - if (full_page_count > MI_MAX_FULL_PAGES_PER_QUEUE) { + full_page_retain--; + if (full_page_retain < 0) { mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page)); mi_page_to_full(page, pq); } @@ -810,7 +800,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages) if (page_candidate == NULL) { page_candidate = page; - candidate_count = 0; + candidate_limit = _mi_option_get_fast(mi_option_max_page_candidates); } else if (mi_page_all_free(page_candidate)) { _mi_page_free(page_candidate, pq); @@ -820,13 +810,14 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m page_candidate = page; } // if we find a non-expandable candidate, or searched for N pages, return with the best candidate - if (immediate_available || candidate_count > MI_MAX_CANDIDATE_SEARCH) { + if (immediate_available || candidate_limit <= 0) { mi_assert_internal(page_candidate!=NULL); break; } } - #else - // first-fit algorithm + + #if 0 + // first-fit algorithm without candidates // If the page contains free blocks, we are done if (mi_page_immediate_available(page) || mi_page_is_expandable(page)) { break; // pick this one From ec9c61c066d46ad998028d83e984ff33a5fb5470 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 6 Dec 2024 14:53:24 -0800 Subject: [PATCH 035/264] initial no more pairmap --- include/mimalloc/internal.h | 7 +- include/mimalloc/types.h | 8 +- src/arena.c | 66 +-- src/bitmap.c | 937 ++++++++++++++---------------------- src/bitmap.h | 158 +++--- 5 files changed, 465 insertions(+), 711 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 3a8b272e..d9c2cd6e 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -700,7 +700,9 @@ static inline bool mi_page_try_claim_ownership(mi_page_t* page) { return ((old&1)==0); } -static inline void _mi_page_unown(mi_page_t* page) { +// release ownership of a page. This may free the page if all blocks were concurrently +// freed in the meantime. Returns true if the page was freed. +static inline bool _mi_page_unown(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); mi_thread_free_t tf_new; @@ -712,13 +714,14 @@ static inline void _mi_page_unown(mi_page_t* page) { if (mi_page_all_free(page)) { // it may become free just before unowning it _mi_arena_page_unabandon(page); _mi_arena_page_free(page); - return; + return true; } tf_old = mi_atomic_load_relaxed(&page->xthread_free); } mi_assert_internal(mi_tf_block(tf_old)==NULL); tf_new = mi_tf_create(NULL, false); } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new)); + return false; } //----------------------------------------------------------- diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index d4c37c37..d78dbc59 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -117,16 +117,16 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_ARENA_SLICE_SHIFT (13 + MI_SIZE_SHIFT) // 64 KiB (32 KiB on 32-bit) #endif #endif -#ifndef MI_BITMAP_CHUNK_BITS_SHIFT -#define MI_BITMAP_CHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) +#ifndef MI_BCHUNK_BITS_SHIFT +#define MI_BCHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) #endif -#define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) +#define MI_BCHUNK_BITS (1 << MI_BCHUNK_BITS_SHIFT) #define MI_ARENA_SLICE_SIZE (MI_ZU(1) << MI_ARENA_SLICE_SHIFT) #define MI_ARENA_SLICE_ALIGN (MI_ARENA_SLICE_SIZE) #define MI_ARENA_MIN_OBJ_SLICES (1) -#define MI_ARENA_MAX_OBJ_SLICES (MI_BITMAP_CHUNK_BITS) // 32 MiB (for now, cannot cross chunk boundaries) +#define MI_ARENA_MAX_OBJ_SLICES (MI_BCHUNK_BITS) // 32 MiB (for now, cannot cross chunk boundaries) #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE) #define MI_ARENA_MAX_OBJ_SIZE (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE) diff --git a/src/arena.c b/src/arena.c index 79a52c4d..fd609fe0 100644 --- a/src/arena.c +++ b/src/arena.c @@ -48,7 +48,7 @@ typedef struct mi_arena_s { mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) mi_bitmap_t* slices_purge; // can the slice be purged? (slice in purge => slice in free) mi_bitmap_t* slices_dirty; // is the slice potentially non-zero? - mi_pairmap_t pages_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) + mi_bitmap_t* pages_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) // the full queue contains abandoned full pages // followed by the bitmaps (whose size depends on the arena size) } mi_arena_t; @@ -476,16 +476,24 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t Arena page allocation ----------------------------------------------------------- */ -static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2) { - mi_arena_t* arena = (mi_arena_t*)arg1; - mi_subproc_t* subproc = (mi_subproc_t*)arg2; - +static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2, bool* keep_abandoned) { // found an abandoned page of the right size - // it is set busy for now so we can read safely even with concurrent mi_free reclaiming - // try to claim ownership atomically - mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); - if (subproc != page->subproc) return false; - if (!mi_page_try_claim_ownership(page)) return false; + mi_arena_t* const arena = (mi_arena_t*)arg1; + mi_subproc_t* const subproc = (mi_subproc_t*)arg2; + mi_page_t* const page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + // can we claim ownership? + if (!mi_page_try_claim_ownership(page)) { + *keep_abandoned = true; + return false; + } + if (subproc != page->subproc) { + // wrong sub-process.. we need to unown again, and perhaps not keep it abandoned + const bool freed = _mi_page_unown(page); + *keep_abandoned = !freed; + return false; + } + // yes, we can reclaim it + *keep_abandoned = false; return true; } @@ -505,10 +513,10 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl mi_forall_arenas(req_arena_id, allow_large, tseq, arena_id, arena) { size_t slice_index; - mi_pairmap_t* const pairmap = &arena->pages_abandoned[bin]; + mi_bitmap_t* const bitmap = arena->pages_abandoned[bin]; - if (mi_pairmap_try_find_and_set_busy(pairmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) { - // found an abandoned page of the right size + if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) { + // found an abandoned page of the right size // and claimed ownership. mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); mi_assert_internal(mi_page_is_owned(page)); @@ -528,7 +536,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(!mi_page_is_full(page)); return page; - } + } } mi_forall_arenas_end(); return NULL; @@ -694,7 +702,7 @@ void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); - mi_assert_internal(mi_pairmap_is_clear(&arena->pages_abandoned[bin], slice_index)); + mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); } #endif @@ -728,8 +736,8 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_page_set_abandoned_mapped(page); - bool were_zero = mi_pairmap_set(&arena->pages_abandoned[bin], slice_index); - MI_UNUSED(were_zero); mi_assert_internal(were_zero); + const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index); + MI_UNUSED(wasclear); mi_assert_internal(wasclear); mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]); } else { @@ -783,7 +791,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done - mi_pairmap_clear_once_not_busy(&arena->pages_abandoned[bin], slice_index); + mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index); mi_page_clear_abandoned_mapped(page); mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]); } @@ -956,12 +964,12 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* } static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_base) { - if (slice_count == 0) slice_count = MI_BITMAP_CHUNK_BITS; - mi_assert_internal((slice_count % MI_BITMAP_CHUNK_BITS) == 0); - const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BITMAP_CHUNK_SIZE); - const size_t bitmaps_size = 4 * mi_bitmap_size(slice_count,NULL); - const size_t pairmaps_size = MI_BIN_COUNT * 2 * mi_bitmap_size(slice_count,NULL); - const size_t size = base_size + bitmaps_size + pairmaps_size; + if (slice_count == 0) slice_count = MI_BCHUNK_BITS; + mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0); + const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE); + const size_t bitmaps_count = 4 + MI_BIN_COUNT; // free, commit, dirty, purge, and abandonded + const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count,NULL); + const size_t size = base_size + bitmaps_size; const size_t os_page_size = _mi_os_page_size(); const size_t info_size = _mi_align_up(size, os_page_size) + os_page_size; // + guard page @@ -992,7 +1000,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } - const size_t slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BITMAP_CHUNK_BITS); + const size_t slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BCHUNK_BITS); if (slice_count > MI_BITMAP_MAX_BIT_COUNT) { // 16 GiB for now // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_slices(MI_BITMAP_MAX_BIT_COUNT)/MI_MiB); @@ -1034,7 +1042,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base); arena->slices_purge = mi_arena_bitmap_init(slice_count,&base); for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) { - mi_pairmap_init(&arena->pages_abandoned[i], mi_arena_bitmap_init(slice_count, &base), mi_arena_bitmap_init(slice_count, &base)); + arena->pages_abandoned[i] = mi_arena_bitmap_init(slice_count,&base); } mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena))); @@ -1112,9 +1120,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ size_t bit_count = 0; size_t bit_set_count = 0; for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { - char buf[MI_BITMAP_CHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); - mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; - for (size_t j = 0, k = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { + char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); + mi_bchunk_t* chunk = &bitmap->chunks[i]; + for (size_t j = 0, k = 0; j < MI_BCHUNK_FIELDS; j++) { if (j > 0 && (j % 4) == 0) { buf[k++] = '\n'; _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix); diff --git a/src/bitmap.c b/src/bitmap.c index f25c91ac..7df46070 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -77,50 +77,41 @@ static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { } // Clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0. -static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx) { +// `all_clear` is set if the new bfield is zero. +static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_one()<bfields[i], idx); +//} + +static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; - mi_assert_internal(idx < MI_BFIELD_BITS-1); - mi_assert_internal((idx%2)==0); - return mi_bfield_atomic_xset2(set, &chunk->bfields[i], idx, all_already_xset); + return mi_bfield_atomic_set(&chunk->bfields[i], idx); } -static inline bool mi_bitmap_chunk_set2(mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_set) { - return mi_bitmap_chunk_xset2(MI_BIT_SET, chunk, cidx, all_already_set); -} - -static inline bool mi_bitmap_chunk_clear2(mi_bitmap_chunk_t* chunk, size_t cidx, bool* all_already_clear) { - return mi_bitmap_chunk_xset2(MI_BIT_CLEAR, chunk, cidx, all_already_clear); +static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_all_clear) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + return mi_bfield_atomic_clear(&chunk->bfields[i], idx, maybe_all_clear); } // Set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0). -static bool mi_bitmap_chunk_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* pall_already_xset) { - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); +static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_xset) { + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); bool all_transition = true; - size_t all_already_xset = 0; + size_t total_already_xset = 0; size_t idx = cidx % MI_BFIELD_BITS; size_t field = cidx / MI_BFIELD_BITS; while (n > 0) { size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field if (m > n) { m = n; } mi_assert_internal(idx + m <= MI_BFIELD_BITS); - mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + mi_assert_internal(field < MI_BCHUNK_FIELDS); const mi_bfield_t mask = mi_bfield_mask(m, idx); size_t already_xset = 0; const bool transition = mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset); - if (already_xset > 0 && transition) { - _mi_error_message(EFAULT, "ouch\n"); - } + mi_assert_internal((transition && already_xset == m) || (!transition && already_xset > 0)); all_transition = all_transition && transition; - all_already_xset += already_xset; + total_already_xset += already_xset; // next field field++; idx = 0; n -= m; } - if (pall_already_xset!=NULL) { *pall_already_xset = all_already_xset; } + if (palready_xset!=NULL) { *palready_xset = total_already_xset; } return all_transition; } -static inline bool mi_bitmap_chunk_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { - return mi_bitmap_chunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set); +static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { + return mi_bchunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set); } -static inline bool mi_bitmap_chunk_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, size_t* already_clear) { - return mi_bitmap_chunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, already_clear); +static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_clear) { + return mi_bchunk_xsetN(MI_BIT_CLEAR, chunk, cidx, n, already_clear); } -// check if a pair of bits is set/clear -static inline bool mi_bitmap_chunk_is_xset2(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx) { - mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); - const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; - mi_assert_internal(idx < MI_BFIELD_BITS-1); - mi_assert_internal((idx%2)==0); - const size_t mask = (mi_bfield_t)0x03 << idx; - return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mask); -} - -static inline bool mi_bitmap_chunk_is_set2(mi_bitmap_chunk_t* chunk, size_t cidx) { - return mi_bitmap_chunk_is_xset2(MI_BIT_SET, chunk, cidx); -} - -static inline bool mi_bitmap_chunk_is_clear2(mi_bitmap_chunk_t* chunk, size_t cidx) { - return mi_bitmap_chunk_is_xset2(MI_BIT_CLEAR, chunk, cidx); -} +// ------ is_xset -------- // Check if a sequence of `n` bits within a chunk are all set/cleared. -static bool mi_bitmap_chunk_is_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); +static bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) { + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); size_t idx = cidx % MI_BFIELD_BITS; size_t field = cidx / MI_BFIELD_BITS; @@ -378,7 +363,7 @@ static bool mi_bitmap_chunk_is_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, si size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field if (m > n) { m = n; } mi_assert_internal(idx + m <= MI_BFIELD_BITS); - mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + mi_assert_internal(field < MI_BCHUNK_FIELDS); const size_t mask = mi_bfield_mask(m, idx); if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask)) { return false; @@ -392,71 +377,91 @@ static bool mi_bitmap_chunk_is_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, si } +// ------ try_xset -------- -static inline bool mi_bitmap_chunk_try_xset(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx) { - mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); +static inline bool mi_bchunk_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; return mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx); } -static inline bool mi_bitmap_chunk_try_set(mi_bitmap_chunk_t* chunk, size_t cidx) { - return mi_bitmap_chunk_try_xset(MI_BIT_SET, chunk, cidx); +static inline bool mi_bchunk_try_set(mi_bchunk_t* chunk, size_t cidx) { + return mi_bchunk_try_xset(MI_BIT_SET, chunk, cidx); } -static inline bool mi_bitmap_chunk_try_clear(mi_bitmap_chunk_t* chunk, size_t cidx) { - return mi_bitmap_chunk_try_xset(MI_BIT_CLEAR, chunk, cidx); +static inline bool mi_bchunk_try_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_all_clear) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + return mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, maybe_all_clear); } -static inline bool mi_bitmap_chunk_try_xset8(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx) { - mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); + +//static inline bool mi_bchunk_try_xset8(mi_xset_t set, mi_bchunk_t* chunk, size_t byte_idx) { +// mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS); +// const size_t i = byte_idx / MI_BFIELD_SIZE; +// const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; +// return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx); +//} + +static inline bool mi_bchunk_try_set8(mi_bchunk_t* chunk, size_t byte_idx) { + mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS); const size_t i = byte_idx / MI_BFIELD_SIZE; const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; - return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx); + return mi_bfield_atomic_try_set8(&chunk->bfields[i], ibyte_idx); } -static inline bool mi_bitmap_chunk_try_set8(mi_bitmap_chunk_t* chunk, size_t byte_idx) { - return mi_bitmap_chunk_try_xset8(MI_BIT_SET, chunk, byte_idx); +static inline bool mi_bchunk_try_clear8(mi_bchunk_t* chunk, size_t byte_idx, bool* maybe_all_clear) { + mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS); + const size_t i = byte_idx / MI_BFIELD_SIZE; + const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; + return mi_bfield_atomic_try_clear8(&chunk->bfields[i], ibyte_idx, maybe_all_clear); } -static inline bool mi_bitmap_chunk_try_clear8(mi_bitmap_chunk_t* chunk, size_t byte_idx) { - return mi_bitmap_chunk_try_xset8(MI_BIT_CLEAR, chunk, byte_idx); -} // Try to atomically set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving all bit fields as is. -static bool mi_bitmap_chunk_try_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); +static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); if (n==0) return true; size_t start_idx = cidx % MI_BFIELD_BITS; size_t start_field = cidx / MI_BFIELD_BITS; - size_t end_field = MI_BITMAP_CHUNK_FIELDS; + size_t end_field = MI_BCHUNK_FIELDS; mi_bfield_t mask_mid = 0; mi_bfield_t mask_end = 0; + bool field_is_clear; + bool maybe_all_clear = true; + if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = false; } // first field size_t field = start_field; size_t m = MI_BFIELD_BITS - start_idx; // m is the bits to xset in this field if (m > n) { m = n; } mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); - mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS); + mi_assert_internal(start_field < MI_BCHUNK_FIELDS); const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx); - if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start)) return false; + if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start, &field_is_clear)) return false; + maybe_all_clear = maybe_all_clear && field_is_clear; // done? n -= m; - if (n==0) return true; + if (n==0) { + if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; } + return true; + } // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields // mid fields while (n >= MI_BFIELD_BITS) { field++; - mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + mi_assert_internal(field < MI_BCHUNK_FIELDS); mask_mid = mi_bfield_all_set(); - if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore; + if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid, &field_is_clear)) goto restore; + maybe_all_clear = maybe_all_clear && field_is_clear; n -= MI_BFIELD_BITS; } @@ -464,12 +469,14 @@ static bool mi_bitmap_chunk_try_xsetN(mi_xset_t set, mi_bitmap_chunk_t* chunk, s if (n > 0) { mi_assert_internal(n < MI_BFIELD_BITS); field++; - mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + mi_assert_internal(field < MI_BCHUNK_FIELDS); end_field = field; mask_end = mi_bfield_mask(n, 0); - if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end)) goto restore; + if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end, &field_is_clear)) goto restore; + maybe_all_clear = maybe_all_clear && field_is_clear; } + if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; } return true; restore: @@ -483,14 +490,23 @@ restore: return false; } -static inline bool mi_bitmap_chunk_try_setN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { - return mi_bitmap_chunk_try_xsetN(MI_BIT_SET, chunk, cidx, n); +static inline bool mi_bchunk_try_setN(mi_bchunk_t* chunk, size_t cidx, size_t n) { + return mi_bchunk_try_xsetN(MI_BIT_SET, chunk, cidx, n, NULL); } -static inline bool mi_bitmap_chunk_try_clearN(mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { - return mi_bitmap_chunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n); +static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) { + return mi_bchunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n, maybe_all_clear); } +static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx); +} + +// ------ find_and_try_xset -------- + #if defined(__AVX2__) static inline __m256i mi_mm256_zero(void) { return _mm256_setzero_si256(); @@ -507,10 +523,10 @@ static inline bool mi_mm256_is_zero( __m256i vec) { #endif // find least 0/1-bit in a chunk and try to set/clear it atomically -// set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // todo: try neon version -static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) { -#if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) +static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t* pidx) { +#if defined(__AVX2__) && (MI_BCHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF : 0) @@ -519,18 +535,18 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_ch if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 const size_t chunk_idx = _tzcnt_u32(mask) / 8; - mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); size_t cidx; if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; } } // try again } -#elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512) +#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { size_t chunk_idx = 0; #if 1 @@ -559,24 +575,24 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_ch mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. const size_t chunk_idx = _tzcnt_u64(mask) / 8; #endif - mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); size_t cidx; if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; } } // try again } #else - for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { size_t idx; if mi_unlikely(mi_bfield_find_least_to_xset(set, chunk->bfields[i], &idx)) { // find least 0-bit if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx)) { // try to set it atomically *pidx = (i*MI_BFIELD_BITS + idx); - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; } } @@ -585,38 +601,38 @@ static inline bool mi_bitmap_chunk_find_and_try_xset(mi_xset_t set, mi_bitmap_ch #endif } -static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) { - return mi_bitmap_chunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx); +static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) { + return mi_bchunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx); } -static inline bool mi_bitmap_chunk_find_and_try_set(mi_bitmap_chunk_t* chunk, size_t* pidx) { - return mi_bitmap_chunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); +static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) { + return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); } // find least byte in a chunk with all bits set, and try unset it atomically -// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // todo: try neon version -static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) +static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) while(true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vcmp = _mm256_cmpeq_epi8(vec, mi_mm256_ones()); // (byte == ~0 ? -1 : 0) const uint32_t mask = _mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte if (mask == 0) return false; const size_t i = _tzcnt_u32(mask); - mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(8*i < MI_BCHUNK_BITS); const size_t chunk_idx = i / MI_BFIELD_SIZE; const size_t byte_idx = i % MI_BFIELD_SIZE; if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) { // try to unset atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8); - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; } // try again } #else - for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { const mi_bfield_t x = chunk->bfields[i]; // has_set8 has low bit in each byte set if the byte in x == 0xFF const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F @@ -627,9 +643,9 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); mi_assert_internal((idx%8)==0); const size_t byte_idx = idx/8; - if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) { // unset the byte atomically + if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) { // unset the byte atomically *pidx = (i*MI_BFIELD_BITS) + idx; - mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS); + mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; } // else continue @@ -642,11 +658,11 @@ static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set, // and try to clear them atomically. -// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. -static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { +// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. +static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BFIELD_BITS) return false; const mi_bfield_t mask = mi_bfield_mask(n, 0); - for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { mi_bfield_t b = chunk->bfields[i]; size_t bshift = 0; size_t idx; @@ -657,10 +673,10 @@ static bool mi_bitmap_chunk_find_and_try_clearNX(mi_bitmap_chunk_t* chunk, size_ if ((b&mask) == mask) { // found a match mi_assert_internal( ((mask << bshift) >> bshift) == mask ); - if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i],mask<bfields[i],mask< MI_BITMAP_CHUNK_BITS) return false; // cannot be more than a chunk - // if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx); +// set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. +static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { + if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk + // if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx); // we align an a field, and require `field_count` fields to be all clear. // n >= MI_BFIELD_BITS; find a first field that is 0 const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields - for (size_t i = 0; i <= MI_BITMAP_CHUNK_FIELDS - field_count; i++) + for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++) { // first pre-scan for a range of fields that are all set bool allset = true; size_t j = 0; do { - mi_assert_internal(i + j < MI_BITMAP_CHUNK_FIELDS); + mi_assert_internal(i + j < MI_BCHUNK_FIELDS); mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i+j]); if (~b != 0) { allset = false; @@ -708,11 +724,11 @@ static bool mi_bitmap_chunk_find_and_try_clearN_(mi_bitmap_chunk_t* chunk, size_ // if all set, we can try to atomically clear them if (allset) { const size_t cidx = i*MI_BFIELD_BITS; - if (mi_bitmap_chunk_try_clearN(chunk, cidx, n)) { + if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) { // we cleared all atomically *pidx = cidx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); - mi_assert_internal(*pidx + n <= MI_BITMAP_CHUNK_BITS); + mi_assert_internal(*pidx < MI_BCHUNK_BITS); + mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); return true; } } @@ -721,87 +737,43 @@ static bool mi_bitmap_chunk_find_and_try_clearN_(mi_bitmap_chunk_t* chunk, size_ } -static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { - if (n==1) return mi_bitmap_chunk_find_and_try_clear(chunk, pidx); - if (n==8) return mi_bitmap_chunk_find_and_try_clear8(chunk, pidx); - if (n == 0 || n > MI_BITMAP_CHUNK_BITS) return false; // cannot be more than a chunk - if (n < MI_BFIELD_BITS) return mi_bitmap_chunk_find_and_try_clearNX(chunk, n, pidx); - return mi_bitmap_chunk_find_and_try_clearN_(chunk, n, pidx); +static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { + if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx); + if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx); + if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk + if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx); + return mi_bchunk_find_and_try_clearN_(chunk, n, pidx); } -// are all bits in a bitmap chunk set? -// static inline bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { -// #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) -// const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); -// return _mm256_test_all_ones(vec); -// #else -// // written like this for vectorization -// mi_bfield_t x = chunk->bfields[0]; -// for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { -// x = x & chunk->bfields[i]; -// } -// return (~x == 0); -// #endif -// } -// are all bits in a bitmap chunk clear? -static inline bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { - #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - return mi_mm256_is_zero(vec); - #elif defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==512) - const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); - if (!mi_mm256_is_zero(vec1)) return false; - const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); - return (mi_mm256_is_zero(vec2)); - #else - for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - if (chunk->bfields[i] != 0) return false; +// are all bits in a bitmap chunk clear? (this uses guaranteed atomic reads) +static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) { + for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { + if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false; } return true; +} + +// are all bits in a bitmap chunk clear? +static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { + #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return mi_mm256_is_zero(vec); + #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) + // a 64b cache-line contains the entire chunk anyway so load both at once + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); + return (mi_mm256_is_zero(_mm256_or_epi64(vec1,vec2))); + #else + return mi_bchunk_all_are_clear(chunk); #endif } + /* -------------------------------------------------------------------------------- - chunkmap (for now for 32-bit sets only) + chunkmap -------------------------------------------------------------------------------- */ -static void mi_chunkmap_split(mi_chunkmap_t es, mi_cmap_t* cmap, mi_epoch_t* epoch) { - *cmap = (mi_cmap_t)es; - if (epoch!=NULL) { *epoch = (mi_epoch_t)(es >> 32); } -} - -static mi_chunkmap_t mi_chunkmap_join(mi_cmap_t cmap, mi_epoch_t epoch) { - return ((mi_chunkmap_t)epoch << MI_CHUNKMAP_BITS) | cmap; -} - -// setting a bit increases the epoch -static void mi_chunkmap_set(_Atomic(mi_chunkmap_t)* cm, size_t idx) { - mi_assert(idx < MI_CHUNKMAP_BITS); - mi_epoch_t epoch; - mi_cmap_t cmap; - mi_chunkmap_t cm_new; - mi_chunkmap_t cm_old = mi_atomic_load_relaxed(cm); - do { - mi_chunkmap_split(cm_old, &cmap, &epoch); - cm_new = mi_chunkmap_join(cmap | (((mi_cmap_t)1)<chunk_maps[cmidx], idx); + mi_bchunk_set(&bitmap->chunkmap, chunk_idx); } -static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx, mi_epoch_t epoch) { +static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); - const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS; - const size_t idx = chunk_idx % MI_CHUNKMAP_BITS; - return mi_chunkmap_try_clear(&bitmap->chunk_maps[cmidx], idx, epoch); -} - -static mi_cmap_t mi_bitmap_chunkmap(mi_bitmap_t* bitmap, size_t chunk_idx, mi_epoch_t* epoch) { - mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); - const size_t cmidx = chunk_idx / MI_CHUNKMAP_BITS; - mi_assert_internal(cmidx < bitmap->chunk_map_count); - mi_cmap_t cmap; - mi_chunkmap_split(mi_atomic_load_relaxed(&bitmap->chunk_maps[cmidx]), &cmap, epoch); - return cmap; -} - -static mi_epoch_t mi_bitmap_chunkmap_epoch(mi_bitmap_t* bitmap, size_t chunk_idx) { - mi_epoch_t epoch; - mi_bitmap_chunkmap(bitmap, chunk_idx, &epoch); - return epoch; + // check if the corresponding chunk is all clear + if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) return false; + // clear the chunkmap bit + mi_bchunk_clear(&bitmap->chunkmap, chunk_idx, NULL); + // .. but a concurrent set may have happened in between our all-clear test and the clearing of the + // bit in the mask. We check again to catch this situation. + if (!mi_bchunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bchunk_set(&bitmap->chunkmap, chunk_idx); + return false; + } + return true; } /* -------------------------------------------------------------------------------- @@ -841,14 +804,14 @@ static mi_epoch_t mi_bitmap_chunkmap_epoch(mi_bitmap_t* bitmap, size_t chunk_idx -------------------------------------------------------------------------------- */ size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) { - mi_assert_internal((bit_count % MI_BITMAP_CHUNK_BITS) == 0); - bit_count = _mi_align_up(bit_count, MI_BITMAP_CHUNK_BITS); + mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0); + bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS); mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT); mi_assert_internal(bit_count > 0); - const size_t chunk_count = bit_count / MI_BITMAP_CHUNK_BITS; + const size_t chunk_count = bit_count / MI_BCHUNK_BITS; mi_assert_internal(chunk_count >= 1); - const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BITMAP_CHUNK_SIZE); - mi_assert_internal( (size%MI_BITMAP_CHUNK_SIZE) == 0 ); + const size_t size = sizeof(mi_bitmap_t) + ((chunk_count - 1) * MI_BCHUNK_SIZE); + mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 ); if (pchunk_count != NULL) { *pchunk_count = chunk_count; } return size; } @@ -861,8 +824,6 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) if (!already_zero) { _mi_memzero_aligned(bitmap, size); } - mi_atomic_store_release(&bitmap->chunk_map_count, _mi_divide_up(chunk_count, MI_CHUNKMAP_BITS)); - mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_map_count) <= MI_BITMAP_MAX_CHUNKMAPS); mi_atomic_store_release(&bitmap->chunk_count, chunk_count); mi_assert_internal(mi_atomic_load_relaxed(&bitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT); return size; @@ -874,32 +835,39 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); // first chunk - size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - size_t m = MI_BITMAP_CHUNK_BITS - cidx; + size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + size_t m = MI_BCHUNK_BITS - cidx; if (m > n) { m = n; } - mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL); + mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL); mi_bitmap_chunkmap_set(bitmap, chunk_idx); // n can be large so use memset for efficiency for all in-between chunks chunk_idx++; n -= m; - const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; + const size_t mid_chunks = n / MI_BCHUNK_BITS; if (mid_chunks > 0) { - _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BITMAP_CHUNK_SIZE); + _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE); const size_t end_chunk = chunk_idx + mid_chunks; while (chunk_idx < end_chunk) { - mi_bitmap_chunkmap_set(bitmap, chunk_idx); - chunk_idx++; + if ((chunk_idx % MI_BFIELD_BITS) == 0 && (chunk_idx + MI_BFIELD_BITS <= end_chunk)) { + // optimize: we can set a full bfield in the chunkmap + mi_atomic_store_relaxed( &bitmap->chunkmap.bfields[chunk_idx/MI_BFIELD_BITS], mi_bfield_all_set()); + chunk_idx += MI_BFIELD_BITS; + } + else { + mi_bitmap_chunkmap_set(bitmap, chunk_idx); + chunk_idx++; + } } - n -= (mid_chunks * MI_BITMAP_CHUNK_BITS); + n -= (mid_chunks * MI_BCHUNK_BITS); } // last chunk if (n > 0) { - mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); - mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); - mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); + mi_assert_internal(n < MI_BCHUNK_BITS); + mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); + mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); mi_bitmap_chunkmap_set(bitmap, chunk_idx); } } @@ -909,22 +877,19 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { // and false otherwise leaving the bitmask as is. static bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (set) { - // first set the chunkmap since it is a conservative approximation (increases epoch) - mi_bitmap_chunkmap_set(bitmap, chunk_idx); - // then actually try to set it atomically - return mi_bitmap_chunk_try_set(&bitmap->chunks[chunk_idx], cidx); + const bool ok = mi_bchunk_try_set(&bitmap->chunks[chunk_idx], cidx); + if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); } // set afterwards + return ok; } else { - const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx); - bool cleared = mi_bitmap_chunk_try_clear(&bitmap->chunks[chunk_idx], cidx); - if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } - return cleared; + bool maybe_all_clear; + const bool ok = mi_bchunk_try_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear); + if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return ok; } } @@ -933,126 +898,107 @@ static bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { static bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); mi_assert_internal(idx%8 == 0); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t byte_idx = (idx % MI_BCHUNK_BITS)/8; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (set) { - // first set the anyset since it is a conservative approximation (increases epoch) - mi_bitmap_chunkmap_set(bitmap, chunk_idx); - // then actually try to set it atomically - return mi_bitmap_chunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx); + const bool ok = mi_bchunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx); + if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); } // set afterwards + return ok; } else { - const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx); - bool cleared = mi_bitmap_chunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx); - if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } - return cleared; + bool maybe_all_clear; + const bool ok = mi_bchunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx, &maybe_all_clear); + if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return ok; } } - // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! static bool mi_bitmap_try_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n<=MI_BCHUNK_BITS); mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); if (n==0 || idx + n > mi_bitmap_max_bits(bitmap)) return false; - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia - + if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia if (set) { - // first set the chunkmap since it is a conservative approximation (increases epoch) - mi_bitmap_chunkmap_set(bitmap, chunk_idx); - // then actually try to set it atomically - return mi_bitmap_chunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n); + const bool ok = mi_bchunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n); + if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); } // set afterwards + return ok; } else { - const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx); - bool cleared = mi_bitmap_chunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n); - if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } - return cleared; + bool maybe_all_clear; + const bool ok = mi_bchunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); + if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return ok; } } mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS); if (n==1) return mi_bitmap_try_xset(set, bitmap, idx); if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx); - // todo: add 32/64 for large pages + // todo: add 32/64 for large pages ? return mi_bitmap_try_xsetN_(set, bitmap, idx, n); } -// Set/clear a sequence of 2 bits that were on an even `idx` in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -static bool mi_bitmap_xset_pair(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal((idx%2)==0); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); +// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) +bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (set) { - // first set the chunkmap since it is a conservative approximation (increases epoch) - mi_bitmap_chunkmap_set(bitmap, chunk_idx); - // then actually try to set it atomically - return mi_bitmap_chunk_set2(&bitmap->chunks[chunk_idx], cidx, NULL); + const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards + return wasclear; } else { - const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx); - bool already_clear = false; - const bool allset = mi_bitmap_chunk_clear2(&bitmap->chunks[chunk_idx], cidx, &already_clear); - if (!already_clear && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } - return allset; + bool maybe_all_clear; + const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear); + if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return wasset; } } // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) { mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n<=MI_BCHUNK_BITS); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia if (set) { - // first set the chunkmap since it is a conservative approximation (increases epoch) - mi_bitmap_chunkmap_set(bitmap, chunk_idx); - // then actually try to set it atomically - return mi_bitmap_chunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset); + const bool allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset); + mi_bitmap_chunkmap_set(bitmap,chunk_idx); // set afterwards + return allclear; } else { - const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap,chunk_idx); size_t already_clear = 0; - const bool allset = mi_bitmap_chunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear); + const bool allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear ); if (already_xset != NULL) { *already_xset = already_clear; } - if (already_clear < n && epoch == mi_bitmap_chunkmap_epoch(bitmap,chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } + if (already_clear < n) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } return allset; } } // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) { - mi_assert_internal(n>0 && n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS); //TODO: specialize? //if (n==1) return mi_bitmap_xset(set, bitmap, idx); //if (n==2) return mi_bitmap_xset(set, bitmap, idx); @@ -1061,82 +1007,52 @@ bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, s } -// Is a sequence of 2 bits already all set/cleared? -static inline bool mi_bitmap_is_xset2(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx + 2 <= mi_bitmap_max_bits(bitmap)); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + 2 <= MI_BITMAP_CHUNK_BITS); - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - return mi_bitmap_chunk_is_xset2(set, &bitmap->chunks[chunk_idx], cidx); -} - - // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n<=MI_BCHUNK_BITS); mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia - return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); + return mi_bchunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); } /* -------------------------------------------------------------------------------- bitmap try_find_and_clear -------------------------------------------------------------------------------- */ -static inline size_t mi_bitmap_find_hi_chunk(mi_bitmap_t* bitmap) { - size_t hi_chunk_map_idx = 0; - mi_cmap_t hi_cmap = 0; - for (size_t i = 1; i < mi_bitmap_chunk_map_count(bitmap); i++) { - mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, i, NULL); - if (cmap != 0) { - hi_chunk_map_idx = i; - hi_cmap = cmap; - } - } - uint32_t cmap_idx; - if (mi_bsr32(hi_cmap, &cmap_idx)) { - const size_t hi = (hi_chunk_map_idx * MI_CHUNKMAP_BITS) + cmap_idx; - mi_assert_internal(hi < mi_bitmap_chunk_count(bitmap)); - return hi; - } - else { - return 0; - } -} + #define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \ { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ const size_t chunk_start = 0; /* tseq % (1 + mi_bitmap_find_hi_chunk(bitmap)); */ \ - const size_t chunk_map_start = chunk_start / MI_CHUNKMAP_BITS; \ - const uint32_t chunk_map_start_idx = (uint32_t)(chunk_start % MI_CHUNKMAP_BITS); \ + const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \ + const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ + const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \ /* for each chunkmap entry `i` */ \ - for (size_t _i = 0; _i < bitmap->chunk_map_count; _i++) { \ - size_t i = (_i + chunk_map_start); \ - if (i >= bitmap->chunk_map_count) { i -= bitmap->chunk_map_count; } /* adjust for the start position */ \ + for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \ + size_t i = (_i + chunkmap_start); \ + if (i >= chunkmap_max_bfield) { i -= chunkmap_max_bfield; } /* adjust for the start position */ \ \ - const size_t chunk_idx0 = i*MI_CHUNKMAP_BITS; \ - mi_epoch_t name_epoch; \ - mi_cmap_t cmap = mi_bitmap_chunkmap(bitmap, chunk_idx0, &name_epoch); \ - uint32_t cmap_idx_shift = 0; /* shift through the cmap */ \ - if (_i == 0) { cmap = mi_rotr32(cmap, chunk_map_start_idx); cmap_idx_shift = chunk_map_start_idx; } /* rotate right for the start position (on the first iteration) */ \ + const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ + mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ + size_t cmap_idx_shift = 0; /* shift through the cmap */ \ + if (_i == 0) { cmap = mi_rotr(cmap, chunkmap_start_idx); cmap_idx_shift = chunkmap_start_idx; } /* rotate right for the start position (on the first iteration) */ \ \ - uint32_t cmap_idx; /* one bit set of each chunk that may have bits set */ \ - while (mi_bsf32(cmap, &cmap_idx)) { /* find least bit that is set */ \ + size_t cmap_idx; \ + while (mi_bsf(cmap, &cmap_idx)) { /* find least bit that is set */ \ /* set the chunk idx */ \ - size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_CHUNKMAP_BITS); \ - if (name_chunk_idx >= mi_bitmap_chunk_count(bitmap)) { name_chunk_idx -= mi_bitmap_chunk_count(bitmap); } \ + size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \ + mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \ /* try to find and clear N bits in that chunk */ \ - if (name_chunk_idx < mi_bitmap_chunk_count(bitmap)) { /* we can have less chunks than in the chunkmap.. */ + { #define mi_bitmap_forall_chunks_end() \ } \ @@ -1146,7 +1062,7 @@ static inline size_t mi_bitmap_find_hi_chunk(mi_bitmap_t* bitmap) { cmap >>= 1; \ } \ }} - + // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) @@ -1154,17 +1070,15 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) { size_t cidx; - if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { - *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + if mi_likely(mi_bchunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { + *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); return true; } else { // we may find that all are cleared only on a second iteration but that is ok as // the chunkmap is a conservative approximation. - if (epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); // continue } } @@ -1172,183 +1086,48 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t return false; } -/* -------------------------------------------------------------------------------- - pairmap --------------------------------------------------------------------------------- */ -void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2) { - mi_assert_internal(mi_bitmap_chunk_count(bm1)==mi_bitmap_chunk_count(bm2)); - pairmap->bitmap1 = bm1; - pairmap->bitmap2 = bm2; -} - -static void mi_pairmap_from_pair_idx(mi_pairmap_t* pairmap, size_t pair_idx, mi_bitmap_t** bitmap, size_t* pidx) { - const size_t idx = 2*pair_idx; - const size_t maxbits = mi_bitmap_max_bits(pairmap->bitmap1); - mi_assert_internal(pair_idx < maxbits); - if (idx < maxbits) { - *bitmap = pairmap->bitmap1; - *pidx = idx; - } - else { - *bitmap = pairmap->bitmap2; - *pidx = idx - maxbits; - } -} - -bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx) { - mi_bitmap_t* bitmap; - size_t idx; - mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); - return mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx); -} - -bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx) { - mi_bitmap_t* bitmap; - size_t idx; - mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); - return mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx); -} - -bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx) { - mi_bitmap_t* bitmap; - size_t idx; - mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); - return mi_bitmap_is_xset2(MI_BIT_CLEAR, bitmap, idx); -} - - - -/* -------------------------------------------------------------------------------- - pairmap clear while not busy --------------------------------------------------------------------------------- */ - -static inline bool mi_bfield_atomic_clear2_once_not_busy(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set). - mi_assert_internal(idx < MI_BFIELD_BITS-1); - const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx); - const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); - mi_bfield_t bnew; - mi_bfield_t old = mi_atomic_load_relaxed(b); - do { - if mi_unlikely((old&mask)==mask_busy) { - old = mi_atomic_load_acquire(b); - if ((old&mask)==mask_busy) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); } - while ((old&mask)==mask_busy) { // busy wait - mi_atomic_yield(); - old = mi_atomic_load_acquire(b); - } - } - bnew = (old & ~mask); // clear - } while (!mi_atomic_cas_weak_acq_rel(b, &old, bnew)); - mi_assert_internal((old&mask) != mask_busy); // we should never clear a busy page - mi_assert_internal((old&mask) == mask); // in our case: we should only go from set to clear (when reclaiming an abandoned page from a free) - return ((old&mask) == mask); -} - -static inline bool mi_bitmap_chunk_clear2_once_not_busy(mi_bitmap_chunk_t* chunk, size_t cidx) { - mi_assert_internal(cidx < MI_BITMAP_CHUNK_BITS); - const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; - return mi_bfield_atomic_clear2_once_not_busy(&chunk->bfields[i], idx); -} - -static bool mi_bitmap_clear2_once_not_busy(mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal((idx%2)==0); - mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); - const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; - const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - const mi_epoch_t epoch = mi_bitmap_chunkmap_epoch(bitmap, chunk_idx); - bool cleared = mi_bitmap_chunk_clear2_once_not_busy(&bitmap->chunks[chunk_idx], cidx); - if (cleared && epoch == mi_bitmap_chunkmap_epoch(bitmap, chunk_idx) && mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx, epoch); - } - return cleared; -} - -void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx) { - mi_bitmap_t* bitmap; - size_t idx; - mi_pairmap_from_pair_idx(pairmap, pair_idx, &bitmap, &idx); - mi_bitmap_clear2_once_not_busy(bitmap, idx); -} - - - -/* -------------------------------------------------------------------------------- - pairmap try and set busy --------------------------------------------------------------------------------- */ - -// Atomically go from set to busy, or return false otherwise and leave the bit field as-is. -static inline bool mi_bfield_atomic_try_set_busy(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal((idx%2)==0); // bit patterns are 00 (clear), 10 (busy), and 11 (set). - mi_assert_internal(idx < MI_BFIELD_BITS-1); - const mi_bfield_t mask = ((mi_bfield_t)MI_PAIR_SET << idx); - const mi_bfield_t mask_busy = ((mi_bfield_t)MI_PAIR_BUSY << idx); - mi_bfield_t old; - mi_bfield_t bnew; - do { - old = mi_atomic_load_relaxed(b); - if ((old & mask) != mask) return false; // no longer set - bnew = (old & ~mask) | mask_busy; - } while (!mi_atomic_cas_weak_acq_rel(b, &old, bnew)); - return true; -} - -static inline bool mi_bitmap_chunk_try_find_and_set_busy(mi_bitmap_chunk_t* chunk, size_t* pidx) { - for (int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { - while (true) { - const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]) & MI_BFIELD_LO_BIT2; // only keep MI_PAIR_SET bits - size_t idx; - if (!mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit - break; // not found: continue with the next field - } - else { - mi_assert_internal((idx%2)==0); - if mi_likely(mi_bfield_atomic_try_set_busy(&chunk->bfields[i], idx)) { - *pidx = (i*MI_BFIELD_BITS) + idx; - mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS-1); - return true; - } - // else: try this word once again - } - } - } - return false; -} - - -static bool mi_bitmap_try_find_and_set_busy(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t idx_offset, size_t* ppair_idx, - mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2) +mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, + mi_claim_fun_t* claim, void* arg1, void* arg2) { mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) { - MI_UNUSED(epoch); MI_UNUSED(n); - mi_assert_internal(n==2); size_t cidx; - if mi_likely(mi_bitmap_chunk_try_find_and_set_busy(&bitmap->chunks[chunk_idx], &cidx)) { - const size_t idx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; - mi_assert_internal((idx%2)==0); - const size_t pair_idx = (idx + idx_offset)/2; - if (claim(pair_idx, arg1, arg2)) { // while busy, the claim function can read from the page - mi_bitmap_xset_pair(MI_BIT_CLEAR, bitmap, idx); // claimed, clear the entry - *ppair_idx = pair_idx; + if mi_likely(mi_bchunk_find_and_try_clear(&bitmap->chunks[chunk_idx], &cidx)) { + const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx; + mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap)); + bool keep_set = true; + if ((*claim)(slice_index, arg1, arg2, &keep_set)) { + // success! + mi_assert_internal(!keep_set); + *pidx = slice_index; return true; } else { - mi_bitmap_xset_pair(MI_BIT_SET, bitmap, idx); // not claimed, reset the entry - // and continue + // failed to claim it, set abandoned mapping again (unless thet page was freed) + if (keep_set) { + const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); + mi_assert_internal(wasclear); MI_UNUSED(wasclear); + } + // continue } } + else { + // we may find that all are cleared only on a second iteration but that is ok as + // the chunkmap is a conservative approximation. + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); + // continue + } } mi_bitmap_forall_chunks_end(); return false; } -// Used to find an abandoned page, and transition from set to busy. -mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pair_idx, - mi_bitmap_claim_while_busy_fun_t* claim, void* arg1, void* arg2 ) { - if (mi_bitmap_try_find_and_set_busy(pairmap->bitmap1, 2, tseq, 0, pair_idx, claim, arg1, arg2)) return true; - return mi_bitmap_try_find_and_set_busy(pairmap->bitmap2, 2, tseq, mi_bitmap_max_bits(pairmap->bitmap1), pair_idx, claim, arg1, arg2); -} +// Clear a bit once it is set. +void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx); +} \ No newline at end of file diff --git a/src/bitmap.h b/src/bitmap.h index 78ee5380..9ef97d2f 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -19,35 +19,34 @@ Concurrent bitmap that can set/reset sequences of bits atomically each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB). We need 16K bits to represent a 1GiB arena. - `mi_bitmap_chunk_t`: a chunk of bfield's of a total of MI_BITMAP_CHUNK_BITS (= 512) + `mi_bchunk_t`: a chunk of bfield's of a total of MI_BCHUNK_BITS (= 512 on 64-bit, 256 on 32-bit) allocations never span across chunks -- so MI_ARENA_MAX_OBJ_SIZE is the number of bits in a chunk times the MI_ARENA_SLICE_SIZE (512 * 64KiB = 32 MiB). - These chunks are cache-aligned and we can use AVX2/AVX512/SVE/SVE2/etc. instructions + These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions to scan for bits (perhaps) more efficiently. - `mi_chunkmap_t`: for each chunk we track if it has (potentially) any bit set. + `mi_bchunkmap_t` == `mi_bchunk_t`: for each chunk we track if it has (potentially) any bit set. The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set. This is used to avoid scanning every chunk. (and thus strictly an optimization) It is conservative: it is fine to a bit in the chunk map even if the chunk turns out - to have no bits set. + to have no bits set. It is also allowed to briefly have a clear bit even if the + chunk has bits set, as long as we guarantee that we set the bit later on -- this + allows us to set the chunkmap bit after we set a bit in the corresponding chunk. - When we (potentially) set a bit in a chunk, we first update the chunkmap. However, when we clear a bit in a chunk, and the chunk is indeed all clear, we cannot safely clear the bit corresponding to the chunk in the chunkmap since it - may race with another thread setting a bit in the same chunk (and we may clear the - bit even though a bit is set in the chunk which is not allowed). + may race with another thread setting a bit in the same chunk. Therefore, when + clearing, we first test if a chunk is clear, then clear the chunkmap bit, and + then test again to catch any set bits that we missed. - To fix this, the chunkmap contains 32-bits of bits for chunks, and a 32-bit "epoch" - counter that is increased everytime a bit is set. We only clear a bit if the epoch - stayed the same over our clear operation (so we know no other thread in the mean - time set a bit in any of the chunks corresponding to the chunkmap). - Since increasing the epoch and setting a bit must be atomic, we use only half-word - bits (32) (we could use 128-bit atomics if needed since modern hardware supports this) + Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes + not find a free page even though it's there (but we accept this as we avoid taking + full locks). (Another way to do this is to use an epoch but we like to avoid that complexity + for now). - `mi_bitmap_t`: a bitmap with N chunks. A bitmap always has MI_BITMAP_MAX_CHUNK_FIELDS (=16) - and can support arena's from few chunks up to 16 chunkmap's = 16 * 32 chunks = 16 GiB - The `chunk_count` can be anything from 1 to the max supported by the chunkmap's but - each chunk is always complete (512 bits, so 512 * 64KiB = 32MiB memory area's). + `mi_bitmap_t`: a bitmap with N chunks. A bitmap has a chunkmap of MI_BCHUNK_BITS (512) + and thus has at most 512 chunks (=2^18 bits x 64 KiB slices = 16 GiB max arena size). + The minimum is 1 chunk which is a 32 MiB arena. For now, the implementation assumes MI_HAS_FAST_BITSCAN and uses trailing-zero-count and pop-count (but we think it can be adapted work reasonably well on older hardware too) @@ -56,60 +55,49 @@ Concurrent bitmap that can set/reset sequences of bits atomically // A word-size bit field. typedef size_t mi_bfield_t; -#define MI_BFIELD_BITS_SHIFT (MI_SIZE_SHIFT+3) -#define MI_BFIELD_BITS (1 << MI_BFIELD_BITS_SHIFT) -#define MI_BFIELD_SIZE (MI_BFIELD_BITS/8) -#define MI_BFIELD_BITS_MOD_MASK (MI_BFIELD_BITS - 1) -#define MI_BFIELD_LO_BIT8 (((~(mi_bfield_t)0))/0xFF) // 0x01010101 .. -#define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. +#define MI_BFIELD_BITS_SHIFT (MI_SIZE_SHIFT+3) +#define MI_BFIELD_BITS (1 << MI_BFIELD_BITS_SHIFT) +#define MI_BFIELD_SIZE (MI_BFIELD_BITS/8) +#define MI_BFIELD_LO_BIT8 (((~(mi_bfield_t)0))/0xFF) // 0x01010101 .. +#define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. -#define MI_BITMAP_CHUNK_SIZE (MI_BITMAP_CHUNK_BITS / 8) -#define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) -#define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) - -// A bitmap chunk contains 512 bits of bfields on 64_bit (256 on 32-bit) -typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_chunk_s { - _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; -} mi_bitmap_chunk_t; +#define MI_BCHUNK_SIZE (MI_BCHUNK_BITS / 8) +#define MI_BCHUNK_FIELDS (MI_BCHUNK_BITS / MI_BFIELD_BITS) // 8 on both 64- and 32-bit -// for now 32-bit epoch + 32-bit bit-set (note: with ABA instructions we can double this) -typedef uint64_t mi_chunkmap_t; -typedef uint32_t mi_epoch_t; -typedef uint32_t mi_cmap_t; +// A bitmap chunk contains 512 bits on 64-bit (256 on 32-bit) +typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bchunk_s { + _Atomic(mi_bfield_t) bfields[MI_BCHUNK_FIELDS]; +} mi_bchunk_t; -#define MI_CHUNKMAP_BITS (32) // 1 chunkmap tracks 32 chunks +// The chunkmap has one bit per corresponding chunk that is set if the chunk potentially has bits set. +// The chunkmap is itself a chunk. +typedef mi_bchunk_t mi_bchunkmap_t; -#define MI_BITMAP_MAX_CHUNKMAPS (16) -#define MI_BITMAP_MAX_CHUNK_COUNT (MI_BITMAP_MAX_CHUNKMAPS * MI_CHUNKMAP_BITS) -#define MI_BITMAP_MIN_CHUNK_COUNT (1 * MI_CHUNKMAP_BITS) // 1 GiB arena +#define MI_BCHUNKMAP_BITS MI_BCHUNK_BITS -#define MI_BITMAP_MAX_BIT_COUNT (MI_BITMAP_MAX_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 16 GiB arena -#define MI_BITMAP_MIN_BIT_COUNT (MI_BITMAP_MIN_CHUNK_COUNT * MI_BITMAP_CHUNK_BITS) // 1 GiB arena +#define MI_BITMAP_MAX_CHUNK_COUNT (MI_BCHUNKMAP_BITS) +#define MI_BITMAP_MIN_CHUNK_COUNT (1) +#define MI_BITMAP_MAX_BIT_COUNT (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS) // 16 GiB arena +#define MI_BITMAP_MIN_BIT_COUNT (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS) // 32 MiB arena // An atomic bitmap -typedef mi_decl_align(MI_BITMAP_CHUNK_SIZE) struct mi_bitmap_s { - _Atomic(size_t) chunk_map_count; // valid chunk_maps entries - _Atomic(size_t) chunk_count; // total count of chunks - size_t padding[MI_BITMAP_CHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc - _Atomic(mi_chunkmap_t) chunk_maps[MI_BITMAP_MAX_CHUNKMAPS]; - - mi_bitmap_chunk_t chunks[MI_BITMAP_MIN_BIT_COUNT]; // or more, up to MI_BITMAP_MAX_CHUNK_COUNT +typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { + _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) + size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1]; // suppress warning on msvc + mi_bchunkmap_t chunkmap; + mi_bchunk_t chunks[1]; // or more, up to MI_BITMAP_MAX_CHUNK_COUNT } mi_bitmap_t; -static inline size_t mi_bitmap_chunk_map_count(const mi_bitmap_t* bitmap) { - return mi_atomic_load_relaxed(&bitmap->chunk_map_count); -} - static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) { return mi_atomic_load_relaxed(&bitmap->chunk_count); } static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) { - return (mi_bitmap_chunk_count(bitmap) * MI_BITMAP_CHUNK_BITS); + return (mi_bitmap_chunk_count(bitmap) * MI_BCHUNK_BITS); } @@ -134,9 +122,22 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero); // Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) +bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx); + +static inline bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_xset(MI_BIT_SET, bitmap, idx); +} + +static inline bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_xset(MI_BIT_CLEAR, bitmap, idx); +} + + // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! -// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! +// If `already_xset` is not NULL, it is to all the bits were already all set/cleared. bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset); static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) { @@ -162,7 +163,7 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n // Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) // and false otherwise leaving the bitmask as is. -// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { @@ -177,48 +178,11 @@ static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); +typedef bool (mi_claim_fun_t)(size_t slice_index, void* arg1, void* arg2, bool* keep_set); +mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, + mi_claim_fun_t* claim, void* arg1, void* arg2); -/* -------------------------------------------------------------------------------- - Atomic bitmap for a pair of bits. - - The valid pairs are CLEAR (0), SET (3), or BUSY (2). - - These bit pairs are used in the abandoned pages maps: when set, the entry has - an available page. When we scan for an available abandoned page and find an entry SET, - we first set it to BUSY, and try to claim the page atomically (since it can race - with a concurrent `mi_free` which also tries to claim the page). However, unlike `mi_free`, - we cannot be sure that a concurrent `mi_free` also didn't free (and decommit) the page - just when we got the entry. Therefore, a page can only be freed after `mi_arena_unabandon` - which (busy) waits until the BUSY flag is cleared to ensure all readers are done. - (and pair-bit operations must therefore be release_acquire). --------------------------------------------------------------------------------- */ - -#define MI_PAIR_CLEAR (0) -#define MI_PAIR_UNUSED (1) // should never occur -#define MI_PAIR_BUSY (2) -#define MI_PAIR_SET (3) - -// 0b....0101010101010101 -#define MI_BFIELD_LO_BIT2 ((MI_BFIELD_LO_BIT8 << 6)|(MI_BFIELD_LO_BIT8 << 4)|(MI_BFIELD_LO_BIT8 << 2)|MI_BFIELD_LO_BIT8) - -// A pairmap manipulates pairs of bits (and consists of 2 bitmaps) -typedef struct mi_pairmap_s { - mi_bitmap_t* bitmap1; - mi_bitmap_t* bitmap2; -} mi_pairmap_t; - -// initialize a pairmap to all clear; avoid a mem_zero if `already_zero` is true -void mi_pairmap_init(mi_pairmap_t* pairmap, mi_bitmap_t* bm1, mi_bitmap_t* bm2); -bool mi_pairmap_set(mi_pairmap_t* pairmap, size_t pair_idx); -bool mi_pairmap_clear(mi_pairmap_t* pairmap, size_t pair_idx); -bool mi_pairmap_is_clear(mi_pairmap_t* pairmap, size_t pair_idx); -void mi_pairmap_clear_once_not_busy(mi_pairmap_t* pairmap, size_t pair_idx); - -typedef bool (mi_bitmap_claim_while_busy_fun_t)(size_t pair_index, void* arg1, void* arg2); -mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx, - mi_bitmap_claim_while_busy_fun_t* claim, void* arg1 ,void* arg2 - ); - +void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx); #endif // MI_BITMAP_H From 61436a92b9ec623220a92d1f2c166d39a64067a9 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 6 Dec 2024 15:26:01 -0800 Subject: [PATCH 036/264] working simplified version without pairmaps and bitmap epoch --- src/bitmap.c | 48 +++++++++++++++++++++++----------------------- src/bitmap.h | 18 +++++++++++------ src/init.c | 2 +- src/page-map.c | 6 +++--- test/test-stress.c | 8 ++++---- 5 files changed, 44 insertions(+), 38 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index 7df46070..0916aaae 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -42,9 +42,9 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { return mi_rotr(x,r); } -static inline mi_bfield_t mi_bfield_zero(void) { - return 0; -} +//static inline mi_bfield_t mi_bfield_zero(void) { +// return 0; +//} static inline mi_bfield_t mi_bfield_one(void) { return 1; @@ -147,10 +147,10 @@ static inline bool mi_bfield_atomic_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t // Tries to set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 // and otherwise false (leaving the bit unchanged) -static inline bool mi_bfield_atomic_try_set(_Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal(idx < MI_BFIELD_BITS); - return mi_bfield_atomic_set(b, idx); // for a single bit there is no difference -} +//static inline bool mi_bfield_atomic_try_set(_Atomic(mi_bfield_t)*b, size_t idx) { +// mi_assert_internal(idx < MI_BFIELD_BITS); +// return mi_bfield_atomic_set(b, idx); // for a single bit there is no difference +//} // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0. // `all_clear` is set to true if the new bfield is zero (and false otherwise) @@ -237,17 +237,17 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by // Try to set a full field of bits atomically, and return true all bits transitioned from all 0's to 1's. // and false otherwise leaving the bit field as-is. -static inline bool mi_bfield_atomic_try_setX(_Atomic(mi_bfield_t)*b) { - mi_bfield_t old = 0; - return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_all_set()); -} +//static inline bool mi_bfield_atomic_try_setX(_Atomic(mi_bfield_t)*b) { +// mi_bfield_t old = 0; +// return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_all_set()); +//} // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's. // and false otherwise leaving the bit field as-is. -static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) { - mi_bfield_t old = mi_bfield_all_set(); - return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_zero()); -} +//static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) { +// mi_bfield_t old = mi_bfield_all_set(); +// return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_zero()); +//} // Check if all bits corresponding to a mask are set. @@ -328,7 +328,7 @@ static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size const mi_bfield_t mask = mi_bfield_mask(m, idx); size_t already_xset = 0; const bool transition = mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset); - mi_assert_internal((transition && already_xset == m) || (!transition && already_xset > 0)); + mi_assert_internal((transition && already_xset == 0) || (!transition && already_xset > 0)); all_transition = all_transition && transition; total_already_xset += already_xset; // next field @@ -605,9 +605,9 @@ static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx return mi_bchunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx); } -static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) { - return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); -} +//static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) { +// return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); +//} // find least byte in a chunk with all bits set, and try unset it atomically @@ -763,7 +763,7 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { // a 64b cache-line contains the entire chunk anyway so load both at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); - return (mi_mm256_is_zero(_mm256_or_epi64(vec1,vec2))); + return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2))); #else return mi_bchunk_all_are_clear(chunk); #endif @@ -810,7 +810,7 @@ size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) { mi_assert_internal(bit_count > 0); const size_t chunk_count = bit_count / MI_BCHUNK_BITS; mi_assert_internal(chunk_count >= 1); - const size_t size = sizeof(mi_bitmap_t) + ((chunk_count - 1) * MI_BCHUNK_SIZE); + const size_t size = offsetof(mi_bitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE); mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 ); if (pchunk_count != NULL) { *pchunk_count = chunk_count; } return size; @@ -1044,10 +1044,10 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ size_t cmap_idx_shift = 0; /* shift through the cmap */ \ - if (_i == 0) { cmap = mi_rotr(cmap, chunkmap_start_idx); cmap_idx_shift = chunkmap_start_idx; } /* rotate right for the start position (on the first iteration) */ \ + if (_i == 0) { cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); cmap_idx_shift = chunkmap_start_idx; } /* rotate right for the start position (on the first iteration) */ \ \ size_t cmap_idx; \ - while (mi_bsf(cmap, &cmap_idx)) { /* find least bit that is set */ \ + while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \ /* set the chunk idx */ \ size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \ mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \ @@ -1130,4 +1130,4 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) { const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx); -} \ No newline at end of file +} diff --git a/src/bitmap.h b/src/bitmap.h index 9ef97d2f..7b6000cc 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -75,12 +75,18 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bchunk_s { // The chunkmap is itself a chunk. typedef mi_bchunk_t mi_bchunkmap_t; -#define MI_BCHUNKMAP_BITS MI_BCHUNK_BITS +#define MI_BCHUNKMAP_BITS MI_BCHUNK_BITS -#define MI_BITMAP_MAX_CHUNK_COUNT (MI_BCHUNKMAP_BITS) -#define MI_BITMAP_MIN_CHUNK_COUNT (1) -#define MI_BITMAP_MAX_BIT_COUNT (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS) // 16 GiB arena -#define MI_BITMAP_MIN_BIT_COUNT (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS) // 32 MiB arena +#define MI_BITMAP_MAX_CHUNK_COUNT (MI_BCHUNKMAP_BITS) +#define MI_BITMAP_MIN_CHUNK_COUNT (1) +#if MI_SIZE_BITS > 32 +#define MI_BITMAP_DEFAULT_CHUNK_COUNT (64) // 2 GiB on 64-bit -- this is for the page map +#else +#define MI_BITMAP_DEFAULT_CHUNK_COUNT (1) +#endif +#define MI_BITMAP_MAX_BIT_COUNT (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS) // 16 GiB arena +#define MI_BITMAP_MIN_BIT_COUNT (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS) // 32 MiB arena +#define MI_BITMAP_DEFAULT_BIT_COUNT (MI_BITMAP_DEFAULT_CHUNK_COUNT * MI_BCHUNK_BITS) // 2 GiB arena // An atomic bitmap @@ -88,7 +94,7 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1]; // suppress warning on msvc mi_bchunkmap_t chunkmap; - mi_bchunk_t chunks[1]; // or more, up to MI_BITMAP_MAX_CHUNK_COUNT + mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT } mi_bitmap_t; diff --git a/src/init.c b/src/init.c index 64b31e1b..5d4a775a 100644 --- a/src/init.c +++ b/src/init.c @@ -400,7 +400,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { tld->heap_backing = bheap; tld->heaps = NULL; tld->subproc = &mi_subproc_default; - tld->tseq = 0; // mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); tld->os.stats = &tld->stats; } diff --git a/src/page-map.c b/src/page-map.c index 25693064..c292378b 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -13,7 +13,7 @@ mi_decl_cache_align uint8_t* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; static mi_memid_t mi_page_map_memid; -static mi_bitmap_t mi_page_map_commit = { 1, MI_BITMAP_MIN_CHUNK_COUNT }; +static mi_bitmap_t mi_page_map_commit = { MI_BITMAP_DEFAULT_CHUNK_COUNT, { 0 }, { 0 }, { { 0 } } }; bool _mi_page_map_init(void) { size_t vbits = _mi_os_virtual_address_bits(); @@ -22,10 +22,10 @@ bool _mi_page_map_init(void) { // 64 KiB for 4 GiB address space (on 32-bit) const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); - mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_MIN_BIT_COUNT); + mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_DEFAULT_BIT_COUNT); // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true); - mi_page_map_all_committed = false; // _mi_os_has_overcommit(); // commit on-access on Linux systems? + mi_page_map_all_committed = (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems? _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); diff --git a/test/test-stress.c b/test/test-stress.c index 61891269..d5f106d5 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -41,11 +41,11 @@ static int THREADS = 8; static int SCALE = 10; static int ITER = 10; #elif 0 -static int THREADS = 1; +static int THREADS = 4; static int SCALE = 100; static int ITER = 10; #define ALLOW_LARGE false -#elif 1 +#elif 0 static int THREADS = 32; static int SCALE = 50; static int ITER = 50; @@ -343,9 +343,9 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG - mi_debug_show_arenas(true, true, false); + //mi_debug_show_arenas(true, true, false); mi_collect(true); - // mi_debug_show_arenas(true,true,false); + mi_debug_show_arenas(true,true,false); #endif // mi_stats_print(NULL); #else From 5a5943ad33551c4fcd84d62be1564445f25d52d4 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 6 Dec 2024 21:03:33 -0800 Subject: [PATCH 037/264] record max_clear bit --- src/arena.c | 32 ++++++++++++++++++++++++------ src/bitmap.c | 53 +++++++++++++++++++++++++++++++++++--------------- src/bitmap.h | 5 +++-- src/page-map.c | 5 ++++- 4 files changed, 70 insertions(+), 25 deletions(-) diff --git a/src/arena.c b/src/arena.c index fd609fe0..2c215264 100644 --- a/src/arena.c +++ b/src/arena.c @@ -476,23 +476,30 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t Arena page allocation ----------------------------------------------------------- */ -static bool mi_arena_claim_abandoned(size_t slice_index, void* arg1, void* arg2, bool* keep_abandoned) { +static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* arg2, bool* keep_abandoned) { // found an abandoned page of the right size mi_arena_t* const arena = (mi_arena_t*)arg1; mi_subproc_t* const subproc = (mi_subproc_t*)arg2; mi_page_t* const page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); // can we claim ownership? if (!mi_page_try_claim_ownership(page)) { + // there was a concurrent free .. + // we need to keep it in the abandoned map as the free will call `mi_arena_page_unabandon`, + // and wait for readers (us!) to finish. This is why it is very important to set the abandoned + // bit again (or otherwise the unabandon will never stop waiting). *keep_abandoned = true; return false; } if (subproc != page->subproc) { - // wrong sub-process.. we need to unown again, and perhaps not keep it abandoned + // wrong sub-process.. we need to unown again + // (an unown might free the page, and depending on that we can keep it in the abandoned map or not) + // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point. + // so we cannot check in `mi_arena_free` for this invariant to hold. const bool freed = _mi_page_unown(page); *keep_abandoned = !freed; return false; } - // yes, we can reclaim it + // yes, we can reclaim it, keep the abandaned map entry clear *keep_abandoned = false; return true; } @@ -515,7 +522,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl size_t slice_index; mi_bitmap_t* const bitmap = arena->pages_abandoned[bin]; - if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_claim_abandoned, arena, subproc)) { + if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc)) { // found an abandoned page of the right size // and claimed ownership. mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); @@ -703,6 +710,9 @@ void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); + // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may + // be (temporarily) not true if the free happens while trying to reclaim + // see `mi_arana_try_claim_abandoned` } #endif @@ -1087,10 +1097,11 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc return ENOMEM; } _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); + // mi_debug_show_arenas(true, true, false); + return 0; } - // Manage a range of regular OS memory bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); @@ -1121,13 +1132,22 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ size_t bit_set_count = 0; for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); + size_t k = 0; mi_bchunk_t* chunk = &bitmap->chunks[i]; - for (size_t j = 0, k = 0; j < MI_BCHUNK_FIELDS; j++) { + + if (i<10) { buf[k++] = ' '; } + if (i<100) { itoa((int)i, buf+k, 10); k += (i < 10 ? 1 : 2); } + buf[k++] = ' '; + + for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { if (j > 0 && (j % 4) == 0) { buf[k++] = '\n'; _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix); buf[k++] = ' '; buf[k++] = ' '; + buf[k++] = ' '; + buf[k++] = ' '; + buf[k++] = ' '; } if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; diff --git a/src/bitmap.c b/src/bitmap.c index 0916aaae..15401d8d 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -87,7 +87,7 @@ static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bo } // Clear a bit but only when/once it is set. This is used by concurrent free's while -// the page is abandoned and mapped. +// the page is abandoned and mapped. static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_one()<bfields[i+j]); - if (~b != 0) { - allset = false; - i += j; // no need to look again at the previous fields - break; + size_t idx; + if (mi_bfield_find_least_bit(~b,&idx)) { + if (m > idx) { + allset = false; + i += j; // no need to look again at the previous fields + break; + } + } + else { + // all bits in b were set + m -= MI_BFIELD_BITS; // note: can underflow } } while (++j < field_count); - + // if all set, we can try to atomically clear them if (allset) { const size_t cidx = i*MI_BFIELD_BITS; @@ -796,6 +804,11 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) mi_bchunk_set(&bitmap->chunkmap, chunk_idx); return false; } + // record the max clear + size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); + do { + if mi_likely(chunk_idx <= oldmax) break; + } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx)); return true; } @@ -853,6 +866,7 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { if ((chunk_idx % MI_BFIELD_BITS) == 0 && (chunk_idx + MI_BFIELD_BITS <= end_chunk)) { // optimize: we can set a full bfield in the chunkmap mi_atomic_store_relaxed( &bitmap->chunkmap.bfields[chunk_idx/MI_BFIELD_BITS], mi_bfield_all_set()); + mi_bitmap_chunkmap_set(bitmap, chunk_idx + MI_BFIELD_BITS - 1); // track the max set chunk_idx += MI_BFIELD_BITS; } else { @@ -1032,20 +1046,24 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ - const size_t chunk_start = 0; /* tseq % (1 + mi_bitmap_find_hi_chunk(bitmap)); */ \ + const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); /* mi_bitmap_chunk_count(bitmap) */ \ + const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */ /* space out threads */ \ const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \ const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \ /* for each chunkmap entry `i` */ \ for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \ size_t i = (_i + chunkmap_start); \ - if (i >= chunkmap_max_bfield) { i -= chunkmap_max_bfield; } /* adjust for the start position */ \ - \ + if (i >= chunkmap_max_bfield) { \ + i -= chunkmap_max_bfield; /* adjust for the start position */ \ + } \ const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ size_t cmap_idx_shift = 0; /* shift through the cmap */ \ - if (_i == 0) { cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); cmap_idx_shift = chunkmap_start_idx; } /* rotate right for the start position (on the first iteration) */ \ - \ + if (_i == 0) { \ + cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \ + cmap_idx_shift = chunkmap_start_idx; \ + } \ size_t cmap_idx; \ while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \ /* set the chunk idx */ \ @@ -1065,6 +1083,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +// (Used to find fresh free slices.) mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) @@ -1087,6 +1106,8 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t } +// Find a set bit in the bitmap and try to atomically clear it and claim it. +// (Used to find pages in the pages_abandoned bitmaps.) mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, mi_claim_fun_t* claim, void* arg1, void* arg2) { @@ -1108,7 +1129,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t if (keep_set) { const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); mi_assert_internal(wasclear); MI_UNUSED(wasclear); - } + } // continue } } diff --git a/src/bitmap.h b/src/bitmap.h index 7b6000cc..7938bfa0 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -91,8 +91,9 @@ typedef mi_bchunk_t mi_bchunkmap_t; // An atomic bitmap typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { - _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) - size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1]; // suppress warning on msvc + _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) + _Atomic(size_t) chunk_max_clear; // max chunk index that was once cleared + size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc mi_bchunkmap_t chunkmap; mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT } mi_bitmap_t; diff --git a/src/page-map.c b/src/page-map.c index c292378b..ca0e2481 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -13,7 +13,10 @@ mi_decl_cache_align uint8_t* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; static mi_memid_t mi_page_map_memid; -static mi_bitmap_t mi_page_map_commit = { MI_BITMAP_DEFAULT_CHUNK_COUNT, { 0 }, { 0 }, { { 0 } } }; + +// (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization) +static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0), + { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} }; bool _mi_page_map_init(void) { size_t vbits = _mi_os_virtual_address_bits(); From 659a9dd51d1d02b620ea569d62fdda76dcb60c38 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 6 Dec 2024 22:37:59 -0800 Subject: [PATCH 038/264] fix page info size and order; atomic page flags --- CMakeLists.txt | 2 +- include/mimalloc/atomic.h | 4 +- include/mimalloc/internal.h | 24 ++++++++--- include/mimalloc/types.h | 81 +++++++++++++++---------------------- src/arena.c | 36 ++++++++--------- src/bitmap.c | 6 +-- src/free.c | 5 ++- src/init.c | 10 ++--- src/os.c | 4 +- test/test-stress.c | 4 +- 10 files changed, 87 insertions(+), 89 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 2c04aea8..1a4cc1f1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -360,7 +360,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM list(APPEND mi_cflags_dynamic -ftls-model=initial-exec) message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)") else() - list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2) + list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2) endif() endif() if(MI_OVERRIDE) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index 3a0d4892..caa90cf8 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -80,10 +80,12 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) #define mi_atomic_add_relaxed(p,x) mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed)) -#define mi_atomic_sub_relaxed(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_add_acq_rel(p,x) mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_sub_relaxed(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_sub_acq_rel(p,x) mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_and_relaxed(p,x) mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_and_acq_rel(p,x) mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel)) +#define mi_atomic_or_relaxed(p,x) mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_or_acq_rel(p,x) mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel)) #define mi_atomic_increment_relaxed(p) mi_atomic_add_relaxed(p,(uintptr_t)1) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index d9c2cd6e..ad7c41c6 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -667,7 +667,8 @@ static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) { static inline bool mi_page_is_huge(const mi_page_t* page) { - return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN)); + return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || + (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page)); } @@ -727,20 +728,33 @@ static inline bool _mi_page_unown(mi_page_t* page) { //----------------------------------------------------------- // Page flags //----------------------------------------------------------- +static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) { + return mi_atomic_load_acquire(&page->xflags); +} + +static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) { + if (set) { + mi_atomic_or_acq_rel(&page->xflags, newflag); + } + else { + mi_atomic_and_acq_rel(&page->xflags, ~newflag); + } +} + static inline bool mi_page_is_in_full(const mi_page_t* page) { - return page->flags.x.in_full; + return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0); } static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) { - page->flags.x.in_full = in_full; + mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE); } static inline bool mi_page_has_aligned(const mi_page_t* page) { - return page->flags.x.has_aligned; + return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0); } static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { - page->flags.x.has_aligned = has_aligned; + mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED); } /* ------------------------------------------------------------------- diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index d78dbc59..5dfbb808 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -111,17 +111,17 @@ terms of the MIT license. A copy of the license can be found in the file // Sizes are for 64-bit #ifndef MI_ARENA_SLICE_SHIFT -#ifdef MI_SMALL_PAGE_SHIFT // compatibility +#ifdef MI_SMALL_PAGE_SHIFT // compatibility #define MI_ARENA_SLICE_SHIFT MI_SMALL_PAGE_SHIFT #else #define MI_ARENA_SLICE_SHIFT (13 + MI_SIZE_SHIFT) // 64 KiB (32 KiB on 32-bit) #endif #endif #ifndef MI_BCHUNK_BITS_SHIFT -#define MI_BCHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) +#define MI_BCHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) #endif -#define MI_BCHUNK_BITS (1 << MI_BCHUNK_BITS_SHIFT) +#define MI_BCHUNK_BITS (1 << MI_BCHUNK_BITS_SHIFT) #define MI_ARENA_SLICE_SIZE (MI_ZU(1) << MI_ARENA_SLICE_SHIFT) #define MI_ARENA_SLICE_ALIGN (MI_ARENA_SLICE_SIZE) @@ -167,8 +167,8 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) { typedef struct mi_memid_os_info { void* base; // actual base address of the block (used for offset aligned allocations) - size_t alignment; // alignment at allocation size_t size; // allocated full size + // size_t alignment; // alignment at allocation } mi_memid_os_info_t; typedef struct mi_memid_arena_info { @@ -224,26 +224,11 @@ typedef enum mi_owned_e { } mi_owned_t; -// The `in_full` and `has_aligned` page flags are put in a union to efficiently -// test if both are false (`full_aligned == 0`) in the `mi_free` routine. -#if !MI_TSAN -typedef union mi_page_flags_s { - uint8_t full_aligned; - struct { - uint8_t in_full : 1; - uint8_t has_aligned : 1; - } x; -} mi_page_flags_t; -#else -// under thread sanitizer, use a byte for each flag to suppress warning, issue #130 -typedef union mi_page_flags_s { - uint32_t full_aligned; - struct { - uint8_t in_full; - uint8_t has_aligned; - } x; -} mi_page_flags_t; -#endif +// The `in_full` and `has_aligned` page flags are put in the same field +// to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine. +#define MI_PAGE_IN_FULL_QUEUE MI_ZU(0x01) +#define MI_PAGE_HAS_ALIGNED MI_ZU(0x02) +typedef size_t mi_page_flags_t; // Thread free list. // We use the bottom bit of the pointer for `mi_owned_t` flags @@ -280,35 +265,33 @@ typedef struct mi_subproc_s mi_subproc_t; // the owning heap `thread_delayed_free` list. This guarantees that pages // will be freed correctly even if only other threads free blocks. typedef struct mi_page_s { - _Atomic(mi_threadid_t)xthread_id; // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned) + _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned) - mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) - uint16_t used; // number of blocks in use (including blocks in `thread_free`) - uint16_t capacity; // number of blocks committed (must be the first field for proper zero-initialisation) - uint16_t reserved; // number of blocks reserved in memory - uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) - uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type + mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) + uint16_t used; // number of blocks in use (including blocks in `thread_free`) + uint16_t capacity; // number of blocks committed (must be the first field for proper zero-initialisation) + uint16_t reserved; // number of blocks reserved in memory + uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) + uint8_t retire_expire; // expiration count for retired blocks - mi_page_flags_t flags; // `in_full` and `has_aligned` flags (8 bits) - uint8_t free_is_zero:1; // `true` if the blocks in the free list are zero initialized - uint8_t retire_expire:7; // expiration count for retired blocks - // padding - - mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) - size_t block_size; // size available in each block (always `>0`) - uint8_t* page_start; // start of the blocks + mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) + _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads + _Atomic(mi_page_flags_t) xflags; // `in_full` and `has_aligned` flags + size_t block_size; // size available in each block (always `>0`) + uint8_t* page_start; // start of the blocks + uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type + bool free_is_zero; // `true` if the blocks in the free list are zero initialized + // padding #if (MI_ENCODE_FREELIST || MI_PADDING) - uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary + uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary #endif - _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - - mi_heap_t* heap; // heap this threads belong to. - struct mi_page_s* next; // next page owned by the heap with the same `block_size` - struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` - mi_subproc_t* subproc; // sub-process of this heap - mi_memid_t memid; // provenance of the page memory + mi_heap_t* heap; // heap this threads belong to. + struct mi_page_s* next; // next page owned by the heap with the same `block_size` + struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` + mi_subproc_t* subproc; // sub-process of this heap + mi_memid_t memid; // provenance of the page memory } mi_page_t; @@ -317,10 +300,10 @@ typedef struct mi_page_s { // ------------------------------------------------------ #define MI_PAGE_ALIGN MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map. -#define MI_PAGE_MIN_BLOCK_ALIGN (32) // minimal block alignment in a page +#define MI_PAGE_MIN_BLOCK_ALIGN (64) // minimal block alignment in a page #define MI_PAGE_MAX_OVERALLOC_ALIGN MI_ARENA_SLICE_SIZE // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation -#if MI_DEBUG && MI_SIZE_SIZE == 8 +#if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8 #define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) #else #define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) diff --git a/src/arena.c b/src/arena.c index 2c215264..45697081 100644 --- a/src/arena.c +++ b/src/arena.c @@ -483,7 +483,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* a mi_page_t* const page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); // can we claim ownership? if (!mi_page_try_claim_ownership(page)) { - // there was a concurrent free .. + // there was a concurrent free .. // we need to keep it in the abandoned map as the free will call `mi_arena_page_unabandon`, // and wait for readers (us!) to finish. This is why it is very important to set the abandoned // bit again (or otherwise the unabandon will never stop waiting). @@ -596,7 +596,9 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz } } #endif - mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)); + if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) { + _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small\n"); + }; const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); const size_t reserved = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size); mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); @@ -1126,28 +1128,22 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { return bit_set_count; } -static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) { - _mi_output_message("%s%s:\n", prefix, header); +static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) { + _mi_output_message("%s:\n", header); size_t bit_count = 0; size_t bit_set_count = 0; for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); size_t k = 0; mi_bchunk_t* chunk = &bitmap->chunks[i]; - - if (i<10) { buf[k++] = ' '; } - if (i<100) { itoa((int)i, buf+k, 10); k += (i < 10 ? 1 : 2); } - buf[k++] = ' '; + if (i<10) { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; } + else if (i<100) { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; } + else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); } + for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { if (j > 0 && (j % 4) == 0) { - buf[k++] = '\n'; - _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix); - buf[k++] = ' '; - buf[k++] = ' '; - buf[k++] = ' '; - buf[k++] = ' '; - buf[k++] = ' '; + buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5; } if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; @@ -1164,9 +1160,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_ } bit_count += MI_BFIELD_BITS; } - _mi_output_message("%s %s\n", prefix, buf); + _mi_output_message(" %s\n", buf); } - _mi_output_message("%s total ('x'): %zu\n", prefix, bit_set_count); + _mi_output_message(" total ('x'): %zu\n", bit_set_count); return bit_set_count; } @@ -1183,12 +1179,12 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) slice_total += arena->slice_count; _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { - free_total += mi_debug_show_bitmap(" ", "in-use slices", arena->slice_count, arena->slices_free, true); + free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true); } - mi_debug_show_bitmap(" ", "committed slices", arena->slice_count, arena->slices_committed, false); + mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false); // todo: abandoned slices if (show_purge) { - purge_total += mi_debug_show_bitmap(" ", "purgeable slices", arena->slice_count, arena->slices_purge, false); + purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false); } } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); diff --git a/src/bitmap.c b/src/bitmap.c index 15401d8d..2ef692cb 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -805,10 +805,10 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) return false; } // record the max clear - size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); + /*size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); do { if mi_likely(chunk_idx <= oldmax) break; - } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx)); + } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));*/ return true; } @@ -1046,7 +1046,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ - const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); /* mi_bitmap_chunk_count(bitmap) */ \ + /* const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); */ /* mi_bitmap_chunk_count(bitmap) */ \ const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */ /* space out threads */ \ const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \ const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ diff --git a/src/free.c b/src/free.c index 0ff4bf60..afb23838 100644 --- a/src/free.c +++ b/src/free.c @@ -163,8 +163,9 @@ void mi_free(void* p) mi_attr_noexcept if mi_unlikely(page==NULL) return; const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page)); + const mi_page_flags_t flags = mi_page_flags(page); if mi_likely(is_local) { // thread-local free? - if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) + if mi_likely(flags == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) // thread-local, aligned, and not a full page mi_block_t* const block = (mi_block_t*)p; mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */); @@ -176,7 +177,7 @@ void mi_free(void* p) mi_attr_noexcept } else { // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap) - if mi_likely(page->flags.full_aligned == 0) { + if mi_likely(flags == 0) { // blocks are aligned (and not a full page) mi_block_t* const block = (mi_block_t*)p; mi_free_block_mt(page,block); diff --git a/src/init.c b/src/init.c index 5d4a775a..4fbd50ed 100644 --- a/src/init.c +++ b/src/init.c @@ -20,21 +20,21 @@ const mi_page_t _mi_page_empty = { 0, // capacity 0, // reserved capacity 0, // block size shift - 0, // heap tag - { 0 }, // flags - false, // is_zero 0, // retire_expire NULL, // local_free + MI_ATOMIC_VAR_INIT(0), // xthread_free + MI_ATOMIC_VAR_INIT(0), // xflags 0, // block_size NULL, // page_start + 0, // heap tag + false, // is_zero #if (MI_PADDING || MI_ENCODE_FREELIST) { 0, 0 }, #endif - MI_ATOMIC_VAR_INIT(0), // xthread_free NULL, // xheap NULL, NULL, // next, prev NULL, // subproc - { {{ NULL, 0, 0}}, false, false, false, MI_MEM_NONE } // memid + { {{ NULL, 0}}, false, false, false, MI_MEM_NONE } // memid }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) diff --git a/src/os.c b/src/os.c index c7f464c0..156a655b 100644 --- a/src/os.c +++ b/src/os.c @@ -128,7 +128,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me // different base? (due to alignment) if (memid.mem.os.base != base) { mi_assert(memid.mem.os.base <= addr); - mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr); + // mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr); base = memid.mem.os.base; if (memid.mem.os.size==0) { csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); } } @@ -305,7 +305,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo if (p != NULL) { *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large); memid->mem.os.base = os_base; - memid->mem.os.alignment = alignment; + // memid->mem.os.alignment = alignment; memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base); // todo: return from prim_alloc_aligned } return p; diff --git a/test/test-stress.c b/test/test-stress.c index d5f106d5..d46c2484 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,7 +40,7 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; -#elif 0 +#elif 1 static int THREADS = 4; static int SCALE = 100; static int ITER = 10; @@ -347,6 +347,8 @@ int main(int argc, char** argv) { mi_collect(true); mi_debug_show_arenas(true,true,false); #endif + mi_collect(true); + mi_debug_show_arenas(true, true, false); // mi_stats_print(NULL); #else mi_stats_print(NULL); // so we see rss/commit/elapsed From bf9a2ddb59778dc11a39e380f9b7ba49c9f34ecb Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 6 Dec 2024 23:07:10 -0800 Subject: [PATCH 039/264] compile for 32-bit as well --- include/mimalloc/types.h | 2 +- src/bitmap.c | 23 +++++++++++++---------- src/page-map.c | 4 ++-- test/main-override.cpp | 2 +- test/test-stress.c | 2 +- 5 files changed, 18 insertions(+), 15 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 5dfbb808..ba9a8864 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -300,7 +300,7 @@ typedef struct mi_page_s { // ------------------------------------------------------ #define MI_PAGE_ALIGN MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map. -#define MI_PAGE_MIN_BLOCK_ALIGN (64) // minimal block alignment in a page +#define MI_PAGE_MIN_BLOCK_ALIGN MI_SIZE_BITS // minimal block alignment in a page (64b on 64-bit, 32b on 32-bit) #define MI_PAGE_MAX_OVERALLOC_ALIGN MI_ARENA_SLICE_SIZE // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8 diff --git a/src/bitmap.c b/src/bitmap.c index 2ef692cb..7f4c8776 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -805,10 +805,10 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) return false; } // record the max clear - /*size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); + size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); do { if mi_likely(chunk_idx <= oldmax) break; - } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));*/ + } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx)); return true; } @@ -1042,21 +1042,23 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n -------------------------------------------------------------------------------- */ -#define mi_bitmap_forall_chunks(bitmap, tseq, name_epoch, name_chunk_idx) \ +#define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \ { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ - /* const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); */ /* mi_bitmap_chunk_count(bitmap) */ \ - const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */ /* space out threads */ \ + const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \ const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \ + const size_t chunkmap_hi_bfield = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\ const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \ /* for each chunkmap entry `i` */ \ for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \ - size_t i = (_i + chunkmap_start); \ - if (i >= chunkmap_max_bfield) { \ - i -= chunkmap_max_bfield; /* adjust for the start position */ \ + size_t i; \ + if (_i < chunkmap_hi_bfield) { \ + i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \ + if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \ } \ + else { i = _i; } /* the rest of the chunks above chunk_hi_idx */ \ const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ size_t cmap_idx_shift = 0; /* shift through the cmap */ \ @@ -1086,7 +1088,8 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n // (Used to find fresh free slices.) mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { - mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) + // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); + mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) { size_t cidx; if mi_likely(mi_bchunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { @@ -1111,7 +1114,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, mi_claim_fun_t* claim, void* arg1, void* arg2) { - mi_bitmap_forall_chunks(bitmap, tseq, epoch, chunk_idx) + mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) { size_t cidx; if mi_likely(mi_bchunk_find_and_try_clear(&bitmap->chunks[chunk_idx], &cidx)) { diff --git a/src/page-map.c b/src/page-map.c index ca0e2481..d849e6a2 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -43,9 +43,9 @@ bool _mi_page_map_init(void) { bool is_zero; _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero, NULL); if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(_mi_page_map, _mi_os_page_size()); } - _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL - mi_assert_internal(_mi_ptr_page(NULL)==NULL); } + _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL + mi_assert_internal(_mi_ptr_page(NULL)==NULL); return true; } diff --git a/test/main-override.cpp b/test/main-override.cpp index 3f64117a..5a1fc6d2 100644 --- a/test/main-override.cpp +++ b/test/main-override.cpp @@ -382,7 +382,7 @@ static void test_mt_shutdown() // issue #372 static void fail_aslr() { - size_t sz = (4ULL << 40); // 4TiB + uint64_t sz = (4ULL << 40); // 4TiB void* p = malloc(sz); printf("pointer p: %p: area up to %p\n", p, (uint8_t*)p + sz); *(int*)0x5FFFFFFF000 = 0; // should segfault diff --git a/test/test-stress.c b/test/test-stress.c index d46c2484..19edf2b5 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,7 +40,7 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; -#elif 1 +#elif 0 static int THREADS = 4; static int SCALE = 100; static int ITER = 10; From 70115d8b8c0e52d8f196622901639fffed41ff9c Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 6 Dec 2024 23:25:53 -0800 Subject: [PATCH 040/264] small fixes --- include/mimalloc/internal.h | 41 +++++--------------- src/arena.c | 8 ++-- src/free.c | 12 +++--- src/heap.c | 15 +------- src/os.c | 6 +-- src/page-queue.c | 26 ++++--------- src/page.c | 77 ++----------------------------------- 7 files changed, 34 insertions(+), 151 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index ad7c41c6..28eca4bb 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -540,30 +540,16 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { if (heap != NULL) { - // mi_atomic_store_release(&page->xheap, (uintptr_t)heap); page->heap = heap; page->heap_tag = heap->tag; mi_atomic_store_release(&page->xthread_id, heap->thread_id); } else { - // mi_atomic_store_release(&page->xheap, (uintptr_t)heap->tld->subproc); page->heap = NULL; mi_atomic_store_release(&page->xthread_id,0); } } -//static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { -// mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING); -// if (heap != NULL) { -// mi_atomic_store_release(&page->xheap, (uintptr_t)heap); -// page->heap_tag = heap->tag; -// mi_atomic_store_release(&page->xthread_id, heap->thread_id); -// } -// else { -// mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc); -// mi_atomic_store_release(&page->xthread_id,0); -// } -//} // Thread free flag helpers static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { @@ -650,24 +636,24 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) { static inline bool mi_page_is_abandoned(const mi_page_t* page) { // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) - return (mi_atomic_load_acquire(&page->xthread_id) <= 1); + return (mi_atomic_load_relaxed(&page->xthread_id) <= 1); } static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) { - return (mi_atomic_load_acquire(&page->xthread_id) == 1); + return (mi_atomic_load_relaxed(&page->xthread_id) == 1); } static inline void mi_page_set_abandoned_mapped(mi_page_t* page) { - mi_atomic_or_acq_rel(&page->xthread_id, (uintptr_t)1); + mi_atomic_or_relaxed(&page->xthread_id, (uintptr_t)1); } static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) { - mi_atomic_and_acq_rel(&page->xthread_id, ~(uintptr_t)1); + mi_atomic_and_relaxed(&page->xthread_id, ~(uintptr_t)1); } static inline bool mi_page_is_huge(const mi_page_t* page) { - return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || + return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page)); } @@ -683,15 +669,6 @@ static inline void _mi_page_unown_unconditional(mi_page_t* page) { mi_assert_internal(mi_page_thread_id(page)==0); const uintptr_t old = mi_atomic_and_acq_rel(&page->xthread_free, ~((uintptr_t)1)); mi_assert_internal((old&1)==1); MI_UNUSED(old); - /* - mi_thread_free_t tf_new; - mi_thread_free_t tf_old; - do { - tf_old = mi_atomic_load_relaxed(&page->xthread_free); - mi_assert_internal(mi_tf_is_owned(tf_old)); - tf_new = mi_tf_create(mi_tf_block(tf_old), false); - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new)); - */ } @@ -721,7 +698,7 @@ static inline bool _mi_page_unown(mi_page_t* page) { } mi_assert_internal(mi_tf_block(tf_old)==NULL); tf_new = mi_tf_create(NULL, false); - } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf_old, tf_new)); + } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); return false; } @@ -729,15 +706,15 @@ static inline bool _mi_page_unown(mi_page_t* page) { // Page flags //----------------------------------------------------------- static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) { - return mi_atomic_load_acquire(&page->xflags); + return mi_atomic_load_relaxed(&page->xflags); } static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) { if (set) { - mi_atomic_or_acq_rel(&page->xflags, newflag); + mi_atomic_or_relaxed(&page->xflags, newflag); } else { - mi_atomic_and_acq_rel(&page->xflags, ~newflag); + mi_atomic_and_relaxed(&page->xflags, ~newflag); } } diff --git a/src/arena.c b/src/arena.c index 45697081..8362a31f 100644 --- a/src/arena.c +++ b/src/arena.c @@ -944,7 +944,7 @@ void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { bool _mi_arena_contains(const void* p) { const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) { return true; } @@ -1140,7 +1140,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi if (i<10) { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; } else if (i<100) { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; } else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); } - + for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { if (j > 0 && (j % 4) == 0) { buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5; @@ -1174,7 +1174,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) //size_t abandoned_total = 0; size_t purge_total = 0; for (size_t i = 0; i < max_arenas; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; slice_total += arena->slice_count; _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); @@ -1324,7 +1324,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) { if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled - const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); if (max_arena == 0) return; // _mi_error_message(EFAULT, "purging not yet implemented\n"); diff --git a/src/free.c b/src/free.c index afb23838..ece55599 100644 --- a/src/free.c +++ b/src/free.c @@ -70,7 +70,7 @@ static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) do { mi_block_set_next(page, block, mi_tf_block(tf_old)); tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); - } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); + } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough? // and atomically try to collect the page if it was abandoned const bool is_owned_now = !mi_tf_is_owned(tf_old); @@ -207,17 +207,17 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { #endif // 1. free if the page is free now - if (mi_page_all_free(page)) + if (mi_page_all_free(page)) { // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish) - _mi_arena_page_unabandon(page); + _mi_arena_page_unabandon(page); // we can free the page directly _mi_arena_page_free(page); return; } - + // 2. if the page is not too full, we can try to reclaim it for ourselves - if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && + if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && !mi_page_is_used_at_frac(page,8)) { // the page has still some blocks in use (but not too many) @@ -234,7 +234,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) ) - { + { if (mi_page_queue(tagheap, page->block_size)->first != NULL) { // don't reclaim for an block_size we don't use // first remove it from the abandoned pages in the arena -- this waits for any readers to finish _mi_arena_page_unabandon(page); diff --git a/src/heap.c b/src/heap.c index 2ff40930..d687f25e 100644 --- a/src/heap.c +++ b/src/heap.c @@ -136,24 +136,11 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) _mi_arena_reclaim_all_abandoned(heap); } - // if abandoning, mark all pages to no longer add to delayed_free - //if (collect == MI_ABANDON) { - // mi_heap_visit_pages(heap, &mi_heap_page_never_delayed_free, NULL, NULL); - //} - - // free all current thread delayed blocks. - // (if abandoning, after this there are no more thread-delayed references into the pages.) - // _mi_heap_delayed_free_all(heap); - // collect retired pages _mi_heap_collect_retired(heap, force); // collect all pages owned by this thread mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); - // mi_assert_internal( collect != MI_ABANDON || mi_atomic_load_ptr_acquire(mi_block_t,&heap->thread_delayed_free) == NULL ); - - // collect segments (purge pages, this can be expensive so don't force on abandonment) - // _mi_segments_collect(collect == MI_FORCE, &heap->tld->segments); // if forced, collect thread data cache on program-exit (or shared library unload) if (force && is_main_thread && mi_heap_is_backing(heap)) { @@ -219,7 +206,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool if (poolData != NULL) { heap->no_reclaim = true; } - } + } #endif if (heap == tld->heap_backing) { diff --git a/src/os.c b/src/os.c index 156a655b..b05068fd 100644 --- a/src/os.c +++ b/src/os.c @@ -113,8 +113,8 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_st if (err != 0) { _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr); } - if (still_committed) { - _mi_stat_decrease(&stats->committed, size); + if (still_committed) { + _mi_stat_decrease(&stats->committed, size); } _mi_stat_decrease(&stats->reserved, size); } @@ -556,7 +556,7 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { #endif } end = start + size; - } while (!mi_atomic_cas_strong_acq_rel(&mi_huge_start, &huge_start, end)); + } while (!mi_atomic_cas_weak_acq_rel(&mi_huge_start, &huge_start, end)); if (total_size != NULL) *total_size = size; return (uint8_t*)start; diff --git a/src/page-queue.c b/src/page-queue.c index ad616b1d..9e3aaacc 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -12,7 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MI_IN_PAGE_C #error "this file should be included from 'page.c'" // include to help an IDE -#include "mimalloc.h" +#include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/atomic.h" #endif @@ -83,10 +83,10 @@ static inline uint8_t mi_bin(size_t size) { #if defined(MI_ALIGN4W) if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes #endif - wsize--; + wsize--; mi_assert_internal(wsize!=0); // find the highest bit position - uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize)); + uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize)); // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin @@ -211,8 +211,8 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) { static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_expensive(mi_page_queue_contains(queue, page)); - mi_assert_internal(mi_page_block_size(page) == queue->block_size || - (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || + mi_assert_internal(mi_page_block_size(page) == queue->block_size || + (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); mi_heap_t* heap = mi_page_heap(page); if (page->prev != NULL) page->prev->next = page->next; @@ -227,7 +227,6 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { heap->page_count--; page->next = NULL; page->prev = NULL; - // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), NULL); mi_page_set_in_full(page,false); } @@ -243,7 +242,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_ (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); mi_page_set_in_full(page, mi_page_queue_is_full(queue)); - // mi_atomic_store_ptr_release(mi_atomic_cast(void*, &page->heap), heap); + page->next = queue->first; page->prev = NULL; if (queue->first != NULL) { @@ -346,8 +345,8 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* page->prev = to->first; page->next = next; to->first->next = page; - if (next != NULL) { - next->prev = page; + if (next != NULL) { + next->prev = page; } else { to->last = page; @@ -385,15 +384,6 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue // set append pages to new heap and count size_t count = 0; for (mi_page_t* page = append->first; page != NULL; page = page->next) { - /* - // inline `mi_page_set_heap` to avoid wrong assertion during absorption; - // in this case it is ok to be delayed freeing since both "to" and "from" heap are still alive. - mi_atomic_store_release(&page->xheap, (uintptr_t)heap); - // set the flag to delayed free (not overriding NEVER_DELAYED_FREE) which has as a - // side effect that it spins until any DELAYED_FREEING is finished. This ensures - // that after appending only the new heap will be used for delayed free operations. - _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, false); - */ mi_page_set_heap(page, heap); count++; } diff --git a/src/page.c b/src/page.c index 056c9506..54e7b539 100644 --- a/src/page.c +++ b/src/page.c @@ -132,40 +132,6 @@ bool _mi_page_is_valid(mi_page_t* page) { } #endif -/* -void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) { - while (!_mi_page_try_use_delayed_free(page, delay, override_never)) { - mi_atomic_yield(); - } -} - -bool _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never) { - mi_thread_free_t tfreex; - mi_delayed_t old_delay; - mi_thread_free_t tfree; - size_t yield_count = 0; - do { - tfree = mi_atomic_load_acquire(&page->xthread_free); // note: must acquire as we can break/repeat this loop and not do a CAS; - tfreex = mi_tf_set_delayed(tfree, delay); - old_delay = mi_tf_delayed(tfree); - if mi_unlikely(old_delay == MI_DELAYED_FREEING) { - if (yield_count >= 4) return false; // give up after 4 tries - yield_count++; - mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done. - // tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail - } - else if (delay == old_delay) { - break; // avoid atomic operation if already equal - } - else if (!override_never && old_delay == MI_NEVER_DELAYED_FREE) { - break; // leave never-delayed flag set - } - } while ((old_delay == MI_DELAYED_FREEING) || - !mi_atomic_cas_weak_release(&page->xthread_free, &tfree, tfreex)); - - return true; // success -} -*/ /* ----------------------------------------------------------- Page collect the `local_free` and `thread_free` lists @@ -181,7 +147,7 @@ static void _mi_page_thread_free_collect(mi_page_t* page) head = mi_tf_block(tfree); if (head == NULL) return; // return if the list is empty tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree)); // set the thread free list to NULL - } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex)); + } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex)); // release is enough? mi_assert_internal(head != NULL); // find the tail -- also to get a proper count (without data races) @@ -334,43 +300,6 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) { return page; } -/* ----------------------------------------------------------- - Do any delayed frees - (put there by other threads if they deallocated in a full page) ------------------------------------------------------------ */ -/* -void _mi_heap_delayed_free_all(mi_heap_t* heap) { - while (!_mi_heap_delayed_free_partial(heap)) { - mi_atomic_yield(); - } -} - -// returns true if all delayed frees were processed -bool _mi_heap_delayed_free_partial(mi_heap_t* heap) { - // take over the list (note: no atomic exchange since it is often NULL) - mi_block_t* block = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); - while (block != NULL && !mi_atomic_cas_ptr_weak_acq_rel(mi_block_t, &heap->thread_delayed_free, &block, NULL)) { }; - bool all_freed = true; - - // and free them all - while(block != NULL) { - mi_block_t* next = mi_block_nextx(heap,block, heap->keys); - // use internal free instead of regular one to keep stats etc correct - if (!_mi_free_delayed_block(block)) { - // we might already start delayed freeing while another thread has not yet - // reset the delayed_freeing flag; in that case delay it further by reinserting the current block - // into the delayed free list - all_freed = false; - mi_block_t* dfree = mi_atomic_load_ptr_relaxed(mi_block_t, &heap->thread_delayed_free); - do { - mi_block_set_nextx(heap, block, dfree, heap->keys); - } while (!mi_atomic_cas_ptr_weak_release(mi_block_t,&heap->thread_delayed_free, &dfree, block)); - } - block = next; - } - return all_freed; -} -*/ /* ----------------------------------------------------------- Unfull, abandon, free and retire @@ -765,7 +694,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m #if MI_STAT size_t count = 0; #endif - long candidate_limit = 0; // we reset this on the first candidate to limit the search + long candidate_limit = 0; // we reset this on the first candidate to limit the search long full_page_retain = _mi_option_get_fast(mi_option_full_page_retain); mi_page_t* page_candidate = NULL; // a page with free space mi_page_t* page = pq->first; @@ -777,7 +706,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m count++; #endif candidate_limit--; - + // collect freed blocks by us and other threads _mi_page_free_collect(page, false); From 9631b0d4d2259c2bc2cf9808b40c444cee7ea3f2 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 7 Dec 2024 14:03:51 -0800 Subject: [PATCH 041/264] revise visiting arenas, better bitmap scanning --- src/arena.c | 83 ++++++++++++----------- src/bitmap.c | 188 +++++++++++++++++++++++++++++++++------------------ 2 files changed, 164 insertions(+), 107 deletions(-) diff --git a/src/arena.c b/src/arena.c index 8362a31f..8b9ab4da 100644 --- a/src/arena.c +++ b/src/arena.c @@ -202,20 +202,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // set the dirty bits if (arena->memid.initially_zero) { - // size_t dirty_count = 0; - memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL); - //if (dirty_count>0) { - // if (memid->initially_zero) { - // _mi_error_message(EFAULT, "ouch1\n"); - // } - // // memid->initially_zero = false; - //} - //else { - // if (!memid->initially_zero) { - // _mi_error_message(EFAULT, "ouch2\n"); - // } - // // memid->initially_zero = true; - //} + memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL); } // set commit state @@ -235,7 +222,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( #if MI_DEBUG > 1 if (memid->initially_zero) { if (!mi_mem_is_zero(p, mi_size_of_slices(slice_count))) { - _mi_error_message(EFAULT, "arena allocation was not zero-initialized!\n"); + _mi_error_message(EFAULT, "interal error: arena allocation was not zero-initialized!\n"); memid->initially_zero = false; } } @@ -327,31 +314,47 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are return true; } -#define MI_THREADS_PER_ARENA (16) -#define mi_forall_arenas(req_arena_id, allow_large, tseq, var_arena_id, var_arena) \ +#define mi_forall_arenas(req_arena_id, tseq, name_arena) \ { \ - size_t _max_arena; \ - size_t _start; \ - if (req_arena_id == _mi_arena_id_none()) { \ - _max_arena = mi_atomic_load_relaxed(&mi_arena_count); \ - _start = (_max_arena <= 2 ? 0 : (tseq % (_max_arena-1))); \ - } \ - else { \ - _max_arena = 1; \ - _start = mi_arena_id_index(req_arena_id); \ - mi_assert_internal(mi_atomic_load_relaxed(&mi_arena_count) > _start); \ - } \ - for (size_t i = 0; i < _max_arena; i++) { \ - size_t _idx = i + _start; \ - if (_idx >= _max_arena) { _idx -= _max_arena; } \ - const mi_arena_id_t var_arena_id = mi_arena_id_create(_idx); MI_UNUSED(var_arena_id);\ - mi_arena_t* const var_arena = mi_arena_from_index(_idx); \ - if (var_arena != NULL && mi_arena_is_suitable(var_arena,req_arena_id,-1 /* todo: numa node */,allow_large)) \ - { + const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \ + if (_arena_count > 0) { \ + const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \ + size_t _start; \ + if (req_arena_id == _mi_arena_id_none()) { \ + /* always start searching in an arena 1 below the max */ \ + _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \ + } \ + else { \ + _start = mi_arena_id_index(req_arena_id); \ + mi_assert_internal(_start < _arena_count); \ + } \ + for (size_t _i = 0; _i < _arena_count; _i++) { \ + size_t _idx; \ + if (_i < _arena_cycle) { \ + _idx = _i + _start; \ + if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \ + } \ + else { \ + _idx = _i; \ + } \ + mi_arena_t* const name_arena = mi_arena_from_index(_idx); \ + if (name_arena != NULL) \ + { -#define mi_forall_arenas_end() }}} +#define mi_forall_arenas_end() \ + } \ + if (req_arena_id != _mi_arena_id_none()) break; \ + } \ + }} +#define mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, name_arena) \ + mi_forall_arenas(req_arena_id,tseq,name_arena) { \ + if (mi_arena_is_suitable(name_arena, req_arena_id, -1 /* todo: numa node */, allow_large)) { \ + +#define mi_forall_suitable_arenas_end() \ + }} \ + mi_forall_arenas_end() /* ----------------------------------------------------------- Arena allocation @@ -369,12 +372,12 @@ static mi_decl_noinline void* mi_arena_try_find_free( // search arena's const size_t tseq = tld->tseq; - mi_forall_arenas(req_arena_id, allow_large, tseq, arena_id, arena) + mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena) { void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid); if (p != NULL) return p; } - mi_forall_arenas_end(); + mi_forall_suitable_arenas_end(); return NULL; } @@ -517,7 +520,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl // search arena's const bool allow_large = true; size_t tseq = tld->tseq; - mi_forall_arenas(req_arena_id, allow_large, tseq, arena_id, arena) + mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena) { size_t slice_index; mi_bitmap_t* const bitmap = arena->pages_abandoned[bin]; @@ -545,7 +548,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl return page; } } - mi_forall_arenas_end(); + mi_forall_suitable_arenas_end(); return NULL; } diff --git a/src/bitmap.c b/src/bitmap.c index 7f4c8776..fb8468fa 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -42,9 +42,9 @@ static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { return mi_rotr(x,r); } -//static inline mi_bfield_t mi_bfield_zero(void) { -// return 0; -//} +static inline mi_bfield_t mi_bfield_zero(void) { + return 0; +} static inline mi_bfield_t mi_bfield_one(void) { return 1; @@ -64,9 +64,9 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) { // Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR). // return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). -static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) { - return mi_bfield_find_least_bit((set ? ~x : x), idx); -} +//static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) { +// return mi_bfield_find_least_bit((set ? ~x : x), idx); +//} // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { @@ -244,10 +244,10 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's. // and false otherwise leaving the bit field as-is. -//static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) { -// mi_bfield_t old = mi_bfield_all_set(); -// return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_zero()); -//} +static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) { + mi_bfield_t old = mi_bfield_all_set(); + return mi_atomic_cas_strong_acq_rel(b, &old, mi_bfield_zero()); +} // Check if all bits corresponding to a mask are set. @@ -514,31 +514,33 @@ static inline __m256i mi_mm256_zero(void) { static inline __m256i mi_mm256_ones(void) { return _mm256_set1_epi64x(~0); } -static inline bool mi_mm256_is_ones(__m256i vec) { - return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec)); -} +//static inline bool mi_mm256_is_ones(__m256i vec) { +// return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec)); +//} static inline bool mi_mm256_is_zero( __m256i vec) { return _mm256_testz_si256(vec,vec); } #endif -// find least 0/1-bit in a chunk and try to set/clear it atomically +// Find least 1-bit in a chunk and try to clear it atomically // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. +// This is used to find free slices and abandoned pages and should be efficient. // todo: try neon version -static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t* pidx) { -#if defined(__AVX2__) && (MI_BCHUNK_BITS==256) +static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) - // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) + // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared) if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 const size_t chunk_idx = _tzcnt_u32(mask) / 8; mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); size_t cidx; - if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear - if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically + if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit + if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; @@ -546,39 +548,42 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk } // try again } -#elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { size_t chunk_idx = 0; - #if 1 + #if 0 + // one vector at a time __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - if ((set ? mi_mm256_is_ones(vec) : mi_mm256_is_zero(vec))) { + if (mi_mm256_is_zero(vec)) { chunk_idx += 4; vec = _mm256_load_si256(((const __m256i*)chunk->bfields) + 1); } - const __m256i vcmp = _mm256_cmpeq_epi64(vec, (set ? mi_mm256_ones() : mi_mm256_zero())); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0) const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits are all set or clear) - // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) + // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared) if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 chunk_idx += _tzcnt_u32(mask) / 8; #else + // a cache line is 64b so we can just as well load all at the same time const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); - const __m256i cmpv = (set ? mi_mm256_ones() : mi_mm256_zero()); - const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) - const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == ~0 / 0 ? 0xFF : 0) + const __m256i cmpv = mi_mm256_zero(); + const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (elem64 == 0 ? 0xFF : 0) + const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (elem64 == 0 ? 0xFF : 0) const uint32_t mask1 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) - const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte (so each 8 bits are all set or clear) + const uint32_t mask2 = ~_mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte (so each 8 bits are all set or clear) const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; - // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a zero / one bit (and thus can be set/cleared) + // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared) if (mask==0) return false; mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. - const size_t chunk_idx = _tzcnt_u64(mask) / 8; + chunk_idx = _tzcnt_u64(mask) / 8; #endif mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); size_t cidx; - if (mi_bfield_find_least_to_xset(set, chunk->bfields[chunk_idx], &cidx)) { // find the bit-idx that is set/clear - if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[chunk_idx], cidx)) { // set/clear it atomically + if (mi_bfield_find_least_bit(b, &cidx)) { // find the bit-idx that is clear + if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; @@ -586,11 +591,12 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk } // try again } -#else + #else for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t idx; - if mi_unlikely(mi_bfield_find_least_to_xset(set, chunk->bfields[i], &idx)) { // find least 0-bit - if mi_likely(mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx)) { // try to set it atomically + if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit + if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) { // try to clear it atomically *pidx = (i*MI_BFIELD_BITS + idx); mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; @@ -598,48 +604,49 @@ static inline bool mi_bchunk_find_and_try_xset(mi_xset_t set, mi_bchunk_t* chunk } } return false; -#endif + #endif } -static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) { - return mi_bchunk_find_and_try_xset(MI_BIT_CLEAR, chunk, pidx); -} - -//static inline bool mi_bchunk_find_and_try_set(mi_bchunk_t* chunk, size_t* pidx) { -// return mi_bchunk_find_and_try_xset(MI_BIT_SET, chunk, pidx); -//} // find least byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. +// Used to find medium size pages in the free blocks. // todo: try neon version static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) - while(true) { - const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); - const __m256i vcmp = _mm256_cmpeq_epi8(vec, mi_mm256_ones()); // (byte == ~0 ? -1 : 0) - const uint32_t mask = _mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte - if (mask == 0) return false; - const size_t i = _tzcnt_u32(mask); - mi_assert_internal(8*i < MI_BCHUNK_BITS); - const size_t chunk_idx = i / MI_BFIELD_SIZE; - const size_t byte_idx = i % MI_BFIELD_SIZE; - if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) { // try to unset atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8); - mi_assert_internal(*pidx < MI_BCHUNK_BITS); + #if defined(__AVX2__) && (MI_BCHUNK_BITS==512) + while (true) { + // since a cache-line is 64b, load all at once + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); + const __m256i cmpv = mi_mm256_ones(); + const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0) + const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0) + const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte + const uint32_t mask2 = _mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte + const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; + // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared) + if (mask==0) return false; + const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk + const size_t chunk_idx = bidx / 8; + const size_t byte_idx = bidx % 8; // byte index of the byte in the bfield + mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); + if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // clear it atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx; + mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; } // try again } #else for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { - const mi_bfield_t x = chunk->bfields[i]; + const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]); // has_set8 has low bit in each byte set if the byte in x == 0xFF const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F (x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 >> 7; // shift high bit to low bit size_t idx; - if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit + if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); mi_assert_internal((idx%8)==0); const size_t byte_idx = idx/8; @@ -656,14 +663,58 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid } + +// find least bfield in a chunk with all bits set, and try unset it atomically +// set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. +// Used to find large size pages in the free blocks. +// todo: try neon version +static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pidx) { +#if defined(__AVX2__) && (MI_BCHUNK_BITS==512) + while (true) { + // since a cache-line is 64b, load all at once + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); + const __m256i cmpv = mi_mm256_ones(); + const __m256i vcmp1 = _mm256_cmpeq_epi64(vec1, cmpv); // (bfield == ~0 ? -1 : 0) + const __m256i vcmp2 = _mm256_cmpeq_epi64(vec2, cmpv); // (bfield == ~0 ? -1 : 0) + const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte + const uint32_t mask2 = _mm256_movemask_epi8(vcmp2); // mask of most significant bit of each byte + const uint64_t mask = ((uint64_t)mask2 << 32) | mask1; + // mask is inverted, so each 8-bits are set iff the corresponding elem64 has all bits set (and thus can be cleared) + if (mask==0) return false; + mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. + const size_t chunk_idx = _tzcnt_u64(mask) / 8; + mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); + if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx])) { + *pidx = chunk_idx*MI_BFIELD_BITS; + mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS); + return true; + } + // try again + } +#else + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); + if (~b==0 && mi_bfield_atomic_try_clearX(&chunk->bfields[i])) { + *pidx = i*MI_BFIELD_BITS; + mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS); + return true; + } + } + return false; +#endif +} + + // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set, // and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. +// (We do not cross bfield boundaries) static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BFIELD_BITS) return false; const mi_bfield_t mask = mi_bfield_mask(n, 0); for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { - mi_bfield_t b = chunk->bfields[i]; + mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t bshift = 0; size_t idx; while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit @@ -680,8 +731,9 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* return true; } else { - // if failed to atomically commit, try again from this position - b = (chunk->bfields[i] >> bshift); + // if failed to atomically commit, reload b and try again from this position + bshift -= idx; + b = mi_atomic_load_relaxed(&chunk->bfields[i]) >> bshift; } } else { @@ -699,11 +751,11 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* // find a sequence of `n` bits in a chunk with `n < MI_BCHUNK_BITS` with all bits set, // and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. +// This can cross bfield boundaries. static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - // if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx); - - // we align an a field, and require `field_count` fields to be all clear. + + // we align at a bfield, and scan `field_count` fields // n >= MI_BFIELD_BITS; find a first field that is 0 const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++) @@ -740,14 +792,16 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* return true; } } + // continue } return false; } static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { - if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx); - if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx); + if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx); // small pages + if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx); // medium pages + if (n==MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearX(chunk, pidx); // large pages if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx); return mi_bchunk_find_and_try_clearN_(chunk, n, pidx); From 6b52b19e3b6bd28eb61739ef0a21297993940b28 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 7 Dec 2024 15:02:27 -0800 Subject: [PATCH 042/264] arch specific optimizations --- CMakeLists.txt | 2 +- ide/vs2022/mimalloc.vcxproj | 1 - src/bitmap.c | 57 ++++++++++++++++++------------------- 3 files changed, 28 insertions(+), 32 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 89dad3b5..b1f66f5c 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -385,7 +385,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM list(APPEND mi_cflags_dynamic -ftls-model=initial-exec) message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)") else() - list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2) + list(APPEND mi_cflags -ftls-model=initial-exec) endif() endif() if(MI_OVERRIDE) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index d03fd281..e9a4a339 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -120,7 +120,6 @@ CompileAsCpp false stdcpp20 - AdvancedVectorExtensions2 diff --git a/src/bitmap.c b/src/bitmap.c index fb8468fa..8479555c 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -505,7 +505,7 @@ static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) { mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx); } -// ------ find_and_try_xset -------- +// ------ try_find_and_clear -------- #if defined(__AVX2__) static inline __m256i mi_mm256_zero(void) { @@ -526,7 +526,7 @@ static inline bool mi_mm256_is_zero( __m256i vec) { // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // This is used to find free slices and abandoned pages and should be efficient. // todo: try neon version -static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx) { +static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) { #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); @@ -613,10 +613,10 @@ static inline bool mi_bchunk_find_and_try_clear(mi_bchunk_t* chunk, size_t* pidx // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // Used to find medium size pages in the free blocks. // todo: try neon version -static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pidx) { +static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) { #if defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { - // since a cache-line is 64b, load all at once + // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); const __m256i cmpv = mi_mm256_ones(); @@ -628,9 +628,9 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid // mask is inverted, so each bit is 0xFF iff the corresponding byte has a bit set (and thus can be cleared) if (mask==0) return false; const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk - const size_t chunk_idx = bidx / 8; + const size_t chunk_idx = bidx / 8; const size_t byte_idx = bidx % 8; // byte index of the byte in the bfield - mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); + mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); @@ -668,10 +668,10 @@ static inline bool mi_bchunk_find_and_try_clear8(mi_bchunk_t* chunk, size_t* pid // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // Used to find large size pages in the free blocks. // todo: try neon version -static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pidx) { -#if defined(__AVX2__) && (MI_BCHUNK_BITS==512) +static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { - // since a cache-line is 64b, load all at once + // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); const __m256i cmpv = mi_mm256_ones(); @@ -689,7 +689,7 @@ static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pid *pidx = chunk_idx*MI_BFIELD_BITS; mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS); return true; - } + } // try again } #else @@ -710,7 +710,7 @@ static inline bool mi_bchunk_find_and_try_clearX(mi_bchunk_t* chunk, size_t* pid // and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // (We do not cross bfield boundaries) -static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { +static mi_decl_noinline bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BFIELD_BITS) return false; const mi_bfield_t mask = mi_bfield_mask(n, 0); for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { @@ -752,10 +752,10 @@ static bool mi_bchunk_find_and_try_clearNX(mi_bchunk_t* chunk, size_t n, size_t* // and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // This can cross bfield boundaries. -static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { +static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - - // we align at a bfield, and scan `field_count` fields + + // we align at a bfield, and scan `field_count` fields // n >= MI_BFIELD_BITS; find a first field that is 0 const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++) @@ -780,7 +780,7 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* m -= MI_BFIELD_BITS; // note: can underflow } } while (++j < field_count); - + // if all set, we can try to atomically clear them if (allset) { const size_t cidx = i*MI_BFIELD_BITS; @@ -798,13 +798,13 @@ static bool mi_bchunk_find_and_try_clearN_(mi_bchunk_t* chunk, size_t n, size_t* } -static inline bool mi_bchunk_find_and_try_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { - if (n==1) return mi_bchunk_find_and_try_clear(chunk, pidx); // small pages - if (n==8) return mi_bchunk_find_and_try_clear8(chunk, pidx); // medium pages - if (n==MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearX(chunk, pidx); // large pages +static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { + if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages + if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages + if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - if (n < MI_BFIELD_BITS) return mi_bchunk_find_and_try_clearNX(chunk, n, pidx); - return mi_bchunk_find_and_try_clearN_(chunk, n, pidx); + if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); + return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); } @@ -858,7 +858,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) mi_bchunk_set(&bitmap->chunkmap, chunk_idx); return false; } - // record the max clear + // record the max clear size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); do { if mi_likely(chunk_idx <= oldmax) break; @@ -1139,23 +1139,22 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n // Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -// (Used to find fresh free slices.) +// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS) mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) { size_t cidx; - if mi_likely(mi_bchunk_find_and_try_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { + if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; - mi_assert_internal(*pidx <= mi_bitmap_max_bits(bitmap) - n); + mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap)); return true; } else { // we may find that all are cleared only on a second iteration but that is ok as // the chunkmap is a conservative approximation. mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); - // continue } } mi_bitmap_forall_chunks_end(); @@ -1171,7 +1170,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) { size_t cidx; - if mi_likely(mi_bchunk_find_and_try_clear(&bitmap->chunks[chunk_idx], &cidx)) { + if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) { const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx; mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap)); bool keep_set = true; @@ -1182,19 +1181,17 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t return true; } else { - // failed to claim it, set abandoned mapping again (unless thet page was freed) + // failed to claim it, set abandoned mapping again (unless the page was freed) if (keep_set) { const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); mi_assert_internal(wasclear); MI_UNUSED(wasclear); } - // continue } } else { // we may find that all are cleared only on a second iteration but that is ok as // the chunkmap is a conservative approximation. mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); - // continue } } mi_bitmap_forall_chunks_end(); From bf42759d976bd965eacd8a0b4c13c6dc9e6182d9 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 7 Dec 2024 15:13:17 -0800 Subject: [PATCH 043/264] check heaptag on abandonded page allocation --- include/mimalloc/types.h | 13 ++++++++++++- src/arena.c | 17 ++++++++--------- src/bitmap.c | 4 ++-- src/bitmap.h | 4 ++-- 4 files changed, 24 insertions(+), 14 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index ba9a8864..d883ec52 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -237,6 +237,8 @@ typedef uintptr_t mi_thread_free_t; // Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython) typedef struct mi_subproc_s mi_subproc_t; +// A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython) +typedef uint8_t mi_heaptag_t; // A page contains blocks of one specific size (`block_size`). // Each page has three list of free blocks: @@ -280,7 +282,7 @@ typedef struct mi_page_s { size_t block_size; // size available in each block (always `>0`) uint8_t* page_start; // start of the blocks - uint8_t heap_tag; // tag of the owning heap, used to separate heaps by object type + mi_heaptag_t heap_tag; // tag of the owning heap, used to separate heaps by object type bool free_is_zero; // `true` if the blocks in the free list are zero initialized // padding #if (MI_ENCODE_FREELIST || MI_PADDING) @@ -411,7 +413,16 @@ struct mi_heap_s { mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") }; +// ------------------------------------------------------ +// Arena's +// These are large reserved areas of memory allocated from +// the OS that are managed by mimalloc to efficiently +// allocate MI_SLICE_SIZE slices of memory for the +// mimalloc pages. +// ------------------------------------------------------ +// A large memory arena where pages are allocated in. +typedef struct mi_arena_s mi_arena_t; // ------------------------------------------------------ // Debug diff --git a/src/arena.c b/src/arena.c index 8b9ab4da..f6c0f0a3 100644 --- a/src/arena.c +++ b/src/arena.c @@ -479,11 +479,9 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t Arena page allocation ----------------------------------------------------------- */ -static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* arg2, bool* keep_abandoned) { +static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_abandoned) { // found an abandoned page of the right size - mi_arena_t* const arena = (mi_arena_t*)arg1; - mi_subproc_t* const subproc = (mi_subproc_t*)arg2; - mi_page_t* const page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + mi_page_t* const page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); // can we claim ownership? if (!mi_page_try_claim_ownership(page)) { // there was a concurrent free .. @@ -493,8 +491,9 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* a *keep_abandoned = true; return false; } - if (subproc != page->subproc) { - // wrong sub-process.. we need to unown again + if (subproc != page->subproc || heap_tag != page->heap_tag) { + // wrong sub-process or heap_tag.. we need to unown again + // note: this normally never happens unless subprocesses/heaptags are actually used. // (an unown might free the page, and depending on that we can keep it in the abandoned map or not) // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point. // so we cannot check in `mi_arena_free` for this invariant to hold. @@ -507,7 +506,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* a return true; } -static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld) +static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_heaptag_t heaptag, mi_tld_t* tld) { MI_UNUSED(slice_count); const size_t bin = _mi_bin(block_size); @@ -525,7 +524,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl size_t slice_index; mi_bitmap_t* const bitmap = arena->pages_abandoned[bin]; - if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc)) { + if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc, heaptag)) { // found an abandoned page of the right size // and claimed ownership. mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); @@ -632,7 +631,7 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size mi_tld_t* const tld = heap->tld; // 1. look for an abandoned page - mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, tld); + mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, heap->tag, tld); if (page != NULL) { return page; // return as abandoned } diff --git a/src/bitmap.c b/src/bitmap.c index 8479555c..cdeeb009 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1165,7 +1165,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, - mi_claim_fun_t* claim, void* arg1, void* arg2) + mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag ) { mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) { @@ -1174,7 +1174,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx; mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap)); bool keep_set = true; - if ((*claim)(slice_index, arg1, arg2, &keep_set)) { + if ((*claim)(slice_index, arena, subproc, heap_tag, &keep_set)) { // success! mi_assert_internal(!keep_set); *pidx = slice_index; diff --git a/src/bitmap.h b/src/bitmap.h index 7938bfa0..aaa552ad 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -185,10 +185,10 @@ static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); -typedef bool (mi_claim_fun_t)(size_t slice_index, void* arg1, void* arg2, bool* keep_set); +typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set); mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, - mi_claim_fun_t* claim, void* arg1, void* arg2); + mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag ); void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx); From d0c86f3f0e625236da685c9668378657cc8e79ba Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 7 Dec 2024 16:26:07 -0800 Subject: [PATCH 044/264] specialize bitmap operations for common page sizes --- src/bitmap.c | 372 ++++++++++++++++++++----------------------------- src/bitmap.h | 38 ++--- src/page-map.c | 8 +- 3 files changed, 175 insertions(+), 243 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index cdeeb009..b76dfc77 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -27,10 +27,6 @@ static inline size_t mi_bfield_popcount(mi_bfield_t x) { return mi_popcount(x); } -//static inline size_t mi_bfield_clz(mi_bfield_t x) { -// return mi_clz(x); -//} - // find the least significant bit that is set (i.e. count trailing zero's) // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). @@ -55,18 +51,13 @@ static inline mi_bfield_t mi_bfield_all_set(void) { } static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) { + mi_assert_internal(bit_count > 0); mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS); const mi_bfield_t mask0 = (bit_count < MI_BFIELD_BITS ? (mi_bfield_one() << bit_count)-1 : mi_bfield_all_set()); return (mask0 << shiftl); } - -// Find the least significant bit that can be xset (0 for MI_BIT_SET, 1 for MI_BIT_CLEAR). -// return false if `x==~0` (for MI_BIT_SET) or `x==0` for MI_BIT_CLEAR (with `*idx` undefined) and true otherwise, -// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). -//static inline bool mi_bfield_find_least_to_xset(mi_xset_t set, mi_bfield_t x, size_t* idx) { -// return mi_bfield_find_least_bit((set ? ~x : x), idx); -//} +// ------- mi_bfield_atomic_set --------------------------------------- // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { @@ -105,15 +96,6 @@ static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_ mi_assert_internal((old&mask)==mask); // we should only clear when it was set } -// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). -static inline bool mi_bfield_atomic_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) { - if (set) { - return mi_bfield_atomic_set(b, idx); - } - else { - return mi_bfield_atomic_clear(b, idx, NULL); - } -} // Set a mask set of bits atomically, and return true of the mask bits transitioned from all 0's to 1's. static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, size_t* already_set) { @@ -144,13 +126,33 @@ static inline bool mi_bfield_atomic_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t } } +static inline bool mi_bfield_atomic_set8(_Atomic(mi_bfield_t)*b, size_t byte_idx) { + mi_assert_internal(byte_idx < MI_BFIELD_SIZE); + const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8); + return mi_bfield_atomic_xset_mask(MI_BIT_SET, b, mask, NULL); +} + +static inline bool mi_bfield_atomic_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx, bool* all_clear) { + mi_assert_internal(byte_idx < MI_BFIELD_SIZE); + const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8); + mi_bfield_t old = mi_atomic_load_relaxed(b); + while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)) {}; // try to atomically clear the mask bits until success + if (all_clear!=NULL) { *all_clear = ((old&~mask)==0); } + return ((old&mask) == mask); +} + +static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b) { + const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_all_set()); + return (old==0); +} + +static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b) { + const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero()); + return (~old==0); +} + +// ------- mi_bfield_atomic_try_xset --------------------------------------- -// Tries to set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 -// and otherwise false (leaving the bit unchanged) -//static inline bool mi_bfield_atomic_try_set(_Atomic(mi_bfield_t)*b, size_t idx) { -// mi_assert_internal(idx < MI_BFIELD_BITS); -// return mi_bfield_atomic_set(b, idx); // for a single bit there is no difference -//} // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0. // `all_clear` is set to true if the new bfield is zero (and false otherwise) @@ -162,14 +164,6 @@ static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx return ((old&mask) == mask); } -// Tries to set/clear a bit atomically, and returns true if the bit atomically transitioned from 0 to 1 (or 1 to 0) -static inline bool mi_bfield_atomic_try_xset( mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) { - mi_assert_internal(idx < MI_BFIELD_BITS); - // for a single bit, we can always just set/clear and test afterwards if it was actually us that changed it first - return mi_bfield_atomic_xset(set, b, idx); -} - - // Tries to set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask // and false otherwise (leaving the bit field as is). static inline bool mi_bfield_atomic_try_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) { @@ -211,13 +205,6 @@ static inline bool mi_bfield_atomic_try_xset_mask(mi_xset_t set, _Atomic(mi_bfie } } -// Tries to set a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF -// and false otherwise (leaving the bit field as is). -static inline bool mi_bfield_atomic_try_set8(_Atomic(mi_bfield_t)*b, size_t byte_idx) { - mi_assert_internal(byte_idx < MI_BFIELD_SIZE); - const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8); - return mi_bfield_atomic_try_set_mask(b, mask); -} // Tries to clear a byte atomically, and returns true if the byte atomically transitioned from 0xFF to 0 static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t byte_idx, bool* all_clear) { @@ -226,22 +213,6 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t by return mi_bfield_atomic_try_clear_mask(b, mask, all_clear); } -//// Tries to set/clear a byte atomically, and returns true if the byte atomically transitioned from 0 to 0xFF (or 0xFF to 0) -//// and false otherwise (leaving the bit field as is). -//static inline bool mi_bfield_atomic_try_xset8(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t byte_idx) { -// mi_assert_internal(byte_idx < MI_BFIELD_SIZE); -// const mi_bfield_t mask = ((mi_bfield_t)0xFF)<<(byte_idx*8); -// return mi_bfield_atomic_try_xset_mask(set, b, mask); -//} - - -// Try to set a full field of bits atomically, and return true all bits transitioned from all 0's to 1's. -// and false otherwise leaving the bit field as-is. -//static inline bool mi_bfield_atomic_try_setX(_Atomic(mi_bfield_t)*b) { -// mi_bfield_t old = 0; -// return mi_atomic_cas_weak_acq_rel(b, &old, mi_bfield_all_set()); -//} - // Try to clear a full field of bits atomically, and return true all bits transitioned from all 1's to 0's. // and false otherwise leaving the bit field as-is. static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) { @@ -250,6 +221,9 @@ static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b) { } +// ------- mi_bfield_atomic_is_set --------------------------------------- + + // Check if all bits corresponding to a mask are set. static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) { mi_assert_internal(mask != 0); @@ -275,26 +249,12 @@ static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfiel } -// Check if a bit is set/clear -// static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, size_t idx) { -// mi_assert_internal(idx < MI_BFIELD_BITS); -// const mi_bfield_t mask = mi_bfield_one()<bfields[i], idx); -//} +// ------- mi_bchunk_xset --------------------------------------- static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) { mi_assert_internal(cidx < MI_BCHUNK_BITS); @@ -310,6 +270,30 @@ static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_ return mi_bfield_atomic_clear(&chunk->bfields[i], idx, maybe_all_clear); } +static inline bool mi_bchunk_set8(mi_bchunk_t* chunk, size_t byte_idx) { + mi_assert_internal(byte_idx < MI_BCHUNK_SIZE); + const size_t i = byte_idx / MI_BFIELD_SIZE; + const size_t bidx = byte_idx % MI_BFIELD_SIZE; + return mi_bfield_atomic_set8(&chunk->bfields[i], bidx); +} + +static inline bool mi_bchunk_clear8(mi_bchunk_t* chunk, size_t byte_idx, bool* maybe_all_clear) { + mi_assert_internal(byte_idx < MI_BCHUNK_SIZE); + const size_t i = byte_idx / MI_BFIELD_SIZE; + const size_t bidx = byte_idx % MI_BFIELD_SIZE; + return mi_bfield_atomic_clear8(&chunk->bfields[i], bidx, maybe_all_clear); +} + +static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t field_idx) { + mi_assert_internal(field_idx < MI_BCHUNK_FIELDS); + return mi_bfield_atomic_setX(&chunk->bfields[field_idx]); +} + +static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t field_idx, bool* maybe_all_clear) { + mi_assert_internal(field_idx < MI_BCHUNK_FIELDS); + if (maybe_all_clear != NULL) { *maybe_all_clear = true; } + return mi_bfield_atomic_clearX(&chunk->bfields[field_idx]); +} // Set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0). @@ -340,7 +324,6 @@ static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size return all_transition; } - static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { return mi_bchunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set); } @@ -351,74 +334,46 @@ static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, s -// ------ is_xset -------- +// ------- mi_bchunk_is_xset --------------------------------------- // Check if a sequence of `n` bits within a chunk are all set/cleared. -static bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) { - mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); - mi_assert_internal(n>0); - size_t idx = cidx % MI_BFIELD_BITS; - size_t field = cidx / MI_BFIELD_BITS; +// This can cross bfield's +mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) { + mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS); while (n > 0) { size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field if (m > n) { m = n; } mi_assert_internal(idx + m <= MI_BFIELD_BITS); - mi_assert_internal(field < MI_BCHUNK_FIELDS); + mi_assert_internal(field_idx < MI_BCHUNK_FIELDS); const size_t mask = mi_bfield_mask(m, idx); - if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mask)) { + if (!mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field_idx], mask)) { return false; } // next field - field++; + field_idx++; idx = 0; n -= m; } return true; } - -// ------ try_xset -------- - -static inline bool mi_bchunk_try_xset(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); - const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; - return mi_bfield_atomic_try_xset(set, &chunk->bfields[i], idx); -} - -static inline bool mi_bchunk_try_set(mi_bchunk_t* chunk, size_t cidx) { - return mi_bchunk_try_xset(MI_BIT_SET, chunk, cidx); -} - -static inline bool mi_bchunk_try_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_all_clear) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); - const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; - return mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, maybe_all_clear); +// Check if a sequence of `n` bits within a chunk are all set/cleared. +static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) { + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); + mi_assert_internal(n>0); + if (n==0) return true; + size_t field = cidx / MI_BFIELD_BITS; + size_t idx = cidx % MI_BFIELD_BITS; + if mi_likely(n<=MI_BFIELD_BITS) { + return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mi_bfield_mask(n, idx)); + } + else { + return mi_bchunk_is_xsetN_(set, chunk, field, idx, n); + } } -//static inline bool mi_bchunk_try_xset8(mi_xset_t set, mi_bchunk_t* chunk, size_t byte_idx) { -// mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS); -// const size_t i = byte_idx / MI_BFIELD_SIZE; -// const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; -// return mi_bfield_atomic_try_xset8(set, &chunk->bfields[i], ibyte_idx); -//} - -static inline bool mi_bchunk_try_set8(mi_bchunk_t* chunk, size_t byte_idx) { - mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS); - const size_t i = byte_idx / MI_BFIELD_SIZE; - const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; - return mi_bfield_atomic_try_set8(&chunk->bfields[i], ibyte_idx); -} - -static inline bool mi_bchunk_try_clear8(mi_bchunk_t* chunk, size_t byte_idx, bool* maybe_all_clear) { - mi_assert_internal(byte_idx*8 < MI_BCHUNK_BITS); - const size_t i = byte_idx / MI_BFIELD_SIZE; - const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; - return mi_bfield_atomic_try_clear8(&chunk->bfields[i], ibyte_idx, maybe_all_clear); -} - +// ------- mi_bchunk_try_xset --------------------------------------- // Try to atomically set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0), @@ -490,22 +445,16 @@ restore: return false; } -static inline bool mi_bchunk_try_setN(mi_bchunk_t* chunk, size_t cidx, size_t n) { - return mi_bchunk_try_xsetN(MI_BIT_SET, chunk, cidx, n, NULL); -} +// static inline bool mi_bchunk_try_setN(mi_bchunk_t* chunk, size_t cidx, size_t n) { +// return mi_bchunk_try_xsetN(MI_BIT_SET, chunk, cidx, n, NULL); +// } static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) { return mi_bchunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n, maybe_all_clear); } -static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); - const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; - mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx); -} -// ------ try_find_and_clear -------- +// ------- mi_bchunk_try_find_and_clear --------------------------------------- #if defined(__AVX2__) static inline __m256i mi_mm256_zero(void) { @@ -808,6 +757,18 @@ static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, s } +// ------- mi_bchunk_clear_once_set --------------------------------------- + +static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + mi_bfield_atomic_clear_once_set(&chunk->bfields[i], idx); +} + + +// ------- mi_bitmap_all_are_clear --------------------------------------- + // are all bits in a bitmap chunk clear? (this uses guaranteed atomic reads) static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) { for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { @@ -831,12 +792,6 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { #endif } - -/* -------------------------------------------------------------------------------- - chunkmap --------------------------------------------------------------------------------- */ - - /* -------------------------------------------------------------------------------- bitmap chunkmap -------------------------------------------------------------------------------- */ @@ -866,6 +821,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) return true; } + /* -------------------------------------------------------------------------------- bitmap -------------------------------------------------------------------------------- */ @@ -941,82 +897,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { } -// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0), -// and false otherwise leaving the bitmask as is. -static bool mi_bitmap_try_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); - const size_t chunk_idx = idx / MI_BCHUNK_BITS; - const size_t cidx = idx % MI_BCHUNK_BITS; - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (set) { - const bool ok = mi_bchunk_try_set(&bitmap->chunks[chunk_idx], cidx); - if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); } // set afterwards - return ok; - } - else { - bool maybe_all_clear; - const bool ok = mi_bchunk_try_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear); - if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return ok; - } -} -// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) -// and false otherwise leaving the bitmask as is. -static bool mi_bitmap_try_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); - mi_assert_internal(idx%8 == 0); - const size_t chunk_idx = idx / MI_BCHUNK_BITS; - const size_t byte_idx = (idx % MI_BCHUNK_BITS)/8; - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (set) { - const bool ok = mi_bchunk_try_set8(&bitmap->chunks[chunk_idx], byte_idx); - if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); } // set afterwards - return ok; - } - else { - bool maybe_all_clear; - const bool ok = mi_bchunk_try_clear8(&bitmap->chunks[chunk_idx], byte_idx, &maybe_all_clear); - if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return ok; - } -} - -// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) -// and false otherwise leaving the bitmask as is. -// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! -static bool mi_bitmap_try_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BCHUNK_BITS); - mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); - if (n==0 || idx + n > mi_bitmap_max_bits(bitmap)) return false; - - const size_t chunk_idx = idx / MI_BCHUNK_BITS; - const size_t cidx = idx % MI_BCHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia - if (set) { - const bool ok = mi_bchunk_try_setN(&bitmap->chunks[chunk_idx], cidx, n); - if (ok) { mi_bitmap_chunkmap_set(bitmap,chunk_idx); } // set afterwards - return ok; - } - else { - bool maybe_all_clear; - const bool ok = mi_bchunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); - if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return ok; - } -} - -mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS); - if (n==1) return mi_bitmap_try_xset(set, bitmap, idx); - if (n==8) return mi_bitmap_try_xset8(set, bitmap, idx); - // todo: add 32/64 for large pages ? - return mi_bitmap_try_xsetN_(set, bitmap, idx, n); -} +// ------- mi_bitmap_xset --------------------------------------- // Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { @@ -1037,6 +920,48 @@ bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { } } +// Set/clear aligned 8-bits in the bitmap (with `(idx%8)==0`). +// Returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) +static bool mi_bitmap_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); + mi_assert_internal((idx%8)==0); + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t byte_idx = (idx % MI_BCHUNK_BITS)/8; + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + if (set) { + const bool wasclear = mi_bchunk_set8(&bitmap->chunks[chunk_idx], byte_idx); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards + return wasclear; + } + else { + bool maybe_all_clear; + const bool wasset = mi_bchunk_clear8(&bitmap->chunks[chunk_idx], byte_idx, &maybe_all_clear); + if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return wasset; + } +} + +// Set/clear a field of bits. +// Returns `true` if atomically transitioned from 0 to ~0 (or ~0 to 0) +static bool mi_bitmap_xsetX(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); + mi_assert_internal((idx%MI_BFIELD_BITS)==0); + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t field_idx = (idx % MI_BCHUNK_BITS)/MI_BFIELD_BITS; + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + if (set) { + const bool wasclear = mi_bchunk_setX(&bitmap->chunks[chunk_idx],field_idx); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards + return wasclear; + } + else { + bool maybe_all_clear; + const bool wasset = mi_bchunk_clearX(&bitmap->chunks[chunk_idx], field_idx, &maybe_all_clear); + if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return wasset; + } +} + // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) { @@ -1067,14 +992,15 @@ static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, siz // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) { mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS); - //TODO: specialize? - //if (n==1) return mi_bitmap_xset(set, bitmap, idx); - //if (n==2) return mi_bitmap_xset(set, bitmap, idx); - //if (n==8) return mi_bitmap_xset8(set, bitmap, idx); + if (n==1) return mi_bitmap_xset(set, bitmap, idx); + if (n==8) return mi_bitmap_xset8(set, bitmap, idx); + if (n==MI_BFIELD_BITS) return mi_bitmap_xsetX(set, bitmap, idx); return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset); } +// ------- mi_bitmap_is_xset --------------------------------------- + // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_assert_internal(n>0); @@ -1091,10 +1017,11 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n } -/* -------------------------------------------------------------------------------- - bitmap try_find_and_clear --------------------------------------------------------------------------------- */ +/* -------------------------------------------------------------------------------- + bitmap try_find_and_clear + (used to find free pages) +-------------------------------------------------------------------------------- */ #define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \ { \ @@ -1116,7 +1043,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ size_t cmap_idx_shift = 0; /* shift through the cmap */ \ - if (_i == 0) { \ + if (_i == 0 && chunkmap_start_idx > 0) { \ cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \ cmap_idx_shift = chunkmap_start_idx; \ } \ @@ -1162,6 +1089,11 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t } +/* -------------------------------------------------------------------------------- + bitmap try_find_and_claim + (used to allocate abandoned pages) +-------------------------------------------------------------------------------- */ + // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, diff --git a/src/bitmap.h b/src/bitmap.h index aaa552ad..7d6d8f97 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -82,7 +82,7 @@ typedef mi_bchunk_t mi_bchunkmap_t; #if MI_SIZE_BITS > 32 #define MI_BITMAP_DEFAULT_CHUNK_COUNT (64) // 2 GiB on 64-bit -- this is for the page map #else -#define MI_BITMAP_DEFAULT_CHUNK_COUNT (1) +#define MI_BITMAP_DEFAULT_CHUNK_COUNT (1) #endif #define MI_BITMAP_MAX_BIT_COUNT (MI_BITMAP_MAX_CHUNK_COUNT * MI_BCHUNK_BITS) // 16 GiB arena #define MI_BITMAP_MIN_BIT_COUNT (MI_BITMAP_MIN_CHUNK_COUNT * MI_BCHUNK_BITS) // 32 MiB arena @@ -92,7 +92,7 @@ typedef mi_bchunk_t mi_bchunkmap_t; // An atomic bitmap typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) - _Atomic(size_t) chunk_max_clear; // max chunk index that was once cleared + _Atomic(size_t) chunk_max_clear; // max chunk index that was once cleared size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc mi_bchunkmap_t chunkmap; mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT @@ -126,7 +126,8 @@ size_t mi_bitmap_size(size_t bit_count, size_t* chunk_count); // returns the size of the bitmap. size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero); -// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). +// Not atomic so only use if still local to a thread. void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); @@ -144,7 +145,8 @@ static inline bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) { // Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! -// If `already_xset` is not NULL, it is to all the bits were already all set/cleared. +// If `already_xset` is not NULL, it is set to count of bits were already all set/cleared. +// (this is used for correct statistics if commiting over a partially committed area) bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset); static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) { @@ -159,6 +161,8 @@ static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { // Is a sequence of n bits already all set/cleared? bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); +// Is a sequence of n bits already set? +// (Used to check if a memory range is already committed) static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n); } @@ -168,28 +172,24 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n } -// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) -// and false otherwise leaving the bitmask as is. -// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! -mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); - -static inline bool mi_bitmap_try_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { - return mi_bitmap_try_xsetN(MI_BIT_SET, bitmap, idx, n); -} - -static inline bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { - return mi_bitmap_try_xsetN(MI_BIT_CLEAR, bitmap, idx, n); -} - -// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. +// Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); + +// Called once a bit is cleared to see if the memory slice can be claimed. typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set); -mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, +// Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true. +// If not claimed, continue on (potentially setting the bit again depending on `keep_set`). +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag ); + +// Atomically clear a bit but only if it is set. Will block otherwise until the bit is set. +// This is used to delay free-ing a page that it at the same time being considered to be +// allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`). void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx); #endif // MI_BITMAP_H diff --git a/src/page-map.c b/src/page-map.c index d849e6a2..7a00d172 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -55,14 +55,14 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) { const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit; const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit; for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) { // per bit to avoid crossing over bitmap chunks - if (mi_bitmap_is_xsetN(MI_BIT_CLEAR, &mi_page_map_commit, i, 1)) { + if (mi_bitmap_is_clearN(&mi_page_map_commit, i, 1)) { // this may race, in which case we do multiple commits (which is ok) bool is_zero; uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit); const size_t size = mi_page_map_entries_per_commit_bit; - _mi_os_commit(start, size, &is_zero, NULL); + _mi_os_commit(start, size, &is_zero, NULL); if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); } - mi_bitmap_xsetN(MI_BIT_SET, &mi_page_map_commit, i, 1, NULL); + mi_bitmap_set(&mi_page_map_commit, i); } } #if MI_DEBUG > 0 @@ -119,7 +119,7 @@ void _mi_page_map_unregister(mi_page_t* page) { mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { uintptr_t idx = ((uintptr_t)p >> MI_ARENA_SLICE_SHIFT); - if (!mi_page_map_all_committed || mi_bitmap_is_xsetN(MI_BIT_SET, &mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) { + if (!mi_page_map_all_committed || mi_bitmap_is_setN(&mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) { return (_mi_page_map[idx] != 0); } else { From c33de86da35b23cebf0dbadea10ac9316a2441b4 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 7 Dec 2024 17:11:11 -0800 Subject: [PATCH 045/264] check for running in a threadpool to disable page reclaim --- include/mimalloc/prim.h | 3 ++- include/mimalloc/types.h | 19 ++++++++++--------- src/arena.c | 12 ++++++++---- src/bitmap.h | 2 +- src/free.c | 2 +- src/heap.c | 30 +++++++++++------------------- src/init.c | 17 ++++++++++------- src/prim/emscripten/prim.c | 5 ++++- src/prim/unix/prim.c | 4 ++++ src/prim/wasi/prim.c | 4 ++++ src/prim/windows/prim.c | 25 +++++++++++++++++++++++-- test/test-stress.c | 4 ++-- 12 files changed, 80 insertions(+), 47 deletions(-) diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 8a627438..65f65376 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -117,7 +117,8 @@ void _mi_prim_thread_done_auto_done(void); // Called when the default heap for a thread changes void _mi_prim_thread_associate_default_heap(mi_heap_t* heap); - +// Is this thread part of a thread pool? +bool _mi_prim_thread_is_in_threadpool(void); //------------------------------------------------------------------- // Thread id: `_mi_prim_thread_id()` diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index d883ec52..e10786a0 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -399,7 +399,7 @@ struct mi_heap_s { size_t page_retired_min; // smallest retired index (retired pages are fully free, but still in the page queues) size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread - bool no_reclaim; // `true` if this heap should not reclaim abandoned pages + bool allow_page_reclaim; // `true` if this heap can reclaim abandoned pages bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint uint8_t tag; // custom tag, can be used for separating heaps based on the object types #if MI_GUARDED @@ -568,14 +568,15 @@ typedef struct mi_os_tld_s { // Thread local data struct mi_tld_s { - unsigned long long heartbeat; // monotonic heartbeat count - bool recurse; // true if deferred was called; used to prevent infinite recursion. - mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) - mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) - mi_subproc_t* subproc; // sub-process this thread belongs to. - size_t tseq; // thread sequence id - mi_os_tld_t os; // os tld - mi_stats_t stats; // statistics + unsigned long long heartbeat; // monotonic heartbeat count + mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) + mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) + mi_subproc_t* subproc; // sub-process this thread belongs to. + size_t tseq; // thread sequence id + bool recurse; // true if deferred was called; used to prevent infinite recursion. + bool is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks) + mi_os_tld_t os; // os tld + mi_stats_t stats; // statistics }; #endif diff --git a/src/arena.c b/src/arena.c index f6c0f0a3..fa7d53ed 100644 --- a/src/arena.c +++ b/src/arena.c @@ -585,21 +585,25 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment)); - // claimed free slices: initialize the page partly + // claimed free slices: initialize the page partly if (!memid.initially_zero) { + mi_track_mem_undefined(page, slice_count * MI_ARENA_SLICE_SIZE); _mi_memzero_aligned(page, sizeof(*page)); } - #if MI_DEBUG > 1 else { + mi_track_mem_defined(page, slice_count * MI_ARENA_SLICE_SIZE); + } + #if MI_DEBUG > 1 + if (memid.initially_zero) { if (!mi_mem_is_zero(page, mi_size_of_slices(slice_count))) { - _mi_error_message(EFAULT, "page memory was not zero initialized!\n"); + _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n"); memid.initially_zero = false; _mi_memzero_aligned(page, sizeof(*page)); } } #endif if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) { - _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small\n"); + _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small.\n"); }; const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); const size_t reserved = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size); diff --git a/src/bitmap.h b/src/bitmap.h index 7d6d8f97..40c4df42 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -1,5 +1,5 @@ /* ---------------------------------------------------------------------------- -Copyright (c) 2019-2023 Microsoft Research, Daan Leijen +Copyright (c) 2019-2024 Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. diff --git a/src/free.c b/src/free.c index ece55599..d45507e7 100644 --- a/src/free.c +++ b/src/free.c @@ -230,7 +230,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { { mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); if ((tagheap != NULL) && // don't reclaim across heap object types - (!tagheap->no_reclaim) && // we are allowed to reclaim abandoned pages + (tagheap->allow_page_reclaim) && // we are allowed to reclaim abandoned pages (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) ) diff --git a/src/heap.c b/src/heap.c index d687f25e..3bf8b976 100644 --- a/src/heap.c +++ b/src/heap.c @@ -128,7 +128,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) #else collect >= MI_FORCE #endif - && is_main_thread && mi_heap_is_backing(heap) && !heap->no_reclaim) + && is_main_thread && mi_heap_is_backing(heap) && heap->allow_page_reclaim) { // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. // if all memory is freed by now, all segments should be freed. @@ -192,23 +192,14 @@ void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool heap->tld = tld; heap->thread_id = _mi_thread_id(); heap->arena_id = arena_id; - heap->no_reclaim = noreclaim; + heap->allow_page_reclaim = !noreclaim; heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0); heap->tag = tag; - - #if defined(WIN32) && (MI_ARCH_X64 || MI_ARCH_X86) - // disallow reclaim for threads running in the windows threadpool - const DWORD winVersion = GetVersion(); - const DWORD winMajorVersion = (DWORD)(LOBYTE(LOWORD(winVersion))); - if (winMajorVersion >= 6) { - _TEB* const teb = NtCurrentTeb(); - void* const poolData = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778))); - if (poolData != NULL) { - heap->no_reclaim = true; - } + if (tld->is_in_threadpool) { + // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. + // (but abandoning is good in this case) + heap->allow_page_reclaim = false; } - #endif - if (heap == tld->heap_backing) { _mi_random_init(&heap->random); } @@ -364,7 +355,8 @@ static bool mi_cdecl mi_heap_track_block_free(const mi_heap_t* heap, const mi_he void mi_heap_destroy(mi_heap_t* heap) { mi_assert(heap != NULL); mi_assert(mi_heap_is_initialized(heap)); - mi_assert(heap->no_reclaim); + mi_assert(!heap->allow_page_reclaim); + mi_assert(!heap->allow_page_abandon); mi_assert_expensive(mi_heap_is_valid(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return; #if MI_GUARDED @@ -372,9 +364,9 @@ void mi_heap_destroy(mi_heap_t* heap) { mi_heap_delete(heap); return; #else - if (!heap->no_reclaim) { + if (heap->allow_page_reclaim) { _mi_warning_message("'mi_heap_destroy' called but ignored as the heap was not created with 'allow_destroy' (heap at %p)\n", heap); - // don't free in case it may contain reclaimed pages + // don't free in case it may contain reclaimed pages, mi_heap_delete(heap); } else { @@ -395,7 +387,7 @@ void _mi_heap_unsafe_destroy_all(void) { mi_heap_t* curr = bheap->tld->heaps; while (curr != NULL) { mi_heap_t* next = curr->next; - if (curr->no_reclaim) { + if (!curr->allow_page_reclaim) { mi_heap_destroy(curr); } else { diff --git a/src/init.c b/src/init.c index 4fbd50ed..b66efc69 100644 --- a/src/init.c +++ b/src/init.c @@ -131,12 +131,14 @@ extern mi_heap_t _mi_heap_main; static mi_decl_cache_align mi_subproc_t mi_subproc_default; static mi_decl_cache_align mi_tld_t tld_main = { - 0, false, + 0, &_mi_heap_main, &_mi_heap_main, - &mi_subproc_default, // subproc - 0, // tseq - { 0, &tld_main.stats }, // os - { MI_STATS_NULL } // stats + &mi_subproc_default, // subproc + 0, // tseq + false, // recurse + false, // is_in_threadpool + { 0, &tld_main.stats }, // os + { MI_STATS_NULL } // stats }; mi_decl_cache_align mi_heap_t _mi_heap_main = { @@ -150,8 +152,8 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = { 0, // page count MI_BIN_FULL, 0, // page retired min/max NULL, // next heap - false, // can reclaim - true, // eager abandon + true, // allow page reclaim + true, // allow page abandon 0, // tag #if MI_GUARDED 0, 0, 0, 0, 0, @@ -402,6 +404,7 @@ void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { tld->subproc = &mi_subproc_default; tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); tld->os.stats = &tld->stats; + tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool(); } // Free the thread local default heap (called from `mi_thread_done`) diff --git a/src/prim/emscripten/prim.c b/src/prim/emscripten/prim.c index 82147de7..d3dcca93 100644 --- a/src/prim/emscripten/prim.c +++ b/src/prim/emscripten/prim.c @@ -239,6 +239,9 @@ void _mi_prim_thread_done_auto_done(void) { void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); - } #endif + +bool _mi_prim_thread_is_in_threadpool(void) { + return false; +} diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 5a4440c3..e1ca3964 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -886,3 +886,7 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { } #endif + +bool _mi_prim_thread_is_in_threadpool(void) { + return false; +} diff --git a/src/prim/wasi/prim.c b/src/prim/wasi/prim.c index e1e7de5e..def09985 100644 --- a/src/prim/wasi/prim.c +++ b/src/prim/wasi/prim.c @@ -277,3 +277,7 @@ void _mi_prim_thread_done_auto_done(void) { void _mi_prim_thread_associate_default_heap(mi_heap_t* heap) { MI_UNUSED(heap); } + +bool _mi_prim_thread_is_in_threadpool(void) { + return false; +} diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 276da85c..80522f47 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -17,6 +17,9 @@ terms of the MIT license. A copy of the license can be found in the file // Dynamically bind Windows API points for portability //--------------------------------------------- +static DWORD win_major_version = 6; +static DWORD win_minor_version = 0; + // We use VirtualAlloc2 for aligned allocation, but it is only supported on Windows 10 and Windows Server 2016. // So, we need to look it up dynamically to run on older systems. (use __stdcall for 32-bit compatibility) // NtAllocateVirtualAllocEx is used for huge OS page allocation (1GiB) @@ -115,6 +118,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) config->has_overcommit = false; config->has_partial_free = false; config->has_virtual_reserve = true; + // windows version + const DWORD win_version = GetVersion(); + win_major_version = (DWORD)(LOBYTE(LOWORD(win_version))); + win_minor_version = (DWORD)(HIBYTE(LOWORD(win_version))); // get the page size SYSTEM_INFO si; GetSystemInfo(&si); @@ -134,7 +141,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) { config->physical_memory = (size_t)(memInKiB * MI_KiB); } - } + } // get the VirtualAlloc2 function HINSTANCE hDll; hDll = LoadLibrary(TEXT("kernelbase.dll")); @@ -809,4 +816,18 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { void _mi_allocator_done(void) { mi_allocator_done(); } -#endif \ No newline at end of file +#endif + + +bool _mi_prim_thread_is_in_threadpool(void) { + #if (MI_ARCH_X64 || MI_ARCH_X86) + if (win_major_version >= 6) { + // check if this thread belongs to a windows threadpool + // see: + _TEB* const teb = NtCurrentTeb(); + void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778))); + return (pool_data != NULL); + } + #endif + return false; +} diff --git a/test/test-stress.c b/test/test-stress.c index 19edf2b5..915c953f 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -347,8 +347,8 @@ int main(int argc, char** argv) { mi_collect(true); mi_debug_show_arenas(true,true,false); #endif - mi_collect(true); - mi_debug_show_arenas(true, true, false); + // mi_collect(true); + // mi_debug_show_arenas(true, true, false); // mi_stats_print(NULL); #else mi_stats_print(NULL); // so we see rss/commit/elapsed From 5a06d2aeba381d47371fcb3189cf24b9ceda2865 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 8 Dec 2024 09:03:25 -0800 Subject: [PATCH 046/264] update bit primitives --- include/mimalloc/bits.h | 200 +++++++++++----------------------------- src/libc.c | 75 +++++++++++++-- src/os.c | 2 +- 3 files changed, 122 insertions(+), 155 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index e1951cf7..3afac04d 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -36,6 +36,12 @@ terms of the MIT license. A copy of the license can be found in the file #error platform pointers must be 32, 64, or 128 bits #endif +#if (INTPTR_MAX) > LONG_MAX +# define MI_PU(x) x##ULL +#else +# define MI_PU(x) x##UL +#endif + #if SIZE_MAX == UINT64_MAX # define MI_SIZE_SHIFT (3) typedef int64_t mi_ssize_t; @@ -43,15 +49,13 @@ typedef int64_t mi_ssize_t; # define MI_SIZE_SHIFT (2) typedef int32_t mi_ssize_t; #else -#error platform objects must be 32 or 64 bits +#error platform objects must be 32 or 64 bits in size #endif #if (SIZE_MAX/2) > LONG_MAX # define MI_ZU(x) x##ULL -# define MI_ZI(x) x##LL #else # define MI_ZU(x) x##UL -# define MI_ZI(x) x##L #endif #define MI_INTPTR_SIZE (1< @@ -352,30 +272,15 @@ static inline size_t mi_rotr(size_t x, size_t r) { #endif } -static inline uint32_t mi_rotr32(uint32_t x, uint32_t r) { - #if mi_has_builtin(rotateright32) - return mi_builtin(rotateright32)(x, r); - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) - return _lrotr(x, (int)r); - #else - // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to - // avoid UB when `rshift==0`. See - const unsigned int rshift = (unsigned int)(r) & 31; - return ((x >> rshift) | (x << ((-rshift) & 31))); - #endif -} - static inline size_t mi_rotl(size_t x, size_t r) { #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64) return mi_builtin(rotateleft64)(x,r); #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32) return mi_builtin(rotateleft32)(x,r); - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) - #if MI_SIZE_BITS==32 - return _lrotl(x,(int)r); - #else - return _rotl64(x,(int)r); - #endif + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64) + return _rotl64(x, (int)r); + #elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32) + return _lrotl(x, (int)r); #else // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to // avoid UB when `rshift==0`. See @@ -385,5 +290,4 @@ static inline size_t mi_rotl(size_t x, size_t r) { } - #endif // MI_BITS_H diff --git a/src/libc.c b/src/libc.c index 20e9e38b..3fdbf3e7 100644 --- a/src/libc.c +++ b/src/libc.c @@ -1,5 +1,5 @@ /* ---------------------------------------------------------------------------- -Copyright (c) 2018-2023, Microsoft Research, Daan Leijen +Copyright (c) 2018-2024, Microsoft Research, Daan Leijen This is free software; you can redistribute it and/or modify it under the terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. @@ -277,10 +277,12 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { // -------------------------------------------------------- -// generic trailing and leading zero count +// generic trailing and leading zero count, and popcount // -------------------------------------------------------- -uint32_t _mi_ctz_generic32(uint32_t x) { +#if !MI_HAS_FAST_BITSCAN + +static size_t mi_ctz_generic32(uint32_t x) { // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, @@ -319,10 +321,71 @@ size_t _mi_clz_generic(size_t x) { size_t _mi_ctz_generic(size_t x) { if (x==0) return MI_SIZE_BITS; #if (MI_SIZE_BITS <= 32) - return _mi_ctz_generic32((uint32_t)x); + return mi_ctz_generic32((uint32_t)x); #else - const size_t count = _mi_ctz_generic32((uint32_t)x); + const size_t count = mi_ctz_generic32((uint32_t)x); if (count < 32) return count; - return (32 + _mi_ctz_generic32((uint32_t)(x>>32))); + return (32 + mi_ctz_generic32((uint32_t)(x>>32))); #endif } + +#endif // bit scan + +#if !MI_HAS_FAST_POPCOUNT + +#if MI_SIZE_SIZE == 4 +#define mi_mask_even_bits32 (0x55555555) +#define mi_mask_even_pairs32 (0x33333333) +#define mi_mask_even_nibbles32 (0x0F0F0F0F) + +// sum of all the bytes in `x` if it is guaranteed that the sum < 256! +static size_t mi_byte_sum32(uint32_t x) { + // perform `x * 0x01010101`: the highest byte contains the sum of all bytes. + x += (x << 8); + x += (x << 16); + return (size_t)(x >> 24); +} + +static size_t mi_popcount_generic32(uint32_t x) { + // first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10 + // in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair + // into the lower bit-pair: + x = x - ((x >> 1) & mi_mask_even_bits32); + // add the 2-bit pair results + x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32); + // add the 4-bit nibble results + x = (x + (x >> 4)) & mi_mask_even_nibbles32; + // each byte now has a count of its bits, we can sum them now: + return mi_byte_sum32(x); +} + +size_t _mi_popcount_generic(size_t x) { + return mi_popcount_generic32(x); +} + +#else +#define mi_mask_even_bits64 (0x5555555555555555) +#define mi_mask_even_pairs64 (0x3333333333333333) +#define mi_mask_even_nibbles64 (0x0F0F0F0F0F0F0F0F) + +// sum of all the bytes in `x` if it is guaranteed that the sum < 256! +static size_t mi_byte_sum64(uint64_t x) { + x += (x << 8); + x += (x << 16); + x += (x << 32); + return (size_t)(x >> 56); +} + +static size_t mi_popcount_generic64(uint64_t x) { + x = x - ((x >> 1) & mi_mask_even_bits64); + x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64); + x = (x + (x >> 4)) & mi_mask_even_nibbles64; + return mi_byte_sum64(x); +} + +size_t _mi_popcount_generic(size_t x) { + return mi_popcount_generic64(x); +} +#endif + +#endif // popcount diff --git a/src/os.c b/src/os.c index b05068fd..0c020302 100644 --- a/src/os.c +++ b/src/os.c @@ -175,7 +175,7 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm MI_UNUSED(tld_stats); mi_stats_t* stats = &_mi_stats_main; - mi_stat_counter_increase(stats->mmap_calls, 1); + _mi_stat_counter_increase(&stats->mmap_calls, 1); if (p != NULL) { _mi_stat_increase(&stats->reserved, size); if (commit) { From 2ed6e03d276dc90072236a644e22aea87b108180 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 8 Dec 2024 09:14:16 -0800 Subject: [PATCH 047/264] update optimization on haswell --- CMakeLists.txt | 25 ++++++++++++++++--------- 1 file changed, 16 insertions(+), 9 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 58895d56..52bb60b4 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -14,7 +14,7 @@ option(MI_TRACK_VALGRIND "Compile with Valgrind support (adds a small overhea option(MI_TRACK_ASAN "Compile with address sanitizer support (adds a small overhead)" OFF) option(MI_TRACK_ETW "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF) option(MI_USE_CXX "Use the C++ compiler to compile the library (instead of the C compiler)" OFF) -option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for arm64: '-march=armv8.1-a' (2016))" ON) +option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON) option(MI_SEE_ASM "Generate assembly files" OFF) option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" ON) option(MI_OSX_ZONE "Use malloc zone to override standard malloc on macOS" ON) @@ -388,21 +388,28 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM list(APPEND mi_cflags -ftls-model=initial-exec) endif() endif() +endif() + +if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel") if(MI_OVERRIDE) list(APPEND mi_cflags -fno-builtin-malloc) endif() if(MI_OPT_ARCH) - if(MI_ARCH STREQUAL "arm64") - set(MI_ARCH_OPT_FLAGS "-march=armv8.1-a") # fast atomics, since ~ 2016 + if(MI_ARCH STREQUAL "x64") + set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2") # fast bit scan (since 2013) + elseif(MI_ARCH STREQUAL "arm64") + set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a") # fast atomics (since 2016) endif() endif() endif() -if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) +if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) # vs2017+ list(APPEND mi_cflags /Zc:__cplusplus) if(MI_OPT_ARCH) - if(MI_ARCH STREQUAL "arm64") - set(MI_OPT_ARCH_FLAGS "/arch:armv8.1") # fast atomics, since ~ 2016 + if(MI_ARCH STREQUAL "x64") + set(MI_OPT_ARCH_FLAGS "/arch:AVX2") # fast bit scan (since 2013) + elseif(MI_ARCH STREQUAL "arm64") + set(MI_OPT_ARCH_FLAGS "/arch:armv8.1") # fast atomics (since 2016) endif() endif() endif() @@ -411,9 +418,9 @@ if(MINGW) add_definitions(-D_WIN32_WINNT=0x600) endif() -if(MI_ARCH_OPT_FLAGS) - list(APPEND mi_cflags ${MI_ARCH_OPT_FLAGS}) - message(STATUS "Architecture specific optimization is enabled (with ${MI_ARCH_OPT_FLAGS}) (MI_OPT_ARCH=ON)") +if(MI_OPT_ARCH_FLAGS) + list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS}) + message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)") endif() # extra needed libraries From 67cc424ada05652c22417edef72bfe1a227ec309 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 8 Dec 2024 09:19:05 -0800 Subject: [PATCH 048/264] delete old files --- src/arena-abandon.c | 357 ----------- src/arena-old.c | 988 ------------------------------ src/arena-page.c | 20 - src/bitmap-old.c | 419 ------------- src/bitmap-old.h | 110 ---- src/page.c | 53 -- src/segment-map.c | 126 ---- src/segment.c | 1387 ------------------------------------------- 8 files changed, 3460 deletions(-) delete mode 100644 src/arena-abandon.c delete mode 100644 src/arena-old.c delete mode 100644 src/arena-page.c delete mode 100644 src/bitmap-old.c delete mode 100644 src/bitmap-old.h delete mode 100644 src/segment-map.c delete mode 100644 src/segment.c diff --git a/src/arena-abandon.c b/src/arena-abandon.c deleted file mode 100644 index 14712886..00000000 --- a/src/arena-abandon.c +++ /dev/null @@ -1,357 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2024, Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -#if !defined(MI_IN_ARENA_C) -#error "this file should be included from 'arena.c' (so mi_arena_t is visible)" -// add includes help an IDE -#include "mimalloc.h" -#include "mimalloc/internal.h" -#include "bitmap.h" -#endif - -// Minimal exports for arena-abandoned. -size_t mi_arena_id_index(mi_arena_id_t id); -mi_arena_t* mi_arena_from_index(size_t idx); -size_t mi_arena_get_count(void); -void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex); -bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index); - -/* ----------------------------------------------------------- - Abandoned blocks/segments: - - _mi_arena_segment_clear_abandoned - _mi_arena_segment_mark_abandoned - - This is used to atomically abandon/reclaim segments - (and crosses the arena API but it is convenient to have here). - - Abandoned segments still have live blocks; they get reclaimed - when a thread frees a block in it, or when a thread needs a fresh - segment. - - Abandoned segments are atomically marked in the `block_abandoned` - bitmap of arenas. Any segments allocated outside arenas are put - in the sub-process `abandoned_os_list`. This list is accessed - using locks but this should be uncommon and generally uncontended. - Reclaim and visiting either scan through the `block_abandoned` - bitmaps of the arena's, or visit the `abandoned_os_list` - - A potentially nicer design is to use arena's for everything - and perhaps have virtual arena's to map OS allocated memory - but this would lack the "density" of our current arena's. TBC. ------------------------------------------------------------ */ - - -// reclaim a specific OS abandoned segment; `true` on success. -// sets the thread_id. -static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) { - mi_assert(segment->memid.memkind != MI_MEM_ARENA); - // not in an arena, remove from list of abandoned os segments - mi_subproc_t* const subproc = segment->subproc; - if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) { - return false; // failed to acquire the lock, we just give up - } - // remove atomically from the abandoned os list (if possible!) - bool reclaimed = false; - mi_segment_t* const next = segment->abandoned_os_next; - mi_segment_t* const prev = segment->abandoned_os_prev; - if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) { - #if MI_DEBUG>3 - // find ourselves in the abandoned list (and check the count) - bool found = false; - size_t count = 0; - for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) { - if (current == segment) { found = true; } - count++; - } - mi_assert_internal(found); - mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count)); - #endif - // remove (atomically) from the list and reclaim - if (prev != NULL) { prev->abandoned_os_next = next; } - else { subproc->abandoned_os_list = next; } - if (next != NULL) { next->abandoned_os_prev = prev; } - else { subproc->abandoned_os_list_tail = prev; } - segment->abandoned_os_next = NULL; - segment->abandoned_os_prev = NULL; - mi_atomic_decrement_relaxed(&subproc->abandoned_count); - mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count); - if (take_lock) { // don't reset the thread_id when iterating - mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); - } - reclaimed = true; - } - if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); } - return reclaimed; -} - -// reclaim a specific abandoned segment; `true` on success. -// sets the thread_id. -bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) { - if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) { - return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */); - } - // arena segment: use the blocks_abandoned bitmap. - size_t arena_idx; - size_t bitmap_idx; - mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); - mi_arena_t* arena = mi_arena_from_index(arena_idx); - mi_assert_internal(arena != NULL); - // reclaim atomically - bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx); - if (was_marked) { - mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0); - mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count); - mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); - } - // mi_assert_internal(was_marked); - mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); - //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); - return was_marked; -} - - -// mark a specific OS segment as abandoned -static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) { - mi_assert(segment->memid.memkind != MI_MEM_ARENA); - // not in an arena; we use a list of abandoned segments - mi_subproc_t* const subproc = segment->subproc; - if (!mi_lock_acquire(&subproc->abandoned_os_lock)) { - _mi_error_message(EFAULT, "internal error: failed to acquire the abandoned (os) segment lock to mark abandonment"); - // we can continue but cannot visit/reclaim such blocks.. - } - else { - // push on the tail of the list (important for the visitor) - mi_segment_t* prev = subproc->abandoned_os_list_tail; - mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL); - mi_assert_internal(segment->abandoned_os_prev == NULL); - mi_assert_internal(segment->abandoned_os_next == NULL); - if (prev != NULL) { prev->abandoned_os_next = segment; } - else { subproc->abandoned_os_list = segment; } - subproc->abandoned_os_list_tail = segment; - segment->abandoned_os_prev = prev; - segment->abandoned_os_next = NULL; - mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count); - mi_atomic_increment_relaxed(&subproc->abandoned_count); - // and release the lock - mi_lock_release(&subproc->abandoned_os_lock); - } - return; -} - -// mark a specific segment as abandoned -// clears the thread_id. -void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) -{ - mi_assert_internal(segment->used == segment->abandoned); - mi_atomic_store_release(&segment->thread_id, (uintptr_t)0); // mark as abandoned for multi-thread free's - if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) { - mi_arena_segment_os_mark_abandoned(segment); - return; - } - // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap - size_t arena_idx; - size_t bitmap_idx; - mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); - mi_arena_t* arena = mi_arena_from_index(arena_idx); - mi_assert_internal(arena != NULL); - // set abandonment atomically - mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned - const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); - if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); } - mi_assert_internal(was_unmarked); - mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); -} - - -/* ----------------------------------------------------------- - Iterate through the abandoned blocks/segments using a cursor. - This is used for reclaiming and abandoned block visiting. ------------------------------------------------------------ */ - -// start a cursor at a randomized arena -void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) { - mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc); - current->bitmap_idx = 0; - current->subproc = subproc; - current->visit_all = visit_all; - current->hold_visit_lock = false; - const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count); - const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count); - const size_t max_arena = mi_arena_get_count(); - if (heap != NULL && heap->arena_id != _mi_arena_id_none()) { - // for a heap that is bound to one arena, only visit that arena - current->start = mi_arena_id_index(heap->arena_id); - current->end = current->start + 1; - current->os_list_count = 0; - } - else { - // otherwise visit all starting at a random location - if (abandoned_count > abandoned_list_count && max_arena > 0) { - current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena)); - current->end = current->start + max_arena; - } - else { - current->start = 0; - current->end = 0; - } - current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list - } - mi_assert_internal(current->start <= max_arena); -} - -void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) { - if (current->hold_visit_lock) { - mi_lock_release(¤t->subproc->abandoned_os_visit_lock); - current->hold_visit_lock = false; - } -} - -static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) { - // try to reclaim an abandoned segment in the arena atomically - if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL; - mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); - mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); - mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); - // check that the segment belongs to our sub-process - // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled. - // without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process. - // for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock. - if (segment->subproc != subproc) { - // it is from another sub-process, re-mark it and continue searching - const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); - mi_assert_internal(was_zero); MI_UNUSED(was_zero); - return NULL; - } - else { - // success, we unabandoned a segment in our sub-process - mi_atomic_decrement_relaxed(&subproc->abandoned_count); - return segment; - } -} - -static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) { - const size_t max_arena = mi_arena_get_count(); - size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx); - size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx); - // visit arena's (from the previous cursor) - for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) { - // index wraps around - size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start); - mi_arena_t* arena = mi_arena_from_index(arena_idx); - if (arena != NULL) { - bool has_lock = false; - // visit the abandoned fields (starting at previous_idx) - for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) { - size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]); - if mi_unlikely(field != 0) { // skip zero fields quickly - // we only take the arena lock if there are actually abandoned segments present - if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) { - has_lock = (previous->visit_all ? mi_lock_acquire(&arena->abandoned_visit_lock) : mi_lock_try_acquire(&arena->abandoned_visit_lock)); - if (!has_lock) { - if (previous->visit_all) { - _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock"); - } - // skip to next arena - break; - } - } - mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned)); - // visit each set bit in the field (todo: maybe use `ctz` here?) - for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) { - // pre-check if the bit is set - size_t mask = ((size_t)1 << bit_idx); - if mi_unlikely((field & mask) == mask) { - mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); - mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx); - if (segment != NULL) { - //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); - if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } - previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration - return segment; - } - } - } - } - } - if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } - } - } - return NULL; -} - -static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) { - // go through the abandoned_os_list - // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`. - // The lock is released when the cursor is released. - if (!previous->hold_visit_lock) { - previous->hold_visit_lock = (previous->visit_all ? mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock) - : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock)); - if (!previous->hold_visit_lock) { - if (previous->visit_all) { - _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock"); - } - return NULL; // we cannot get the lock, give up - } - } - // One list entry at a time - while (previous->os_list_count > 0) { - previous->os_list_count--; - const bool has_lock = mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free` - if (has_lock) { - mi_segment_t* segment = previous->subproc->abandoned_os_list; - // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries) - if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) { - mi_lock_release(&previous->subproc->abandoned_os_lock); - return segment; - } - // already abandoned, try again - mi_lock_release(&previous->subproc->abandoned_os_lock); - } - else { - _mi_error_message(EFAULT, "failed to acquire abandoned OS list lock during abandoned block visit\n"); - return NULL; - } - } - // done - mi_assert_internal(previous->os_list_count == 0); - return NULL; -} - - -// reclaim abandoned segments -// this does not set the thread id (so it appears as still abandoned) -mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) { - if (previous->start < previous->end) { - // walk the arena - mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous); - if (segment != NULL) { return segment; } - } - // no entries in the arena's anymore, walk the abandoned OS list - mi_assert_internal(previous->start == previous->end); - return mi_arena_segment_clear_abandoned_next_list(previous); -} - - -bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { - // (unfortunately) the visit_abandoned option must be enabled from the start. - // This is to avoid taking locks if abandoned list visiting is not required (as for most programs) - if (!mi_option_is_enabled(mi_option_visit_abandoned)) { - _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON"); - return false; - } - mi_arena_field_cursor_t current;0 - _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, ¤t); - mi_segment_t* segment; - bool ok = true; - while (ok && (segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL) { - ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg); - _mi_arena_segment_mark_abandoned(segment); - } - _mi_arena_field_cursor_done(¤t); - return ok; -} diff --git a/src/arena-old.c b/src/arena-old.c deleted file mode 100644 index 3f41e9c7..00000000 --- a/src/arena-old.c +++ /dev/null @@ -1,988 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2024, Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -/* ---------------------------------------------------------------------------- -"Arenas" are fixed area's of OS memory from which we can allocate -large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB). -In contrast to the rest of mimalloc, the arenas are shared between -threads and need to be accessed using atomic operations. - -Arenas are also used to for huge OS page (1GiB) reservations or for reserving -OS memory upfront which can be improve performance or is sometimes needed -on embedded devices. We can also employ this with WASI or `sbrk` systems -to reserve large arenas upfront and be able to reuse the memory more effectively. - -The arena allocation needs to be thread safe and we use an atomic bitmap to allocate. ------------------------------------------------------------------------------*/ - -#include "mimalloc.h" -#include "mimalloc/internal.h" -#include "mimalloc/atomic.h" -#include "bitmap.h" - - -/* ----------------------------------------------------------- - Arena allocation ------------------------------------------------------------ */ - -// A memory arena descriptor -typedef struct mi_arena_s { - mi_arena_id_t id; // arena id; 0 for non-specific - mi_memid_t memid; // memid of the memory area - _Atomic(uint8_t*)start; // the start of the memory area - size_t block_count; // size of the area in arena blocks (of `MI_ARENA_SLICE_SIZE`) - size_t field_count; // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`) - size_t meta_size; // size of the arena structure itself (including its bitmaps) - mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) - int numa_node; // associated NUMA node - bool exclusive; // only allow allocations if specifically for this arena - bool is_large; // memory area consists of large- or huge OS pages (always committed) - mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited - _Atomic(size_t)search_idx; // optimization to start the search for free blocks - _Atomic(mi_msecs_t)purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. - mi_bitmap_field_t* blocks_dirty; // are the blocks potentially non-zero? - mi_bitmap_field_t* blocks_committed; // are the blocks committed? (can be NULL for memory that cannot be decommitted) - mi_bitmap_field_t* blocks_purge; // blocks that can be (reset) decommitted. (can be NULL for memory that cannot be (reset) decommitted) - mi_bitmap_field_t* blocks_abandoned; // blocks that start with an abandoned segment. (This crosses API's but it is convenient to have here) - mi_bitmap_field_t blocks_inuse[1]; // in-place bitmap of in-use blocks (of size `field_count`) - // do not add further fields here as the dirty, committed, purged, and abandoned bitmaps follow the inuse bitmap fields. -} mi_arena_t; - - -#define MI_ARENA_SLICE_SIZE (MI_SEGMENT_SIZE) // 64MiB (must be at least MI_SEGMENT_ALIGN) -#define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_SLICE_SIZE/2) // 32MiB -#define MI_MAX_ARENAS (132) // Limited as the reservation exponentially increases (and takes up .bss) - -// The available arenas -static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; -static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 - -#define MI_IN_ARENA_C -#include "arena-abandon.c" -#undef MI_IN_ARENA_C - -/* ----------------------------------------------------------- - Arena id's - id = arena_index + 1 ------------------------------------------------------------ */ - -size_t mi_arena_id_index(mi_arena_id_t id) { - return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); -} - -static mi_arena_id_t mi_arena_id_create(size_t arena_index) { - mi_assert_internal(arena_index < MI_MAX_ARENAS); - return (int)arena_index + 1; -} - -mi_arena_id_t _mi_arena_id_none(void) { - return 0; -} - -static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { - return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || - (arena_id == req_arena_id)); -} - -bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { - if (memid.memkind == MI_MEM_ARENA) { - return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); - } - else { - return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); - } -} - -size_t mi_arena_get_count(void) { - return mi_atomic_load_relaxed(&mi_arena_count); -} - -mi_arena_t* mi_arena_from_index(size_t idx) { - mi_assert_internal(idx < mi_arena_get_count()); - return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); -} - - -/* ----------------------------------------------------------- - Arena allocations get a (currently) 16-bit memory id where the - lower 8 bits are the arena id, and the upper bits the block index. ------------------------------------------------------------ */ - -static size_t mi_block_count_of_size(size_t size) { - return _mi_divide_up(size, MI_ARENA_SLICE_SIZE); -} - -static size_t mi_arena_block_size(size_t bcount) { - return (bcount * MI_ARENA_SLICE_SIZE); -} - -static size_t mi_arena_size(mi_arena_t* arena) { - return mi_arena_block_size(arena->block_count); -} - -static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) { - mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); - memid.mem.arena.id = id; - memid.mem.arena.block_index = bitmap_index; - memid.mem.arena.is_exclusive = is_exclusive; - return memid; -} - -bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) { - mi_assert_internal(memid.memkind == MI_MEM_ARENA); - *arena_index = mi_arena_id_index(memid.mem.arena.id); - *bitmap_index = memid.mem.arena.block_index; - return memid.mem.arena.is_exclusive; -} - - - -/* ----------------------------------------------------------- - Special static area for mimalloc internal structures - to avoid OS calls (for example, for the arena metadata (~= 256b)) ------------------------------------------------------------ */ - -#define MI_ARENA_STATIC_MAX ((MI_INTPTR_SIZE/2)*MI_KiB) // 4 KiB on 64-bit - -static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX]; // must be cache aligned, see issue #895 -static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top; - -static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) { - *memid = _mi_memid_none(); - if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL; - const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top); - if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL; - - // try to claim space - if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; } - const size_t oversize = size + alignment - 1; - if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL; - const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize); - size_t top = oldtop + oversize; - if (top > MI_ARENA_STATIC_MAX) { - // try to roll back, ok if this fails - mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop); - return NULL; - } - - // success - *memid = _mi_memid_create(MI_MEM_STATIC); - memid->initially_zero = true; - const size_t start = _mi_align_up(oldtop, alignment); - uint8_t* const p = &mi_arena_static[start]; - _mi_memzero_aligned(p, size); - return p; -} - -void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) { - *memid = _mi_memid_none(); - - // try static - void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid); - if (p != NULL) return p; - - // or fall back to the OS - p = _mi_os_alloc(size, memid, &_mi_stats_main); - if (p == NULL) return NULL; - - // zero the OS memory if needed - if (!memid->initially_zero) { - _mi_memzero_aligned(p, size); - memid->initially_zero = true; - } - return p; -} - -void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { - if (mi_memkind_is_os(memid.memkind)) { - _mi_os_free(p, size, memid, &_mi_stats_main); - } - else { - mi_assert(memid.memkind == MI_MEM_STATIC); - } -} - -void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { - return (arena->start + mi_arena_block_size(mi_bitmap_index_bit(bindex))); -} - - -/* ----------------------------------------------------------- - Thread safe allocation in an arena ------------------------------------------------------------ */ - -// claim the `blocks_inuse` bits -static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) -{ - size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx); // start from last search; ok to be relaxed as the exact start does not matter - if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) { - mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx)); // start search from found location next time around - return true; - }; - return false; -} - - -/* ----------------------------------------------------------- - Arena Allocation ------------------------------------------------------------ */ - -static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, - bool commit, mi_memid_t* memid, mi_os_tld_t* tld) -{ - MI_UNUSED(arena_index); - mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); - - mi_bitmap_index_t bitmap_index; - if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL; - - // claimed it! - void* p = mi_arena_block_start(arena, bitmap_index); - *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index); - memid->is_pinned = arena->memid.is_pinned; - - // none of the claimed blocks should be scheduled for a decommit - if (arena->blocks_purge != NULL) { - // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`). - _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index); - } - - // set the dirty bits (todo: no need for an atomic op here?) - if (arena->memid.initially_zero && arena->blocks_dirty != NULL) { - memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL); - } - - // set commit state - if (arena->blocks_committed == NULL) { - // always committed - memid->initially_committed = true; - } - else if (commit) { - // commit requested, but the range may not be committed as a whole: ensure it is committed now - memid->initially_committed = true; - bool any_uncommitted; - _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted); - if (any_uncommitted) { - bool commit_zero = false; - if (!_mi_os_commit(p, mi_arena_block_size(needed_bcount), &commit_zero, tld->stats)) { - memid->initially_committed = false; - } - else { - if (commit_zero) { memid->initially_zero = true; } - } - } - } - else { - // no need to commit, but check if already fully committed - memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index); - } - - return p; -} - -// allocate in a speficic arena -static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, - bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) -{ - MI_UNUSED_RELEASE(alignment); - mi_assert(alignment <= MI_SEGMENT_ALIGN); - const size_t bcount = mi_block_count_of_size(size); - const size_t arena_index = mi_arena_id_index(arena_id); - mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); - mi_assert_internal(size <= mi_arena_block_size(bcount)); - - // Check arena suitability - mi_arena_t* arena = mi_arena_from_index(arena_index); - if (arena == NULL) return NULL; - if (!allow_large && arena->is_large) return NULL; - if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; - if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity - const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); - if (match_numa_node) { if (!numa_suitable) return NULL; } - else { if (numa_suitable) return NULL; } - } - - // try to allocate - void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld); - mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); - return p; -} - - -// allocate from an arena with fallback to the OS -static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, - bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) -{ - MI_UNUSED(alignment); - mi_assert_internal(alignment <= MI_SEGMENT_ALIGN); - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - if mi_likely(max_arena == 0) return NULL; - - if (req_arena_id != _mi_arena_id_none()) { - // try a specific arena if requested - if (mi_arena_id_index(req_arena_id) < max_arena) { - void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - } - } - else { - // try numa affine allocation - for (size_t i = 0; i < max_arena; i++) { - void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - } - - // try from another numa node instead.. - if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already - for (size_t i = 0; i < max_arena; i++) { - void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - } - } - } - return NULL; -} - -// try to reserve a fresh arena space -static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id) -{ - if (_mi_preloading()) return false; // use OS only while pre loading - if (req_arena_id != _mi_arena_id_none()) return false; - - const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); - if (arena_count > (MI_MAX_ARENAS - 4)) return false; - - size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); - if (arena_reserve == 0) return false; - - if (!_mi_os_has_virtual_reserve()) { - arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) - } - arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); - arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE); - if (arena_count >= 8 && arena_count <= 128) { - // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 ); - size_t reserve = 0; - if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { - arena_reserve = reserve; - } - } - if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size - - // commit eagerly? - bool arena_commit = false; - if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } - else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } - - return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); -} - - -void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) -{ - mi_assert_internal(memid != NULL && tld != NULL); - mi_assert_internal(size > 0); - *memid = _mi_memid_none(); - - const int numa_node = _mi_os_numa_node(tld); // current numa node - - // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) - if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? - if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) { - void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - - // otherwise, try to first eagerly reserve a new arena - if (req_arena_id == _mi_arena_id_none()) { - mi_arena_id_t arena_id = 0; - if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { - // and try allocate in there - mi_assert_internal(req_arena_id == _mi_arena_id_none()); - p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); - if (p != NULL) return p; - } - } - } - } - - // if we cannot use OS allocation, return NULL - if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { - errno = ENOMEM; - return NULL; - } - - // finally, fall back to the OS - if (align_offset > 0) { - return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); - } - else { - return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); - } -} - -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) -{ - return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); -} - - -void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { - if (size != NULL) *size = 0; - size_t arena_index = mi_arena_id_index(arena_id); - if (arena_index >= MI_MAX_ARENAS) return NULL; - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); - if (arena == NULL) return NULL; - if (size != NULL) { *size = mi_arena_block_size(arena->block_count); } - return arena->start; -} - - -/* ----------------------------------------------------------- - Arena purge ------------------------------------------------------------ */ - -static long mi_arena_purge_delay(void) { - // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay - return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); -} - -// reset or decommit in an arena and update the committed/decommit bitmaps -// assumes we own the area (i.e. blocks_in_use is claimed by us) -static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) { - mi_assert_internal(arena->blocks_committed != NULL); - mi_assert_internal(arena->blocks_purge != NULL); - mi_assert_internal(!arena->memid.is_pinned); - const size_t size = mi_arena_block_size(blocks); - void* const p = mi_arena_block_start(arena, bitmap_idx); - bool needs_recommit; - if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { - // all blocks are committed, we can purge freely - needs_recommit = _mi_os_purge(p, size, stats); - } - else { - // some blocks are not committed -- this can happen when a partially committed block is freed - // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge - // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), - // and also undo the decommit stats (as it was already adjusted) - mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); - needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); - if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } - } - - // clear the purged blocks - _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx); - // update committed bitmap - if (needs_recommit) { - _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); - } -} - -// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. -// Note: assumes we (still) own the area as we may purge immediately -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) { - mi_assert_internal(arena->blocks_purge != NULL); - const long delay = mi_arena_purge_delay(); - if (delay < 0) return; // is purging allowed at all? - - if (_mi_preloading() || delay == 0) { - // decommit directly - mi_arena_purge(arena, bitmap_idx, blocks, stats); - } - else { - // schedule decommit - mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); - if (expire != 0) { - mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10)); // add smallish extra delay - } - else { - mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); - } - _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL); - } -} - -// purge a range of blocks -// return true if the full range was purged. -// assumes we own the area (i.e. blocks_in_use is claimed by us) -static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startidx, size_t bitlen, size_t purge, mi_stats_t* stats) { - const size_t endidx = startidx + bitlen; - size_t bitidx = startidx; - bool all_purged = false; - while (bitidx < endidx) { - // count consecutive ones in the purge mask - size_t count = 0; - while (bitidx + count < endidx && (purge & ((size_t)1 << (bitidx + count))) != 0) { - count++; - } - if (count > 0) { - // found range to be purged - const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitidx); - mi_arena_purge(arena, range_idx, count, stats); - if (count == bitlen) { - all_purged = true; - } - } - bitidx += (count+1); // +1 to skip the zero bit (or end) - } - return all_purged; -} - -// returns true if anything was purged -static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) -{ - if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false; - mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); - if (expire == 0) return false; - if (!force && expire > now) return false; - - // reset expire (if not already set concurrently) - mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); - - // potential purges scheduled, walk through the bitmap - bool any_purged = false; - bool full_purge = true; - for (size_t i = 0; i < arena->field_count; i++) { - size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]); - if (purge != 0) { - size_t bitidx = 0; - while (bitidx < MI_BITMAP_FIELD_BITS) { - // find consecutive range of ones in the purge mask - size_t bitlen = 0; - while (bitidx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitidx + bitlen))) != 0) { - bitlen++; - } - // temporarily claim the purge range as "in-use" to be thread-safe with allocation - // try to claim the longest range of corresponding in_use bits - const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitidx); - while( bitlen > 0 ) { - if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) { - break; - } - bitlen--; - } - // actual claimed bits at `in_use` - if (bitlen > 0) { - // read purge again now that we have the in_use bits - purge = mi_atomic_load_acquire(&arena->blocks_purge[i]); - if (!mi_arena_purge_range(arena, i, bitidx, bitlen, purge, stats)) { - full_purge = false; - } - any_purged = true; - // release the claimed `in_use` bits again - _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index); - } - bitidx += (bitlen+1); // +1 to skip the zero (or end) - } // while bitidx - } // purge != 0 - } - // if not fully purged, make sure to purge again in the future - if (!full_purge) { - const long delay = mi_arena_purge_delay(); - mi_msecs_t expected = 0; - mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay); - } - return any_purged; -} - -static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) { - if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled - - const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); - if (max_arena == 0) return; - - // allow only one thread to purge at a time - static mi_atomic_guard_t purge_guard; - mi_atomic_guard(&purge_guard) - { - mi_msecs_t now = _mi_clock_now(); - size_t max_purge_count = (visit_all ? max_arena : 1); - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); - if (arena != NULL) { - if (mi_arena_try_purge(arena, now, force, stats)) { - if (max_purge_count <= 1) break; - max_purge_count--; - } - } - } - } -} - - -/* ----------------------------------------------------------- - Arena free ------------------------------------------------------------ */ - -void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { - mi_assert_internal(size > 0 && stats != NULL); - mi_assert_internal(committed_size <= size); - if (p==NULL) return; - if (size==0) return; - const bool all_committed = (committed_size == size); - - // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) - mi_track_mem_undefined(p,size); - - if (mi_memkind_is_os(memid.memkind)) { - // was a direct OS allocation, pass through - if (!all_committed && committed_size > 0) { - // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) - _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } - _mi_os_free(p, size, memid, stats); - } - else if (memid.memkind == MI_MEM_ARENA) { - // allocated in an arena - size_t arena_idx; - size_t bitmap_idx; - mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx); - mi_assert_internal(arena_idx < MI_MAX_ARENAS); - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]); - mi_assert_internal(arena != NULL); - const size_t blocks = mi_block_count_of_size(size); - - // checks - if (arena == NULL) { - _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); - return; - } - mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx)); - if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) { - _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); - return; - } - - // potentially decommit - if (arena->memid.is_pinned || arena->blocks_committed == NULL) { - mi_assert_internal(all_committed); - } - else { - mi_assert_internal(arena->blocks_committed != NULL); - mi_assert_internal(arena->blocks_purge != NULL); - - if (!all_committed) { - // mark the entire range as no longer committed (so we recommit the full range when re-using) - _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); - mi_track_mem_noaccess(p,size); - if (committed_size > 0) { - // if partially committed, adjust the committed stats (is it will be recommitted when re-using) - // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. - _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } - // note: if not all committed, it may be that the purge will reset/decommit the entire range - // that contains already decommitted parts. Since purge consistently uses reset or decommit that - // works (as we should never reset decommitted parts). - } - // (delay) purge the entire range - mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats); - } - - // and make it available to others again - bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx); - if (!all_inuse) { - _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); - return; - }; - } - else { - // arena was none, external, or static; nothing to do - mi_assert_internal(memid.memkind < MI_MEM_OS); - } - - // purge expired decommits - mi_arenas_try_purge(false, false, stats); -} - -// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` -// for dynamic libraries that are unloaded and need to release all their allocated memory. -static void mi_arenas_unsafe_destroy(void) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - size_t new_max_arena = 0; - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); - if (arena != NULL) { - mi_lock_done(&arena->abandoned_visit_lock); - if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) { - mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); - _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); - } - else { - new_max_arena = i; - } - _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size); - } - } - - // try to lower the max arena. - size_t expected = max_arena; - mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); -} - -// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired -void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { - mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); -} - -// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` -// for dynamic libraries that are unloaded and need to release all their allocated memory. -void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { - mi_arenas_unsafe_destroy(); - _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas -} - -// Is a pointer inside any of our arenas? -bool _mi_arena_contains(const void* p) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_arena_block_size(arena->block_count) > (const uint8_t*)p) { - return true; - } - } - return false; -} - -/* ----------------------------------------------------------- - Add an arena. ------------------------------------------------------------ */ - -static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { - mi_assert_internal(arena != NULL); - mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0); - mi_assert_internal(arena->block_count > 0); - if (arena_id != NULL) { *arena_id = -1; } - - size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); - if (i >= MI_MAX_ARENAS) { - mi_atomic_decrement_acq_rel(&mi_arena_count); - return false; - } - _mi_stat_counter_increase(&stats->arena_count,1); - arena->id = mi_arena_id_create(i); - mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); - if (arena_id != NULL) { *arena_id = arena->id; } - return true; -} - -static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept -{ - if (arena_id != NULL) *arena_id = _mi_arena_id_none(); - if (size < MI_ARENA_SLICE_SIZE) return false; - - if (is_large) { - mi_assert_internal(memid.initially_committed && memid.is_pinned); - } - - const size_t bcount = size / MI_ARENA_SLICE_SIZE; - const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS); - const size_t bitmaps = (memid.is_pinned ? 3 : 5); - const size_t asize = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t)); - mi_memid_t meta_memid; - mi_arena_t* arena = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid); - if (arena == NULL) return false; - - // already zero'd due to zalloc - // _mi_memzero(arena, asize); - arena->id = _mi_arena_id_none(); - arena->memid = memid; - arena->exclusive = exclusive; - arena->meta_size = asize; - arena->meta_memid = meta_memid; - arena->block_count = bcount; - arena->field_count = fields; - arena->start = (uint8_t*)start; - arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) - arena->is_large = is_large; - arena->purge_expire = 0; - arena->search_idx = 0; - mi_lock_init(&arena->abandoned_visit_lock); - // consecutive bitmaps - arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap - arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap - arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap - arena->blocks_purge = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap - // initialize committed bitmap? - if (arena->blocks_committed != NULL && arena->memid.initially_committed) { - memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning - } - - // and claim leftover blocks if needed (so we never allocate there) - ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount; - mi_assert_internal(post >= 0); - if (post > 0) { - // don't use leftover bits at the end - mi_bitmap_index_t postidx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); - _mi_bitmap_claim(arena->blocks_inuse, fields, post, postidx, NULL); - } - return mi_arena_add(arena, arena_id, &_mi_stats_main); - -} - -bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); - memid.initially_committed = is_committed; - memid.initially_zero = is_zero; - memid.is_pinned = is_large; - return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id); -} - -// Reserve a range of regular OS memory -int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - if (arena_id != NULL) *arena_id = _mi_arena_id_none(); - size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one block - mi_memid_t memid; - void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main); - if (start == NULL) return ENOMEM; - const bool is_large = memid.is_pinned; // todo: use separate is_large field? - if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { - _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); - _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); - return ENOMEM; - } - _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); - return 0; -} - - -// Manage a range of regular OS memory -bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { - return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); -} - -// Reserve a range of regular OS memory -int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { - return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); -} - - -/* ----------------------------------------------------------- - Debugging ------------------------------------------------------------ */ - -static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) { - _mi_verbose_message("%s%s:\n", prefix, header); - size_t bcount = 0; - size_t inuse_count = 0; - for (size_t i = 0; i < field_count; i++) { - char buf[MI_BITMAP_FIELD_BITS + 1]; - uintptr_t field = mi_atomic_load_relaxed(&fields[i]); - for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) { - if (bcount < block_count) { - bool inuse = ((((uintptr_t)1 << bit) & field) != 0); - if (inuse) inuse_count++; - buf[bit] = (inuse ? 'x' : '.'); - } - else { - buf[bit] = ' '; - } - } - buf[MI_BITMAP_FIELD_BITS] = 0; - _mi_verbose_message("%s %s\n", prefix, buf); - } - _mi_verbose_message("%s total ('x'): %zu\n", prefix, inuse_count); - return inuse_count; -} - -void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { - size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); - size_t inuse_total = 0; - size_t abandoned_total = 0; - size_t purge_total = 0; - for (size_t i = 0; i < max_arenas; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); - if (arena == NULL) break; - _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_SLICE_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : "")); - if (show_inuse) { - inuse_total += mi_debug_show_bitmap(" ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count); - } - if (arena->blocks_committed != NULL) { - mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count); - } - if (show_abandoned) { - abandoned_total += mi_debug_show_bitmap(" ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count); - } - if (show_purge && arena->blocks_purge != NULL) { - purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count); - } - } - if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", inuse_total); - if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); - if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); -} - - -/* ----------------------------------------------------------- - Reserve a huge page arena. ------------------------------------------------------------ */ -// reserve at a specific numa node -int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - if (arena_id != NULL) *arena_id = -1; - if (pages==0) return 0; - if (numa_node < -1) numa_node = -1; - if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); - size_t hsize = 0; - size_t pages_reserved = 0; - mi_memid_t memid; - void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); - if (p==NULL || pages_reserved==0) { - _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); - return ENOMEM; - } - _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); - - if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { - _mi_os_free(p, hsize, memid, &_mi_stats_main); - return ENOMEM; - } - return 0; -} - -int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { - return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); -} - -// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) -int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { - if (pages == 0) return 0; - - // pages per numa node - size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); - if (numa_count <= 0) numa_count = 1; - const size_t pages_per = pages / numa_count; - const size_t pages_mod = pages % numa_count; - const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); - - // reserve evenly among numa nodes - for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { - size_t node_pages = pages_per; // can be 0 - if (numa_node < pages_mod) node_pages++; - int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); - if (err) return err; - if (pages < node_pages) { - pages = 0; - } - else { - pages -= node_pages; - } - } - - return 0; -} - -int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { - MI_UNUSED(max_secs); - _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); - if (pages_reserved != NULL) *pages_reserved = 0; - int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); - if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; - return err; -} - - diff --git a/src/arena-page.c b/src/arena-page.c deleted file mode 100644 index 93d25dbf..00000000 --- a/src/arena-page.c +++ /dev/null @@ -1,20 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2024, Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -/* ---------------------------------------------------------------------------- - ------------------------------------------------------------------------------*/ - -#include "mimalloc.h" -#include "mimalloc/internal.h" -#include "bitmap.h" - - -/* ----------------------------------------------------------- - Arena allocation ------------------------------------------------------------ */ - diff --git a/src/bitmap-old.c b/src/bitmap-old.c deleted file mode 100644 index 3e6311dc..00000000 --- a/src/bitmap-old.c +++ /dev/null @@ -1,419 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2023 Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -/* ---------------------------------------------------------------------------- -Concurrent bitmap that can set/reset sequences of bits atomically, -represented as an array of fields where each field is a machine word (`size_t`) - -There are two api's; the standard one cannot have sequences that cross -between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). - -The `_across` postfixed functions do allow sequences that can cross over -between the fields. (This is used in arena allocation) ----------------------------------------------------------------------------- */ - -#include "mimalloc.h" -#include "mimalloc/internal.h" -#include "mimalloc/bits.h" -#include "bitmap.h" - -/* ----------------------------------------------------------- - Bitmap definition ------------------------------------------------------------ */ - -// The bit mask for a given number of blocks at a specified bit index. -static inline size_t mi_bitmap_mask_(size_t count, size_t bitidx) { - mi_assert_internal(count + bitidx <= MI_BITMAP_FIELD_BITS); - mi_assert_internal(count > 0); - if (count >= MI_BITMAP_FIELD_BITS) return MI_BITMAP_FIELD_FULL; - if (count == 0) return 0; - return ((((size_t)1 << count) - 1) << bitidx); -} - - - -/* ----------------------------------------------------------- - Claim a bit sequence atomically ------------------------------------------------------------ */ - -// Try to atomically claim a sequence of `count` bits in a single -// field at `idx` in `bitmap`. Returns `true` on success. -bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx) -{ - mi_assert_internal(bitmap_idx != NULL); - mi_assert_internal(count <= MI_BITMAP_FIELD_BITS); - mi_bitmap_field_t* field = &bitmap[idx]; - size_t map = mi_atomic_load_relaxed(field); - if (map==MI_BITMAP_FIELD_FULL) return false; // short cut - - // search for 0-bit sequence of length count - const size_t mask = mi_bitmap_mask_(count, 0); - const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count; - -#if MI_HAS_FAST_BITSCAN - size_t bitidx = mi_ctz(~map); // quickly find the first zero bit if possible -#else - size_t bitidx = 0; // otherwise start at 0 -#endif - size_t m = (mask << bitidx); // invariant: m == mask shifted by bitidx - - // scan linearly for a free range of zero bits - while (bitidx <= bitidx_max) { - const size_t mapm = (map & m); - if (mapm == 0) { // are the mask bits free at bitidx? - mi_assert_internal((m >> bitidx) == mask); // no overflow? - const size_t newmap = (map | m); - mi_assert_internal((newmap^map) >> bitidx == mask); - if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { // TODO: use weak cas here? - // no success, another thread claimed concurrently.. keep going (with updated `map`) - continue; - } - else { - // success, we claimed the bits! - *bitmap_idx = mi_bitmap_index_create(idx, bitidx); - return true; - } - } - else { - // on to the next bit range -#if MI_HAS_FAST_BITSCAN - mi_assert_internal(mapm != 0); - const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx)); - mi_assert_internal(shift > 0 && shift <= count); -#else - const size_t shift = 1; -#endif - bitidx += shift; - m <<= shift; - } - } - // no bits found - return false; -} - - -// Starts at idx, and wraps around to search in all `bitmap_fields` fields. -// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields. -bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx) { - size_t idx = start_field_idx; - for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { - if (idx >= bitmap_fields) { idx = 0; } // wrap - if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { - return true; - } - } - return false; -} - - -// Set `count` bits at `bitmap_idx` to 0 atomically -// Returns `true` if all `count` bits were 1 previously. -bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - const size_t idx = mi_bitmap_index_field(bitmap_idx); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - const size_t mask = mi_bitmap_mask_(count, bitidx); - mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); - // mi_assert_internal((bitmap[idx] & mask) == mask); - const size_t prev = mi_atomic_and_acq_rel(&bitmap[idx], ~mask); - return ((prev & mask) == mask); -} - - -// Set `count` bits at `bitmap_idx` to 1 atomically -// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero) { - const size_t idx = mi_bitmap_index_field(bitmap_idx); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - const size_t mask = mi_bitmap_mask_(count, bitidx); - mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); - //mi_assert_internal(any_zero != NULL || (bitmap[idx] & mask) == 0); - size_t prev = mi_atomic_or_acq_rel(&bitmap[idx], mask); - if (any_zero != NULL) { *any_zero = ((prev & mask) != mask); } - return ((prev & mask) == 0); -} - -// Returns `true` if all `count` bits were 1. `any_ones` is `true` if there was at least one bit set to one. -static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_ones) { - const size_t idx = mi_bitmap_index_field(bitmap_idx); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - const size_t mask = mi_bitmap_mask_(count, bitidx); - mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); - const size_t field = mi_atomic_load_relaxed(&bitmap[idx]); - if (any_ones != NULL) { *any_ones = ((field & mask) != 0); } - return ((field & mask) == mask); -} - -// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. -// Returns `true` if successful when all previous `count` bits were 0. -bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - const size_t idx = mi_bitmap_index_field(bitmap_idx); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - const size_t mask = mi_bitmap_mask_(count, bitidx); - mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); - size_t expected = mi_atomic_load_relaxed(&bitmap[idx]); - do { - if ((expected & mask) != 0) return false; - } - while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask)); - mi_assert_internal((expected & mask) == 0); - return true; -} - - -bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - return mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, NULL); -} - -bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - bool any_ones; - mi_bitmap_is_claimedx(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); - return any_ones; -} - - -//-------------------------------------------------------------------------- -// the `_across` functions work on bitmaps where sequences can cross over -// between the fields. This is used in arena allocation -//-------------------------------------------------------------------------- - -// Try to atomically claim a sequence of `count` bits starting from the field -// at `idx` in `bitmap` and crossing into subsequent fields. Returns `true` on success. -// Only needs to consider crossing into the next fields (see `mi_bitmap_try_find_from_claim_across`) -static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t idx, const size_t count, const size_t retries, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) -{ - mi_assert_internal(bitmap_idx != NULL); - - // check initial trailing zeros - mi_bitmap_field_t* field = &bitmap[idx]; - size_t map = mi_atomic_load_relaxed(field); - const size_t initial = mi_clz(map); // count of initial zeros starting at idx - mi_assert_internal(initial <= MI_BITMAP_FIELD_BITS); - if (initial == 0) return false; - if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx); // no need to cross fields (this case won't happen for us) - if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries - - // scan ahead - size_t found = initial; - size_t mask = 0; // mask bits for the final field - while(found < count) { - field++; - map = mi_atomic_load_relaxed(field); - const size_t mask_bits = (found + MI_BITMAP_FIELD_BITS <= count ? MI_BITMAP_FIELD_BITS : (count - found)); - mi_assert_internal(mask_bits > 0 && mask_bits <= MI_BITMAP_FIELD_BITS); - mask = mi_bitmap_mask_(mask_bits, 0); - if ((map & mask) != 0) return false; // some part is already claimed - found += mask_bits; - } - mi_assert_internal(field < &bitmap[bitmap_fields]); - - // we found a range of contiguous zeros up to the final field; mask contains mask in the final field - // now try to claim the range atomically - mi_bitmap_field_t* const final_field = field; - const size_t final_mask = mask; - mi_bitmap_field_t* const initial_field = &bitmap[idx]; - const size_t initial_idx = MI_BITMAP_FIELD_BITS - initial; - const size_t initial_mask = mi_bitmap_mask_(initial, initial_idx); - - // initial field - size_t newmap; - field = initial_field; - map = mi_atomic_load_relaxed(field); - do { - newmap = (map | initial_mask); - if ((map & initial_mask) != 0) { goto rollback; }; - } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); - - // intermediate fields - while (++field < final_field) { - newmap = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0); - map = 0; - if (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)) { goto rollback; } - } - - // final field - mi_assert_internal(field == final_field); - map = mi_atomic_load_relaxed(field); - do { - newmap = (map | final_mask); - if ((map & final_mask) != 0) { goto rollback; } - } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); - - // claimed! - mi_stat_counter_increase(stats->arena_crossover_count,1); - *bitmap_idx = mi_bitmap_index_create(idx, initial_idx); - return true; - -rollback: - // roll back intermediate fields - // (we just failed to claim `field` so decrement first) - while (--field > initial_field) { - newmap = 0; - map = mi_bitmap_mask_(MI_BITMAP_FIELD_BITS, 0); - mi_assert_internal(mi_atomic_load_relaxed(field) == map); - mi_atomic_store_release(field, newmap); - } - if (field == initial_field) { // (if we failed on the initial field, `field + 1 == initial_field`) - map = mi_atomic_load_relaxed(field); - do { - mi_assert_internal((map & initial_mask) == initial_mask); - newmap = (map & ~initial_mask); - } while (!mi_atomic_cas_strong_acq_rel(field, &map, newmap)); - } - mi_stat_counter_increase(stats->arena_rollback_count,1); - // retry? (we make a recursive call instead of goto to be able to use const declarations) - if (retries <= 2) { - return mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, retries+1, bitmap_idx, stats); - } - else { - return false; - } -} - - -// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. -// Starts at idx, and wraps around to search in all `bitmap_fields` fields. -bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats) { - mi_assert_internal(count > 0); - if (count <= 2) { - // we don't bother with crossover fields for small counts - return _mi_bitmap_try_find_from_claim(bitmap, bitmap_fields, start_field_idx, count, bitmap_idx); - } - - // visit the fields - size_t idx = start_field_idx; - for (size_t visited = 0; visited < bitmap_fields; visited++, idx++) { - if (idx >= bitmap_fields) { idx = 0; } // wrap - // first try to claim inside a field - /* - if (count <= MI_BITMAP_FIELD_BITS) { - if (_mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx)) { - return true; - } - } - */ - // if that fails, then try to claim across fields - if (mi_bitmap_try_find_claim_field_across(bitmap, bitmap_fields, idx, count, 0, bitmap_idx, stats)) { - return true; - } - } - return false; -} - -// Helper for masks across fields; returns the mid count, post_mask may be 0 -static size_t mi_bitmap_mask_across(mi_bitmap_index_t bitmap_idx, size_t bitmap_fields, size_t count, size_t* pre_mask, size_t* mid_mask, size_t* post_mask) { - MI_UNUSED(bitmap_fields); - const size_t bitidx = mi_bitmap_index_bit_in_field(bitmap_idx); - if mi_likely(bitidx + count <= MI_BITMAP_FIELD_BITS) { - *pre_mask = mi_bitmap_mask_(count, bitidx); - *mid_mask = 0; - *post_mask = 0; - mi_assert_internal(mi_bitmap_index_field(bitmap_idx) < bitmap_fields); - return 0; - } - else { - const size_t pre_bits = MI_BITMAP_FIELD_BITS - bitidx; - mi_assert_internal(pre_bits < count); - *pre_mask = mi_bitmap_mask_(pre_bits, bitidx); - count -= pre_bits; - const size_t mid_count = (count / MI_BITMAP_FIELD_BITS); - *mid_mask = MI_BITMAP_FIELD_FULL; - count %= MI_BITMAP_FIELD_BITS; - *post_mask = (count==0 ? 0 : mi_bitmap_mask_(count, 0)); - mi_assert_internal(mi_bitmap_index_field(bitmap_idx) + mid_count + (count==0 ? 0 : 1) < bitmap_fields); - return mid_count; - } -} - -// Set `count` bits at `bitmap_idx` to 0 atomically -// Returns `true` if all `count` bits were 1 previously. -bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - size_t idx = mi_bitmap_index_field(bitmap_idx); - size_t pre_mask; - size_t mid_mask; - size_t post_mask; - size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); - bool all_one = true; - mi_bitmap_field_t* field = &bitmap[idx]; - size_t prev = mi_atomic_and_acq_rel(field++, ~pre_mask); // clear first part - if ((prev & pre_mask) != pre_mask) all_one = false; - while(mid_count-- > 0) { - prev = mi_atomic_and_acq_rel(field++, ~mid_mask); // clear mid part - if ((prev & mid_mask) != mid_mask) all_one = false; - } - if (post_mask!=0) { - prev = mi_atomic_and_acq_rel(field, ~post_mask); // clear end part - if ((prev & post_mask) != post_mask) all_one = false; - } - return all_one; -} - -// Set `count` bits at `bitmap_idx` to 1 atomically -// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero) { - size_t idx = mi_bitmap_index_field(bitmap_idx); - size_t pre_mask; - size_t mid_mask; - size_t post_mask; - size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); - bool all_zero = true; - bool any_zero = false; - _Atomic(size_t)*field = &bitmap[idx]; - size_t prev = mi_atomic_or_acq_rel(field++, pre_mask); - if ((prev & pre_mask) != 0) all_zero = false; - if ((prev & pre_mask) != pre_mask) any_zero = true; - while (mid_count-- > 0) { - prev = mi_atomic_or_acq_rel(field++, mid_mask); - if ((prev & mid_mask) != 0) all_zero = false; - if ((prev & mid_mask) != mid_mask) any_zero = true; - } - if (post_mask!=0) { - prev = mi_atomic_or_acq_rel(field, post_mask); - if ((prev & post_mask) != 0) all_zero = false; - if ((prev & post_mask) != post_mask) any_zero = true; - } - if (pany_zero != NULL) { *pany_zero = any_zero; } - return all_zero; -} - - -// Returns `true` if all `count` bits were 1. -// `any_ones` is `true` if there was at least one bit set to one. -static bool mi_bitmap_is_claimedx_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_ones) { - size_t idx = mi_bitmap_index_field(bitmap_idx); - size_t pre_mask; - size_t mid_mask; - size_t post_mask; - size_t mid_count = mi_bitmap_mask_across(bitmap_idx, bitmap_fields, count, &pre_mask, &mid_mask, &post_mask); - bool all_ones = true; - bool any_ones = false; - mi_bitmap_field_t* field = &bitmap[idx]; - size_t prev = mi_atomic_load_relaxed(field++); - if ((prev & pre_mask) != pre_mask) all_ones = false; - if ((prev & pre_mask) != 0) any_ones = true; - while (mid_count-- > 0) { - prev = mi_atomic_load_relaxed(field++); - if ((prev & mid_mask) != mid_mask) all_ones = false; - if ((prev & mid_mask) != 0) any_ones = true; - } - if (post_mask!=0) { - prev = mi_atomic_load_relaxed(field); - if ((prev & post_mask) != post_mask) all_ones = false; - if ((prev & post_mask) != 0) any_ones = true; - } - if (pany_ones != NULL) { *pany_ones = any_ones; } - return all_ones; -} - -bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - return mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, NULL); -} - -bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { - bool any_ones; - mi_bitmap_is_claimedx_across(bitmap, bitmap_fields, count, bitmap_idx, &any_ones); - return any_ones; -} diff --git a/src/bitmap-old.h b/src/bitmap-old.h deleted file mode 100644 index f8898935..00000000 --- a/src/bitmap-old.h +++ /dev/null @@ -1,110 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2023 Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -/* ---------------------------------------------------------------------------- -Concurrent bitmap that can set/reset sequences of bits atomically, -represented as an array of fields where each field is a machine word (`size_t`) - -There are two api's; the standard one cannot have sequences that cross -between the bitmap fields (and a sequence must be <= MI_BITMAP_FIELD_BITS). -(this is used in region allocation) - -The `_across` postfixed functions do allow sequences that can cross over -between the fields. (This is used in arena allocation) ----------------------------------------------------------------------------- */ -#pragma once -#ifndef MI_BITMAP_H -#define MI_BITMAP_H - -/* ----------------------------------------------------------- - Bitmap definition ------------------------------------------------------------ */ - -#define MI_BITMAP_FIELD_BITS (8*MI_SIZE_SIZE) -#define MI_BITMAP_FIELD_FULL (~((size_t)0)) // all bits set - -// An atomic bitmap of `size_t` fields -typedef _Atomic(size_t) mi_bitmap_field_t; -typedef mi_bitmap_field_t* mi_bitmap_t; - -// A bitmap index is the index of the bit in a bitmap. -typedef size_t mi_bitmap_index_t; - -// Create a bit index. -static inline mi_bitmap_index_t mi_bitmap_index_create_ex(size_t idx, size_t bitidx) { - mi_assert_internal(bitidx <= MI_BITMAP_FIELD_BITS); - return (idx*MI_BITMAP_FIELD_BITS) + bitidx; -} -static inline mi_bitmap_index_t mi_bitmap_index_create(size_t idx, size_t bitidx) { - mi_assert_internal(bitidx < MI_BITMAP_FIELD_BITS); - return mi_bitmap_index_create_ex(idx,bitidx); -} - -// Get the field index from a bit index. -static inline size_t mi_bitmap_index_field(mi_bitmap_index_t bitmap_idx) { - return (bitmap_idx / MI_BITMAP_FIELD_BITS); -} - -// Get the bit index in a bitmap field -static inline size_t mi_bitmap_index_bit_in_field(mi_bitmap_index_t bitmap_idx) { - return (bitmap_idx % MI_BITMAP_FIELD_BITS); -} - -// Get the full bit index -static inline size_t mi_bitmap_index_bit(mi_bitmap_index_t bitmap_idx) { - return bitmap_idx; -} - -/* ----------------------------------------------------------- - Claim a bit sequence atomically ------------------------------------------------------------ */ - -// Try to atomically claim a sequence of `count` bits in a single -// field at `idx` in `bitmap`. Returns `true` on success. -bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_t count, mi_bitmap_index_t* bitmap_idx); - -// Starts at idx, and wraps around to search in all `bitmap_fields` fields. -// For now, `count` can be at most MI_BITMAP_FIELD_BITS and will never cross fields. -bool _mi_bitmap_try_find_from_claim(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx); - -// Set `count` bits at `bitmap_idx` to 0 atomically -// Returns `true` if all `count` bits were 1 previously. -bool _mi_bitmap_unclaim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); - -// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. -// Returns `true` if successful when all previous `count` bits were 0. -bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); - -// Set `count` bits at `bitmap_idx` to 1 atomically -// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* any_zero); - -bool _mi_bitmap_is_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); -bool _mi_bitmap_is_any_claimed(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); - - -//-------------------------------------------------------------------------- -// the `_across` functions work on bitmaps where sequences can cross over -// between the fields. This is used in arena allocation -//-------------------------------------------------------------------------- - -// Find `count` bits of zeros and set them to 1 atomically; returns `true` on success. -// Starts at idx, and wraps around to search in all `bitmap_fields` fields. -bool _mi_bitmap_try_find_from_claim_across(mi_bitmap_t bitmap, const size_t bitmap_fields, const size_t start_field_idx, const size_t count, mi_bitmap_index_t* bitmap_idx, mi_stats_t* stats); - -// Set `count` bits at `bitmap_idx` to 0 atomically -// Returns `true` if all `count` bits were 1 previously. -bool _mi_bitmap_unclaim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); - -// Set `count` bits at `bitmap_idx` to 1 atomically -// Returns `true` if all `count` bits were 0 previously. `any_zero` is `true` if there was at least one zero bit. -bool _mi_bitmap_claim_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx, bool* pany_zero); - -bool _mi_bitmap_is_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); -bool _mi_bitmap_is_any_claimed_across(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx); - -#endif diff --git a/src/page.c b/src/page.c index 54e7b539..f21bf91f 100644 --- a/src/page.c +++ b/src/page.c @@ -339,59 +339,6 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { } } -/* -// Abandon a page with used blocks at the end of a thread. -// Note: only call if it is ensured that no references exist from -// the `page->heap->thread_delayed_free` into this page. -// Currently only called through `mi_heap_collect_ex` which ensures this. -void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { - mi_assert_internal(page != NULL); - mi_assert_expensive(_mi_page_is_valid(page)); - mi_assert_internal(pq == mi_page_queue_of(page)); - mi_assert_internal(mi_page_heap(page) != NULL); - - mi_heap_t* pheap = mi_page_heap(page); - - // remove from our page list - mi_page_queue_remove(pq, page); - - // page is no longer associated with our heap - mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE); - mi_page_set_heap(page, NULL); - -#if (MI_DEBUG>1) && !MI_TRACK_ENABLED - // check there are no references left.. - for (mi_block_t* block = (mi_block_t*)pheap->thread_delayed_free; block != NULL; block = mi_block_nextx(pheap, block, pheap->keys)) { - mi_assert_internal(_mi_ptr_page(block) != page); - } -#endif - - // and abandon it - mi_assert_internal(mi_page_is_abandoned(page)); - _mi_arena_page_abandon(page, pheap->tld); -} - -// force abandon a page -void _mi_page_force_abandon(mi_page_t* page) { - mi_heap_t* heap = mi_page_heap(page); - // mark page as not using delayed free - _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false); - - // ensure this page is no longer in the heap delayed free list - _mi_heap_delayed_free_all(heap); - // TODO: can we still access the page meta-info even if it is freed? - if (page->capacity == 0) return; // it may have been freed now - - // and now unlink it from the page queue and abandon (or free) - mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); - if (mi_page_all_free(page)) { - _mi_page_free(page, pq, false); - } - else { - _mi_page_abandon(page, pq); - } -} -*/ // Free a page with no more free blocks void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) { diff --git a/src/segment-map.c b/src/segment-map.c deleted file mode 100644 index 2c3964fe..00000000 --- a/src/segment-map.c +++ /dev/null @@ -1,126 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2023, Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -/* ----------------------------------------------------------- - The following functions are to reliably find the segment or - block that encompasses any pointer p (or NULL if it is not - in any of our segments). - We maintain a bitmap of all memory with 1 bit per MI_SEGMENT_SIZE (64MiB) - set to 1 if it contains the segment meta data. ------------------------------------------------------------ */ -#include "mimalloc.h" -#include "mimalloc/internal.h" -#include "mimalloc/atomic.h" - -// Reduce total address space to reduce .bss (due to the `mi_segment_map`) -#if (MI_INTPTR_SIZE > 4) && MI_TRACK_ASAN -#define MI_SEGMENT_MAP_MAX_ADDRESS (128*1024ULL*MI_GiB) // 128 TiB (see issue #881) -#elif (MI_INTPTR_SIZE > 4) -#define MI_SEGMENT_MAP_MAX_ADDRESS (48*1024ULL*MI_GiB) // 48 TiB -#else -#define MI_SEGMENT_MAP_MAX_ADDRESS (UINT32_MAX) -#endif - -#define MI_SEGMENT_MAP_PART_SIZE (MI_INTPTR_SIZE*MI_KiB - 128) // 128 > sizeof(mi_memid_t) ! -#define MI_SEGMENT_MAP_PART_BITS (8*MI_SEGMENT_MAP_PART_SIZE) -#define MI_SEGMENT_MAP_PART_ENTRIES (MI_SEGMENT_MAP_PART_SIZE / MI_INTPTR_SIZE) -#define MI_SEGMENT_MAP_PART_BIT_SPAN (MI_SEGMENT_ALIGN) -#define MI_SEGMENT_MAP_PART_SPAN (MI_SEGMENT_MAP_PART_BITS * MI_SEGMENT_MAP_PART_BIT_SPAN) -#define MI_SEGMENT_MAP_MAX_PARTS ((MI_SEGMENT_MAP_MAX_ADDRESS / MI_SEGMENT_MAP_PART_SPAN) + 1) - -// A part of the segment map. -typedef struct mi_segmap_part_s { - mi_memid_t memid; - _Atomic(uintptr_t) map[MI_SEGMENT_MAP_PART_ENTRIES]; -} mi_segmap_part_t; - -// Allocate parts on-demand to reduce .bss footprint -static _Atomic(mi_segmap_part_t*) mi_segment_map[MI_SEGMENT_MAP_MAX_PARTS]; // = { NULL, .. } - -static mi_segmap_part_t* mi_segment_map_index_of(const mi_segment_t* segment, bool create_on_demand, size_t* idx, size_t* bitidx) { - // note: segment can be invalid or NULL. - mi_assert_internal(_mi_ptr_segment(segment + 1) == segment); // is it aligned on MI_SEGMENT_SIZE? - *idx = 0; - *bitidx = 0; - if ((uintptr_t)segment >= MI_SEGMENT_MAP_MAX_ADDRESS) return NULL; - const uintptr_t segindex = ((uintptr_t)segment) / MI_SEGMENT_MAP_PART_SPAN; - if (segindex >= MI_SEGMENT_MAP_MAX_PARTS) return NULL; - mi_segmap_part_t* part = mi_atomic_load_ptr_relaxed(mi_segmap_part_t, &mi_segment_map[segindex]); - - // allocate on demand to reduce .bss footprint - if (part == NULL) { - if (!create_on_demand) return NULL; - mi_memid_t memid; - part = (mi_segmap_part_t*)_mi_os_alloc(sizeof(mi_segmap_part_t), &memid, NULL); - if (part == NULL) return NULL; - mi_segmap_part_t* expected = NULL; - if (!mi_atomic_cas_ptr_strong_release(mi_segmap_part_t, &mi_segment_map[segindex], &expected, part)) { - _mi_os_free(part, sizeof(mi_segmap_part_t), memid, NULL); - part = expected; - if (part == NULL) return NULL; - } - } - mi_assert(part != NULL); - const uintptr_t offset = ((uintptr_t)segment) % MI_SEGMENT_MAP_PART_SPAN; - const uintptr_t bitofs = offset / MI_SEGMENT_MAP_PART_BIT_SPAN; - *idx = bitofs / MI_INTPTR_BITS; - *bitidx = bitofs % MI_INTPTR_BITS; - return part; -} - -void _mi_segment_map_allocated_at(const mi_segment_t* segment) { - if (segment->memid.memkind == MI_MEM_ARENA) return; // we lookup segments first in the arena's and don't need the segment map - size_t index; - size_t bitidx; - mi_segmap_part_t* part = mi_segment_map_index_of(segment, true /* alloc map if needed */, &index, &bitidx); - if (part == NULL) return; // outside our address range.. - uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); - uintptr_t newmask; - do { - newmask = (mask | ((uintptr_t)1 << bitidx)); - } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask)); -} - -void _mi_segment_map_freed_at(const mi_segment_t* segment) { - if (segment->memid.memkind == MI_MEM_ARENA) return; - size_t index; - size_t bitidx; - mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* don't alloc if not present */, &index, &bitidx); - if (part == NULL) return; // outside our address range.. - uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); - uintptr_t newmask; - do { - newmask = (mask & ~((uintptr_t)1 << bitidx)); - } while (!mi_atomic_cas_weak_release(&part->map[index], &mask, newmask)); -} - -// Determine the segment belonging to a pointer or NULL if it is not in a valid segment. -static mi_segment_t* _mi_segment_of(const void* p) { - if (p == NULL) return NULL; - mi_segment_t* segment = _mi_ptr_segment(p); // segment can be NULL - size_t index; - size_t bitidx; - mi_segmap_part_t* part = mi_segment_map_index_of(segment, false /* dont alloc if not present */, &index, &bitidx); - if (part == NULL) return NULL; - const uintptr_t mask = mi_atomic_load_relaxed(&part->map[index]); - if mi_likely((mask & ((uintptr_t)1 << bitidx)) != 0) { - bool cookie_ok = (_mi_ptr_cookie(segment) == segment->cookie); - mi_assert_internal(cookie_ok); MI_UNUSED(cookie_ok); - return segment; // yes, allocated by us - } - return NULL; -} - -// Is this a valid pointer in our heap? -static bool mi_is_valid_pointer(const void* p) { - // first check if it is in an arena, then check if it is OS allocated - return (_mi_arena_contains(p) || _mi_segment_of(p) != NULL); -} - -mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { - return mi_is_valid_pointer(p); -} diff --git a/src/segment.c b/src/segment.c deleted file mode 100644 index 74abcdbc..00000000 --- a/src/segment.c +++ /dev/null @@ -1,1387 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2018-2024, Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ -#include "mimalloc.h" -#include "mimalloc/internal.h" -#include "mimalloc/atomic.h" - -#include // memset -#include - -#define MI_PAGE_HUGE_ALIGN (256*1024) - -static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size); - -/* -------------------------------------------------------------------------------- - Segment allocation - We allocate pages inside bigger "segments" (4MiB on 64-bit). This is to avoid - splitting VMA's on Linux and reduce fragmentation on other OS's. - Each thread owns its own segments. - - Currently we have: - - small pages (64KiB), 64 in one segment - - medium pages (512KiB), 8 in one segment - - large pages (4MiB), 1 in one segment - - huge segments have 1 page in one segment that can be larger than `MI_SEGMENT_SIZE`. - it is used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or with alignment `> MI_BLOCK_ALIGNMENT_MAX`. - - The memory for a segment is usually committed on demand. - (i.e. we are careful to not touch the memory until we actually allocate a block there) - - If a thread ends, it "abandons" pages that still contain live blocks. - Such segments are abondoned and these can be reclaimed by still running threads, - (much like work-stealing). --------------------------------------------------------------------------------- */ - - -/* ----------------------------------------------------------- - Queue of segments containing free pages ------------------------------------------------------------ */ - -#if (MI_DEBUG>=3) -static bool mi_segment_queue_contains(const mi_segment_queue_t* queue, const mi_segment_t* segment) { - mi_assert_internal(segment != NULL); - mi_segment_t* list = queue->first; - while (list != NULL) { - if (list == segment) break; - mi_assert_internal(list->next==NULL || list->next->prev == list); - mi_assert_internal(list->prev==NULL || list->prev->next == list); - list = list->next; - } - return (list == segment); -} -#endif - -/* -static bool mi_segment_queue_is_empty(const mi_segment_queue_t* queue) { - return (queue->first == NULL); -} -*/ - -static void mi_segment_queue_remove(mi_segment_queue_t* queue, mi_segment_t* segment) { - mi_assert_expensive(mi_segment_queue_contains(queue, segment)); - if (segment->prev != NULL) segment->prev->next = segment->next; - if (segment->next != NULL) segment->next->prev = segment->prev; - if (segment == queue->first) queue->first = segment->next; - if (segment == queue->last) queue->last = segment->prev; - segment->next = NULL; - segment->prev = NULL; -} - -static void mi_segment_enqueue(mi_segment_queue_t* queue, mi_segment_t* segment) { - mi_assert_expensive(!mi_segment_queue_contains(queue, segment)); - segment->next = NULL; - segment->prev = queue->last; - if (queue->last != NULL) { - mi_assert_internal(queue->last->next == NULL); - queue->last->next = segment; - queue->last = segment; - } - else { - queue->last = queue->first = segment; - } -} - -static mi_segment_queue_t* mi_segment_free_queue_of_kind(mi_page_kind_t kind, mi_segments_tld_t* tld) { - if (kind == MI_PAGE_SMALL) return &tld->small_free; - else if (kind == MI_PAGE_MEDIUM) return &tld->medium_free; - else return NULL; -} - -static mi_segment_queue_t* mi_segment_free_queue(const mi_segment_t* segment, mi_segments_tld_t* tld) { - return mi_segment_free_queue_of_kind(segment->page_kind, tld); -} - -// remove from free queue if it is in one -static void mi_segment_remove_from_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) { - mi_segment_queue_t* queue = mi_segment_free_queue(segment, tld); // may be NULL - bool in_queue = (queue!=NULL && (segment->next != NULL || segment->prev != NULL || queue->first == segment)); - if (in_queue) { - mi_segment_queue_remove(queue, segment); - } -} - -static void mi_segment_insert_in_free_queue(mi_segment_t* segment, mi_segments_tld_t* tld) { - mi_segment_enqueue(mi_segment_free_queue(segment, tld), segment); -} - - -/* ----------------------------------------------------------- - Invariant checking ------------------------------------------------------------ */ - -#if (MI_DEBUG >= 2) || (MI_SECURE >= 2) -static size_t mi_segment_page_size(const mi_segment_t* segment) { - if (segment->capacity > 1) { - mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); - return ((size_t)1 << segment->page_shift); - } - else { - mi_assert_internal(segment->page_kind >= MI_PAGE_LARGE); - return segment->segment_size; - } -} -#endif - -#if (MI_DEBUG>=2) -static bool mi_pages_purge_contains(const mi_page_t* page, mi_segments_tld_t* tld) { - mi_page_t* p = tld->pages_purge.first; - while (p != NULL) { - if (p == page) return true; - p = p->next; - } - return false; -} -#endif - -#if (MI_DEBUG>=3) -static bool mi_segment_is_valid(const mi_segment_t* segment, mi_segments_tld_t* tld) { - mi_assert_internal(segment != NULL); - mi_assert_internal(_mi_ptr_cookie(segment) == segment->cookie); - mi_assert_internal(segment->used <= segment->capacity); - mi_assert_internal(segment->abandoned <= segment->used); - mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM || segment->capacity == 1); // one large or huge page per segment - size_t nfree = 0; - for (size_t i = 0; i < segment->capacity; i++) { - const mi_page_t* const page = &segment->pages[i]; - if (!page->segment_in_use) { - nfree++; - } - if (page->segment_in_use) { - mi_assert_expensive(!mi_pages_purge_contains(page, tld)); - } - mi_assert_internal(page->is_huge == (segment->page_kind == MI_PAGE_HUGE)); - } - mi_assert_internal(nfree + segment->used == segment->capacity); - // mi_assert_internal(segment->thread_id == _mi_thread_id() || (segment->thread_id==0)); // or 0 - mi_assert_internal(segment->page_kind == MI_PAGE_HUGE || - (mi_segment_page_size(segment) * segment->capacity == segment->segment_size)); - return true; -} -#endif - -static bool mi_page_not_in_queue(const mi_page_t* page, mi_segments_tld_t* tld) { - mi_assert_internal(page != NULL); - if (page->next != NULL || page->prev != NULL) { - mi_assert_internal(mi_pages_purge_contains(page, tld)); - return false; - } - else { - // both next and prev are NULL, check for singleton list - return (tld->pages_purge.first != page && tld->pages_purge.last != page); - } -} - - -/* ----------------------------------------------------------- - Guard pages ------------------------------------------------------------ */ - -static void mi_segment_protect_range(void* p, size_t size, bool protect) { - if (protect) { - _mi_os_protect(p, size); - } - else { - _mi_os_unprotect(p, size); - } -} - -static void mi_segment_protect(mi_segment_t* segment, bool protect, mi_os_tld_t* tld) { - // add/remove guard pages - if (MI_SECURE != 0) { - // in secure mode, we set up a protected page in between the segment info and the page data - const size_t os_psize = _mi_os_page_size(); - mi_assert_internal((segment->segment_info_size - os_psize) >= (sizeof(mi_segment_t) + ((segment->capacity - 1) * sizeof(mi_page_t)))); - mi_assert_internal(((uintptr_t)segment + segment->segment_info_size) % os_psize == 0); - mi_segment_protect_range((uint8_t*)segment + segment->segment_info_size - os_psize, os_psize, protect); - #if (MI_SECURE >= 2) - if (segment->capacity == 1) - #endif - { - // and protect the last (or only) page too - mi_assert_internal(MI_SECURE <= 1 || segment->page_kind >= MI_PAGE_LARGE); - uint8_t* start = (uint8_t*)segment + segment->segment_size - os_psize; - if (protect && !segment->memid.initially_committed) { - if (protect) { - // ensure secure page is committed - if (_mi_os_commit(start, os_psize, NULL, tld->stats)) { // if this fails that is ok (as it is an unaccessible page) - mi_segment_protect_range(start, os_psize, protect); - } - } - } - else { - mi_segment_protect_range(start, os_psize, protect); - } - } - #if (MI_SECURE >= 2) - else { - // or protect every page - const size_t page_size = mi_segment_page_size(segment); - for (size_t i = 0; i < segment->capacity; i++) { - if (segment->pages[i].is_committed) { - mi_segment_protect_range((uint8_t*)segment + (i+1)*page_size - os_psize, os_psize, protect); - } - } - } - #endif - } -} - -/* ----------------------------------------------------------- - Page reset ------------------------------------------------------------ */ - -static void mi_page_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) { - // todo: should we purge the guard page as well when MI_SECURE>=2 ? - mi_assert_internal(page->is_committed); - mi_assert_internal(!page->segment_in_use); - if (!segment->allow_purge) return; - mi_assert_internal(page->used == 0); - mi_assert_internal(page->free == NULL); - mi_assert_expensive(!mi_pages_purge_contains(page, tld)); - size_t psize; - void* start = mi_segment_raw_page_start(segment, page, &psize); - const bool needs_recommit = _mi_os_purge(start, psize, tld->stats); - if (needs_recommit) { page->is_committed = false; } -} - -static bool mi_page_ensure_committed(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) { - if (page->is_committed) return true; - mi_assert_internal(segment->allow_decommit); - mi_assert_expensive(!mi_pages_purge_contains(page, tld)); - - size_t psize; - uint8_t* start = mi_segment_raw_page_start(segment, page, &psize); - bool is_zero = false; - const size_t gsize = (MI_SECURE >= 2 ? _mi_os_page_size() : 0); - bool ok = _mi_os_commit(start, psize + gsize, &is_zero, tld->stats); - if (!ok) return false; // failed to commit! - page->is_committed = true; - page->used = 0; - page->free = NULL; - page->is_zero_init = is_zero; - if (gsize > 0) { - mi_segment_protect_range(start + psize, gsize, true); - } - return true; -} - - -/* ----------------------------------------------------------- - The free page queue ------------------------------------------------------------ */ - -// we re-use the `free` field for the expiration counter. Since this is a -// a pointer size field while the clock is always 64-bit we need to guard -// against overflow, we use substraction to check for expiry which works -// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days) -static uint32_t mi_page_get_expire( mi_page_t* page ) { - return (uint32_t)((uintptr_t)page->free); -} - -static void mi_page_set_expire( mi_page_t* page, uint32_t expire ) { - page->free = (mi_block_t*)((uintptr_t)expire); -} - -static void mi_page_purge_set_expire(mi_page_t* page) { - mi_assert_internal(mi_page_get_expire(page)==0); - uint32_t expire = (uint32_t)_mi_clock_now() + mi_option_get(mi_option_purge_delay); - mi_page_set_expire(page, expire); -} - -// we re-use the `free` field for the expiration counter. Since this is a -// a pointer size field while the clock is always 64-bit we need to guard -// against overflow, we use substraction to check for expiry which work -// as long as the reset delay is under (2^30 - 1) milliseconds (~12 days) -static bool mi_page_purge_is_expired(mi_page_t* page, mi_msecs_t now) { - int32_t expire = (int32_t)mi_page_get_expire(page); - return (((int32_t)now - expire) >= 0); -} - -static void mi_segment_schedule_purge(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) { - mi_assert_internal(!page->segment_in_use); - mi_assert_internal(mi_page_not_in_queue(page,tld)); - mi_assert_expensive(!mi_pages_purge_contains(page, tld)); - mi_assert_internal(_mi_page_segment(page)==segment); - if (!segment->allow_purge) return; - - if (mi_option_get(mi_option_purge_delay) == 0) { - // purge immediately? - mi_page_purge(segment, page, tld); - } - else if (mi_option_get(mi_option_purge_delay) > 0) { // no purging if the delay is negative - // otherwise push on the delayed page reset queue - mi_page_queue_t* pq = &tld->pages_purge; - // push on top - mi_page_purge_set_expire(page); - page->next = pq->first; - page->prev = NULL; - if (pq->first == NULL) { - mi_assert_internal(pq->last == NULL); - pq->first = pq->last = page; - } - else { - pq->first->prev = page; - pq->first = page; - } - } -} - -static void mi_page_purge_remove(mi_page_t* page, mi_segments_tld_t* tld) { - if (mi_page_not_in_queue(page,tld)) return; - - mi_page_queue_t* pq = &tld->pages_purge; - mi_assert_internal(pq!=NULL); - mi_assert_internal(!page->segment_in_use); - mi_assert_internal(mi_page_get_expire(page) != 0); - mi_assert_internal(mi_pages_purge_contains(page, tld)); - if (page->prev != NULL) page->prev->next = page->next; - if (page->next != NULL) page->next->prev = page->prev; - if (page == pq->last) pq->last = page->prev; - if (page == pq->first) pq->first = page->next; - page->next = page->prev = NULL; - mi_page_set_expire(page,0); -} - -static void mi_segment_remove_all_purges(mi_segment_t* segment, bool force_purge, mi_segments_tld_t* tld) { - if (segment->memid.is_pinned) return; // never reset in huge OS pages - for (size_t i = 0; i < segment->capacity; i++) { - mi_page_t* page = &segment->pages[i]; - if (!page->segment_in_use) { - mi_page_purge_remove(page, tld); - if (force_purge && page->is_committed) { - mi_page_purge(segment, page, tld); - } - } - else { - mi_assert_internal(mi_page_not_in_queue(page,tld)); - } - } -} - -static void mi_pages_try_purge(bool force, mi_segments_tld_t* tld) { - if (mi_option_get(mi_option_purge_delay) < 0) return; // purging is not allowed - - mi_msecs_t now = _mi_clock_now(); - mi_page_queue_t* pq = &tld->pages_purge; - // from oldest up to the first that has not expired yet - mi_page_t* page = pq->last; - while (page != NULL && (force || mi_page_purge_is_expired(page,now))) { - mi_page_t* const prev = page->prev; // save previous field - mi_page_purge_remove(page, tld); // remove from the list to maintain invariant for mi_page_purge - mi_page_purge(_mi_page_segment(page), page, tld); - page = prev; - } - // discard the reset pages from the queue - pq->last = page; - if (page != NULL){ - page->next = NULL; - } - else { - pq->first = NULL; - } -} - - -/* ----------------------------------------------------------- - Segment size calculations ------------------------------------------------------------ */ - -static size_t mi_segment_raw_page_size(const mi_segment_t* segment) { - return (segment->page_kind == MI_PAGE_HUGE ? segment->segment_size : (size_t)1 << segment->page_shift); -} - -// Raw start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set) -// The raw start is not taking aligned block allocation into consideration. -static uint8_t* mi_segment_raw_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) { - size_t psize = mi_segment_raw_page_size(segment); - uint8_t* p = (uint8_t*)segment + page->segment_idx * psize; - - if (page->segment_idx == 0) { - // the first page starts after the segment info (and possible guard page) - p += segment->segment_info_size; - psize -= segment->segment_info_size; - } - -#if (MI_SECURE > 1) // every page has an os guard page - psize -= _mi_os_page_size(); -#elif (MI_SECURE==1) // the last page has an os guard page at the end - if (page->segment_idx == segment->capacity - 1) { - psize -= _mi_os_page_size(); - } -#endif - - if (page_size != NULL) *page_size = psize; - mi_assert_internal(page->block_size == 0 || _mi_ptr_page(p) == page); - mi_assert_internal(_mi_ptr_segment(p) == segment); - return p; -} - -// Start of the page available memory; can be used on uninitialized pages (only `segment_idx` must be set) -uint8_t* _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t* page, size_t* page_size) -{ - size_t psize; - uint8_t* p = mi_segment_raw_page_start(segment, page, &psize); - const size_t block_size = mi_page_block_size(page); - if (/*page->segment_idx == 0 &&*/ block_size > 0 && block_size <= MI_MAX_ALIGN_GUARANTEE) { - // for small and medium objects, ensure the page start is aligned with the block size (PR#66 by kickunderscore) - mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); - size_t adjust = block_size - ((uintptr_t)p % block_size); - if (adjust < block_size && psize >= block_size + adjust) { - p += adjust; - psize -= adjust; - mi_assert_internal((uintptr_t)p % block_size == 0); - } - } - mi_assert_internal(_mi_is_aligned(p, MI_MAX_ALIGN_SIZE)); - mi_assert_internal(block_size == 0 || block_size > MI_MAX_ALIGN_GUARANTEE || _mi_is_aligned(p,block_size)); - - if (page_size != NULL) *page_size = psize; - mi_assert_internal(_mi_ptr_page(p) == page); - mi_assert_internal(_mi_ptr_segment(p) == segment); - return p; -} - - -static size_t mi_segment_calculate_sizes(size_t capacity, size_t required, size_t* pre_size, size_t* info_size) -{ - const size_t minsize = sizeof(mi_segment_t) + ((capacity - 1) * sizeof(mi_page_t)) + 16 /* padding */; - size_t guardsize = 0; - size_t isize = 0; - - - if (MI_SECURE == 0) { - // normally no guard pages - #if MI_GUARDED - isize = _mi_align_up(minsize, _mi_os_page_size()); - #else - isize = _mi_align_up(minsize, 16 * MI_MAX_ALIGN_SIZE); - #endif - } - else { - // in secure mode, we set up a protected page in between the segment info - // and the page data (and one at the end of the segment) - const size_t page_size = _mi_os_page_size(); - isize = _mi_align_up(minsize, page_size); - guardsize = page_size; - //required = _mi_align_up(required, isize + guardsize); - } - - if (info_size != NULL) *info_size = isize; - if (pre_size != NULL) *pre_size = isize + guardsize; - return (required==0 ? MI_SEGMENT_SIZE : _mi_align_up( required + isize + 2*guardsize, MI_PAGE_HUGE_ALIGN) ); -} - - -/* ---------------------------------------------------------------------------- -Segment caches -We keep a small segment cache per thread to increase local -reuse and avoid setting/clearing guard pages in secure mode. -------------------------------------------------------------------------------- */ - -static void mi_segments_track_size(long segment_size, mi_segments_tld_t* tld) { - if (segment_size>=0) _mi_stat_increase(&tld->stats->segments,1); - else _mi_stat_decrease(&tld->stats->segments,1); - tld->count += (segment_size >= 0 ? 1 : -1); - if (tld->count > tld->peak_count) tld->peak_count = tld->count; - tld->current_size += segment_size; - if (tld->current_size > tld->peak_size) tld->peak_size = tld->current_size; -} - -static void mi_segment_os_free(mi_segment_t* segment, size_t segment_size, mi_segments_tld_t* tld) { - segment->thread_id = 0; - _mi_segment_map_freed_at(segment); - mi_segments_track_size(-((long)segment_size),tld); - if (segment->was_reclaimed) { - tld->reclaim_count--; - segment->was_reclaimed = false; - } - - if (MI_SECURE != 0) { - mi_assert_internal(!segment->memid.is_pinned); - mi_segment_protect(segment, false, tld->os); // ensure no more guard pages are set - } - - bool fully_committed = true; - size_t committed_size = 0; - const size_t page_size = mi_segment_raw_page_size(segment); - for (size_t i = 0; i < segment->capacity; i++) { - mi_page_t* page = &segment->pages[i]; - if (page->is_committed) { committed_size += page_size; } - if (!page->is_committed) { fully_committed = false; } - } - MI_UNUSED(fully_committed); - mi_assert_internal((fully_committed && committed_size == segment_size) || (!fully_committed && committed_size < segment_size)); - - _mi_arena_free(segment, segment_size, committed_size, segment->memid, tld->stats); -} - -// called from `heap_collect`. -void _mi_segments_collect(bool force, mi_segments_tld_t* tld) { - mi_pages_try_purge(force,tld); - #if MI_DEBUG>=2 - if (!_mi_is_main_thread()) { - mi_assert_internal(tld->pages_purge.first == NULL); - mi_assert_internal(tld->pages_purge.last == NULL); - } - #endif -} - - -/* ----------------------------------------------------------- - Segment allocation ------------------------------------------------------------ */ - -static mi_segment_t* mi_segment_os_alloc(bool eager_delayed, size_t page_alignment, mi_arena_id_t req_arena_id, - size_t pre_size, size_t info_size, bool commit, size_t segment_size, - mi_segments_tld_t* tld, mi_os_tld_t* tld_os) -{ - mi_memid_t memid; - bool allow_large = (!eager_delayed && (MI_SECURE == 0)); // only allow large OS pages once we are no longer lazy - size_t align_offset = 0; - size_t alignment = MI_SEGMENT_SIZE; - if (page_alignment > 0) { - alignment = page_alignment; - align_offset = _mi_align_up(pre_size, MI_SEGMENT_SIZE); - segment_size = segment_size + (align_offset - pre_size); // adjust the segment size - } - - mi_segment_t* segment = (mi_segment_t*)_mi_arena_alloc_aligned(segment_size, alignment, align_offset, commit, allow_large, req_arena_id, &memid, tld_os); - if (segment == NULL) { - return NULL; // failed to allocate - } - - if (!memid.initially_committed) { - // ensure the initial info is committed - mi_assert_internal(!memid.is_pinned); - bool ok = _mi_os_commit(segment, pre_size, NULL, tld_os->stats); - if (!ok) { - // commit failed; we cannot touch the memory: free the segment directly and return `NULL` - _mi_arena_free(segment, segment_size, 0, memid, tld_os->stats); - return NULL; - } - } - - MI_UNUSED(info_size); - segment->memid = memid; - segment->allow_decommit = !memid.is_pinned; - segment->allow_purge = segment->allow_decommit && (mi_option_get(mi_option_purge_delay) >= 0); - segment->segment_size = segment_size; - segment->subproc = tld->subproc; - mi_segments_track_size((long)(segment_size), tld); - _mi_segment_map_allocated_at(segment); - return segment; -} - -// Allocate a segment from the OS aligned to `MI_SEGMENT_SIZE` . -static mi_segment_t* mi_segment_alloc(size_t required, mi_page_kind_t page_kind, size_t page_shift, size_t page_alignment, - mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) -{ - // required is only > 0 for huge page allocations - mi_assert_internal((required > 0 && page_kind > MI_PAGE_LARGE)|| (required==0 && page_kind <= MI_PAGE_LARGE)); - - // calculate needed sizes first - size_t capacity; - if (page_kind == MI_PAGE_HUGE) { - mi_assert_internal(page_shift == MI_SEGMENT_SHIFT + 1 && required > 0); - capacity = 1; - } - else { - mi_assert_internal(required == 0 && page_alignment == 0); - size_t page_size = (size_t)1 << page_shift; - capacity = MI_SEGMENT_SIZE / page_size; - mi_assert_internal(MI_SEGMENT_SIZE % page_size == 0); - mi_assert_internal(capacity >= 1 && capacity <= MI_SMALL_PAGES_PER_SEGMENT); - } - size_t info_size; - size_t pre_size; - const size_t init_segment_size = mi_segment_calculate_sizes(capacity, required, &pre_size, &info_size); - mi_assert_internal(init_segment_size >= required); - - // Initialize parameters - const bool eager_delayed = (page_kind <= MI_PAGE_MEDIUM && // don't delay for large objects - // !_mi_os_has_overcommit() && // never delay on overcommit systems - _mi_current_thread_count() > 1 && // do not delay for the first N threads - tld->peak_count < (size_t)mi_option_get(mi_option_eager_commit_delay)); - const bool eager = !eager_delayed && mi_option_is_enabled(mi_option_eager_commit); - const bool init_commit = eager; // || (page_kind >= MI_PAGE_LARGE); - - // Allocate the segment from the OS (segment_size can change due to alignment) - mi_segment_t* segment = mi_segment_os_alloc(eager_delayed, page_alignment, req_arena_id, pre_size, info_size, init_commit, init_segment_size, tld, os_tld); - if (segment == NULL) return NULL; - mi_assert_internal(segment != NULL && (uintptr_t)segment % MI_SEGMENT_SIZE == 0); - mi_assert_internal(segment->memid.is_pinned ? segment->memid.initially_committed : true); - - // zero the segment info (but not the `mem` fields) - ptrdiff_t ofs = offsetof(mi_segment_t, next); - _mi_memzero((uint8_t*)segment + ofs, info_size - ofs); - - // initialize pages info - const bool is_huge = (page_kind == MI_PAGE_HUGE); - for (size_t i = 0; i < capacity; i++) { - mi_assert_internal(i <= 255); - segment->pages[i].segment_idx = (uint8_t)i; - segment->pages[i].is_committed = segment->memid.initially_committed; - segment->pages[i].is_zero_init = segment->memid.initially_zero; - segment->pages[i].is_huge = is_huge; - } - - // initialize - segment->page_kind = page_kind; - segment->capacity = capacity; - segment->page_shift = page_shift; - segment->segment_info_size = pre_size; - segment->thread_id = _mi_thread_id(); - segment->cookie = _mi_ptr_cookie(segment); - - // set protection - mi_segment_protect(segment, true, tld->os); - - // insert in free lists for small and medium pages - if (page_kind <= MI_PAGE_MEDIUM) { - mi_segment_insert_in_free_queue(segment, tld); - } - - return segment; -} - - -static void mi_segment_free(mi_segment_t* segment, bool force, mi_segments_tld_t* tld) { - MI_UNUSED(force); - mi_assert(segment != NULL); - - // in `mi_segment_force_abandon` we set this to true to ensure the segment's memory stays valid - if (segment->dont_free) return; - - // don't purge as we are freeing now - mi_segment_remove_all_purges(segment, false /* don't force as we are about to free */, tld); - mi_segment_remove_from_free_queue(segment, tld); - - mi_assert_expensive(!mi_segment_queue_contains(&tld->small_free, segment)); - mi_assert_expensive(!mi_segment_queue_contains(&tld->medium_free, segment)); - mi_assert(segment->next == NULL); - mi_assert(segment->prev == NULL); - _mi_stat_decrease(&tld->stats->page_committed, segment->segment_info_size); - - // return it to the OS - mi_segment_os_free(segment, segment->segment_size, tld); -} - -/* ----------------------------------------------------------- - Free page management inside a segment ------------------------------------------------------------ */ - - -static bool mi_segment_has_free(const mi_segment_t* segment) { - return (segment->used < segment->capacity); -} - -static bool mi_segment_page_claim(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) { - mi_assert_internal(_mi_page_segment(page) == segment); - mi_assert_internal(!page->segment_in_use); - mi_page_purge_remove(page, tld); - - // check commit - if (!mi_page_ensure_committed(segment, page, tld)) return false; - - // set in-use before doing unreset to prevent delayed reset - page->segment_in_use = true; - segment->used++; - mi_assert_internal(page->segment_in_use && page->is_committed && page->used==0 && !mi_pages_purge_contains(page,tld)); - mi_assert_internal(segment->used <= segment->capacity); - if (segment->used == segment->capacity && segment->page_kind <= MI_PAGE_MEDIUM) { - // if no more free pages, remove from the queue - mi_assert_internal(!mi_segment_has_free(segment)); - mi_segment_remove_from_free_queue(segment, tld); - } - return true; -} - - -/* ----------------------------------------------------------- - Free ------------------------------------------------------------ */ - -static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld); - -// clear page data; can be called on abandoned segments -static void mi_segment_page_clear(mi_segment_t* segment, mi_page_t* page, mi_segments_tld_t* tld) -{ - mi_assert_internal(page->segment_in_use); - mi_assert_internal(mi_page_all_free(page)); - mi_assert_internal(page->is_committed); - mi_assert_internal(mi_page_not_in_queue(page, tld)); - - size_t inuse = page->capacity * mi_page_block_size(page); - _mi_stat_decrease(&tld->stats->page_committed, inuse); - _mi_stat_decrease(&tld->stats->pages, 1); - - page->is_zero_init = false; - page->segment_in_use = false; - - // zero the page data, but not the segment fields and capacity, page start, and block_size (for page size calculations) - size_t block_size = page->block_size; - uint8_t block_size_shift = page->block_size_shift; - uint8_t heap_tag = page->heap_tag; - uint8_t* page_start = page->page_start; - uint16_t capacity = page->capacity; - uint16_t reserved = page->reserved; - ptrdiff_t ofs = offsetof(mi_page_t,capacity); - _mi_memzero((uint8_t*)page + ofs, sizeof(*page) - ofs); - page->capacity = capacity; - page->reserved = reserved; - page->block_size = block_size; - page->block_size_shift = block_size_shift; - page->heap_tag = heap_tag; - page->page_start = page_start; - segment->used--; - - // schedule purge - mi_segment_schedule_purge(segment, page, tld); - - page->capacity = 0; // after purge these can be zero'd now - page->reserved = 0; -} - -void _mi_segment_page_free(mi_page_t* page, bool force, mi_segments_tld_t* tld) -{ - mi_assert(page != NULL); - mi_segment_t* segment = _mi_page_segment(page); - mi_assert_expensive(mi_segment_is_valid(segment,tld)); - mi_pages_try_purge(false /*force?*/, tld); - - // mark it as free now - mi_segment_page_clear(segment, page, tld); - - if (segment->used == 0) { - // no more used pages; remove from the free list and free the segment - mi_segment_free(segment, force, tld); - } - else { - if (segment->used == segment->abandoned) { - // only abandoned pages; remove from free list and abandon - mi_segment_abandon(segment,tld); - } - else if (segment->used + 1 == segment->capacity) { - mi_assert_internal(segment->page_kind <= MI_PAGE_MEDIUM); // large and huge pages are always the single page in a segment - if (segment->page_kind <= MI_PAGE_MEDIUM) { - // move back to segments free list - mi_segment_insert_in_free_queue(segment,tld); - } - } - } -} - - -/* ----------------------------------------------------------- -Abandonment - -When threads terminate, they can leave segments with -live blocks (reached through other threads). Such segments -are "abandoned" and will be reclaimed by other threads to -reuse their pages and/or free them eventually. The -`thread_id` of such segments is 0. - -When a block is freed in an abandoned segment, the segment -is reclaimed into that thread. - -Moreover, if threads are looking for a fresh segment, they -will first consider abondoned segments -- these can be found -by scanning the arena memory -(segments outside arena memoryare only reclaimed by a free). ------------------------------------------------------------ */ - -/* ----------------------------------------------------------- - Abandon segment/page ------------------------------------------------------------ */ - -static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) { - mi_assert_internal(segment->used == segment->abandoned); - mi_assert_internal(segment->used > 0); - mi_assert_expensive(mi_segment_is_valid(segment, tld)); - - // Potentially force purge. Only abandoned segments in arena memory can be - // reclaimed without a free so if a segment is not from an arena we force purge here to be conservative. - mi_pages_try_purge(false /*force?*/,tld); - const bool force_purge = (segment->memid.memkind != MI_MEM_ARENA) || mi_option_is_enabled(mi_option_abandoned_page_purge); - mi_segment_remove_all_purges(segment, force_purge, tld); - - // remove the segment from the free page queue if needed - mi_segment_remove_from_free_queue(segment, tld); - mi_assert_internal(segment->next == NULL && segment->prev == NULL); - - // all pages in the segment are abandoned; add it to the abandoned list - _mi_stat_increase(&tld->stats->segments_abandoned, 1); - mi_segments_track_size(-((long)segment->segment_size), tld); - segment->abandoned_visits = 0; - if (segment->was_reclaimed) { - tld->reclaim_count--; - segment->was_reclaimed = false; - } - _mi_arena_segment_mark_abandoned(segment); -} - -void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) { - mi_assert(page != NULL); - mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE); - mi_assert_internal(mi_page_heap(page) == NULL); - mi_segment_t* segment = _mi_page_segment(page); - mi_assert_expensive(!mi_pages_purge_contains(page, tld)); - mi_assert_expensive(mi_segment_is_valid(segment, tld)); - segment->abandoned++; - _mi_stat_increase(&tld->stats->pages_abandoned, 1); - mi_assert_internal(segment->abandoned <= segment->used); - if (segment->used == segment->abandoned) { - // all pages are abandoned, abandon the entire segment - mi_segment_abandon(segment, tld); - } -} - -/* ----------------------------------------------------------- - Reclaim abandoned pages ------------------------------------------------------------ */ - -// Possibly clear pages and check if free space is available -static bool mi_segment_check_free(mi_segment_t* segment, size_t block_size, bool* all_pages_free) -{ - mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); - bool has_page = false; - size_t pages_used = 0; - size_t pages_used_empty = 0; - for (size_t i = 0; i < segment->capacity; i++) { - mi_page_t* page = &segment->pages[i]; - if (page->segment_in_use) { - pages_used++; - // ensure used count is up to date and collect potential concurrent frees - _mi_page_free_collect(page, false); - if (mi_page_all_free(page)) { - // if everything free already, page can be reused for some block size - // note: don't clear the page yet as we can only OS reset it once it is reclaimed - pages_used_empty++; - has_page = true; - } - else if (mi_page_block_size(page) == block_size && mi_page_has_any_available(page)) { - // a page has available free blocks of the right size - has_page = true; - } - } - else { - // whole empty page - has_page = true; - } - } - mi_assert_internal(pages_used == segment->used && pages_used >= pages_used_empty); - if (all_pages_free != NULL) { - *all_pages_free = ((pages_used - pages_used_empty) == 0); - } - return has_page; -} - - -// Reclaim a segment; returns NULL if the segment was freed -// set `right_page_reclaimed` to `true` if it reclaimed a page of the right `block_size` that was not full. -static mi_segment_t* mi_segment_reclaim(mi_segment_t* segment, mi_heap_t* heap, size_t requested_block_size, bool* right_page_reclaimed, mi_segments_tld_t* tld) { - if (right_page_reclaimed != NULL) { *right_page_reclaimed = false; } - // can be 0 still with abandoned_next, or already a thread id for segments outside an arena that are reclaimed on a free. - mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0 || mi_atomic_load_relaxed(&segment->thread_id) == _mi_thread_id()); - mi_assert_internal(segment->subproc == heap->tld->segments.subproc); // only reclaim within the same subprocess - mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); - segment->abandoned_visits = 0; - segment->was_reclaimed = true; - tld->reclaim_count++; - mi_segments_track_size((long)segment->segment_size, tld); - mi_assert_internal(segment->next == NULL && segment->prev == NULL); - mi_assert_expensive(mi_segment_is_valid(segment, tld)); - _mi_stat_decrease(&tld->stats->segments_abandoned, 1); - - for (size_t i = 0; i < segment->capacity; i++) { - mi_page_t* page = &segment->pages[i]; - if (page->segment_in_use) { - mi_assert_internal(page->is_committed); - mi_assert_internal(mi_page_not_in_queue(page, tld)); - mi_assert_internal(mi_page_thread_free_flag(page)==MI_NEVER_DELAYED_FREE); - mi_assert_internal(mi_page_heap(page) == NULL); - segment->abandoned--; - mi_assert(page->next == NULL); - _mi_stat_decrease(&tld->stats->pages_abandoned, 1); - // get the target heap for this thread which has a matching heap tag (so we reclaim into a matching heap) - mi_heap_t* target_heap = _mi_heap_by_tag(heap, page->heap_tag); // allow custom heaps to separate objects - if (target_heap == NULL) { - target_heap = heap; - _mi_error_message(EFAULT, "page with tag %u cannot be reclaimed by a heap with the same tag (using heap tag %u instead)\n", page->heap_tag, heap->tag ); - } - // associate the heap with this page, and allow heap thread delayed free again. - mi_page_set_heap(page, target_heap); - _mi_page_use_delayed_free(page, MI_USE_DELAYED_FREE, true); // override never (after heap is set) - _mi_page_free_collect(page, false); // ensure used count is up to date - if (mi_page_all_free(page)) { - // if everything free already, clear the page directly - mi_segment_page_clear(segment, page, tld); // reset is ok now - } - else { - // otherwise reclaim it into the heap - _mi_page_reclaim(target_heap, page); - if (requested_block_size == mi_page_block_size(page) && mi_page_has_any_available(page) && heap == target_heap) { - if (right_page_reclaimed != NULL) { *right_page_reclaimed = true; } - } - } - } - /* expired - else if (page->is_committed) { // not in-use, and not reset yet - // note: do not reset as this includes pages that were not touched before - // mi_pages_purge_add(segment, page, tld); - } - */ - } - mi_assert_internal(segment->abandoned == 0); - if (segment->used == 0) { - mi_assert_internal(right_page_reclaimed == NULL || !(*right_page_reclaimed)); - mi_segment_free(segment, false, tld); - return NULL; - } - else { - if (segment->page_kind <= MI_PAGE_MEDIUM && mi_segment_has_free(segment)) { - mi_segment_insert_in_free_queue(segment, tld); - } - return segment; - } -} - - -// attempt to reclaim a particular segment (called from multi threaded free `alloc.c:mi_free_block_mt`) -bool _mi_segment_attempt_reclaim(mi_heap_t* heap, mi_segment_t* segment) { - if (mi_atomic_load_relaxed(&segment->thread_id) != 0) return false; // it is not abandoned - if (segment->subproc != heap->tld->segments.subproc) return false; // only reclaim within the same subprocess - if (!_mi_heap_memid_is_suitable(heap,segment->memid)) return false; // don't reclaim between exclusive and non-exclusive arena's - const long target = _mi_option_get_fast(mi_option_target_segments_per_thread); - if (target > 0 && (size_t)target <= heap->tld->segments.count) return false; // don't reclaim if going above the target count - - // don't reclaim more from a `free` call than half the current segments - // this is to prevent a pure free-ing thread to start owning too many segments - // (but not for out-of-arena segments as that is the main way to be reclaimed for those) - if (segment->memid.memkind == MI_MEM_ARENA && heap->tld->segments.reclaim_count * 2 > heap->tld->segments.count) { - return false; - } - if (_mi_arena_segment_clear_abandoned(segment)) { // atomically unabandon - mi_segment_t* res = mi_segment_reclaim(segment, heap, 0, NULL, &heap->tld->segments); - mi_assert_internal(res == segment); - return (res != NULL); - } - return false; -} - -void _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld) { - mi_segment_t* segment; - mi_arena_field_cursor_t current; - _mi_arena_field_cursor_init(heap, tld->subproc, true /* visit all, blocking */, ¤t); - while ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL) { - mi_segment_reclaim(segment, heap, 0, NULL, tld); - } - _mi_arena_field_cursor_done(¤t); -} - - -static bool segment_count_is_within_target(mi_segments_tld_t* tld, size_t* ptarget) { - const size_t target = (size_t)mi_option_get_clamp(mi_option_target_segments_per_thread, 0, 1024); - if (ptarget != NULL) { *ptarget = target; } - return (target == 0 || tld->count < target); -} - -static long mi_segment_get_reclaim_tries(mi_segments_tld_t* tld) { - // limit the tries to 10% (default) of the abandoned segments with at least 8 and at most 1024 tries. - const size_t perc = (size_t)mi_option_get_clamp(mi_option_max_segment_reclaim, 0, 100); - if (perc <= 0) return 0; - const size_t total_count = mi_atomic_load_relaxed(&tld->subproc->abandoned_count); - if (total_count == 0) return 0; - const size_t relative_count = (total_count > 10000 ? (total_count / 100) * perc : (total_count * perc) / 100); // avoid overflow - long max_tries = (long)(relative_count <= 1 ? 1 : (relative_count > 1024 ? 1024 : relative_count)); - if (max_tries < 8 && total_count > 8) { max_tries = 8; } - return max_tries; -} - -static mi_segment_t* mi_segment_try_reclaim(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, bool* reclaimed, mi_segments_tld_t* tld) -{ - *reclaimed = false; - long max_tries = mi_segment_get_reclaim_tries(tld); - if (max_tries <= 0) return NULL; - - mi_segment_t* result = NULL; - mi_segment_t* segment = NULL; - mi_arena_field_cursor_t current; - _mi_arena_field_cursor_init(heap, tld->subproc, false /* non-blocking */, ¤t); - while (segment_count_is_within_target(tld,NULL) && (max_tries-- > 0) && ((segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL)) - { - mi_assert(segment->subproc == heap->tld->segments.subproc); // cursor only visits segments in our sub-process - segment->abandoned_visits++; - // todo: should we respect numa affinity for abondoned reclaim? perhaps only for the first visit? - // todo: an arena exclusive heap will potentially visit many abandoned unsuitable segments and use many tries - // Perhaps we can skip non-suitable ones in a better way? - bool is_suitable = _mi_heap_memid_is_suitable(heap, segment->memid); - bool all_pages_free; - bool has_page = mi_segment_check_free(segment,block_size,&all_pages_free); // try to free up pages (due to concurrent frees) - if (all_pages_free) { - // free the segment (by forced reclaim) to make it available to other threads. - // note1: we prefer to free a segment as that might lead to reclaiming another - // segment that is still partially used. - // note2: we could in principle optimize this by skipping reclaim and directly - // freeing but that would violate some invariants temporarily) - mi_segment_reclaim(segment, heap, 0, NULL, tld); - } - else if (has_page && segment->page_kind == page_kind && is_suitable) { - // found a free page of the right kind, or page of the right block_size with free space - // we return the result of reclaim (which is usually `segment`) as it might free - // the segment due to concurrent frees (in which case `NULL` is returned). - result = mi_segment_reclaim(segment, heap, block_size, reclaimed, tld); - break; - } - else if (segment->abandoned_visits > 3 && is_suitable) { - // always reclaim on 3rd visit to limit the abandoned segment count. - mi_segment_reclaim(segment, heap, 0, NULL, tld); - } - else { - // otherwise, mark it back as abandoned - // todo: reset delayed pages in the segment? - _mi_arena_segment_mark_abandoned(segment); - } - } - _mi_arena_field_cursor_done(¤t); - return result; -} - - -/* ----------------------------------------------------------- - Force abandon a segment that is in use by our thread ------------------------------------------------------------ */ - -// force abandon a segment -static void mi_segment_force_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) -{ - mi_assert_internal(segment->abandoned < segment->used); - mi_assert_internal(!segment->dont_free); - - // ensure the segment does not get free'd underneath us (so we can check if a page has been freed in `mi_page_force_abandon`) - segment->dont_free = true; - - // for all pages - for (size_t i = 0; i < segment->capacity; i++) { - mi_page_t* page = &segment->pages[i]; - if (page->segment_in_use) { - // abandon the page if it is still in-use (this will free the page if possible as well (but not our segment)) - mi_assert_internal(segment->used > 0); - if (segment->used == segment->abandoned+1) { - // the last page.. abandon and return as the segment will be abandoned after this - // and we should no longer access it. - segment->dont_free = false; - _mi_page_force_abandon(page); - return; - } - else { - // abandon and continue - _mi_page_force_abandon(page); - } - } - } - segment->dont_free = false; - mi_assert(segment->used == segment->abandoned); - mi_assert(segment->used == 0); - if (segment->used == 0) { // paranoia - // all free now - mi_segment_free(segment, false, tld); - } - else { - // perform delayed purges - mi_pages_try_purge(false /* force? */, tld); - } -} - - -// try abandon segments. -// this should be called from `reclaim_or_alloc` so we know all segments are (about) fully in use. -static void mi_segments_try_abandon_to_target(mi_heap_t* heap, size_t target, mi_segments_tld_t* tld) { - if (target <= 1) return; - const size_t min_target = (target > 4 ? (target*3)/4 : target); // 75% - // todo: we should maintain a list of segments per thread; for now, only consider segments from the heap full pages - for (int i = 0; i < 64 && tld->count >= min_target; i++) { - mi_page_t* page = heap->pages[MI_BIN_FULL].first; - while (page != NULL && mi_page_is_huge(page)) { - page = page->next; - } - if (page==NULL) { - break; - } - mi_segment_t* segment = _mi_page_segment(page); - mi_segment_force_abandon(segment, tld); - mi_assert_internal(page != heap->pages[MI_BIN_FULL].first); // as it is just abandoned - } -} - -// try abandon segments. -// this should be called from `reclaim_or_alloc` so we know all segments are (about) fully in use. -static void mi_segments_try_abandon(mi_heap_t* heap, mi_segments_tld_t* tld) { - // we call this when we are about to add a fresh segment so we should be under our target segment count. - size_t target = 0; - if (segment_count_is_within_target(tld, &target)) return; - mi_segments_try_abandon_to_target(heap, target, tld); -} - -void mi_collect_reduce(size_t target_size) mi_attr_noexcept { - mi_collect(true); - mi_heap_t* heap = mi_heap_get_default(); - mi_segments_tld_t* tld = &heap->tld->segments; - size_t target = target_size / MI_SEGMENT_SIZE; - if (target == 0) { - target = (size_t)mi_option_get_clamp(mi_option_target_segments_per_thread, 1, 1024); - } - mi_segments_try_abandon_to_target(heap, target, tld); -} - -/* ----------------------------------------------------------- - Reclaim or allocate ------------------------------------------------------------ */ - -static mi_segment_t* mi_segment_reclaim_or_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t page_kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) -{ - mi_assert_internal(page_kind <= MI_PAGE_LARGE); - mi_assert_internal(block_size <= MI_LARGE_OBJ_SIZE_MAX); - - // try to abandon some segments to increase reuse between threads - mi_segments_try_abandon(heap,tld); - - // 1. try to reclaim an abandoned segment - bool reclaimed; - mi_segment_t* segment = mi_segment_try_reclaim(heap, block_size, page_kind, &reclaimed, tld); - mi_assert_internal(segment == NULL || _mi_arena_memid_is_suitable(segment->memid, heap->arena_id)); - if (reclaimed) { - // reclaimed the right page right into the heap - mi_assert_internal(segment != NULL && segment->page_kind == page_kind && page_kind <= MI_PAGE_LARGE); - return NULL; // pretend out-of-memory as the page will be in the page queue of the heap with available blocks - } - else if (segment != NULL) { - // reclaimed a segment with empty pages (of `page_kind`) in it - return segment; - } - // 2. otherwise allocate a fresh segment - return mi_segment_alloc(0, page_kind, page_shift, 0, heap->arena_id, tld, os_tld); -} - - -/* ----------------------------------------------------------- - Small page allocation ------------------------------------------------------------ */ - -static mi_page_t* mi_segment_find_free(mi_segment_t* segment, mi_segments_tld_t* tld) { - mi_assert_internal(mi_segment_has_free(segment)); - mi_assert_expensive(mi_segment_is_valid(segment, tld)); - for (size_t i = 0; i < segment->capacity; i++) { // TODO: use a bitmap instead of search? - mi_page_t* page = &segment->pages[i]; - if (!page->segment_in_use) { - bool ok = mi_segment_page_claim(segment, page, tld); - if (ok) return page; - } - } - mi_assert(false); - return NULL; -} - -// Allocate a page inside a segment. Requires that the page has free pages -static mi_page_t* mi_segment_page_alloc_in(mi_segment_t* segment, mi_segments_tld_t* tld) { - mi_assert_internal(mi_segment_has_free(segment)); - return mi_segment_find_free(segment, tld); -} - -static mi_page_t* mi_segment_page_try_alloc_in_queue(mi_heap_t* heap, mi_page_kind_t kind, mi_segments_tld_t* tld) { - // find an available segment the segment free queue - mi_segment_queue_t* const free_queue = mi_segment_free_queue_of_kind(kind, tld); - for (mi_segment_t* segment = free_queue->first; segment != NULL; segment = segment->next) { - if (_mi_arena_memid_is_suitable(segment->memid, heap->arena_id) && mi_segment_has_free(segment)) { - return mi_segment_page_alloc_in(segment, tld); - } - } - return NULL; -} - -static mi_page_t* mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, mi_page_kind_t kind, size_t page_shift, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { - mi_page_t* page = mi_segment_page_try_alloc_in_queue(heap, kind, tld); - if (page == NULL) { - // possibly allocate or reclaim a fresh segment - mi_segment_t* const segment = mi_segment_reclaim_or_alloc(heap, block_size, kind, page_shift, tld, os_tld); - if (segment == NULL) return NULL; // return NULL if out-of-memory (or reclaimed) - mi_assert_internal(segment->page_kind==kind); - mi_assert_internal(segment->used < segment->capacity); - mi_assert_internal(_mi_arena_memid_is_suitable(segment->memid, heap->arena_id)); - page = mi_segment_page_try_alloc_in_queue(heap, kind, tld); // this should now succeed - } - mi_assert_internal(page != NULL); - #if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN - // verify it is committed - mi_segment_raw_page_start(_mi_page_segment(page), page, NULL)[0] = 0; - #endif - return page; -} - -static mi_page_t* mi_segment_small_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { - return mi_segment_page_alloc(heap, block_size, MI_PAGE_SMALL,MI_SMALL_PAGE_SHIFT,tld,os_tld); -} - -static mi_page_t* mi_segment_medium_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { - return mi_segment_page_alloc(heap, block_size, MI_PAGE_MEDIUM, MI_MEDIUM_PAGE_SHIFT, tld, os_tld); -} - -/* ----------------------------------------------------------- - large page allocation ------------------------------------------------------------ */ - -static mi_page_t* mi_segment_large_page_alloc(mi_heap_t* heap, size_t block_size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { - mi_segment_t* segment = mi_segment_reclaim_or_alloc(heap,block_size,MI_PAGE_LARGE,MI_LARGE_PAGE_SHIFT,tld,os_tld); - if (segment == NULL) return NULL; - mi_page_t* page = mi_segment_find_free(segment, tld); - mi_assert_internal(page != NULL); -#if MI_DEBUG>=2 && !MI_TRACK_ENABLED // && !MI_TSAN - mi_segment_raw_page_start(segment, page, NULL)[0] = 0; -#endif - return page; -} - -static mi_page_t* mi_segment_huge_page_alloc(size_t size, size_t page_alignment, mi_arena_id_t req_arena_id, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) -{ - mi_segment_t* segment = mi_segment_alloc(size, MI_PAGE_HUGE, MI_SEGMENT_SHIFT + 1, page_alignment, req_arena_id, tld, os_tld); - if (segment == NULL) return NULL; - mi_assert_internal(mi_segment_page_size(segment) - segment->segment_info_size - (2*(MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= size); - #if MI_HUGE_PAGE_ABANDON - segment->thread_id = 0; // huge pages are immediately abandoned - mi_segments_track_size(-(long)segment->segment_size, tld); - #endif - mi_page_t* page = mi_segment_find_free(segment, tld); - mi_assert_internal(page != NULL); - mi_assert_internal(page->is_huge); - - // for huge pages we initialize the block_size as we may - // overallocate to accommodate large alignments. - size_t psize; - uint8_t* start = mi_segment_raw_page_start(segment, page, &psize); - page->block_size = psize; - - // reset the part of the page that will not be used; this can be quite large (close to MI_SEGMENT_SIZE) - if (page_alignment > 0 && segment->allow_decommit && page->is_committed) { - uint8_t* aligned_p = (uint8_t*)_mi_align_up((uintptr_t)start, page_alignment); - mi_assert_internal(_mi_is_aligned(aligned_p, page_alignment)); - mi_assert_internal(psize - (aligned_p - start) >= size); - uint8_t* decommit_start = start + sizeof(mi_block_t); // for the free list - ptrdiff_t decommit_size = aligned_p - decommit_start; - _mi_os_reset(decommit_start, decommit_size, os_tld->stats); // do not decommit as it may be in a region - } - - return page; -} - -#if MI_HUGE_PAGE_ABANDON -// free huge block from another thread -void _mi_segment_huge_page_free(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) { - // huge page segments are always abandoned and can be freed immediately by any thread - mi_assert_internal(segment->page_kind==MI_PAGE_HUGE); - mi_assert_internal(segment == _mi_page_segment(page)); - mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id)==0); - - // claim it and free - mi_heap_t* heap = mi_heap_get_default(); // issue #221; don't use the internal get_default_heap as we need to ensure the thread is initialized. - // paranoia: if this it the last reference, the cas should always succeed - size_t expected_tid = 0; - if (mi_atomic_cas_strong_acq_rel(&segment->thread_id, &expected_tid, heap->thread_id)) { - mi_block_set_next(page, block, page->free); - page->free = block; - page->used--; - page->is_zero_init = false; - mi_assert(page->used == 0); - mi_tld_t* tld = heap->tld; - mi_segments_track_size((long)segment->segment_size, &tld->segments); - _mi_segment_page_free(page, true, &tld->segments); - } -#if (MI_DEBUG!=0) - else { - mi_assert_internal(false); - } -#endif -} - -#else -// reset memory of a huge block from another thread -void _mi_segment_huge_page_reset(mi_segment_t* segment, mi_page_t* page, mi_block_t* block) { - mi_assert_internal(segment->page_kind == MI_PAGE_HUGE); - mi_assert_internal(segment == _mi_page_segment(page)); - mi_assert_internal(page->used == 1); // this is called just before the free - mi_assert_internal(page->free == NULL); - if (segment->allow_decommit && page->is_committed) { - size_t usize = mi_usable_size(block); - if (usize > sizeof(mi_block_t)) { - usize = usize - sizeof(mi_block_t); - uint8_t* p = (uint8_t*)block + sizeof(mi_block_t); - _mi_os_reset(p, usize, &_mi_stats_main); - } - } -} -#endif - -/* ----------------------------------------------------------- - Page allocation ------------------------------------------------------------ */ - -mi_page_t* _mi_segment_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment, mi_segments_tld_t* tld, mi_os_tld_t* os_tld) { - mi_page_t* page; - if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) { - mi_assert_internal(_mi_is_power_of_two(page_alignment)); - mi_assert_internal(page_alignment >= MI_SEGMENT_SIZE); - //mi_assert_internal((MI_SEGMENT_SIZE % page_alignment) == 0); - if (page_alignment < MI_SEGMENT_SIZE) { page_alignment = MI_SEGMENT_SIZE; } - page = mi_segment_huge_page_alloc(block_size, page_alignment, heap->arena_id, tld, os_tld); - } - else if (block_size <= MI_SMALL_OBJ_SIZE_MAX) { - page = mi_segment_small_page_alloc(heap, block_size, tld, os_tld); - } - else if (block_size <= MI_MEDIUM_OBJ_SIZE_MAX) { - page = mi_segment_medium_page_alloc(heap, block_size, tld, os_tld); - } - else if (block_size <= MI_LARGE_OBJ_SIZE_MAX /* || mi_is_good_fit(block_size, MI_LARGE_PAGE_SIZE - sizeof(mi_segment_t)) */ ) { - page = mi_segment_large_page_alloc(heap, block_size, tld, os_tld); - } - else { - page = mi_segment_huge_page_alloc(block_size, page_alignment, heap->arena_id, tld, os_tld); - } - mi_assert_expensive(page == NULL || mi_segment_is_valid(_mi_page_segment(page),tld)); - mi_assert_internal(page == NULL || (mi_segment_page_size(_mi_page_segment(page)) - (MI_SECURE == 0 ? 0 : _mi_os_page_size())) >= block_size); - // mi_segment_try_purge(tld); - mi_assert_internal(page == NULL || mi_page_not_in_queue(page, tld)); - mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); - return page; -} - - -/* ----------------------------------------------------------- - Visit blocks in a segment (only used for abandoned segments) ------------------------------------------------------------ */ - -static bool mi_segment_visit_page(mi_page_t* page, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { - mi_heap_area_t area; - _mi_heap_area_init(&area, page); - if (!visitor(NULL, &area, NULL, area.block_size, arg)) return false; - if (visit_blocks) { - return _mi_heap_area_visit_blocks(&area, page, visitor, arg); - } - else { - return true; - } -} - -bool _mi_segment_visit_blocks(mi_segment_t* segment, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { - for (size_t i = 0; i < segment->capacity; i++) { - mi_page_t* const page = &segment->pages[i]; - if (page->segment_in_use) { - if (heap_tag < 0 || (int)page->heap_tag == heap_tag) { - if (!mi_segment_visit_page(page, visit_blocks, visitor, arg)) return false; - } - } - } - return true; -} From 2084df3dde95aa2fb2bb73e8fc3eff2f7edc6662 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 8 Dec 2024 12:20:54 -0800 Subject: [PATCH 049/264] add dedicated meta data allocation for threads and tld --- CMakeLists.txt | 1 + ide/vs2022/mimalloc-override.vcxproj | 1 + ide/vs2022/mimalloc-override.vcxproj.filters | 1 + ide/vs2022/mimalloc.vcxproj | 1 + ide/vs2022/mimalloc.vcxproj.filters | 1 + include/mimalloc/internal.h | 251 +++++++++---------- include/mimalloc/types.h | 32 ++- src/arena-meta.c | 156 ++++++++++++ src/arena.c | 90 +++---- src/heap.c | 50 ++-- src/init.c | 184 ++++++-------- src/os.c | 133 +++++----- src/page-map.c | 6 +- src/prim/windows/prim.c | 14 +- src/static.c | 1 + 15 files changed, 511 insertions(+), 411 deletions(-) create mode 100644 src/arena-meta.c diff --git a/CMakeLists.txt b/CMakeLists.txt index 5e47cfe6..6df4ba5a 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,6 +49,7 @@ set(mi_sources src/alloc-aligned.c src/alloc-posix.c src/arena.c + src/arena-meta.c src/bitmap.c src/heap.c src/init.c diff --git a/ide/vs2022/mimalloc-override.vcxproj b/ide/vs2022/mimalloc-override.vcxproj index a5d5c34c..eebc4d8a 100644 --- a/ide/vs2022/mimalloc-override.vcxproj +++ b/ide/vs2022/mimalloc-override.vcxproj @@ -236,6 +236,7 @@ + diff --git a/ide/vs2022/mimalloc-override.vcxproj.filters b/ide/vs2022/mimalloc-override.vcxproj.filters index 60c7a1fb..0e63822c 100644 --- a/ide/vs2022/mimalloc-override.vcxproj.filters +++ b/ide/vs2022/mimalloc-override.vcxproj.filters @@ -58,6 +58,7 @@ Sources + diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index e9a4a339..d8cc25b1 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -214,6 +214,7 @@ + false diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters index a47efddd..7fc4ba9c 100644 --- a/ide/vs2022/mimalloc.vcxproj.filters +++ b/ide/vs2022/mimalloc.vcxproj.filters @@ -58,6 +58,7 @@ Sources + diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 28eca4bb..4c8256a0 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -27,8 +27,6 @@ terms of the MIT license. A copy of the license can be found in the file #if defined(_MSC_VER) #pragma warning(disable:4127) // suppress constant conditional warning (due to MI_SECURE paths) #pragma warning(disable:26812) // unscoped enum warning -#pragma warning(disable:28159) // don't use GetVersion -#pragma warning(disable:4996) // don't use GetVersion #define mi_decl_noinline __declspec(noinline) #define mi_decl_thread __declspec(thread) #define mi_decl_align(a) __declspec(align(a)) @@ -58,42 +56,52 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_decl_externc #endif +// "libc.c" +#include +void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args); +void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...); +char _mi_toupper(char c); +int _mi_strnicmp(const char* s, const char* t, size_t n); +void _mi_strlcpy(char* dest, const char* src, size_t dest_size); +void _mi_strlcat(char* dest, const char* src, size_t dest_size); +size_t _mi_strlen(const char* s); +size_t _mi_strnlen(const char* s, size_t max_len); +bool _mi_getenv(const char* name, char* result, size_t result_size); // "options.c" -void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message); -void _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...); -void _mi_warning_message(const char* fmt, ...); -void _mi_verbose_message(const char* fmt, ...); -void _mi_trace_message(const char* fmt, ...); -void _mi_output_message(const char* fmt, ...); -void _mi_options_init(void); -long _mi_option_get_fast(mi_option_t option); -void _mi_error_message(int err, const char* fmt, ...); +void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message); +void _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...); +void _mi_warning_message(const char* fmt, ...); +void _mi_verbose_message(const char* fmt, ...); +void _mi_trace_message(const char* fmt, ...); +void _mi_output_message(const char* fmt, ...); +void _mi_options_init(void); +long _mi_option_get_fast(mi_option_t option); +void _mi_error_message(int err, const char* fmt, ...); // random.c -void _mi_random_init(mi_random_ctx_t* ctx); -void _mi_random_init_weak(mi_random_ctx_t* ctx); -void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx); -void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx); -uintptr_t _mi_random_next(mi_random_ctx_t* ctx); -uintptr_t _mi_heap_random_next(mi_heap_t* heap); -uintptr_t _mi_os_random_weak(uintptr_t extra_seed); +void _mi_random_init(mi_random_ctx_t* ctx); +void _mi_random_init_weak(mi_random_ctx_t* ctx); +void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx); +void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx); +uintptr_t _mi_random_next(mi_random_ctx_t* ctx); +uintptr_t _mi_heap_random_next(mi_heap_t* heap); +uintptr_t _mi_os_random_weak(uintptr_t extra_seed); static inline uintptr_t _mi_random_shuffle(uintptr_t x); // init.c extern mi_decl_cache_align mi_stats_t _mi_stats_main; extern mi_decl_cache_align const mi_page_t _mi_page_empty; -void _mi_process_load(void); +void _mi_process_load(void); void mi_cdecl _mi_process_done(void); -bool _mi_is_redirected(void); -bool _mi_allocator_init(const char** message); -void _mi_allocator_done(void); -bool _mi_is_main_thread(void); -size_t _mi_current_thread_count(void); -bool _mi_preloading(void); // true while the C runtime is not initialized yet -void _mi_thread_done(mi_heap_t* heap); -void _mi_thread_data_collect(void); -void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); +bool _mi_is_redirected(void); +bool _mi_allocator_init(const char** message); +void _mi_allocator_done(void); +bool _mi_is_main_thread(void); +size_t _mi_current_thread_count(void); +bool _mi_preloading(void); // true while the C runtime is not initialized yet +void _mi_thread_done(mi_heap_t* heap); +mi_tld_t* _mi_tld(void); // current tld: `_mi_tld() == _mi_heap_get_default()->tld` mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; size_t _mi_thread_seq_id(void) mi_attr_noexcept; @@ -103,116 +111,94 @@ mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); void _mi_heap_guarded_init(mi_heap_t* heap); // os.c -void _mi_os_init(void); // called from process init -void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats); -void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats); -void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats); -void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats); +void _mi_os_init(void); // called from process init +void* _mi_os_alloc(size_t size, mi_memid_t* memid); +void* _mi_os_zalloc(size_t size, mi_memid_t* memid); +void _mi_os_free(void* p, size_t size, mi_memid_t memid); +void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid); -size_t _mi_os_page_size(void); -size_t _mi_os_good_alloc_size(size_t size); -bool _mi_os_has_overcommit(void); -bool _mi_os_has_virtual_reserve(void); -size_t _mi_os_virtual_address_bits(void); +size_t _mi_os_page_size(void); +size_t _mi_os_good_alloc_size(size_t size); +bool _mi_os_has_overcommit(void); +bool _mi_os_has_virtual_reserve(void); +size_t _mi_os_virtual_address_bits(void); -bool _mi_os_reset(void* addr, size_t size, mi_stats_t* tld_stats); -bool _mi_os_commit(void* p, size_t size, bool* is_zero, mi_stats_t* stats); -bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats); -bool _mi_os_protect(void* addr, size_t size); -bool _mi_os_unprotect(void* addr, size_t size); -bool _mi_os_purge(void* p, size_t size, mi_stats_t* stats); -bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats); +bool _mi_os_reset(void* addr, size_t size); +bool _mi_os_commit(void* p, size_t size, bool* is_zero); +bool _mi_os_decommit(void* addr, size_t size); +bool _mi_os_protect(void* addr, size_t size); +bool _mi_os_unprotect(void* addr, size_t size); +bool _mi_os_purge(void* p, size_t size); +bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset); -void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats); -void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* tld_stats); +void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid); +void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid); -void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size); -bool _mi_os_use_large_page(size_t size, size_t alignment); -size_t _mi_os_large_page_size(void); +void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size); +bool _mi_os_use_large_page(size_t size, size_t alignment); +size_t _mi_os_large_page_size(void); -void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid); +void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid); // arena.c mi_arena_id_t _mi_arena_id_none(void); -void _mi_arena_init(void); -void _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid, mi_stats_t* stats); -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld); -void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld); -bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id); -bool _mi_arena_contains(const void* p); -void _mi_arenas_collect(bool force_purge, mi_stats_t* stats); -void _mi_arena_unsafe_destroy_all(mi_stats_t* stats); +void _mi_arena_init(void); +void _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid); +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid); +void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid); +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id); +bool _mi_arena_contains(const void* p); +void _mi_arenas_collect(bool force_purge); +void _mi_arena_unsafe_destroy_all(void); -mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); -void _mi_arena_page_free(mi_page_t* page); -void _mi_arena_page_abandon(mi_page_t* page); -void _mi_arena_page_unabandon(mi_page_t* page); -bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page); +mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); +void _mi_arena_page_free(mi_page_t* page); +void _mi_arena_page_abandon(mi_page_t* page); +void _mi_arena_page_unabandon(mi_page_t* page); +bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page); -bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page); -void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap); - -void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid); -void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size); - -/* -typedef struct mi_arena_field_cursor_s { // abstract struct - size_t os_list_count; // max entries to visit in the OS abandoned list - size_t start; // start arena idx (may need to be wrapped) - size_t end; // end arena idx (exclusive, may need to be wrapped) - size_t bitmap_idx; // current bit idx for an arena - mi_subproc_t* subproc; // only visit blocks in this sub-process - bool visit_all; // ensure all abandoned blocks are seen (blocking) - bool hold_visit_lock; // if the subproc->abandoned_os_visit_lock is held -} mi_arena_field_cursor_t; -void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current); -mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous); -void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current); -*/ +// arena-meta.c +void* _mi_meta_zalloc( size_t size, mi_memid_t* memid ); +void _mi_meta_free(void* p, size_t size, mi_memid_t memid); // "page-map.c" -bool _mi_page_map_init(void); -void _mi_page_map_register(mi_page_t* page); -void _mi_page_map_unregister(mi_page_t* page); - +bool _mi_page_map_init(void); +void _mi_page_map_register(mi_page_t* page); +void _mi_page_map_unregister(mi_page_t* page); // "page.c" -void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; +void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; -void _mi_page_retire(mi_page_t* page) mi_attr_noexcept; // free the page if there are no other pages with many free blocks -void _mi_page_unfull(mi_page_t* page); -void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq); // free the page -void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq); // abandon the page, to be picked up by another thread... -void _mi_page_force_abandon(mi_page_t* page); +void _mi_page_retire(mi_page_t* page) mi_attr_noexcept; // free the page if there are no other pages with many free blocks +void _mi_page_unfull(mi_page_t* page); +void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq); // free the page +void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq); // abandon the page, to be picked up by another thread... +void _mi_page_force_abandon(mi_page_t* page); +void _mi_heap_collect_retired(mi_heap_t* heap, bool force); -// void _mi_heap_delayed_free_all(mi_heap_t* heap); -// bool _mi_heap_delayed_free_partial(mi_heap_t* heap); -void _mi_heap_collect_retired(mi_heap_t* heap, bool force); +size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); +void _mi_deferred_free(mi_heap_t* heap, bool force); -size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); -void _mi_deferred_free(mi_heap_t* heap, bool force); +void _mi_page_free_collect(mi_page_t* page,bool force); +void _mi_page_init(mi_heap_t* heap, mi_page_t* page); -void _mi_page_free_collect(mi_page_t* page,bool force); -// void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page); // callback from segments -void _mi_page_init(mi_heap_t* heap, mi_page_t* page); - -size_t _mi_bin_size(uint8_t bin); // for stats -uint8_t _mi_bin(size_t size); // for stats +size_t _mi_bin_size(uint8_t bin); // for stats +uint8_t _mi_bin(size_t size); // for stats // "heap.c" -void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag); -void _mi_heap_destroy_pages(mi_heap_t* heap); -void _mi_heap_collect_abandon(mi_heap_t* heap); -void _mi_heap_set_default_direct(mi_heap_t* heap); -bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid); -void _mi_heap_unsafe_destroy_all(void); -mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag); -void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page); -bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg); -void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page); +void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag); +void _mi_heap_destroy_pages(mi_heap_t* heap); +void _mi_heap_collect_abandon(mi_heap_t* heap); +void _mi_heap_set_default_direct(mi_heap_t* heap); +bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid); +void _mi_heap_unsafe_destroy_all(void); +mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag); +void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page); +bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg); +void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page); // "stats.c" -void _mi_stats_done(mi_stats_t* stats); +void _mi_stats_done(mi_stats_t* stats); mi_msecs_t _mi_clock_now(void); mi_msecs_t _mi_clock_end(mi_msecs_t start); mi_msecs_t _mi_clock_start(void); @@ -226,20 +212,6 @@ void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, siz void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept; mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p); void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); -// bool _mi_free_delayed_block(mi_block_t* block); - - -// "libc.c" -#include -void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args); -void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...); -char _mi_toupper(char c); -int _mi_strnicmp(const char* s, const char* t, size_t n); -void _mi_strlcpy(char* dest, const char* src, size_t dest_size); -void _mi_strlcat(char* dest, const char* src, size_t dest_size); -size_t _mi_strlen(const char* s); -size_t _mi_strnlen(const char* s, size_t max_len); -bool _mi_getenv(const char* name, char* result, size_t result_size); #if MI_DEBUG>1 bool _mi_page_is_valid(mi_page_t* page); @@ -449,9 +421,6 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) { return ((uintptr_t)p ^ _mi_heap_main.cookie); } -static inline mi_tld_t* _mi_tld(void) { - return mi_heap_get_default()->tld; -} /* ----------------------------------------------------------- Pages @@ -908,6 +877,16 @@ static inline mi_memid_t _mi_memid_create_os(void* base, size_t size, bool commi return memid; } +static inline mi_memid_t _mi_memid_create_meta(void* mpage, size_t block_idx, size_t block_count) { + mi_memid_t memid = _mi_memid_create(MI_MEM_META); + memid.mem.meta.meta_page = mpage; + memid.mem.meta.block_index = (uint32_t)block_idx; + memid.mem.meta.block_count = (uint32_t)block_count; + memid.initially_committed = true; + memid.initially_zero = true; + memid.is_pinned = true; + return memid; +} // ------------------------------------------------------------------- // Fast "random" shuffle @@ -937,13 +916,13 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) { // Optimize numa node access for the common case (= one node) // ------------------------------------------------------------------- -int _mi_os_numa_node_get(mi_os_tld_t* tld); +int _mi_os_numa_node_get(void); size_t _mi_os_numa_node_count_get(void); extern _Atomic(size_t) _mi_numa_node_count; -static inline int _mi_os_numa_node(mi_os_tld_t* tld) { +static inline int _mi_os_numa_node(void) { if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; } - else return _mi_os_numa_node_get(tld); + else return _mi_os_numa_node_get(); } static inline size_t _mi_os_numa_node_count(void) { const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index e10786a0..d0a77c5f 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -155,6 +155,7 @@ typedef enum mi_memkind_e { MI_MEM_NONE, // not allocated MI_MEM_EXTERNAL, // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example) MI_MEM_STATIC, // allocated in a static area and should not be freed (for arena meta data for example) + MI_MEM_META, // allocated with the meta data allocator MI_MEM_OS, // allocated from the OS MI_MEM_OS_HUGE, // allocated as huge OS pages (usually 1GiB, pinned to physical memory) MI_MEM_OS_REMAP, // allocated in a remapable area (i.e. using `mremap`) @@ -165,6 +166,11 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) { return (memkind >= MI_MEM_OS && memkind <= MI_MEM_OS_REMAP); } +static inline bool mi_memkind_needs_no_free(mi_memkind_t memkind) { + return (memkind <= MI_MEM_STATIC); +} + + typedef struct mi_memid_os_info { void* base; // actual base address of the block (used for offset aligned allocations) size_t size; // allocated full size @@ -178,10 +184,17 @@ typedef struct mi_memid_arena_info { bool is_exclusive; // this arena can only be used for specific arena allocations } mi_memid_arena_info_t; +typedef struct mi_memid_meta_info { + void* meta_page; // meta-page that contains the block + uint32_t block_index; // block index in the meta-data page + uint32_t block_count; // allocated blocks +} mi_memid_meta_info_t; + typedef struct mi_memid_s { union { mi_memid_os_info_t os; // only used for MI_MEM_OS mi_memid_arena_info_t arena; // only used for MI_MEM_ARENA + mi_memid_meta_info_t meta; // only used for MI_MEM_META } mem; bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages) bool initially_committed;// `true` if the memory was originally allocated as committed @@ -190,6 +203,14 @@ typedef struct mi_memid_s { } mi_memid_t; +static inline bool mi_memid_is_os(mi_memid_t memid) { + return mi_memkind_is_os(memid.memkind); +} + +static inline bool mi_memid_needs_no_free(mi_memid_t memid) { + return mi_memkind_needs_no_free(memid.memkind); +} + // ------------------------------------------------------ // Mimalloc pages contain allocated blocks // ------------------------------------------------------ @@ -399,7 +420,8 @@ struct mi_heap_s { size_t page_retired_min; // smallest retired index (retired pages are fully free, but still in the page queues) size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread - bool allow_page_reclaim; // `true` if this heap can reclaim abandoned pages + mi_memid_t memid; // provenance of the heap struct itseft (meta or os) + bool allow_page_reclaim; // `true` if this heap should not reclaim abandoned pages bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint uint8_t tag; // custom tag, can be used for separating heaps based on the object types #if MI_GUARDED @@ -560,12 +582,6 @@ struct mi_subproc_s { typedef int64_t mi_msecs_t; -// OS thread local data -typedef struct mi_os_tld_s { - size_t region_idx; // start point for next allocation - mi_stats_t* stats; // points to tld stats -} mi_os_tld_t; - // Thread local data struct mi_tld_s { unsigned long long heartbeat; // monotonic heartbeat count @@ -573,9 +589,9 @@ struct mi_tld_s { mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) mi_subproc_t* subproc; // sub-process this thread belongs to. size_t tseq; // thread sequence id + mi_memid_t memid; // provenance of the tld memory itself (meta or OS) bool recurse; // true if deferred was called; used to prevent infinite recursion. bool is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks) - mi_os_tld_t os; // os tld mi_stats_t stats; // statistics }; diff --git a/src/arena-meta.c b/src/arena-meta.c new file mode 100644 index 00000000..0fb4dfa5 --- /dev/null +++ b/src/arena-meta.c @@ -0,0 +1,156 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2024, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- + We have a special "mini" allocator just for allocation of meta-data like + the heap (`mi_heap_t`) or thread-local data (`mi_tld_t`). + + We reuse the bitmap of the arena's for allocation of 64b blocks inside + an arena slice (64KiB). + We always ensure that meta data is zero'd (we zero on `free`) +-----------------------------------------------------------------------------*/ + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "bitmap.h" + +/* ----------------------------------------------------------- + Meta data allocation +----------------------------------------------------------- */ + +#define MI_META_PAGE_SIZE MI_ARENA_SLICE_SIZE +#define MI_META_PAGE_ALIGN MI_ARENA_SLICE_ALIGN + +#define MI_META_BLOCK_SIZE (64) +#define MI_META_BLOCK_ALIGN MI_META_BLOCK_SIZE +#define MI_META_BLOCKS_PER_PAGE (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE) // 1024 +#define MI_META_MAX_SIZE (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE) + +typedef struct mi_meta_page_s { + _Atomic(struct mi_meta_page_s*) next; // a linked list of meta-data pages (never released) + mi_memid_t memid; // provenance of the meta-page memory itself + mi_bitmap_t blocks_free; // a small bitmap with 1 bit per block. +} mi_meta_page_t; + +static mi_decl_cache_align _Atomic(mi_meta_page_t*) mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL); + + +#if MI_DEBUG > 1 +static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) { + mi_meta_page_t* mpage = (mi_meta_page_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN); + if (block_idx != NULL) { + *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE; + } + return mpage; +} +#endif + +static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) { + return mi_atomic_load_ptr_acquire(mi_meta_page_t, &mpage->next); +} + +static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) { + mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN)); + mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE); + void* p = ((uint8_t*)mpage + (block_idx * MI_META_BLOCK_SIZE)); + mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL)); + return p; +} + +// allocate a fresh meta page and add it to the global list. +static mi_meta_page_t* mi_meta_page_zalloc(void) { + // allocate a fresh arena slice + mi_memid_t memid; + mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0, + true /* commit*/, true /* allow large */, + _mi_arena_id_none(), 0 /* tseq */, &memid ); + if (mpage == NULL) return NULL; + mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN)); + if (!memid.initially_zero) { + _mi_memzero_aligned(mpage, MI_ARENA_SLICE_SIZE); + } + + // initialize the page + mpage->memid = memid; + mi_bitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */); + const size_t mpage_size = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL); + const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE); + mi_assert_internal(info_blocks < MI_META_BLOCKS_PER_PAGE); + mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks); + + // push atomically in front of the meta page list + // (note: there is no ABA issue since we never free meta-pages) + mi_meta_page_t* old = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages); + do { + mi_atomic_store_ptr_release(mi_meta_page_t, &mpage->next, old); + } while(!mi_atomic_cas_ptr_weak_acq_rel(mi_meta_page_t,&mi_meta_pages,&old,mpage)); + return mpage; +} + + +// allocate meta-data +void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid ) +{ + mi_assert_internal(pmemid != NULL); + size = _mi_align_up(size,MI_META_BLOCK_SIZE); + if (size == 0 || size > MI_META_MAX_SIZE) return NULL; + const size_t block_count = _mi_divide_up(size,MI_META_BLOCK_SIZE); + mi_assert_internal(block_count > 0 && block_count < MI_BCHUNK_BITS); + mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages); + mi_meta_page_t* mpage = mpage0; + while (mpage != NULL) { + size_t block_idx; + if (mi_bitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) { + // found and claimed `block_count` blocks + *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count); + return mi_meta_block_start(mpage,block_idx); + } + else { + mpage = mi_meta_page_next(mpage); + } + } + // failed to find space in existing pages + if (mi_atomic_load_ptr_acquire(mi_meta_page_t,&mi_meta_pages) != mpage0) { + // the page list was updated by another thread in the meantime, retry + return _mi_meta_zalloc(size,pmemid); + } + // otherwise, allocate a fresh metapage and try once more + mpage = mi_meta_page_zalloc(); + if (mpage != NULL) { + size_t block_idx; + if (mi_bitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) { + // found and claimed `block_count` blocks + *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count); + return mi_meta_block_start(mpage,block_idx); + } + } + // if all this failed, allocate from the OS + return _mi_os_alloc(size, pmemid); +} + +// free meta-data +void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { + if (p==NULL) return; + if (memid.memkind == MI_MEM_META) { + mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count); + const size_t block_count = memid.mem.meta.block_count; + const size_t block_idx = memid.mem.meta.block_index; + mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; + mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage); + mi_assert_internal(block_idx + block_count < MI_META_BLOCKS_PER_PAGE); + mi_assert_internal(mi_bitmap_is_clearN(&mpage->blocks_free, block_idx, block_count)); + // we zero on free (and on the initial page allocation) so we don't need a "dirty" map + _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE); + mi_bitmap_clearN(&mpage->blocks_free, block_idx, block_count); + } + else if (mi_memid_is_os(memid)) { + _mi_os_free(p, size, memid); + } + else { + mi_assert_internal(mi_memid_needs_no_free(memid)); + } +} diff --git a/src/arena.c b/src/arena.c index fa7d53ed..2558165a 100644 --- a/src/arena.c +++ b/src/arena.c @@ -214,7 +214,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // not fully committed: commit the full range and set the commit bits // (this may race and we may double-commit which is fine) bool commit_zero = false; - if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero, NULL)) { + if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) { memid->initially_committed = false; } else { @@ -364,14 +364,13 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are static mi_decl_noinline void* mi_arena_try_find_free( size_t slice_count, size_t alignment, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) + mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) { mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE)); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); if (alignment > MI_ARENA_SLICE_ALIGN) return NULL; // search arena's - const size_t tseq = tld->tseq; mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena) { void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid); @@ -385,14 +384,14 @@ static mi_decl_noinline void* mi_arena_try_find_free( static mi_decl_noinline void* mi_arena_try_alloc( size_t slice_count, size_t alignment, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) + mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) { mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); void* p; again: // try to find free slices in the arena's - p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); + p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid); if (p != NULL) return p; // did we need a specific arena? @@ -406,7 +405,7 @@ again: if (ok) { // and try allocate in there mi_assert_internal(req_arena_id == _mi_arena_id_none()); - p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); + p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid); if (p != NULL) return p; } } @@ -423,7 +422,7 @@ again: static void* mi_arena_os_alloc_aligned( size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) + mi_arena_id_t req_arena_id, mi_memid_t* memid) { // if we cannot use OS allocation, return NULL if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { @@ -432,10 +431,10 @@ static void* mi_arena_os_alloc_aligned( } if (align_offset > 0) { - return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, &tld->stats); + return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid); } else { - return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, &tld->stats); + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid); } } @@ -444,9 +443,9 @@ static void* mi_arena_os_alloc_aligned( void* _mi_arena_alloc_aligned( size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) + mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) { - mi_assert_internal(memid != NULL && tld != NULL); + mi_assert_internal(memid != NULL); mi_assert_internal(size > 0); // *memid = _mi_memid_none(); @@ -459,18 +458,18 @@ void* _mi_arena_alloc_aligned( alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0) // and good alignment { const size_t slice_count = mi_slice_count_of_size(size); - void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, memid, tld); + void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid); if (p != NULL) return p; } // fall back to the OS - void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid, tld); + void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid); return p; } -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld) +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) { - return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); + return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, tseq, memid); } @@ -566,7 +565,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz !os_align && // not large alignment slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large { - page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, &memid, tld); + page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, tld->tseq, &memid); } // otherwise fall back to the OS @@ -574,10 +573,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz if (os_align) { // note: slice_count already includes the page mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment)); - page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid, tld); + page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid); } else { - page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld); + page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid); } } @@ -725,7 +724,7 @@ void _mi_arena_page_free(mi_page_t* page) { #endif _mi_page_map_unregister(page); - _mi_arena_free(page, 1, 1, page->memid, NULL); + _mi_arena_free(page, 1, 1, page->memid); } /* ----------------------------------------------------------- @@ -831,16 +830,15 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { /* ----------------------------------------------------------- Arena free ----------------------------------------------------------- */ -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats); -static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats); +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices); +static void mi_arenas_try_purge(bool force, bool visit_all); -void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { +void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid) { mi_assert_internal(size > 0); mi_assert_internal(committed_size <= size); if (p==NULL) return; if (size==0) return; const bool all_committed = (committed_size == size); - if (stats==NULL) { stats = &_mi_stats_main; } // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) mi_track_mem_undefined(p, size); @@ -851,7 +849,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) _mi_stat_decrease(&_mi_stats_main.committed, committed_size); } - _mi_os_free(p, size, memid, stats); + _mi_os_free(p, size, memid); } else if (memid.memkind == MI_MEM_ARENA) { // allocated in an arena @@ -894,7 +892,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi } */ // (delay) purge the entire range - mi_arena_schedule_purge(arena, slice_index, slice_count, stats); + mi_arena_schedule_purge(arena, slice_index, slice_count); } // and make it available to others again @@ -904,13 +902,16 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi return; }; } + else if (memid.memkind == MI_MEM_META) { + _mi_meta_free(p, size, memid); + } else { // arena was none, external, or static; nothing to do - mi_assert_internal(memid.memkind < MI_MEM_OS); + mi_assert_internal(mi_memid_needs_no_free(memid)); } // purge expired decommits - mi_arenas_try_purge(false, false, stats); + mi_arenas_try_purge(false, false); } // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` @@ -924,7 +925,7 @@ static void mi_arenas_unsafe_destroy(void) { mi_lock_done(&arena->abandoned_visit_lock); if (mi_memkind_is_os(arena->memid.memkind)) { mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); - _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main); + _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid); } } } @@ -935,15 +936,15 @@ static void mi_arenas_unsafe_destroy(void) { } // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired -void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { - mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); +void _mi_arenas_collect(bool force_purge) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */); } // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. -void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { +void _mi_arena_unsafe_destroy_all(void) { mi_arenas_unsafe_destroy(); - _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas + _mi_arenas_collect(true /* force purge */); // purge non-owned arenas } // Is a pointer inside any of our arenas? @@ -1036,7 +1037,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int // commit & zero if needed bool is_zero = memid.initially_zero; if (!memid.initially_committed) { - _mi_os_commit(arena, mi_size_of_slices(info_slices), NULL, &_mi_stats_main); + _mi_os_commit(arena, mi_size_of_slices(info_slices), NULL); } if (!is_zero) { _mi_memzero(arena, mi_size_of_slices(info_slices)); @@ -1096,11 +1097,11 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc if (arena_id != NULL) *arena_id = _mi_arena_id_none(); size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice mi_memid_t memid; - void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid, &_mi_stats_main); + void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid); if (start == NULL) return ENOMEM; const bool is_large = memid.is_pinned; // todo: use separate is_large field? if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { - _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); + _mi_os_free_ex(start, size, commit, memid); _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); return ENOMEM; } @@ -1219,7 +1220,7 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { - _mi_os_free(p, hsize, memid, &_mi_stats_main); + _mi_os_free(p, hsize, memid); return ENOMEM; } return 0; @@ -1281,14 +1282,14 @@ static long mi_arena_purge_delay(void) { // reset or decommit in an arena and update the committed/decommit bitmaps // assumes we own the area (i.e. slices_free is claimed by us) -static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats) { +static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices) { mi_assert_internal(!arena->memid.is_pinned); const size_t size = mi_size_of_slices(slices); void* const p = mi_arena_slice_start(arena, slice_index); bool needs_recommit; if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slices)) { // all slices are committed, we can purge freely - needs_recommit = _mi_os_purge(p, size, stats); + needs_recommit = _mi_os_purge(p, size); } else { // some slices are not committed -- this can happen when a partially committed slice is freed @@ -1296,7 +1297,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), // and also undo the decommit stats (as it was already adjusted) mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); - needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); + needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */); if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } } @@ -1312,13 +1313,13 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices, // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. // Note: assumes we (still) own the area as we may purge immediately -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices, mi_stats_t* stats) { +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices) { const long delay = mi_arena_purge_delay(); if (delay < 0) return; // is purging allowed at all? if (_mi_preloading() || delay == 0) { // decommit directly - mi_arena_purge(arena, slice_index, slices, stats); + mi_arena_purge(arena, slice_index, slices); } else { // schedule decommit @@ -1327,14 +1328,13 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ } -static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) { +static void mi_arenas_try_purge(bool force, bool visit_all) { if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); if (max_arena == 0) return; // _mi_error_message(EFAULT, "purging not yet implemented\n"); - MI_UNUSED(stats); MI_UNUSED(visit_all); MI_UNUSED(force); } @@ -1385,7 +1385,7 @@ void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) { if (p != NULL) return p; // or fall back to the OS - p = _mi_os_alloc(size, memid, &_mi_stats_main); + p = _mi_os_alloc(size, memid); if (p == NULL) return NULL; // zero the OS memory if needed @@ -1398,7 +1398,7 @@ void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) { void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { if (mi_memkind_is_os(memid.memkind)) { - _mi_os_free(p, size, memid, &_mi_stats_main); + _mi_os_free(p, size, memid); } else { mi_assert(memid.memkind == MI_MEM_STATIC); diff --git a/src/heap.c b/src/heap.c index 3bf8b976..d2914361 100644 --- a/src/heap.c +++ b/src/heap.c @@ -119,36 +119,31 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) _mi_deferred_free(heap, force); // python/cpython#112532: we may be called from a thread that is not the owner of the heap - const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id()); + // const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id()); // note: never reclaim on collect but leave it to threads that need storage to reclaim - if ( - #ifdef NDEBUG - collect == MI_FORCE - #else - collect >= MI_FORCE - #endif - && is_main_thread && mi_heap_is_backing(heap) && heap->allow_page_reclaim) - { - // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. - // if all memory is freed by now, all segments should be freed. - // note: this only collects in the current subprocess - _mi_arena_reclaim_all_abandoned(heap); - } + //if ( + //#ifdef NDEBUG + // collect == MI_FORCE + //#else + // collect >= MI_FORCE + //#endif + // && is_main_thread && mi_heap_is_backing(heap) && heap->allow_page_reclaim) + //{ + // // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. + // // if all memory is freed by now, all segments should be freed. + // // note: this only collects in the current subprocess + // _mi_arena_reclaim_all_abandoned(heap); + //} // collect retired pages _mi_heap_collect_retired(heap, force); // collect all pages owned by this thread mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); - - // if forced, collect thread data cache on program-exit (or shared library unload) - if (force && is_main_thread && mi_heap_is_backing(heap)) { - _mi_thread_data_collect(); // collect thread data cache - } - + // collect arenas (this is program wide so don't force purges on abandonment of threads) - _mi_arenas_collect(collect == MI_FORCE /* force purge? */, &heap->tld->stats); + _mi_arenas_collect(collect == MI_FORCE /* force purge? */); } void _mi_heap_collect_abandon(mi_heap_t* heap) { @@ -187,24 +182,25 @@ mi_heap_t* mi_heap_get_backing(void) { return bheap; } -void _mi_heap_init(mi_heap_t* heap, mi_tld_t* tld, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) { +void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) { _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); - heap->tld = tld; + heap->tld = _mi_tld(); heap->thread_id = _mi_thread_id(); heap->arena_id = arena_id; heap->allow_page_reclaim = !noreclaim; heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0); heap->tag = tag; - if (tld->is_in_threadpool) { + if (heap->tld->is_in_threadpool) { // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. // (but abandoning is good in this case) heap->allow_page_reclaim = false; } - if (heap == tld->heap_backing) { + if (heap->tld->heap_backing == NULL) { + heap->tld->heap_backing = heap; // first heap becomes the backing heap _mi_random_init(&heap->random); } else { - _mi_random_split(&tld->heap_backing->random, &heap->random); + _mi_random_split(&heap->tld->heap_backing->random, &heap->random); } heap->cookie = _mi_heap_random_next(heap) | 1; heap->keys[0] = _mi_heap_random_next(heap); @@ -220,7 +216,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? if (heap == NULL) return NULL; mi_assert(heap_tag >= 0 && heap_tag < 256); - _mi_heap_init(heap, bheap->tld, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */); + _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */); return heap; } diff --git a/src/init.c b/src/init.c index b66efc69..f09821b4 100644 --- a/src/init.c +++ b/src/init.c @@ -96,6 +96,8 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- +#define MI_MEMID_STATIC {{{0}},true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } + mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, // MI_ATOMIC_VAR_INIT(NULL), // thread delayed free @@ -107,6 +109,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { 0, // page count MI_BIN_FULL, 0, // page retired min/max NULL, // next + MI_MEMID_STATIC, // memid false, // can reclaim true, // can eager abandon 0, // tag @@ -135,9 +138,9 @@ static mi_decl_cache_align mi_tld_t tld_main = { &_mi_heap_main, &_mi_heap_main, &mi_subproc_default, // subproc 0, // tseq + MI_MEMID_STATIC, // memid false, // recurse false, // is_in_threadpool - { 0, &tld_main.stats }, // os { MI_STATS_NULL } // stats }; @@ -152,6 +155,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = { 0, // page count MI_BIN_FULL, 0, // page retired min/max NULL, // next heap + MI_MEMID_STATIC, // memid true, // allow page reclaim true, // allow page abandon 0, // tag @@ -230,6 +234,47 @@ mi_heap_t* _mi_heap_main_get(void) { } +/* ----------------------------------------------------------- + Thread local data +----------------------------------------------------------- */ + +// Thread sequence number +static _Atomic(size_t) mi_tcount; + +// The mimalloc thread local data +mi_decl_thread mi_tld_t* mi_tld; + +// Allocate fresh tld +static mi_tld_t* mi_tld_alloc(void) { + if (_mi_is_main_thread()) { + return &tld_main; + } + else { + mi_memid_t memid; + mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid); + if (tld==NULL) { + _mi_error_message(ENOMEM, "unable to allocate memory for thread local data\n"); + return NULL; + } + tld->memid = memid; + tld->heap_backing = NULL; + tld->heaps = NULL; + tld->subproc = &mi_subproc_default; + tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool(); + return tld; + } +} + +mi_tld_t* _mi_tld(void) { + if (mi_tld==NULL) { + mi_tld = mi_tld_alloc(); + } + return mi_tld; +} + + + /* ----------------------------------------------------------- Sub process ----------------------------------------------------------- */ @@ -239,11 +284,11 @@ mi_subproc_id_t mi_subproc_main(void) { } mi_subproc_id_t mi_subproc_new(void) { - mi_memid_t memid = _mi_memid_none(); - mi_subproc_t* subproc = (mi_subproc_t*)_mi_arena_meta_zalloc(sizeof(mi_subproc_t), &memid); + mi_memid_t memid; + mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid); if (subproc == NULL) return NULL; - subproc->memid = memid; subproc->abandoned_os_list = NULL; + subproc->memid = memid; mi_lock_init(&subproc->abandoned_os_lock); mi_lock_init(&subproc->abandoned_os_visit_lock); return subproc; @@ -269,7 +314,7 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) { // todo: should we refcount subprocesses? mi_lock_done(&subproc->abandoned_os_lock); mi_lock_done(&subproc->abandoned_os_visit_lock); - _mi_arena_meta_free(subproc, subproc->memid, sizeof(mi_subproc_t)); + _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid); } void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { @@ -281,94 +326,10 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { } - /* ----------------------------------------------------------- - Initialization and freeing of the thread local heaps + Allocate heap data ----------------------------------------------------------- */ -// note: in x64 in release build `sizeof(mi_thread_data_t)` is under 4KiB (= OS page size). -typedef struct mi_thread_data_s { - mi_heap_t heap; // must come first due to cast in `_mi_heap_done` - mi_tld_t tld; - mi_memid_t memid; // must come last due to zero'ing -} mi_thread_data_t; - - -// Thread meta-data is allocated directly from the OS. For -// some programs that do not use thread pools and allocate and -// destroy many OS threads, this may causes too much overhead -// per thread so we maintain a small cache of recently freed metadata. - -#define TD_CACHE_SIZE (32) -static _Atomic(mi_thread_data_t*) td_cache[TD_CACHE_SIZE]; - -static mi_thread_data_t* mi_thread_data_zalloc(void) { - // try to find thread metadata in the cache - bool is_zero = false; - mi_thread_data_t* td = NULL; - for (int i = 0; i < TD_CACHE_SIZE; i++) { - td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]); - if (td != NULL) { - // found cached allocation, try use it - td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL); - if (td != NULL) { - break; - } - } - } - - // if that fails, allocate as meta data - if (td == NULL) { - mi_memid_t memid; - td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main); - if (td == NULL) { - // if this fails, try once more. (issue #257) - td = (mi_thread_data_t*)_mi_os_alloc(sizeof(mi_thread_data_t), &memid, &_mi_stats_main); - if (td == NULL) { - // really out of memory - _mi_error_message(ENOMEM, "unable to allocate thread local heap metadata (%zu bytes)\n", sizeof(mi_thread_data_t)); - } - } - if (td != NULL) { - td->memid = memid; - is_zero = memid.initially_zero; - } - } - - if (td != NULL && !is_zero) { - _mi_memzero_aligned(td, offsetof(mi_thread_data_t,memid)); - } - return td; -} - -static void mi_thread_data_free( mi_thread_data_t* tdfree ) { - // try to add the thread metadata to the cache - for (int i = 0; i < TD_CACHE_SIZE; i++) { - mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]); - if (td == NULL) { - mi_thread_data_t* expected = NULL; - if (mi_atomic_cas_ptr_weak_acq_rel(mi_thread_data_t, &td_cache[i], &expected, tdfree)) { - return; - } - } - } - // if that fails, just free it directly - _mi_os_free(tdfree, sizeof(mi_thread_data_t), tdfree->memid, &_mi_stats_main); -} - -void _mi_thread_data_collect(void) { - // free all thread metadata from the cache - for (int i = 0; i < TD_CACHE_SIZE; i++) { - mi_thread_data_t* td = mi_atomic_load_ptr_relaxed(mi_thread_data_t, &td_cache[i]); - if (td != NULL) { - td = mi_atomic_exchange_ptr_acq_rel(mi_thread_data_t, &td_cache[i], NULL); - if (td != NULL) { - _mi_os_free(td, sizeof(mi_thread_data_t), td->memid, &_mi_stats_main); - } - } - } -} - // Initialize the thread local default heap, called from `mi_thread_init` static bool _mi_thread_heap_init(void) { if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true; @@ -380,32 +341,21 @@ static bool _mi_thread_heap_init(void) { //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap()); } else { - // use `_mi_os_alloc` to allocate directly from the OS - mi_thread_data_t* td = mi_thread_data_zalloc(); - if (td == NULL) return false; - - mi_tld_t* tld = &td->tld; - mi_heap_t* heap = &td->heap; - _mi_tld_init(tld, heap); // must be before `_mi_heap_init` - _mi_heap_init(heap, tld, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */); + // allocate heap and thread local data + mi_tld_t* tld = _mi_tld(); // allocates & initializes tld if needed + mi_memid_t memid; + mi_heap_t* heap = (tld == NULL ? NULL : (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid)); + if (heap==NULL || tld==NULL) { + _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n"); + return false; + } + heap->memid = memid; + _mi_heap_init(heap, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */); _mi_heap_set_default_direct(heap); } return false; } -// Thread sequence number -static _Atomic(size_t) mi_tcount; - -// initialize thread local data -void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap) { - _mi_memzero_aligned(tld,sizeof(mi_tld_t)); - tld->heap_backing = bheap; - tld->heaps = NULL; - tld->subproc = &mi_subproc_default; - tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); - tld->os.stats = &tld->stats; - tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool(); -} // Free the thread local default heap (called from `mi_thread_done`) static bool _mi_thread_heap_done(mi_heap_t* heap) { @@ -441,7 +391,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { // free if not the main thread if (heap != &_mi_heap_main) { - mi_thread_data_free((mi_thread_data_t*)heap); + _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid); } else { #if 0 @@ -533,7 +483,13 @@ void _mi_thread_done(mi_heap_t* heap) if (heap->thread_id != _mi_thread_id()) return; // abandon the thread local heap - if (_mi_thread_heap_done(heap)) return; // returns true if already ran + _mi_thread_heap_done(heap); // returns true if already ran + + // free thread local data + if (mi_tld != NULL) { + _mi_meta_free(mi_tld, sizeof(mi_tld_t), mi_tld->memid); + mi_tld = NULL; + } } void _mi_heap_set_default_direct(mi_heap_t* heap) { @@ -689,7 +645,7 @@ void mi_cdecl _mi_process_done(void) { if (mi_option_is_enabled(mi_option_destroy_on_exit)) { mi_collect(true /* force */); _mi_heap_unsafe_destroy_all(); // forcefully release all memory held by all heaps (of this thread only!) - _mi_arena_unsafe_destroy_all(& _mi_heap_main_get()->tld->stats); + _mi_arena_unsafe_destroy_all(); } if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) { diff --git a/src/os.c b/src/os.c index 0c020302..b913fb1c 100644 --- a/src/os.c +++ b/src/os.c @@ -9,6 +9,8 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/atomic.h" #include "mimalloc/prim.h" +// always use main stats for OS calls +#define os_stats (&_mi_stats_main) /* ----------------------------------------------------------- Initialization. @@ -89,8 +91,8 @@ void _mi_os_init(void) { /* ----------------------------------------------------------- Util -------------------------------------------------------------- */ -bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* stats); -bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats); +bool _mi_os_decommit(void* addr, size_t size); +bool _mi_os_commit(void* addr, size_t size, bool* is_zero); void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { MI_UNUSED(try_alignment); MI_UNUSED(size); @@ -102,11 +104,9 @@ void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { Free memory -------------------------------------------------------------- */ -static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats); +static void mi_os_free_huge_os_pages(void* p, size_t size); -static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_stats_t* tld_stats) { - MI_UNUSED(tld_stats); - mi_stats_t* stats = &_mi_stats_main; +static void mi_os_prim_free(void* addr, size_t size, bool still_committed) { mi_assert_internal((size % _mi_os_page_size()) == 0); if (addr == NULL || size == 0) return; // || _mi_os_is_huge_reserved(addr) int err = _mi_prim_free(addr, size); @@ -114,13 +114,12 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed, mi_st _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr); } if (still_committed) { - _mi_stat_decrease(&stats->committed, size); + _mi_stat_decrease(&os_stats->committed, size); } - _mi_stat_decrease(&stats->reserved, size); + _mi_stat_decrease(&os_stats->reserved, size); } -void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats) { - if (stats == NULL) stats = &_mi_stats_main; +void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) { if (mi_memkind_is_os(memid.memkind)) { size_t csize = memid.mem.os.size; if (csize==0) { _mi_os_good_alloc_size(size); } @@ -135,10 +134,10 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me // free it if (memid.memkind == MI_MEM_OS_HUGE) { mi_assert(memid.is_pinned); - mi_os_free_huge_os_pages(base, csize, stats); + mi_os_free_huge_os_pages(base, csize); } else { - mi_os_prim_free(base, csize, still_committed, stats); + mi_os_prim_free(base, csize, still_committed); } } else { @@ -147,9 +146,8 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me } } -void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats) { - if (stats == NULL) stats = &_mi_stats_main; - _mi_os_free_ex(p, size, true, memid, stats); +void _mi_os_free(void* p, size_t size, mi_memid_t memid) { + _mi_os_free_ex(p, size, true, memid); } @@ -159,7 +157,7 @@ void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats) { // Note: the `try_alignment` is just a hint and the returned pointer is not guaranteed to be aligned. // Also `hint_addr` is a hint and may be ignored. -static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) { +static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) { mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(is_zero != NULL); mi_assert_internal(is_large != NULL); @@ -173,13 +171,11 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large); } - MI_UNUSED(tld_stats); - mi_stats_t* stats = &_mi_stats_main; - _mi_stat_counter_increase(&stats->mmap_calls, 1); + _mi_stat_counter_increase(&os_stats->mmap_calls, 1); if (p != NULL) { - _mi_stat_increase(&stats->reserved, size); + _mi_stat_increase(&os_stats->reserved, size); if (commit) { - _mi_stat_increase(&stats->committed, size); + _mi_stat_increase(&os_stats->committed, size); // seems needed for asan (or `mimalloc-test-api` fails) #ifdef MI_TRACK_ASAN if (*is_zero) { mi_track_mem_defined(p,size); } @@ -190,14 +186,14 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm return p; } -static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, mi_stats_t* tld_stats) { - return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero, tld_stats); +static void* mi_os_prim_alloc(size_t size, size_t try_alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero) { + return mi_os_prim_alloc_at(NULL, size, try_alignment, commit, allow_large, is_large, is_zero); } // Primitive aligned allocation from the OS. // This function guarantees the allocated memory is aligned. -static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base, mi_stats_t* stats) { +static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_large, bool* is_zero, void** base) { mi_assert_internal(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0)); mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(is_large != NULL); @@ -213,7 +209,7 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD) void* p = NULL; if (try_direct_alloc) { - p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero, stats); + p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero); } // aligned already? @@ -227,13 +223,13 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit _mi_warning_message("unable to allocate aligned OS memory directly, fall back to over-allocation (size: 0x%zx bytes, address: %p, alignment: 0x%zx, commit: %d)\n", size, p, alignment, commit); } #endif - if (p != NULL) { mi_os_prim_free(p, size, commit, stats); } + if (p != NULL) { mi_os_prim_free(p, size, commit); } if (size >= (SIZE_MAX - alignment)) return NULL; // overflow const size_t over_size = size + alignment; if (!mi_os_mem_config.has_partial_free) { // win32 virtualAlloc cannot free parts of an allocated block // over-allocate uncommitted (virtual) memory - p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero, stats); + p = mi_os_prim_alloc(over_size, 1 /*alignment*/, false /* commit? */, false /* allow_large */, is_large, is_zero); if (p == NULL) return NULL; // set p to the aligned part in the full region @@ -244,12 +240,12 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit // explicitly commit only the aligned part if (commit) { - _mi_os_commit(p, size, NULL, stats); + _mi_os_commit(p, size, NULL); } } else { // mmap can free inside an allocation // overallocate... - p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero, stats); + p = mi_os_prim_alloc(over_size, 1, commit, false, is_large, is_zero); if (p == NULL) return NULL; // and selectively unmap parts around the over-allocated area. @@ -258,8 +254,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit size_t mid_size = _mi_align_up(size, _mi_os_page_size()); size_t post_size = over_size - pre_size - mid_size; mi_assert_internal(pre_size < over_size&& post_size < over_size&& mid_size >= size); - if (pre_size > 0) { mi_os_prim_free(p, pre_size, commit, stats); } - if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit, stats); } + if (pre_size > 0) { mi_os_prim_free(p, pre_size, commit); } + if (post_size > 0) { mi_os_prim_free((uint8_t*)aligned_p + mid_size, post_size, commit); } // we can return the aligned pointer on `mmap` systems p = aligned_p; *base = aligned_p; // since we freed the pre part, `*base == p`. @@ -275,33 +271,31 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit OS API: alloc and alloc_aligned ----------------------------------------------------------- */ -void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { +void* _mi_os_alloc(size_t size, mi_memid_t* memid) { *memid = _mi_memid_none(); if (size == 0) return NULL; - if (stats == NULL) stats = &_mi_stats_main; size = _mi_os_good_alloc_size(size); bool os_is_large = false; bool os_is_zero = false; - void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero, stats); + void* p = mi_os_prim_alloc(size, 0, true, false, &os_is_large, &os_is_zero); if (p != NULL) { *memid = _mi_memid_create_os(p, size, true, os_is_zero, os_is_large); } return p; } -void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) +void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid) { MI_UNUSED(&_mi_os_get_aligned_hint); // suppress unused warnings *memid = _mi_memid_none(); if (size == 0) return NULL; - if (stats == NULL) stats = &_mi_stats_main; size = _mi_os_good_alloc_size(size); alignment = _mi_align_up(alignment, _mi_os_page_size()); bool os_is_large = false; bool os_is_zero = false; void* os_base = NULL; - void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base, stats ); + void* p = mi_os_prim_alloc_aligned(size, alignment, commit, allow_large, &os_is_large, &os_is_zero, &os_base); if (p != NULL) { *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large); memid->mem.os.base = os_base; @@ -311,9 +305,8 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo return p; } -void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { - MI_UNUSED(stats); - void* p = _mi_os_alloc(size, memid, &_mi_stats_main); +void* _mi_os_zalloc(size_t size, mi_memid_t* memid) { + void* p = _mi_os_alloc(size, memid); if (p == NULL) return NULL; // zero the OS memory if needed @@ -332,27 +325,26 @@ void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { to use the actual start of the memory region. ----------------------------------------------------------- */ -void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid, mi_stats_t* stats) { +void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t offset, bool commit, bool allow_large, mi_memid_t* memid) { mi_assert(offset <= size); mi_assert((alignment % _mi_os_page_size()) == 0); *memid = _mi_memid_none(); - if (stats == NULL) stats = &_mi_stats_main; if (offset == 0) { // regular aligned allocation - return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, stats); + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid); } else { // overallocate to align at an offset const size_t extra = _mi_align_up(offset, alignment) - offset; const size_t oversize = size + extra; - void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid, stats); + void* const start = _mi_os_alloc_aligned(oversize, alignment, commit, allow_large, memid); if (start == NULL) return NULL; void* const p = (uint8_t*)start + extra; mi_assert(_mi_is_aligned((uint8_t*)p + offset, alignment)); // decommit the overallocation at the start if (commit && extra > _mi_os_page_size()) { - _mi_os_decommit(start, extra, stats); + _mi_os_decommit(start, extra); } return p; } @@ -386,12 +378,10 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* return mi_os_page_align_areax(true, addr, size, newsize); } -bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats) { - MI_UNUSED(tld_stats); - mi_stats_t* stats = &_mi_stats_main; +bool _mi_os_commit(void* addr, size_t size, bool* is_zero) { if (is_zero != NULL) { *is_zero = false; } - _mi_stat_increase(&stats->committed, size); // use size for precise commit vs. decommit - _mi_stat_counter_increase(&stats->commit_calls, 1); + _mi_stat_increase(&os_stats->committed, size); // use size for precise commit vs. decommit + _mi_stat_counter_increase(&os_stats->commit_calls, 1); // page align range size_t csize; @@ -417,11 +407,9 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* tld_stats return true; } -static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_stats_t* tld_stats) { - MI_UNUSED(tld_stats); - mi_stats_t* stats = &_mi_stats_main; +static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) { mi_assert_internal(needs_recommit!=NULL); - _mi_stat_decrease(&stats->committed, size); + _mi_stat_decrease(&os_stats->committed, size); // page align size_t csize; @@ -438,9 +426,9 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, mi_ return (err == 0); } -bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) { +bool _mi_os_decommit(void* addr, size_t size) { bool needs_recommit; - return mi_os_decommit_ex(addr, size, &needs_recommit, tld_stats); + return mi_os_decommit_ex(addr, size, &needs_recommit); } @@ -448,13 +436,13 @@ bool _mi_os_decommit(void* addr, size_t size, mi_stats_t* tld_stats) { // but may be used later again. This will release physical memory // pages and reduce swapping while keeping the memory committed. // We page align to a conservative area inside the range to reset. -bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { +bool _mi_os_reset(void* addr, size_t size) { // page align conservatively within the range size_t csize; void* start = mi_os_page_align_area_conservative(addr, size, &csize); if (csize == 0) return true; // || _mi_os_is_huge_reserved(addr) - _mi_stat_increase(&stats->reset, csize); - _mi_stat_counter_increase(&stats->reset_calls, 1); + _mi_stat_increase(&os_stats->reset, csize); + _mi_stat_counter_increase(&os_stats->reset_calls, 1); #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN memset(start, 0, csize); // pretend it is eagerly reset @@ -470,22 +458,22 @@ bool _mi_os_reset(void* addr, size_t size, mi_stats_t* stats) { // either resets or decommits memory, returns true if the memory needs // to be recommitted if it is to be re-used later on. -bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats) +bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset) { if (mi_option_get(mi_option_purge_delay) < 0) return false; // is purging allowed? - _mi_stat_counter_increase(&stats->purge_calls, 1); - _mi_stat_increase(&stats->purged, size); + _mi_stat_counter_increase(&os_stats->purge_calls, 1); + _mi_stat_increase(&os_stats->purged, size); if (mi_option_is_enabled(mi_option_purge_decommits) && // should decommit? !_mi_preloading()) // don't decommit during preloading (unsafe) { bool needs_recommit = true; - mi_os_decommit_ex(p, size, &needs_recommit, stats); + mi_os_decommit_ex(p, size, &needs_recommit); return needs_recommit; } else { if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed - _mi_os_reset(p, size, stats); + _mi_os_reset(p, size); } return false; // needs no recommit } @@ -493,8 +481,8 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, mi_stats_t* stats) // either resets or decommits memory, returns true if the memory needs // to be recommitted if it is to be re-used later on. -bool _mi_os_purge(void* p, size_t size, mi_stats_t * stats) { - return _mi_os_purge_ex(p, size, true, stats); +bool _mi_os_purge(void* p, size_t size) { + return _mi_os_purge_ex(p, size, true); } @@ -601,15 +589,15 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse // no success, issue a warning and break if (p != NULL) { _mi_warning_message("could not allocate contiguous huge OS page %zu at %p\n", page, addr); - mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true, &_mi_stats_main); + mi_os_prim_free(p, MI_HUGE_OS_PAGE_SIZE, true); } break; } // success, record it page++; // increase before timeout check (see issue #711) - _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE); - _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE); + _mi_stat_increase(&os_stats->committed, MI_HUGE_OS_PAGE_SIZE); + _mi_stat_increase(&os_stats->reserved, MI_HUGE_OS_PAGE_SIZE); // check for timeout if (max_msecs > 0) { @@ -643,11 +631,11 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse // free every huge page in a range individually (as we allocated per page) // note: needed with VirtualAlloc but could potentially be done in one go on mmap'd systems. -static void mi_os_free_huge_os_pages(void* p, size_t size, mi_stats_t* stats) { +static void mi_os_free_huge_os_pages(void* p, size_t size) { if (p==NULL || size==0) return; uint8_t* base = (uint8_t*)p; while (size >= MI_HUGE_OS_PAGE_SIZE) { - mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true, stats); + mi_os_prim_free(base, MI_HUGE_OS_PAGE_SIZE, true); size -= MI_HUGE_OS_PAGE_SIZE; base += MI_HUGE_OS_PAGE_SIZE; } @@ -676,8 +664,7 @@ size_t _mi_os_numa_node_count_get(void) { return count; } -int _mi_os_numa_node_get(mi_os_tld_t* tld) { - MI_UNUSED(tld); +int _mi_os_numa_node_get(void) { size_t numa_count = _mi_os_numa_node_count(); if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0 // never more than the node count and >= 0 diff --git a/src/page-map.c b/src/page-map.c index 7a00d172..5c712346 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -29,7 +29,7 @@ bool _mi_page_map_init(void) { // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true); mi_page_map_all_committed = (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems? - _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid, NULL); + _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); return false; @@ -41,7 +41,7 @@ bool _mi_page_map_init(void) { // commit the first part so NULL pointers get resolved without an access violation if (!mi_page_map_all_committed) { bool is_zero; - _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero, NULL); + _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero); if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(_mi_page_map, _mi_os_page_size()); } } _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL @@ -60,7 +60,7 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) { bool is_zero; uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit); const size_t size = mi_page_map_entries_per_commit_bit; - _mi_os_commit(start, size, &is_zero, NULL); + _mi_os_commit(start, size, &is_zero); if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); } mi_bitmap_set(&mi_page_map_commit, i); } diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 80522f47..e06b278d 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -17,6 +17,11 @@ terms of the MIT license. A copy of the license can be found in the file // Dynamically bind Windows API points for portability //--------------------------------------------- +#if defined(_MSC_VER) +#pragma warning(disable:28159) // don't use GetVersion +#pragma warning(disable:4996) // don't use GetVersion +#endif + static DWORD win_major_version = 6; static DWORD win_minor_version = 0; @@ -126,8 +131,8 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) SYSTEM_INFO si; GetSystemInfo(&si); if (si.dwPageSize > 0) { config->page_size = si.dwPageSize; } - if (si.dwAllocationGranularity > 0) { - config->alloc_granularity = si.dwAllocationGranularity; + if (si.dwAllocationGranularity > 0) { + config->alloc_granularity = si.dwAllocationGranularity; win_allocation_granularity = si.dwAllocationGranularity; } // get virtual address bits @@ -141,7 +146,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) if (memInKiB > 0 && memInKiB < (SIZE_MAX / MI_KiB)) { config->physical_memory = (size_t)(memInKiB * MI_KiB); } - } + } // get the VirtualAlloc2 function HINSTANCE hDll; hDll = LoadLibrary(TEXT("kernelbase.dll")); @@ -818,14 +823,13 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { } #endif - bool _mi_prim_thread_is_in_threadpool(void) { #if (MI_ARCH_X64 || MI_ARCH_X86) if (win_major_version >= 6) { // check if this thread belongs to a windows threadpool // see: _TEB* const teb = NtCurrentTeb(); - void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778))); + void* const pool_data = *((void**)((uint8_t*)teb + (MI_SIZE_BITS == 32 ? 0x0F90 : 0x1778))); return (pool_data != NULL); } #endif diff --git a/src/static.c b/src/static.c index 0a8fa447..dd874f16 100644 --- a/src/static.c +++ b/src/static.c @@ -24,6 +24,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "alloc-aligned.c" #include "alloc-posix.c" #include "arena.c" +#include "arena-meta.c" #include "bitmap.c" #include "heap.c" #include "init.c" From 2a4af6f169087a60d769ffa61c192961960f5e11 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 8 Dec 2024 17:21:17 -0800 Subject: [PATCH 050/264] comments --- src/init.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/src/init.c b/src/init.c index f09821b4..7e3e5f86 100644 --- a/src/init.c +++ b/src/init.c @@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- -#define MI_MEMID_STATIC {{{0}},true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } +#define MI_MEMID_STATIC {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, @@ -250,6 +250,9 @@ static mi_tld_t* mi_tld_alloc(void) { return &tld_main; } else { + // allocate tld meta-data + // note: we need to be careful to not access the tld from `_mi_meta_zalloc` + // (and in turn from `_mi_arena_alloc_aligned` and `_mi_os_alloc_aligned`). mi_memid_t memid; mi_tld_t* tld = (mi_tld_t*)_mi_meta_zalloc(sizeof(mi_tld_t), &memid); if (tld==NULL) { @@ -414,7 +417,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { // 1. windows dynamic library: // call from DllMain on DLL_THREAD_DETACH // 2. windows static library: -// use `FlsAlloc` to call a destructor when the thread is done +// use special linker section to call a destructor when the thread is done // 3. unix, pthreads: // use a pthread key to call a destructor when a pthread is done // From bf2f2a8bf42397dc1fba1a9e353628013740661a Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 8 Dec 2024 18:48:56 -0800 Subject: [PATCH 051/264] fix bug where only the first chunkmap field would be considered --- src/bitmap.c | 2 +- src/options.c | 2 +- test/test-stress.c | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index 44113429..45a82ba3 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1028,7 +1028,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \ - const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \ + const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \ const size_t chunkmap_hi_bfield = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\ const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \ diff --git a/src/options.c b/src/options.c index f2e9297f..e47f1b6e 100644 --- a/src/options.c +++ b/src/options.c @@ -68,7 +68,7 @@ typedef struct mi_option_desc_s { // in KiB #ifndef MI_DEFAULT_ARENA_RESERVE #if (MI_INTPTR_SIZE>4) - #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L + #define MI_DEFAULT_ARENA_RESERVE 8*1024L*1024L #else #define MI_DEFAULT_ARENA_RESERVE 128L*1024L #endif diff --git a/test/test-stress.c b/test/test-stress.c index 96cf702d..915c953f 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -57,7 +57,7 @@ static int ITER = 10; #define ALLOW_LARGE true #else static int THREADS = 32; // more repeatable if THREADS <= #processors -static int SCALE = 50; // scaling factor +static int SCALE = 25; // scaling factor static int ITER = 50; // N full iterations destructing and re-creating all threads #endif From 68ac94c1baccbeac37a8dd75dddb34542b08e8f0 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 8 Dec 2024 18:53:43 -0800 Subject: [PATCH 052/264] set default arena reserve back to 1GiB --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index e47f1b6e..f2e9297f 100644 --- a/src/options.c +++ b/src/options.c @@ -68,7 +68,7 @@ typedef struct mi_option_desc_s { // in KiB #ifndef MI_DEFAULT_ARENA_RESERVE #if (MI_INTPTR_SIZE>4) - #define MI_DEFAULT_ARENA_RESERVE 8*1024L*1024L + #define MI_DEFAULT_ARENA_RESERVE 1024L*1024L #else #define MI_DEFAULT_ARENA_RESERVE 128L*1024L #endif From d5ed0cc71ef02b5ab986fa7ffc06b4c6e65dd622 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 9 Dec 2024 14:31:43 -0800 Subject: [PATCH 053/264] various improvements --- include/mimalloc/atomic.h | 3 + include/mimalloc/bits.h | 15 ++- include/mimalloc/types.h | 6 +- src/arena.c | 52 ++++++--- src/bitmap.c | 238 +++++++++++++++++++++----------------- src/bitmap.h | 20 +++- src/free.c | 7 +- src/init.c | 2 +- src/os.c | 13 +-- src/random.c | 19 ++- 10 files changed, 223 insertions(+), 152 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index caa90cf8..3b0ff559 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -74,8 +74,11 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_atomic_store_relaxed(p,x) mi_atomic(store_explicit)(p,x,mi_memory_order(relaxed)) #define mi_atomic_exchange_release(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(release)) #define mi_atomic_exchange_acq_rel(p,x) mi_atomic(exchange_explicit)(p,x,mi_memory_order(acq_rel)) + +#define mi_atomic_cas_weak_relaxed(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed)) #define mi_atomic_cas_weak_release(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) #define mi_atomic_cas_weak_acq_rel(p,exp,des) mi_atomic_cas_weak(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) +#define mi_atomic_cas_strong_relaxed(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(relaxed),mi_memory_order(relaxed)) #define mi_atomic_cas_strong_release(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(release),mi_memory_order(relaxed)) #define mi_atomic_cas_strong_acq_rel(p,exp,des) mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire)) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 3afac04d..e47d8a76 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -229,7 +229,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) { unsigned long i; return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false); #else - return (x!=0 ? (*idx = mi_ctz(x), true) : false); + return (x!=0 ? (*idx = mi_ctz(x), true) : false); #endif } @@ -289,5 +289,18 @@ static inline size_t mi_rotl(size_t x, size_t r) { #endif } +static inline uint32_t mi_rotl32(uint32_t x, uint32_t r) { + #if mi_has_builtin(rotateleft32) + return mi_builtin(rotateleft32)(x,r); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + return _lrotl(x, (int)r); + #else + // The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to + // avoid UB when `rshift==0`. See + const unsigned int rshift = (unsigned int)(r) & 31; + return ((x << rshift) | (x >> ((-rshift) & 31))); + #endif +} + #endif // MI_BITS_H diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index d507ca69..71edb397 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -334,9 +334,9 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) -#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~16KiB -#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // ~128KiB -#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // ~2MiB +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB +#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // < 2 MiB #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/arena.c b/src/arena.c index ab74b988..24835f42 100644 --- a/src/arena.c +++ b/src/arena.c @@ -29,7 +29,8 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo ----------------------------------------------------------- */ #define MI_ARENA_BIN_COUNT (MI_BIN_COUNT) - +#define MI_ARENA_MIN_SIZE (MI_BCHUNK_BITS * MI_ARENA_SLICE_SIZE) // 32 MiB (or 8 MiB on 32-bit) +#define MI_ARENA_MAX_SIZE (MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE) // A memory arena descriptor typedef struct mi_arena_s { @@ -105,7 +106,7 @@ size_t mi_arena_get_count(void) { mi_arena_t* mi_arena_from_index(size_t idx) { mi_assert_internal(idx < mi_arena_get_count()); - return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); + return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]); } mi_arena_t* mi_arena_from_id(mi_arena_id_t id) { @@ -235,6 +236,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( } } } + if (memid->initially_zero) { + mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE); + } + else { + mi_track_mem_undefined(p, slice_count * MI_ARENA_SLICE_SIZE); + } } else { // no need to commit, but check if already fully committed @@ -253,7 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // try to reserve a fresh arena space static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) { - if (_mi_preloading()) return false; // use OS only while pre loading + // if (_mi_preloading()) return false; // use OS only while pre loading if (req_arena_id != _mi_arena_id_none()) return false; const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); @@ -269,8 +276,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_SLICE_SIZE); if (arena_count >= 1 && arena_count <= 128) { - // scale up the arena sizes exponentially every 8 entries - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); + // scale up the arena sizes exponentially every 4 entries + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16); size_t reserve = 0; if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { arena_reserve = reserve; @@ -278,8 +285,8 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re } // check arena bounds - const size_t min_reserve = 8 * MI_ARENA_SLICE_SIZE; // hope that fits minimal bitmaps? - const size_t max_reserve = MI_BITMAP_MAX_BIT_COUNT * MI_ARENA_SLICE_SIZE; // 16 GiB + const size_t min_reserve = MI_ARENA_MIN_SIZE; + const size_t max_reserve = MI_ARENA_MAX_SIZE; // 16 GiB if (arena_reserve < min_reserve) { arena_reserve = min_reserve; } @@ -294,7 +301,17 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } - return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); + // and try to reserve the arena + int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); + if (err != 0) { + // failed, try a smaller size? + const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB); + if (arena_reserve > small_arena_reserve) { + // try again + err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); + } + } + return (err==0); } @@ -317,12 +334,12 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are #define mi_forall_arenas(req_arena_id, tseq, name_arena) \ { \ - const size_t _arena_count = mi_atomic_load_relaxed(&mi_arena_count); \ + const size_t _arena_count = mi_arena_get_count(); \ if (_arena_count > 0) { \ const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \ size_t _start; \ if (req_arena_id == _mi_arena_id_none()) { \ - /* always start searching in an arena 1 below the max */ \ + /* always start searching in the arena's below the max */ \ _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \ } \ else { \ @@ -333,10 +350,10 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are size_t _idx; \ if (_i < _arena_cycle) { \ _idx = _i + _start; \ - if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate */ \ + if (_idx >= _arena_cycle) { _idx -= _arena_cycle; } /* adjust so we rotate through the cycle */ \ } \ else { \ - _idx = _i; \ + _idx = _i; /* remaining arena's */ \ } \ mi_arena_t* const name_arena = mi_arena_from_index(_idx); \ if (name_arena != NULL) \ @@ -397,6 +414,9 @@ again: // did we need a specific arena? if (req_arena_id != _mi_arena_id_none()) return NULL; + // don't create arena's while preloading (todo: or should we?) + if (_mi_preloading()) return NULL; + // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?) if (mi_lock_try_acquire(&mi_arena_reserve_lock)) { mi_arena_id_t arena_id = 0; @@ -917,7 +937,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. static void mi_arenas_unsafe_destroy(void) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + const size_t max_arena = mi_arena_get_count(); size_t new_max_arena = 0; for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); @@ -949,7 +969,7 @@ void _mi_arena_unsafe_destroy_all(void) { // Is a pointer inside any of our arenas? bool _mi_arena_contains(const void* p) { - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + const size_t max_arena = mi_arena_get_count(); for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) { @@ -1175,7 +1195,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { MI_UNUSED(show_abandoned); - size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); + size_t max_arenas = mi_arena_get_count(); size_t free_total = 0; size_t slice_total = 0; //size_t abandoned_total = 0; @@ -1331,7 +1351,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ static void mi_arenas_try_purge(bool force, bool visit_all) { if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled - const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + const size_t max_arena = mi_arena_get_count(); if (max_arena == 0) return; // _mi_error_message(EFAULT, "purging not yet implemented\n"); diff --git a/src/bitmap.c b/src/bitmap.c index 45a82ba3..2f563066 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -14,6 +14,8 @@ Concurrent bitmap that can set/reset sequences of bits atomically #include "mimalloc/bits.h" #include "bitmap.h" +#define MI_USE_SIMD 0 + /* -------------------------------------------------------------------------------- bfields -------------------------------------------------------------------------------- */ @@ -34,9 +36,9 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { return mi_bsf(x,idx); } -static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { - return mi_rotr(x,r); -} +//static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { +// return mi_rotr(x,r); +//} static inline mi_bfield_t mi_bfield_zero(void) { return 0; @@ -456,7 +458,7 @@ static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t // ------- mi_bchunk_try_find_and_clear --------------------------------------- -#if defined(__AVX2__) +#if MI_USE_SIMD && defined(__AVX2__) static inline __m256i mi_mm256_zero(void) { return _mm256_setzero_si256(); } @@ -471,12 +473,27 @@ static inline bool mi_mm256_is_zero( __m256i vec) { } #endif +static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) { + mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); + size_t cidx; + if (!allow_allset && (~b == 0)) return false; + if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit + if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + mi_assert_internal(*pidx < MI_BCHUNK_BITS); + return true; + } + } + return false; +} + // Find least 1-bit in a chunk and try to clear it atomically // set `*pidx` to the bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // This is used to find free slices and abandoned pages and should be efficient. // todo: try neon version static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) + #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) while (true) { const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0) @@ -485,19 +502,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 const size_t chunk_idx = _tzcnt_u32(mask) / 8; - mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); - size_t cidx; - if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BCHUNK_BITS); - return true; - } - } + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again } - #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { size_t chunk_idx = 0; #if 0 @@ -528,42 +536,50 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. chunk_idx = _tzcnt_u64(mask) / 8; #endif - mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); - size_t cidx; - if (mi_bfield_find_least_bit(b, &cidx)) { // find the bit-idx that is clear - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; - mi_assert_internal(*pidx < MI_BCHUNK_BITS); - return true; - } - } + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again } #else + // try first to find a field that is not all set (to reduce fragmentation) for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); - size_t idx; - if (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[i], idx, NULL)) { // try to clear it atomically - *pidx = (i*MI_BFIELD_BITS + idx); - mi_assert_internal(*pidx < MI_BCHUNK_BITS); - return true; - } - } + if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true; + } + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true; } return false; #endif } +static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) { + const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); + if (!allow_all_set && (~b == 0)) return false; + // has_set8 has low bit in each byte set if the byte in x == 0xFF + const mi_bfield_t has_set8 = + ((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F + (b & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 + >> 7; // shift high bit to low bit + size_t idx; + if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit + mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); + mi_assert_internal((idx%8)==0); + const size_t byte_idx = idx/8; + if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // unset the byte atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; + mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); + return true; + } + } + return false; +} // find least byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // Used to find medium size pages in the free blocks. // todo: try neon version static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); @@ -588,24 +604,12 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // try again } #else + // first skip allset fields to reduce fragmentation for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { - const mi_bfield_t x = mi_atomic_load_relaxed(&chunk->bfields[i]); - // has_set8 has low bit in each byte set if the byte in x == 0xFF - const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F - (x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 - >> 7; // shift high bit to low bit - size_t idx; - if (mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit - mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); - mi_assert_internal((idx%8)==0); - const size_t byte_idx = idx/8; - if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[i],byte_idx,NULL)) { // unset the byte atomically - *pidx = (i*MI_BFIELD_BITS) + idx; - mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); - return true; - } - // else continue - } + if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true; + } + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true; } return false; #endif @@ -618,7 +622,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // Used to find large size pages in the free blocks. // todo: try neon version static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); @@ -747,14 +751,14 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, } -static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { - if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages - if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages - if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages - if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); - return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); -} +//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { +// if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages +// if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages +// if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages +// if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk +// if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); +// return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); +//} // ------- mi_bchunk_clear_once_set --------------------------------------- @@ -779,10 +783,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) { // are all bits in a bitmap chunk clear? static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { - #if defined(__AVX2__) && (MI_BCHUNK_BITS==256) + #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); return mi_mm256_is_zero(vec); - #elif defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) // a 64b cache-line contains the entire chunk anyway so load both at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); @@ -796,9 +800,17 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { bitmap chunkmap -------------------------------------------------------------------------------- */ +static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) { + size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); + if mi_unlikely(chunk_idx > oldmax) { + mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx); + } +} + static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); mi_bchunk_set(&bitmap->chunkmap, chunk_idx); + mi_bitmap_chunkmap_set_max(bitmap, chunk_idx); } static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) { @@ -813,11 +825,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) mi_bchunk_set(&bitmap->chunkmap, chunk_idx); return false; } - // record the max clear - size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); - do { - if mi_likely(chunk_idx <= oldmax) break; - } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx)); + mi_bitmap_chunkmap_set_max(bitmap, chunk_idx); return true; } @@ -894,6 +902,9 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); mi_bitmap_chunkmap_set(bitmap, chunk_idx); } + + // reset max_accessed + mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0); } @@ -1027,31 +1038,27 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ MI_UNUSED(tseq); \ - const size_t chunk_start = 0; /* (tseq % (1+chunk_hi_idx)); */ /* space out threads? */ \ - const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS ); \ - const size_t chunkmap_hi_bfield = chunkmap_max_bfield; /* chunk_hi_idx / MI_BFIELD_BITS; */\ - const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ - const size_t chunkmap_start_idx = chunk_start % MI_BFIELD_BITS; \ + const size_t chunk_max_acc = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \ + const size_t chunk_start = tseq % chunk_max_acc; /* space out threads? */ \ + const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \ + const size_t chunkmap_max_acc = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \ + const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ /* for each chunkmap entry `i` */ \ - for (size_t _i = 0; _i < chunkmap_max_bfield; _i++) { \ + for (size_t _i = 0; _i < chunkmap_max; _i++) { \ size_t i; \ - if (_i < chunkmap_hi_bfield) { \ - i = _i + chunkmap_start; /* first the chunks up to chunk_hi */ \ - if (i >= chunkmap_hi_bfield) { i -= chunkmap_hi_bfield; } /* rotate */ \ + if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \ + i = _i + chunkmap_start; \ + if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \ } \ - else { i = _i; } /* the rest of the chunks above chunk_hi_idx */ \ + else { i = _i; } /* the rest of the chunks above chunk_max_accessed */ \ const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ - size_t cmap_idx_shift = 0; /* shift through the cmap */ \ - if (_i == 0 && chunkmap_start_idx > 0) { \ - cmap = mi_bfield_rotate_right(cmap, chunkmap_start_idx); /* rotate right for the start position (on the first iteration) */ \ - cmap_idx_shift = chunkmap_start_idx; \ - } \ + /* todo: space out threads within a chunkmap (2GiB) as well? */ \ + size_t cmap_idx_shift = 0; /* shift through the cmap */ \ size_t cmap_idx; \ while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \ /* set the chunk idx */ \ size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \ - mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); \ /* try to find and clear N bits in that chunk */ \ { @@ -1064,28 +1071,45 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n } \ }} -// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -// (Used to find fresh free slices -- optimized for n=1, 8, and MI_BFIELD_BITS) -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) -{ - // const size_t chunk_hi_idx = mi_atomic_load_relaxed(&bitmap->chunk_max_clear); - mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) - { - size_t cidx; - if mi_likely(mi_bchunk_try_find_and_clearN(&bitmap->chunks[chunk_idx], n, &cidx)) { - *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; - mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap)); - return true; - } - else { - // we may find that all are cleared only on a second iteration but that is ok as - // the chunkmap is a conservative approximation. - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); - } - } - mi_bitmap_forall_chunks_end(); - return false; + +#define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \ + mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \ + size_t _cidx; \ + if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \ + *pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \ + return true; \ + } \ + else { \ + /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \ + mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \ + } \ + } \ + mi_bitmap_forall_chunks_end(); \ + return false; \ +} + +#define COMMA , + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , ); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, ); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, ); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BFIELD_BITS); + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BCHUNK_BITS); + mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n); } diff --git a/src/bitmap.h b/src/bitmap.h index 40c4df42..b26791cc 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -91,8 +91,8 @@ typedef mi_bchunk_t mi_bchunkmap_t; // An atomic bitmap typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { - _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) - _Atomic(size_t) chunk_max_clear; // max chunk index that was once cleared + _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) + _Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc mi_bchunkmap_t chunkmap; mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT @@ -172,9 +172,23 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n } +// Specialized versions for common bit sequence sizes +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 1-bit +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS + // Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all. // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); +mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { + if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); // small pages + if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); // medium pages + if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages + if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk + if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx); + return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx); +} // Called once a bit is cleared to see if the memory slice can be claimed. diff --git a/src/free.c b/src/free.c index d45507e7..0da0332e 100644 --- a/src/free.c +++ b/src/free.c @@ -217,8 +217,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { } // 2. if the page is not too full, we can try to reclaim it for ourselves + // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit. if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && - !mi_page_is_used_at_frac(page,8)) + !mi_page_is_used_at_frac(page,4) + // && !mi_page_is_abandoned_mapped(page) + ) { // the page has still some blocks in use (but not too many) // reclaim in our heap if compatible, or otherwise abandon again @@ -247,7 +250,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { } // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations - if (!mi_page_is_used_at_frac(page, 4) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page + if (!mi_page_is_used_at_frac(page,4) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && _mi_arena_page_try_reabandon_to_mapped(page)) { diff --git a/src/init.c b/src/init.c index 2396f594..2070405d 100644 --- a/src/init.c +++ b/src/init.c @@ -96,7 +96,7 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- -#define MI_MEMID_STATIC {{{0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } +#define MI_MEMID_STATIC {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, diff --git a/src/os.c b/src/os.c index b913fb1c..55f7428e 100644 --- a/src/os.c +++ b/src/os.c @@ -203,10 +203,9 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit if (!(alignment >= _mi_os_page_size() && ((alignment & (alignment - 1)) == 0))) return NULL; size = _mi_align_up(size, _mi_os_page_size()); - // try a direct allocation if the alignment is below the default, or if larger than 1/64 fraction of the size (to avoid waste). - const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/64); + // try a direct allocation if the alignment is below the default, or if larger than 1/8 fraction of the size. + const bool try_direct_alloc = (alignment <= mi_os_mem_config.alloc_granularity || alignment > size/8); - // try first with a requested alignment hint (this will usually be aligned directly on Win 10+ or BSD) void* p = NULL; if (try_direct_alloc) { p = mi_os_prim_alloc(size, alignment, commit, allow_large, is_large, is_zero); @@ -233,8 +232,8 @@ static void* mi_os_prim_alloc_aligned(size_t size, size_t alignment, bool commit if (p == NULL) return NULL; // set p to the aligned part in the full region - // note: this is dangerous on Windows as VirtualFree needs the actual base pointer - // this is handled though by having the `base` field in the memid's + // note: on Windows VirtualFree needs the actual base pointer + // this is handledby having the `base` field in the memid. *base = p; // remember the base p = _mi_align_up_ptr(p, alignment); @@ -361,7 +360,7 @@ static void* mi_os_page_align_areax(bool conservative, void* addr, size_t size, if (newsize != NULL) *newsize = 0; if (size == 0 || addr == NULL) return NULL; - // page align conservatively within the range + // page align conservatively within the range, or liberally straddling pages outside the range void* start = (conservative ? _mi_align_up_ptr(addr, _mi_os_page_size()) : mi_align_down_ptr(addr, _mi_os_page_size())); void* end = (conservative ? mi_align_down_ptr((uint8_t*)addr + size, _mi_os_page_size()) @@ -472,7 +471,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset) return needs_recommit; } else { - if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed + if (allow_reset) { // this can sometimes be not allowed if the range is not fully committed (on Windows, we cannot reset uncommitted memory) _mi_os_reset(p, size); } return false; // needs no recommit diff --git a/src/random.c b/src/random.c index 4fc8b2f8..35e2718a 100644 --- a/src/random.c +++ b/src/random.c @@ -7,7 +7,6 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc.h" #include "mimalloc/internal.h" #include "mimalloc/prim.h" // _mi_prim_random_buf -#include // memset /* ---------------------------------------------------------------------------- We use our own PRNG to keep predictable performance of random number generation @@ -33,15 +32,11 @@ The implementation uses regular C code which compiles very well on modern compil (gcc x64 has no register spills, and clang 6+ uses SSE instructions) -----------------------------------------------------------------------------*/ -static inline uint32_t rotl(uint32_t x, uint32_t shift) { - return (x << shift) | (x >> (32 - shift)); -} - static inline void qround(uint32_t x[16], size_t a, size_t b, size_t c, size_t d) { - x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 16); - x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 12); - x[a] += x[b]; x[d] = rotl(x[d] ^ x[a], 8); - x[c] += x[d]; x[b] = rotl(x[b] ^ x[c], 7); + x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 16); + x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 12); + x[a] += x[b]; x[d] = mi_rotl32(x[d] ^ x[a], 8); + x[c] += x[d]; x[b] = mi_rotl32(x[b] ^ x[c], 7); } static void chacha_block(mi_random_ctx_t* ctx) @@ -99,7 +94,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no // since we only use chacha for randomness (and not encryption) we // do not _need_ to read 32-bit values as little endian but we do anyways // just for being compatible :-) - memset(ctx, 0, sizeof(*ctx)); + _mi_memzero(ctx, sizeof(*ctx)); for (size_t i = 0; i < 4; i++) { const uint8_t* sigma = (uint8_t*)"expand 32-byte k"; ctx->input[i] = read32(sigma,i); @@ -114,7 +109,7 @@ static void chacha_init(mi_random_ctx_t* ctx, const uint8_t key[32], uint64_t no } static void chacha_split(mi_random_ctx_t* ctx, uint64_t nonce, mi_random_ctx_t* ctx_new) { - memset(ctx_new, 0, sizeof(*ctx_new)); + _mi_memzero(ctx_new, sizeof(*ctx_new)); _mi_memcpy(ctx_new->input, ctx->input, sizeof(ctx_new->input)); ctx_new->input[12] = 0; ctx_new->input[13] = 0; @@ -160,7 +155,7 @@ If we cannot get good randomness, we fall back to weak randomness based on a tim uintptr_t _mi_os_random_weak(uintptr_t extra_seed) { uintptr_t x = (uintptr_t)&_mi_os_random_weak ^ extra_seed; // ASLR makes the address random - x ^= _mi_prim_clock_now(); + x ^= _mi_prim_clock_now(); // and do a few randomization steps uintptr_t max = ((x ^ (x >> 17)) & 0x0F) + 1; for (uintptr_t i = 0; i < max; i++) { From 351cb0c7407ef95ba152d7a6c5d22b407a76b784 Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 9 Dec 2024 15:16:36 -0800 Subject: [PATCH 054/264] small fixes for macOS --- CMakeLists.txt | 20 +++++++++----------- include/mimalloc/internal.h | 2 +- src/page-map.c | 10 ++++++++-- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6df4ba5a..553b279d 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -330,20 +330,18 @@ endif() # Determine architecture set(MI_OPT_ARCH_FLAGS "") -set(MI_ARCH "unknown") -if(APPLE) - list(FIND CMAKE_OSX_ARCHITECTURES "x86_64" x64_index) - list(FIND CMAKE_OSX_ARCHITECTURES "arm64" arm64_index) - if(x64_index GREATER_EQUAL 0) - set(MI_ARCH "x64") - elseif(arm64_index GREATER_EQUAL 0) - set(MI_ARCH "arm64") - endif() -elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64") +set(MI_ARCH "") +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR + CMAKE_GENERATOR_PLATFORM STREQUAL "x64") # msvc set(MI_ARCH "x64") -elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64") +elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR + CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR # apple + CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64") # msvc set(MI_ARCH "arm64") endif() +if(MI_ARCH) + message(STATUS "Architecture: ${MI_ARCH}") +endif() # Check /proc/cpuinfo for an SV39 MMU and limit the virtual address bits. # (this will skip the aligned hinting in that case. Issue #939, #949) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 4c8256a0..176c1de8 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -459,7 +459,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) { } static inline mi_page_t* _mi_ptr_page(const void* p) { - #if MI_DEBUG + #if MI_DEBUG || defined(__APPLE__) return _mi_checked_ptr_page(p); #else return _mi_ptr_page_ex(p,NULL); diff --git a/src/page-map.c b/src/page-map.c index 5c712346..475e8fc2 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -12,6 +12,7 @@ terms of the MIT license. A copy of the license can be found in the file mi_decl_cache_align uint8_t* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; +static void* mi_page_map_max_address = NULL; static mi_memid_t mi_page_map_memid; // (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization) @@ -23,12 +24,13 @@ bool _mi_page_map_init(void) { if (vbits >= 48) vbits = 47; // 1 byte per block = 2 GiB for 128 TiB address space (48 bit = 256 TiB address space) // 64 KiB for 4 GiB address space (on 32-bit) + mi_page_map_max_address = (void*)(MI_PU(1) << vbits); const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_DEFAULT_BIT_COUNT); // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true); - mi_page_map_all_committed = (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems? + mi_page_map_all_committed = true; // (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems? _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); @@ -118,8 +120,12 @@ void _mi_page_map_unregister(mi_page_t* page) { mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { + // if mi_unlikely(_mi_page_map==NULL) { // happens on macOS during loading + // _mi_page_map_init(); + // } + if mi_unlikely(p >= mi_page_map_max_address) return false; uintptr_t idx = ((uintptr_t)p >> MI_ARENA_SLICE_SHIFT); - if (!mi_page_map_all_committed || mi_bitmap_is_setN(&mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) { + if (mi_page_map_all_committed || mi_bitmap_is_setN(&mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) { return (_mi_page_map[idx] != 0); } else { From 8f5449d2715c66f67a2d3fb2c3f0800ce59ced9a Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 9 Dec 2024 15:39:15 -0800 Subject: [PATCH 055/264] various fixes for test pipeline --- src/alloc-aligned.c | 4 ++-- src/alloc.c | 7 +++---- src/free.c | 2 +- src/page.c | 2 +- 4 files changed, 7 insertions(+), 8 deletions(-) diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index b1e6329c..4b142a1e 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -29,7 +29,7 @@ static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, si mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX); const size_t oversize = size + alignment - 1; void* base = _mi_heap_malloc_guarded(heap, oversize, zero); - void* p = mi_align_up_ptr(base, alignment); + void* p = _mi_align_up_ptr(base, alignment); mi_track_align(base, p, (uint8_t*)p - (uint8_t*)base, size); mi_assert_internal(mi_usable_size(p) >= size); mi_assert_internal(_mi_is_aligned(p, alignment)); @@ -175,7 +175,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t } #if MI_GUARDED - if (offset==0 && alignment < MI_BLOCK_ALIGNMENT_MAX && mi_heap_malloc_use_guarded(heap,size)) { + if (offset==0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN && mi_heap_malloc_use_guarded(heap,size)) { return mi_heap_malloc_guarded_aligned(heap, size, alignment, zero); } #endif diff --git a/src/alloc.c b/src/alloc.c index 840d34fe..b0c89e65 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -619,7 +619,6 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) { block->next = MI_BLOCK_TAG_GUARDED; // set guard page at the end of the block - mi_segment_t* const segment = _mi_page_segment(page); const size_t block_size = mi_page_block_size(page); // must use `block_size` to match `mi_free_local` const size_t os_page_size = _mi_os_page_size(); mi_assert_internal(block_size >= obj_size + os_page_size + sizeof(mi_block_t)); @@ -630,7 +629,7 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) { } uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size; mi_assert_internal(_mi_is_aligned(guard_page, os_page_size)); - if (segment->allow_decommit && _mi_is_aligned(guard_page, os_page_size)) { + if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) { _mi_os_protect(guard_page, os_page_size); } else { @@ -640,9 +639,9 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) { // align pointer just in front of the guard page size_t offset = block_size - os_page_size - obj_size; mi_assert_internal(offset > sizeof(mi_block_t)); - if (offset > MI_BLOCK_ALIGNMENT_MAX) { + if (offset > MI_PAGE_MAX_OVERALLOC_ALIGN) { // give up to place it right in front of the guard page if the offset is too large for unalignment - offset = MI_BLOCK_ALIGNMENT_MAX; + offset = MI_PAGE_MAX_OVERALLOC_ALIGN; } void* p = (uint8_t*)block + offset; mi_track_align(block, p, offset, obj_size); diff --git a/src/free.c b/src/free.c index 0da0332e..49bf8bf6 100644 --- a/src/free.c +++ b/src/free.c @@ -519,7 +519,7 @@ static void mi_block_unguard(mi_page_t* page, mi_block_t* block, void* p) { const size_t bsize = mi_page_block_size(page); const size_t psize = _mi_os_page_size(); mi_assert_internal(bsize > psize); - mi_assert_internal(_mi_page_segment(page)->allow_decommit); + mi_assert_internal(!page->memid.is_pinned); void* gpage = (uint8_t*)block + bsize - psize; mi_assert_internal(_mi_is_aligned(gpage, psize)); _mi_os_unprotect(gpage, psize); diff --git a/src/page.c b/src/page.c index f21bf91f..98319e53 100644 --- a/src/page.c +++ b/src/page.c @@ -756,7 +756,7 @@ static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) { if (page != NULL) { #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) { - mi_page_extend_free(heap, page, heap->tld); + mi_page_extend_free(heap, page); mi_assert_internal(mi_page_immediate_available(page)); } else From 3f732a981f8b4a8a7122b2b59f5c1a1b1141c848 Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 9 Dec 2024 15:49:20 -0800 Subject: [PATCH 056/264] fix debug build of MI_GUARDED --- src/alloc-aligned.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index 4b142a1e..38e0371d 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -26,7 +26,7 @@ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) { #if MI_GUARDED static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, size_t size, size_t alignment, bool zero) mi_attr_noexcept { // use over allocation for guarded blocksl - mi_assert_internal(alignment > 0 && alignment < MI_BLOCK_ALIGNMENT_MAX); + mi_assert_internal(alignment > 0 && alignment < MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t oversize = size + alignment - 1; void* base = _mi_heap_malloc_guarded(heap, oversize, zero); void* p = _mi_align_up_ptr(base, alignment); From bbcbd3cd1fee630547542c20f60b51d5eb62a001 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 9 Dec 2024 19:06:06 -0800 Subject: [PATCH 057/264] add cast to avoid errors on clang 7 --- include/mimalloc/internal.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 176c1de8..c6d9ae36 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -534,7 +534,7 @@ static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) { // Thread id of thread that owns this page static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { - return mi_atomic_load_relaxed(&page->xthread_id); + return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id); } // Thread free access @@ -605,11 +605,11 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) { static inline bool mi_page_is_abandoned(const mi_page_t* page) { // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) - return (mi_atomic_load_relaxed(&page->xthread_id) <= 1); + return (mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id) <= 1); } static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) { - return (mi_atomic_load_relaxed(&page->xthread_id) == 1); + return (mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id) == 1); } static inline void mi_page_set_abandoned_mapped(mi_page_t* page) { @@ -675,7 +675,7 @@ static inline bool _mi_page_unown(mi_page_t* page) { // Page flags //----------------------------------------------------------- static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) { - return mi_atomic_load_relaxed(&page->xflags); + return mi_atomic_load_relaxed(&((mi_page_t*)page)->xflags); } static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) { From f28d5c7029976ce97565fe07ea5382c180c5f361 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 9 Dec 2024 19:12:03 -0800 Subject: [PATCH 058/264] add cast to avoid errors on clang 7 --- src/bitmap.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitmap.h b/src/bitmap.h index b26791cc..191b6864 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -100,7 +100,7 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { static inline size_t mi_bitmap_chunk_count(const mi_bitmap_t* bitmap) { - return mi_atomic_load_relaxed(&bitmap->chunk_count); + return mi_atomic_load_relaxed(&((mi_bitmap_t*)bitmap)->chunk_count); } static inline size_t mi_bitmap_max_bits(const mi_bitmap_t* bitmap) { From 56a1bd7f9ec5b37f65de8c8500ee5c4a4497d553 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 9 Dec 2024 19:43:00 -0800 Subject: [PATCH 059/264] fix 32 bit multiply in generic ctz/clz --- include/mimalloc/bits.h | 4 ++-- src/libc.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index e47d8a76..cb0191cf 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -220,7 +220,7 @@ static inline size_t mi_popcount(size_t x) { // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bsf(size_t x, size_t* idx) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9) // on x64 the carry flag is set on zero which gives better codegen bool is_zero; __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); @@ -237,7 +237,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) { // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bsr(size_t x, size_t* idx) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9) // on x64 the carry flag is set on zero which gives better codegen bool is_zero; __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc"); diff --git a/src/libc.c b/src/libc.c index 3fdbf3e7..2b28bd25 100644 --- a/src/libc.c +++ b/src/libc.c @@ -289,7 +289,7 @@ static size_t mi_ctz_generic32(uint32_t x) { 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; if (x==0) return 32; - return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27]; + return debruijn[((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27]; } static size_t mi_clz_generic32(uint32_t x) { @@ -304,7 +304,7 @@ static size_t mi_clz_generic32(uint32_t x) { x |= x >> 4; x |= x >> 8; x |= x >> 16; - return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27]; + return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27]; } size_t _mi_clz_generic(size_t x) { From e44815ed6fa19eaf12d9141c1e202d8308dcf113 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 9 Dec 2024 20:06:48 -0800 Subject: [PATCH 060/264] add bsf/bsr for compilation with older compilers (clang 7) --- include/mimalloc/bits.h | 22 ++++++++++++++++------ 1 file changed, 16 insertions(+), 6 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index cb0191cf..4f0dce71 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -153,9 +153,9 @@ size_t _mi_clz_generic(size_t x); size_t _mi_ctz_generic(size_t x); static inline size_t mi_ctz(size_t x) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0 - uint64_t r; - __asm ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0 + size_t r; + __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return r; #elif MI_ARCH_X64 && defined(__BMI1__) return (size_t)_tzcnt_u64(x); @@ -164,6 +164,11 @@ static inline size_t mi_ctz(size_t x) { return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS); #elif mi_has_builtinz(ctz) return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS); + #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86) + if (x==0) return MI_SIZE_BITS; + size_t r; + __asm ("bsf\t%1, %0" : "=r"(r) : "r"(x) : "cc"); + return r; #else #define MI_HAS_FAST_BITSCAN 0 return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS); @@ -172,9 +177,9 @@ static inline size_t mi_ctz(size_t x) { static inline size_t mi_clz(size_t x) { #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 - uint64_t r; - __asm ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); - return r; + size_t r; + __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); + return r; #elif MI_ARCH_X64 && defined(__BMI1__) return (size_t)_lzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) @@ -182,6 +187,11 @@ static inline size_t mi_clz(size_t x) { return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS); #elif mi_has_builtinz(clz) return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS); + #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86) + if (x==0) return MI_SIZE_BITS; + size_t r; + __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc"); + return (MI_SIZE_BITS - 1 - r); #else #define MI_HAS_FAST_BITSCAN 0 return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS); From 3a92c3527045b7922e12131d248b6d57ea646de9 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 9 Dec 2024 20:25:22 -0800 Subject: [PATCH 061/264] improve generic ctz/clz --- src/libc.c | 38 +++++++++++++++++++++++--------------- 1 file changed, 23 insertions(+), 15 deletions(-) diff --git a/src/libc.c b/src/libc.c index 2b28bd25..15d4d2a7 100644 --- a/src/libc.c +++ b/src/libc.c @@ -289,7 +289,7 @@ static size_t mi_ctz_generic32(uint32_t x) { 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 }; if (x==0) return 32; - return debruijn[((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27]; + return debruijn[(uint32_t)((x & -(int32_t)x) * (uint32_t)(0x077CB531U)) >> 27]; } static size_t mi_clz_generic32(uint32_t x) { @@ -307,25 +307,33 @@ static size_t mi_clz_generic32(uint32_t x) { return debruijn[(uint32_t)(x * (uint32_t)(0x07C4ACDDU)) >> 27]; } -size_t _mi_clz_generic(size_t x) { - if (x==0) return MI_SIZE_BITS; - #if (MI_SIZE_BITS <= 32) - return mi_clz_generic32((uint32_t)x); - #else - const size_t count = mi_clz_generic32((uint32_t)(x >> 32)); - if (count < 32) return count; - return (32 + mi_clz_generic32((uint32_t)x)); - #endif -} - size_t _mi_ctz_generic(size_t x) { if (x==0) return MI_SIZE_BITS; #if (MI_SIZE_BITS <= 32) return mi_ctz_generic32((uint32_t)x); #else - const size_t count = mi_ctz_generic32((uint32_t)x); - if (count < 32) return count; - return (32 + mi_ctz_generic32((uint32_t)(x>>32))); + const uint32_t lo = (uint32_t)x; + if (lo != 0) { + return mi_ctz_generic32(lo); + } + else { + return (32 + mi_ctz_generic32((uint32_t)(x>>32))); + } + #endif +} + +size_t _mi_clz_generic(size_t x) { + if (x==0) return MI_SIZE_BITS; + #if (MI_SIZE_BITS <= 32) + return mi_clz_generic32((uint32_t)x); + #else + const uint32_t hi = (uint32_t)(x>>32); + if (hi != 0) { + return mi_clz_generic32(hi); + } + else { + return 32 + mi_clz_generic32((uint32_t)x); + } #endif } From c5a2d11193da2335741a6c66fed8d88c6dd53764 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 9 Dec 2024 20:40:26 -0800 Subject: [PATCH 062/264] add extra checks for valid pointers in the pagemap, add max_vabits and debug_commit_full_pagemap options --- ide/vs2022/mimalloc-override.vcxproj.filters | 4 ++- ide/vs2022/mimalloc.vcxproj.filters | 4 ++- include/mimalloc.h | 4 ++- include/mimalloc/internal.h | 1 + src/options.c | 2 ++ src/page-map.c | 37 +++++++++++--------- 6 files changed, 33 insertions(+), 19 deletions(-) diff --git a/ide/vs2022/mimalloc-override.vcxproj.filters b/ide/vs2022/mimalloc-override.vcxproj.filters index 0e63822c..fb48e98f 100644 --- a/ide/vs2022/mimalloc-override.vcxproj.filters +++ b/ide/vs2022/mimalloc-override.vcxproj.filters @@ -58,7 +58,9 @@ Sources - + + Sources + diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters index 7fc4ba9c..06b0364f 100644 --- a/ide/vs2022/mimalloc.vcxproj.filters +++ b/ide/vs2022/mimalloc.vcxproj.filters @@ -58,7 +58,9 @@ Sources - + + Sources + diff --git a/include/mimalloc.h b/include/mimalloc.h index 907ffadb..c11353b7 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -380,7 +380,9 @@ typedef enum mi_option_e { mi_option_target_segments_per_thread, // experimental (=0) mi_option_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) mi_option_full_page_retain, // retain N full pages per size class (=2) - mi_option_max_page_candidates, // max candidate pages to consider for allocation (=4) + mi_option_max_page_candidates, // max candidate pages to consider for allocation (=4) + mi_option_max_vabits, // max virtual address bits to consider in user space (=48) + mi_option_debug_commit_full_pagemap, // commit the full pagemap to catch invalid pointer uses (=0) _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index c6d9ae36..c189a082 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -459,6 +459,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) { } static inline mi_page_t* _mi_ptr_page(const void* p) { + mi_assert_internal(p==NULL || mi_is_in_heap_region(p)); #if MI_DEBUG || defined(__APPLE__) return _mi_checked_ptr_page(p); #else diff --git a/src/options.c b/src/options.c index f2e9297f..8fcee452 100644 --- a/src/options.c +++ b/src/options.c @@ -160,6 +160,8 @@ static mi_option_desc_t options[_mi_option_last] = { 1, UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { 2, UNINIT, MI_OPTION(full_page_retain) }, { 4, UNINIT, MI_OPTION(max_page_candidates) }, + { 0, UNINIT, MI_OPTION(max_vabits) }, + { 0, UNINIT, MI_OPTION(debug_commit_full_pagemap) }, }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/page-map.c b/src/page-map.c index 475e8fc2..181db7f0 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -20,8 +20,11 @@ static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_C { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} }; bool _mi_page_map_init(void) { - size_t vbits = _mi_os_virtual_address_bits(); - if (vbits >= 48) vbits = 47; + size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS); + if (vbits == 0) { + vbits = _mi_os_virtual_address_bits(); + if (vbits >= 48) { vbits = 47; } + } // 1 byte per block = 2 GiB for 128 TiB address space (48 bit = 256 TiB address space) // 64 KiB for 4 GiB address space (on 32-bit) mi_page_map_max_address = (void*)(MI_PU(1) << vbits); @@ -30,7 +33,7 @@ bool _mi_page_map_init(void) { mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_DEFAULT_BIT_COUNT); // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true); - mi_page_map_all_committed = true; // (page_map_size <= 1*MI_MiB); // _mi_os_has_overcommit(); // commit on-access on Linux systems? + mi_page_map_all_committed = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap)); // _mi_os_has_overcommit(); // commit on-access on Linux systems? _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); @@ -52,26 +55,28 @@ bool _mi_page_map_init(void) { } static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) { - // is the page map area that contains the page address committed? - if (!mi_page_map_all_committed) { - const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit; - const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit; - for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) { // per bit to avoid crossing over bitmap chunks - if (mi_bitmap_is_clearN(&mi_page_map_commit, i, 1)) { - // this may race, in which case we do multiple commits (which is ok) + // is the page map area that contains the page address committed? + // we always set the commit bits so we can track what ranges are in-use. + // we only actually commit if the map wasn't committed fully already. + const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit; + const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit; + for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) { // per bit to avoid crossing over bitmap chunks + if (mi_bitmap_is_clearN(&mi_page_map_commit, i, 1)) { + // this may race, in which case we do multiple commits (which is ok) + if (!mi_page_map_all_committed) { bool is_zero; uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit); const size_t size = mi_page_map_entries_per_commit_bit; _mi_os_commit(start, size, &is_zero); - if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start,size); } - mi_bitmap_set(&mi_page_map_commit, i); + if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); } } + mi_bitmap_set(&mi_page_map_commit, i); } - #if MI_DEBUG > 0 - _mi_page_map[idx] = 0; - _mi_page_map[idx+slice_count-1] = 0; - #endif } + #if MI_DEBUG > 0 + _mi_page_map[idx] = 0; + _mi_page_map[idx+slice_count-1] = 0; + #endif } static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) { From 6798375f4734cf9d579f00c8f55313a48616633d Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 9 Dec 2024 21:26:23 -0800 Subject: [PATCH 063/264] temporarily add macOS 13 and 12 for testing --- azure-pipelines.yml | 48 +++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 48 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index eb520aa0..d853db2f 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -305,3 +305,51 @@ jobs: - script: ctest --verbose --timeout 180 workingDirectory: $(BuildType) displayName: CTest + +- job: + displayName: macOS 13 (Ventura) + pool: + vmImage: + macOS-13 + strategy: + matrix: + Debug: + BuildType: debug + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON + Release: + BuildType: release + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release + steps: + - task: CMake@1 + inputs: + workingDirectory: $(BuildType) + cmakeArgs: .. $(cmakeExtraArgs) + - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType) + displayName: Make + - script: ctest --verbose --timeout 180 + workingDirectory: $(BuildType) + displayName: CTest + +- job: + displayName: macOS 12 (Monterey) + pool: + vmImage: + macOS-12 + strategy: + matrix: + Debug: + BuildType: debug + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON + Release: + BuildType: release + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release + steps: + - task: CMake@1 + inputs: + workingDirectory: $(BuildType) + cmakeArgs: .. $(cmakeExtraArgs) + - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType) + displayName: Make + - script: ctest --verbose --timeout 180 + workingDirectory: $(BuildType) + displayName: CTest From f37aff6ee273cb149f3d103598c1c38ab673268d Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 9 Dec 2024 22:27:40 -0800 Subject: [PATCH 064/264] fix for macOS 14 and earlier --- include/mimalloc/internal.h | 2 +- src/heap.c | 6 +++--- src/init.c | 17 +++++++++++++---- 3 files changed, 17 insertions(+), 8 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index c189a082..8a61a58e 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -186,7 +186,7 @@ size_t _mi_bin_size(uint8_t bin); // for stats uint8_t _mi_bin(size_t size); // for stats // "heap.c" -void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag); +void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld); void _mi_heap_destroy_pages(mi_heap_t* heap); void _mi_heap_collect_abandon(mi_heap_t* heap); void _mi_heap_set_default_direct(mi_heap_t* heap); diff --git a/src/heap.c b/src/heap.c index d2914361..ee0a8ce9 100644 --- a/src/heap.c +++ b/src/heap.c @@ -182,9 +182,9 @@ mi_heap_t* mi_heap_get_backing(void) { return bheap; } -void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag) { +void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld) { _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); - heap->tld = _mi_tld(); + heap->tld = (tld == NULL ? _mi_tld() : tld); // avoid reading the thread-local tld during initialization heap->thread_id = _mi_thread_id(); heap->arena_id = arena_id; heap->allow_page_reclaim = !noreclaim; @@ -216,7 +216,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? if (heap == NULL) return NULL; mi_assert(heap_tag >= 0 && heap_tag < 256); - _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */); + _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */, NULL); return heap; } diff --git a/src/init.c b/src/init.c index 2070405d..19e111d3 100644 --- a/src/init.c +++ b/src/init.c @@ -214,7 +214,7 @@ static void mi_heap_main_init(void) { if (_mi_heap_main.cookie == 0) { _mi_heap_main.thread_id = _mi_thread_id(); _mi_heap_main.cookie = 1; - #if defined(_WIN32) && !defined(MI_SHARED_LIB) + #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB) _mi_random_init_weak(&_mi_heap_main.random); // prevent allocation failure during bcrypt dll initialization with static linking #else _mi_random_init(&_mi_heap_main.random); @@ -344,8 +344,12 @@ static bool _mi_thread_heap_init(void) { //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap()); } else { - // allocate heap and thread local data - mi_tld_t* tld = _mi_tld(); // allocates & initializes tld if needed + // allocates tld data + // note: we cannot access thread-locals yet as that can cause (recursive) allocation (on macOS <= 14 for + // example where the loader allocates thread-local data on demand). + mi_tld_t* tld = mi_tld_alloc(); + + // allocate and initialize the heap mi_memid_t memid; mi_heap_t* heap = (tld == NULL ? NULL : (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid)); if (heap==NULL || tld==NULL) { @@ -353,8 +357,13 @@ static bool _mi_thread_heap_init(void) { return false; } heap->memid = memid; - _mi_heap_init(heap, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */); + _mi_heap_init(heap, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */, tld); + + // associate the heap with this thread + // (this is safe, on macOS for example, the heap is set in a dedicated TLS slot and thus does not cause recursive allocation) _mi_heap_set_default_direct(heap); + // now that the heap is set for this thread, we can set the thread-local tld. + mi_tld = tld; } return false; } From 7cd8f31f30cf77bf957b29af1d6f370b0b935759 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 10 Dec 2024 14:50:55 -0800 Subject: [PATCH 065/264] improve popcount --- include/mimalloc/bits.h | 48 +++++++++++++++++++++-------------------- src/libc.c | 4 ++-- 2 files changed, 27 insertions(+), 25 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 4f0dce71..c0405d6f 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -144,11 +144,31 @@ typedef int32_t mi_ssize_t; #define mi_msc_builtinz(name) name##64 #endif - /* -------------------------------------------------------------------------------- - Count trailing/leading zero's + Popcount and count trailing/leading zero's -------------------------------------------------------------------------------- */ +size_t _mi_popcount_generic(size_t x); + +static inline size_t mi_popcount(size_t x) { + #if mi_has_builtinz(popcount) + return mi_builtinz(popcount)(x); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + return mi_msc_builtinz(__popcnt)(x); + #elif MI_ARCH_X64 && defined(__BMI1__) + return (size_t)_mm_popcnt_u64(x); + #else + #define MI_HAS_FAST_POPCOUNT 0 + return (x<=1 ? x : _mi_popcount_generic(x)); + #endif +} + +#ifndef MI_HAS_FAST_POPCOUNT +#define MI_HAS_FAST_POPCOUNT 1 +#endif + + + size_t _mi_clz_generic(size_t x); size_t _mi_ctz_generic(size_t x); @@ -169,6 +189,8 @@ static inline size_t mi_ctz(size_t x) { size_t r; __asm ("bsf\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return r; + #elif MI_HAS_FAST_POPCOUNT + return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS); #else #define MI_HAS_FAST_BITSCAN 0 return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS); @@ -179,7 +201,7 @@ static inline size_t mi_clz(size_t x) { #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 size_t r; __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); - return r; + return r; #elif MI_ARCH_X64 && defined(__BMI1__) return (size_t)_lzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) @@ -202,26 +224,6 @@ static inline size_t mi_clz(size_t x) { #define MI_HAS_FAST_BITSCAN 1 #endif -size_t _mi_popcount_generic(size_t x); - -static inline size_t mi_popcount(size_t x) { - #if mi_has_builtinz(popcount) - return mi_builtinz(popcount)(x); - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) - return mi_msc_builtinz(__popcnt)(x); - #elif MI_ARCH_X64 && defined(__BMI1__) - return (size_t)_mm_popcnt_u64(x); - #else - #define MI_HAS_FAST_POPCOUNT 0 - return (x<=1 ? x : _mi_popcount_generic(x)); - #endif -} - -#ifndef MI_HAS_FAST_POPCOUNT -#define MI_HAS_FAST_POPCOUNT 1 -#endif - - /* -------------------------------------------------------------------------------- find trailing/leading zero (bit scan forward/reverse) -------------------------------------------------------------------------------- */ diff --git a/src/libc.c b/src/libc.c index 15d4d2a7..eed63d87 100644 --- a/src/libc.c +++ b/src/libc.c @@ -283,7 +283,7 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { #if !MI_HAS_FAST_BITSCAN static size_t mi_ctz_generic32(uint32_t x) { - // de Bruijn multiplication, see + // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 @@ -293,7 +293,7 @@ static size_t mi_ctz_generic32(uint32_t x) { } static size_t mi_clz_generic32(uint32_t x) { - // de Bruijn multiplication, see + // de Bruijn multiplication, see static const uint8_t debruijn[32] = { 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, 23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 From 13be5d6740f43931342ba0e59364a68e275da47a Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 10 Dec 2024 15:11:46 -0800 Subject: [PATCH 066/264] use non-null tld in heap_init --- src/heap.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/heap.c b/src/heap.c index ee0a8ce9..1b5d14b4 100644 --- a/src/heap.c +++ b/src/heap.c @@ -184,7 +184,7 @@ mi_heap_t* mi_heap_get_backing(void) { void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld) { _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); - heap->tld = (tld == NULL ? _mi_tld() : tld); // avoid reading the thread-local tld during initialization + heap->tld = tld; // avoid reading the thread-local tld during initialization heap->thread_id = _mi_thread_id(); heap->arena_id = arena_id; heap->allow_page_reclaim = !noreclaim; @@ -216,7 +216,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? if (heap == NULL) return NULL; mi_assert(heap_tag >= 0 && heap_tag < 256); - _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */, NULL); + _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */, bheap->tld); return heap; } From c478ddaab490a5161de2a297e126b2e561c010a2 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 10 Dec 2024 19:44:54 -0800 Subject: [PATCH 067/264] fix MI_GUARDED build --- ide/vs2022/mimalloc.vcxproj | 2 +- src/alloc.c | 5 ++++- src/arena.c | 22 +++++++++++++++++++--- src/init.c | 2 +- src/libc.c | 4 ++-- test/main-override-static.c | 4 ++-- test/test-stress.c | 2 +- 7 files changed, 30 insertions(+), 11 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index d8cc25b1..3f1280ee 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -116,7 +116,7 @@ true Default ../../include - MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); + MI_DEBUG=3;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 diff --git a/src/alloc.c b/src/alloc.c index b0c89e65..25d6f62e 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -628,6 +628,9 @@ static void* mi_block_ptr_set_guarded(mi_block_t* block, size_t obj_size) { return NULL; } uint8_t* guard_page = (uint8_t*)block + block_size - os_page_size; + // note: the alignment of the guard page relies on blocks being os_page_size aligned which + // is ensured in `mi_arena_page_alloc_fresh`. + mi_assert_internal(_mi_is_aligned(block, os_page_size)); mi_assert_internal(_mi_is_aligned(guard_page, os_page_size)); if (!page->memid.is_pinned && _mi_is_aligned(guard_page, os_page_size)) { _mi_os_protect(guard_page, os_page_size); @@ -662,7 +665,7 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo const size_t req_size = _mi_align_up(bsize + os_page_size, os_page_size); mi_block_t* const block = (mi_block_t*)_mi_malloc_generic(heap, req_size, zero, 0 /* huge_alignment */); if (block==NULL) return NULL; - void* const p = mi_block_ptr_set_guarded(block, obj_size); + void* const p = mi_block_ptr_set_guarded(block, obj_size); // stats mi_track_malloc(p, size, zero); diff --git a/src/arena.c b/src/arena.c index 24835f42..9923eae1 100644 --- a/src/arena.c +++ b/src/arena.c @@ -285,7 +285,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re } // check arena bounds - const size_t min_reserve = MI_ARENA_MIN_SIZE; + const size_t min_reserve = MI_ARENA_MIN_SIZE; const size_t max_reserve = MI_ARENA_MAX_SIZE; // 16 GiB if (arena_reserve < min_reserve) { arena_reserve = min_reserve; @@ -302,7 +302,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } // and try to reserve the arena - int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); + int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); if (err != 0) { // failed, try a smaller size? const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB); @@ -624,7 +624,23 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) { _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small.\n"); }; - const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); + size_t block_start; + #if MI_GUARDED + // in a guarded build, we aling pages with blocks a multiple of an OS page size, to the OS page size + // this ensures that all blocks in such pages are OS page size aligned (which is needed for the guard pages) + const size_t os_page_size = _mi_os_page_size(); + mi_assert_internal(MI_PAGE_ALIGN >= os_page_size); + if (block_size % os_page_size == 0) { + block_start = _mi_align_up(MI_PAGE_INFO_SIZE, os_page_size); + } + else + #endif + if (os_align) { + block_start = MI_PAGE_ALIGN; + } + else { + block_start = MI_PAGE_INFO_SIZE; + } const size_t reserved = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size); mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); page->reserved = (uint16_t)reserved; diff --git a/src/init.c b/src/init.c index 19e111d3..57be59a8 100644 --- a/src/init.c +++ b/src/init.c @@ -180,7 +180,7 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp if (heap->guarded_sample_rate >= 1) { heap->guarded_sample_seed = heap->guarded_sample_seed % heap->guarded_sample_rate; } - heap->guarded_sample_count = heap->guarded_sample_seed; // count down samples + heap->guarded_sample_count = 1 + heap->guarded_sample_seed; // count down samples } mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max) { diff --git a/src/libc.c b/src/libc.c index eed63d87..0ec2164d 100644 --- a/src/libc.c +++ b/src/libc.c @@ -84,8 +84,8 @@ bool _mi_getenv(const char* name, char* result, size_t result_size) { // This is mostly to avoid calling these when libc is not yet // initialized (and to reduce dependencies) // -// format: d i, p x u, s -// prec: z l ll L +// format: d i, p, x, u, s +// type: z l ll L // width: 10 // align-left: - // fill: 0 diff --git a/test/main-override-static.c b/test/main-override-static.c index 2e7f1aca..410764bd 100644 --- a/test/main-override-static.c +++ b/test/main-override-static.c @@ -233,8 +233,8 @@ static void test_heap_walk(void) { } static void test_canary_leak(void) { - char* p = mi_mallocn_tp(char, 23); - for (int i = 0; i < 23; i++) { + char* p = mi_mallocn_tp(char, 22); + for (int i = 0; i < 22; i++) { p[i] = '0'+i; } puts(p); diff --git a/test/test-stress.c b/test/test-stress.c index 915c953f..0488fc2b 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -42,7 +42,7 @@ static int SCALE = 10; static int ITER = 10; #elif 0 static int THREADS = 4; -static int SCALE = 100; +static int SCALE = 10; static int ITER = 10; #define ALLOW_LARGE false #elif 0 From 64c4181ffa63e21b644f4f06d42279bfd4e82cf1 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 10 Dec 2024 20:32:48 -0800 Subject: [PATCH 068/264] better block alignment --- include/mimalloc/internal.h | 5 +++++ include/mimalloc/types.h | 7 ++++--- src/alloc-aligned.c | 4 +++- src/arena.c | 19 +++++++++++-------- 4 files changed, 23 insertions(+), 12 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 8a61a58e..5c5afca0 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -479,11 +479,16 @@ static inline uint8_t* mi_page_start(const mi_page_t* page) { return page->page_start; } + static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) { if (size) { *size = mi_page_block_size(page) * page->reserved; } return mi_page_start(page); } +static inline size_t mi_page_info_size(void) { + return _mi_align_up(sizeof(mi_page_t), MI_MAX_ALIGN_SIZE); +} + static inline bool mi_page_contains_address(const mi_page_t* page, const void* p) { size_t psize; uint8_t* start = mi_page_area(page, &psize); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 71edb397..dc1c93fe 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -323,13 +323,14 @@ typedef struct mi_page_s { // ------------------------------------------------------ #define MI_PAGE_ALIGN MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map. -#define MI_PAGE_MIN_BLOCK_ALIGN MI_SIZE_BITS // minimal block alignment in a page (64b on 64-bit, 32b on 32-bit) +#define MI_PAGE_MIN_START_BLOCK_ALIGN MI_MAX_ALIGN_SIZE // minimal block alignment for the first block in a page (16b) +#define MI_PAGE_MAX_START_BLOCK_ALIGN2 MI_KiB // maximal block alignment for "power of 2"-sized blocks #define MI_PAGE_MAX_OVERALLOC_ALIGN MI_ARENA_SLICE_SIZE // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8 -#define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) +#define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+2)*32) // 160 >= sizeof(mi_page_t) #else -#define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN) // >= sizeof(mi_page_t) +#define MI_PAGE_INFO_SIZE ((MI_INTPTR_SHIFT+1)*32) // 128/96 >= sizeof(mi_page_t) #endif // The max object size are checked to not waste more than 12.5% internally over the page sizes. diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index 38e0371d..c36ce0af 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -20,7 +20,9 @@ static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) { mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0)); if (alignment > size) return false; const size_t bsize = mi_good_size(size); - return (bsize <= MI_PAGE_MIN_BLOCK_ALIGN && (bsize & (alignment-1)) == 0); + const bool ok = (bsize <= MI_PAGE_MAX_START_BLOCK_ALIGN2 && _mi_is_power_of_two(bsize)); + if (ok) { mi_assert_internal((bsize & (alignment-1)) == 0); } // since both power of 2 and alignment <= size + return ok; } #if MI_GUARDED diff --git a/src/arena.c b/src/arena.c index 9923eae1..a05e1f5d 100644 --- a/src/arena.c +++ b/src/arena.c @@ -621,25 +621,28 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz } } #endif - if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) { - _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small.\n"); - }; + mi_assert(MI_PAGE_INFO_SIZE >= mi_page_info_size()); size_t block_start; #if MI_GUARDED - // in a guarded build, we aling pages with blocks a multiple of an OS page size, to the OS page size + // in a guarded build, we align pages with blocks a multiple of an OS page size, to the OS page size // this ensures that all blocks in such pages are OS page size aligned (which is needed for the guard pages) const size_t os_page_size = _mi_os_page_size(); mi_assert_internal(MI_PAGE_ALIGN >= os_page_size); - if (block_size % os_page_size == 0) { - block_start = _mi_align_up(MI_PAGE_INFO_SIZE, os_page_size); + if (block_size % os_page_size == 0 && block_size > os_page_size /* at least 2 or more */ ) { + block_start = _mi_align_up(_mi_page_info_size(), os_page_size); } else #endif if (os_align) { block_start = MI_PAGE_ALIGN; } + else if (_mi_is_power_of_two(block_size) && block_size <= MI_PAGE_MAX_START_BLOCK_ALIGN2) { + // naturally align all power-of-2 blocks + block_start = _mi_align_up(mi_page_info_size(), block_size); + } else { - block_start = MI_PAGE_INFO_SIZE; + // otherwise start after the info + block_start = mi_page_info_size(); } const size_t reserved = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size); mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); @@ -691,7 +694,7 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si const mi_arena_id_t req_arena_id = heap->arena_id; mi_tld_t* const tld = heap->tld; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); - const size_t info_size = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE); + const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size()); const size_t slice_count = mi_slice_count_of_size(info_size + block_size); mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld); From 24d3c1bc14b0286607f764d6dda8b1c55e2ad40d Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 11 Dec 2024 09:16:28 -0800 Subject: [PATCH 069/264] heap meta data always uses mi_meta_zalloc --- include/mimalloc/internal.h | 1 + src/alloc-aligned.c | 2 ++ src/heap.c | 62 ++++++++++++++++++++++--------------- src/init.c | 16 +++------- 4 files changed, 45 insertions(+), 36 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 5c5afca0..a2e1d5d7 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -186,6 +186,7 @@ size_t _mi_bin_size(uint8_t bin); // for stats uint8_t _mi_bin(size_t size); // for stats // "heap.c" +mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld); void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld); void _mi_heap_destroy_pages(mi_heap_t* heap); void _mi_heap_collect_abandon(mi_heap_t* heap); diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index c36ce0af..14cbee45 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -183,6 +183,8 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t #endif // try first if there happens to be a small block available with just the right alignment + // since most small power-of-2 blocks (under MI_PAGE_MAX_BLOCK_START_ALIGN2) are already + // naturally aligned this can be often the case. if mi_likely(size <= MI_SMALL_SIZE_MAX && alignment <= size) { const uintptr_t align_mask = alignment-1; // for any x, `(x & align_mask) == (x % alignment)` const size_t padsize = size + MI_PADDING_SIZE; diff --git a/src/heap.c b/src/heap.c index 1b5d14b4..837e7cd8 100644 --- a/src/heap.c +++ b/src/heap.c @@ -182,19 +182,25 @@ mi_heap_t* mi_heap_get_backing(void) { return bheap; } -void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld) { +// todo: make order of parameters consistent (but would that break compat with CPython?) +void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld) +{ + mi_assert_internal(heap!=NULL); + mi_memid_t memid = heap->memid; _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); - heap->tld = tld; // avoid reading the thread-local tld during initialization + heap->memid = memid; + heap->tld = tld; // avoid reading the thread-local tld during initialization heap->thread_id = _mi_thread_id(); heap->arena_id = arena_id; heap->allow_page_reclaim = !noreclaim; heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0); - heap->tag = tag; + heap->tag = heap_tag; if (heap->tld->is_in_threadpool) { // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. // (but abandoning is good in this case) heap->allow_page_reclaim = false; } + if (heap->tld->heap_backing == NULL) { heap->tld->heap_backing = heap; // first heap becomes the backing heap _mi_random_init(&heap->random); @@ -206,18 +212,31 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint heap->keys[0] = _mi_heap_random_next(heap); heap->keys[1] = _mi_heap_random_next(heap); _mi_heap_guarded_init(heap); + // push on the thread local heaps list heap->next = heap->tld->heaps; heap->tld->heaps = heap; } +mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld) { + mi_assert_internal(tld!=NULL); + mi_assert(heap_tag >= 0 && heap_tag < 256); + // allocate and initialize a heap + mi_memid_t memid; + mi_heap_t* heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid); + if (heap==NULL) { + _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n"); + return NULL; + } + heap->memid = memid; + _mi_heap_init(heap, arena_id, allow_destroy, (uint8_t)heap_tag, tld); + return heap; +} + mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) { mi_heap_t* bheap = mi_heap_get_backing(); - mi_heap_t* heap = mi_heap_malloc_tp(bheap, mi_heap_t); // todo: OS allocate in secure mode? - if (heap == NULL) return NULL; - mi_assert(heap_tag >= 0 && heap_tag < 256); - _mi_heap_init(heap, arena_id, allow_destroy /* no reclaim? */, (uint8_t)heap_tag /* heap tag */, bheap->tld); - return heap; + mi_assert_internal(bheap != NULL); + return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld); } mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { @@ -276,7 +295,7 @@ static void mi_heap_free(mi_heap_t* heap) { mi_assert_internal(heap->tld->heaps != NULL); // and free the used memory - mi_free(heap); + _mi_meta_free(heap, sizeof(*heap), heap->memid); } // return a heap on the same thread as `heap` specialized for the specified tag (if it exists) @@ -402,13 +421,7 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) { mi_assert_internal(heap!=NULL); if (from==NULL || from->page_count == 0) return; - // reduce the size of the delayed frees - // _mi_heap_delayed_free_partial(from); - // transfer all pages by appending the queues; this will set a new heap field - // so threads may do delayed frees in either heap for a while. - // note: appending waits for each page to not be in the `MI_DELAYED_FREEING` state - // so after this only the new heap will get delayed frees for (size_t i = 0; i <= MI_BIN_FULL; i++) { mi_page_queue_t* pq = &heap->pages[i]; mi_page_queue_t* append = &from->pages[i]; @@ -418,19 +431,17 @@ static void mi_heap_absorb(mi_heap_t* heap, mi_heap_t* from) { } mi_assert_internal(from->page_count == 0); - // and do outstanding delayed frees in the `from` heap - // note: be careful here as the `heap` field in all those pages no longer point to `from`, - // turns out to be ok as `_mi_heap_delayed_free` only visits the list and calls a - // the regular `_mi_free_delayed_block` which is safe. - //_mi_heap_delayed_free_all(from); - //#if !defined(_MSC_VER) || (_MSC_VER > 1900) // somehow the following line gives an error in VS2015, issue #353 - // mi_assert_internal(mi_atomic_load_ptr_relaxed(mi_block_t,&from->thread_delayed_free) == NULL); - //#endif - // and reset the `from` heap mi_heap_reset_pages(from); } +// are two heaps compatible with respect to heap-tag, exclusive arena etc. +static bool mi_heaps_are_compatible(mi_heap_t* heap1, mi_heap_t* heap2) { + return (heap1->tag == heap2->tag && // store same kind of objects + heap1->tld->subproc == heap2->tld->subproc && // same sub-process + heap1->arena_id == heap2->arena_id); // same arena preference +} + // Safe delete a heap without freeing any still allocated blocks in that heap. void mi_heap_delete(mi_heap_t* heap) { @@ -439,7 +450,8 @@ void mi_heap_delete(mi_heap_t* heap) mi_assert_expensive(mi_heap_is_valid(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return; - if (!mi_heap_is_backing(heap)) { + mi_heap_t* bheap = heap->tld->heap_backing; + if (heap != bheap && mi_heaps_are_compatible(bheap,heap)) { // transfer still used pages to the backing heap mi_heap_absorb(heap->tld->heap_backing, heap); } diff --git a/src/init.c b/src/init.c index 57be59a8..ae1ae086 100644 --- a/src/init.c +++ b/src/init.c @@ -345,25 +345,19 @@ static bool _mi_thread_heap_init(void) { } else { // allocates tld data - // note: we cannot access thread-locals yet as that can cause (recursive) allocation (on macOS <= 14 for - // example where the loader allocates thread-local data on demand). + // note: we cannot access thread-locals yet as that can cause (recursive) allocation + // (on macOS <= 14 for example where the loader allocates thread-local data on demand). mi_tld_t* tld = mi_tld_alloc(); // allocate and initialize the heap - mi_memid_t memid; - mi_heap_t* heap = (tld == NULL ? NULL : (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid)); - if (heap==NULL || tld==NULL) { - _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n"); - return false; - } - heap->memid = memid; - _mi_heap_init(heap, _mi_arena_id_none(), false /* can reclaim */, 0 /* default tag */, tld); + mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld); // associate the heap with this thread // (this is safe, on macOS for example, the heap is set in a dedicated TLS slot and thus does not cause recursive allocation) _mi_heap_set_default_direct(heap); + // now that the heap is set for this thread, we can set the thread-local tld. - mi_tld = tld; + mi_tld = tld; } return false; } From 565656919ed57b9530e9c23c64bb7b5e4b0a47b3 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 11 Dec 2024 13:04:37 -0800 Subject: [PATCH 070/264] fix comments in types; fix guarded alignment bug --- ide/vs2022/mimalloc.vcxproj | 2 +- include/mimalloc/internal.h | 9 +++- include/mimalloc/types.h | 95 +++++++++++++++---------------------- src/arena.c | 47 ++++++++---------- src/init.c | 5 +- 5 files changed, 67 insertions(+), 91 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 3f1280ee..34bb28fe 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -116,7 +116,7 @@ true Default ../../include - MI_DEBUG=3;%(PreprocessorDefinitions); + MI_DEBUG=3;MI_GUARDED=1;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index a2e1d5d7..3c5bd486 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -581,7 +581,7 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) { return (page->free != NULL); } - + // is the page not yet used up to its reserved space? static inline bool mi_page_is_expandable(const mi_page_t* page) { mi_assert_internal(page != NULL); @@ -714,6 +714,12 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { Guarded objects ------------------------------------------------------------------- */ #if MI_GUARDED + +// we always align guarded pointers in a block at an offset +// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones +#define MI_BLOCK_TAG_ALIGNED ((mi_encoded_t)(0)) +#define MI_BLOCK_TAG_GUARDED (~MI_BLOCK_TAG_ALIGNED) + static inline bool mi_block_ptr_is_guarded(const mi_block_t* block, const void* p) { const ptrdiff_t offset = (uint8_t*)p - (uint8_t*)block; return (offset >= (ptrdiff_t)(sizeof(mi_block_t)) && block->next == MI_BLOCK_TAG_GUARDED); @@ -895,6 +901,7 @@ static inline mi_memid_t _mi_memid_create_meta(void* mpage, size_t block_idx, si return memid; } + // ------------------------------------------------------------------- // Fast "random" shuffle // ------------------------------------------------------------------- diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index dc1c93fe..cc64a400 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -97,16 +97,8 @@ terms of the MIT license. A copy of the license can be found in the file #endif -// We used to abandon huge pages in order to eagerly deallocate it if freed from another thread. -// Unfortunately, that makes it not possible to visit them during a heap walk or include them in a -// `mi_heap_destroy`. We therefore instead reset/decommit the huge blocks nowadays if freed from -// another thread so the memory becomes "virtually" available (and eventually gets properly freed by -// the owning thread). -// #define MI_HUGE_PAGE_ABANDON 1 - - // ------------------------------------------------------ -// Main internal data-structures +// Sizes of internal data-structures // ------------------------------------------------------ // Sizes are for 64-bit @@ -145,21 +137,32 @@ terms of the MIT license. A copy of the license can be found in the file // We never allocate more than PTRDIFF_MAX (see also ) #define MI_MAX_ALLOC_SIZE PTRDIFF_MAX +// ------------------------------------------------------ +// Arena's are large reserved areas of memory allocated from +// the OS that are managed by mimalloc to efficiently +// allocate MI_ARENA_SLICE_SIZE slices of memory for the +// mimalloc pages. +// ------------------------------------------------------ + +// A large memory arena where pages are allocated in. +typedef struct mi_arena_s mi_arena_t; // defined in `arena.c` + // --------------------------------------------------------------- // a memory id tracks the provenance of arena/OS allocated memory // --------------------------------------------------------------- -// Memory can reside in arena's, direct OS allocated, or statically allocated. The memid keeps track of this. +// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated. +// The memid keeps track of this. typedef enum mi_memkind_e { MI_MEM_NONE, // not allocated MI_MEM_EXTERNAL, // not owned by mimalloc but provided externally (via `mi_manage_os_memory` for example) - MI_MEM_STATIC, // allocated in a static area and should not be freed (for arena meta data for example) - MI_MEM_META, // allocated with the meta data allocator + MI_MEM_STATIC, // allocated in a static area and should not be freed (the initial main heap data for example (`init.c`)) + MI_MEM_META, // allocated with the meta data allocator (`arena-meta.c`) MI_MEM_OS, // allocated from the OS MI_MEM_OS_HUGE, // allocated as huge OS pages (usually 1GiB, pinned to physical memory) MI_MEM_OS_REMAP, // allocated in a remapable area (i.e. using `mremap`) - MI_MEM_ARENA // allocated from an arena (the usual case) + MI_MEM_ARENA // allocated from an arena (the usual case) (`arena.c`) } mi_memkind_t; static inline bool mi_memkind_is_os(mi_memkind_t memkind) { @@ -178,10 +181,9 @@ typedef struct mi_memid_os_info { } mi_memid_os_info_t; typedef struct mi_memid_arena_info { - uint32_t slice_index; // base index in the arena + mi_arena_t* arena; // arena that contains this memory + uint32_t slice_index; // slice index in the arena uint32_t slice_count; // allocated slices - mi_arena_id_t id; // arena id (>= 1) - bool is_exclusive; // this arena can only be used for specific arena allocations } mi_memid_arena_info_t; typedef struct mi_memid_meta_info { @@ -196,10 +198,10 @@ typedef struct mi_memid_s { mi_memid_arena_info_t arena; // only used for MI_MEM_ARENA mi_memid_meta_info_t meta; // only used for MI_MEM_META } mem; + mi_memkind_t memkind; bool is_pinned; // `true` if we cannot decommit/reset/protect in this memory (e.g. when allocated using large (2Mib) or huge (1GiB) OS pages) bool initially_committed;// `true` if the memory was originally allocated as committed bool initially_zero; // `true` if the memory was originally zero initialized - mi_memkind_t memkind; } mi_memid_t; @@ -227,32 +229,21 @@ typedef struct mi_block_s { mi_encoded_t next; } mi_block_t; -#if MI_GUARDED -// we always align guarded pointers in a block at an offset -// the block `next` field is then used as a tag to distinguish regular offset aligned blocks from guarded ones -#define MI_BLOCK_TAG_ALIGNED ((mi_encoded_t)(0)) -#define MI_BLOCK_TAG_GUARDED (~MI_BLOCK_TAG_ALIGNED) -#endif - - -// The owned flags are used for efficient multi-threaded free-ing -// When we push on the page thread free queue of an abandoned page, -// we also atomically get to own it. This is needed to atomically -// abandon a page (while other threads could concurrently free blocks in it). -typedef enum mi_owned_e { - MI_OWNED = 0, // some heap owns this page - MI_ABANDONED = 1, // the page is abandoned -} mi_owned_t; - // The `in_full` and `has_aligned` page flags are put in the same field // to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine. +// `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing) +// `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing) #define MI_PAGE_IN_FULL_QUEUE MI_ZU(0x01) #define MI_PAGE_HAS_ALIGNED MI_ZU(0x02) typedef size_t mi_page_flags_t; // Thread free list. -// We use the bottom bit of the pointer for `mi_owned_t` flags +// Points to a list of blocks that are freed by other threads. +// The low-bit is set if the page is owned by the current thread. (`mi_page_is_owned`). +// Ownership is required before we can read any non-atomic fields in the page. +// This way we can push a block on the thread free list and try to claim ownership +// atomically in `free.c:mi_free_block_mt`. typedef uintptr_t mi_thread_free_t; // Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython) @@ -276,19 +267,17 @@ typedef uint8_t mi_heaptag_t; // // We don't count `freed` (as |free|) but use `used` to reduce // the number of memory accesses in the `mi_page_all_free` function(s). -// +// // Notes: -// - Access is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc` +// - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`). +// - If a page is not part of a heap it is called "abandoned" -- in +// that case the `xthreadid` is 0 or 1 (1 is for abandoned pages that +// are in the abandoned page lists of an arena, these are called "mapped" abandoned pages). +// - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc` // - Using `uint16_t` does not seem to slow things down -// - `xthread_free` uses the bottom bits as a delayed-free flags to optimize -// concurrent frees where only the first concurrent free adds to the owning -// heap `thread_delayed_free` list (see `free.c:mi_free_block_mt`). -// The invariant is that no-delayed-free is only set if there is -// at least one block that will be added, or as already been added, to -// the owning heap `thread_delayed_free` list. This guarantees that pages -// will be freed correctly even if only other threads free blocks. + typedef struct mi_page_s { - _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned) + _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= heap->thread_id, or 0 or 1 if abandoned) mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) uint16_t used; // number of blocks in use (including blocks in `thread_free`) @@ -299,7 +288,7 @@ typedef struct mi_page_s { mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - _Atomic(mi_page_flags_t) xflags; // `in_full` and `has_aligned` flags + _Atomic(mi_page_flags_t) xflags; // `in_full_queue` and `has_aligned` flags size_t block_size; // size available in each block (always `>0`) uint8_t* page_start; // start of the blocks @@ -355,7 +344,7 @@ typedef enum mi_page_kind_e { MI_PAGE_MEDIUM, // medium blocks go into 512KiB pages MI_PAGE_LARGE, // larger blocks go into 4MiB pages MI_PAGE_SINGLETON // page containing a single block. - // used for blocks `> MI_LARGE_OBJ_SIZE_MAX` or an aligment `> MI_BLOCK_ALIGNMENT_MAX`. + // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`. } mi_page_kind_t; @@ -366,7 +355,7 @@ typedef enum mi_page_kind_e { // A heap just owns a set of pages for allocation and // can only be allocate/reallocate from the thread that created it. // Freeing blocks can be done from any thread though. -// Per thread, the segments are shared among its heaps. +// // Per thread, there is always a default heap that is // used for allocation; it is initialized to statically // point to an empty heap to avoid initialization checks @@ -436,16 +425,6 @@ struct mi_heap_s { mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") }; -// ------------------------------------------------------ -// Arena's -// These are large reserved areas of memory allocated from -// the OS that are managed by mimalloc to efficiently -// allocate MI_SLICE_SIZE slices of memory for the -// mimalloc pages. -// ------------------------------------------------------ - -// A large memory arena where pages are allocated in. -typedef struct mi_arena_s mi_arena_t; // ------------------------------------------------------ // Debug diff --git a/src/arena.c b/src/arena.c index a05e1f5d..c9d21c75 100644 --- a/src/arena.c +++ b/src/arena.c @@ -35,14 +35,13 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo // A memory arena descriptor typedef struct mi_arena_s { mi_memid_t memid; // memid of the memory area - mi_arena_id_t id; // arena id; 0 for non-specific - + mi_arena_id_t id; // arena id (> 0 where `arena == arenas[arena->id - 1]`) + size_t slice_count; // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`) size_t info_slices; // initial slices reserved for the arena bitmaps int numa_node; // associated NUMA node - bool exclusive; // only allow allocations if specifically for this arena + bool is_exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) - mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited _Atomic(mi_msecs_t) purge_expire; // expiration time when slices should be decommitted from `slices_decommit`. mi_bitmap_t* slices_free; // is the slice free? @@ -93,7 +92,8 @@ static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclus bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { if (memid.memkind == MI_MEM_ARENA) { - return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); + const mi_arena_t* arena = memid.mem.arena.arena; + return mi_arena_id_is_suitable(arena->id, arena->is_exclusive, request_arena_id); } else { return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); @@ -152,34 +152,25 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { // Create an arena memid -static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t slice_index, size_t slice_count) { +static mi_memid_t mi_memid_create_arena(mi_arena_t* arena, size_t slice_index, size_t slice_count) { mi_assert_internal(slice_index < UINT32_MAX); mi_assert_internal(slice_count < UINT32_MAX); mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); - memid.mem.arena.id = id; + memid.mem.arena.arena = arena; memid.mem.arena.slice_index = (uint32_t)slice_index; - memid.mem.arena.slice_count = (uint32_t)slice_count; - memid.mem.arena.is_exclusive = is_exclusive; + memid.mem.arena.slice_count = (uint32_t)slice_count; return memid; } -// returns if the arena is exclusive -static bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* slice_index, size_t* slice_count) { +// get the arena and slice span +static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* slice_index, size_t* slice_count) { mi_assert_internal(memid.memkind == MI_MEM_ARENA); - *arena_index = mi_arena_id_index(memid.mem.arena.id); + mi_arena_t* arena = memid.mem.arena.arena; if (slice_index) *slice_index = memid.mem.arena.slice_index; if (slice_count) *slice_count = memid.mem.arena.slice_count; - return memid.mem.arena.is_exclusive; + return arena; } -// get the arena and slice index -static mi_arena_t* mi_arena_from_memid(mi_memid_t memid, size_t* slice_index, size_t* slice_count) { - size_t arena_index; - mi_arena_memid_indices(memid, &arena_index, slice_index, slice_count); - return mi_arena_from_index(arena_index); -} - - static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* slice_index, size_t* slice_count) { // todo: maybe store the arena* directly in the page? return mi_arena_from_memid(page->memid, slice_index, slice_count); @@ -198,7 +189,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // claimed it! void* p = mi_arena_slice_start(arena, slice_index); - *memid = mi_memid_create_arena(arena->id, arena->exclusive, slice_index, slice_count); + *memid = mi_memid_create_arena(arena, slice_index, slice_count); memid->is_pinned = arena->memid.is_pinned; // set the dirty bits @@ -323,7 +314,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_arena_id, int numa_node, bool allow_large) { if (!allow_large && arena->is_large) return false; - if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return false; + if (!mi_arena_id_is_suitable(arena->id, arena->is_exclusive, req_arena_id)) return false; if (req_arena_id == _mi_arena_id_none()) { // if not specific, check numa affinity const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); if (!numa_suitable) return false; @@ -628,8 +619,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz // this ensures that all blocks in such pages are OS page size aligned (which is needed for the guard pages) const size_t os_page_size = _mi_os_page_size(); mi_assert_internal(MI_PAGE_ALIGN >= os_page_size); - if (block_size % os_page_size == 0 && block_size > os_page_size /* at least 2 or more */ ) { - block_start = _mi_align_up(_mi_page_info_size(), os_page_size); + if (!os_align && block_size % os_page_size == 0 && block_size > os_page_size /* at least 2 or more */ ) { + block_start = _mi_align_up(mi_page_info_size(), os_page_size); } else #endif @@ -961,7 +952,7 @@ static void mi_arenas_unsafe_destroy(void) { for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL) { - mi_lock_done(&arena->abandoned_visit_lock); + // mi_lock_done(&arena->abandoned_visit_lock); if (mi_memkind_is_os(arena->memid.memkind)) { mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid); @@ -1085,13 +1076,13 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int // init arena->id = _mi_arena_id_none(); arena->memid = memid; - arena->exclusive = exclusive; + arena->is_exclusive = exclusive; arena->slice_count = slice_count; arena->info_slices = info_slices; arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->is_large = is_large; arena->purge_expire = 0; - mi_lock_init(&arena->abandoned_visit_lock); + // mi_lock_init(&arena->abandoned_visit_lock); // init bitmaps uint8_t* base = mi_arena_start(arena) + bitmap_base; diff --git a/src/init.c b/src/init.c index ae1ae086..a5a0819e 100644 --- a/src/init.c +++ b/src/init.c @@ -11,6 +11,7 @@ terms of the MIT license. A copy of the license can be found in the file #include // memcpy, memset #include // atexit +#define MI_MEMID_STATIC {{{NULL,0}}, MI_MEM_STATIC, true /* pinned */, true /* committed */, false /* zero */ } // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { @@ -34,7 +35,7 @@ const mi_page_t _mi_page_empty = { NULL, // xheap NULL, NULL, // next, prev NULL, // subproc - { {{ NULL, 0}}, false, false, false, MI_MEM_NONE } // memid + MI_MEMID_STATIC // memid }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) @@ -96,8 +97,6 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- -#define MI_MEMID_STATIC {{{NULL,0}}, true /* pinned */, true /* committed */, false /* zero */, MI_MEM_STATIC } - mi_decl_cache_align const mi_heap_t _mi_heap_empty = { NULL, // MI_ATOMIC_VAR_INIT(NULL), // thread delayed free From ab53a73cbdff604c0fa8b336030bd8d4e5f706a8 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 11 Dec 2024 14:29:06 -0800 Subject: [PATCH 071/264] small updates --- include/mimalloc/atomic.h | 6 ++-- include/mimalloc/internal.h | 55 +++++++++++++++++----------- include/mimalloc/prim.h | 6 ++-- include/mimalloc/track.h | 6 ++-- include/mimalloc/types.h | 72 ++++++++++++++++--------------------- 5 files changed, 74 insertions(+), 71 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index 3b0ff559..95c1aefd 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once -#ifndef MIMALLOC_ATOMIC_H -#define MIMALLOC_ATOMIC_H +#ifndef MI_ATOMIC_H +#define MI_ATOMIC_H // include windows.h or pthreads.h #if defined(_WIN32) @@ -509,4 +509,4 @@ static inline void mi_lock_done(mi_lock_t* lock) { -#endif // __MIMALLOC_ATOMIC_H +#endif // MI_ATOMIC_H diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 3c5bd486..4b211d71 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once -#ifndef MIMALLOC_INTERNAL_H -#define MIMALLOC_INTERNAL_H +#ifndef MI_INTERNAL_H +#define MI_INTERNAL_H // -------------------------------------------------------------------------- @@ -239,27 +239,42 @@ bool _mi_page_is_valid(mi_page_t* page); #endif +// ------------------------------------------------------ +// Debug +// ------------------------------------------------------ + +#if !defined(MI_DEBUG_UNINIT) +#define MI_DEBUG_UNINIT (0xD0) +#endif +#if !defined(MI_DEBUG_FREED) +#define MI_DEBUG_FREED (0xDF) +#endif +#if !defined(MI_DEBUG_PADDING) +#define MI_DEBUG_PADDING (0xDE) +#endif + /* ----------------------------------------------------------- - Error codes passed to `_mi_fatal_error` - All are recoverable but EFAULT is a serious error and aborts by default in secure mode. - For portability define undefined error codes using common Unix codes: - + Assertions ----------------------------------------------------------- */ -#include -#ifndef EAGAIN // double free -#define EAGAIN (11) + +#if (MI_DEBUG) +// use our own assertion to print without memory allocation +void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func); +#define mi_assert(expr) ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__)) +#else +#define mi_assert(x) #endif -#ifndef ENOMEM // out of memory -#define ENOMEM (12) + +#if (MI_DEBUG>1) +#define mi_assert_internal mi_assert +#else +#define mi_assert_internal(x) #endif -#ifndef EFAULT // corrupted free-list or meta-data -#define EFAULT (14) -#endif -#ifndef EINVAL // trying to free an invalid pointer -#define EINVAL (22) -#endif -#ifndef EOVERFLOW // count*size overflow -#define EOVERFLOW (75) + +#if (MI_DEBUG>2) +#define mi_assert_expensive mi_assert +#else +#define mi_assert_expensive(x) #endif @@ -1023,4 +1038,4 @@ static inline void _mi_memzero_aligned(void* dst, size_t n) { } -#endif +#endif // MI_INTERNAL_H diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 65f65376..99791585 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once -#ifndef MIMALLOC_PRIM_H -#define MIMALLOC_PRIM_H +#ifndef MI_PRIM_H +#define MI_PRIM_H // -------------------------------------------------------------------------- @@ -370,4 +370,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) { #endif // mi_prim_get_default_heap() -#endif // MIMALLOC_PRIM_H +#endif // MI_PRIM_H diff --git a/include/mimalloc/track.h b/include/mimalloc/track.h index 4b5709e2..199308a6 100644 --- a/include/mimalloc/track.h +++ b/include/mimalloc/track.h @@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once -#ifndef MIMALLOC_TRACK_H -#define MIMALLOC_TRACK_H +#ifndef MI_TRACK_H +#define MI_TRACK_H /* ------------------------------------------------------------------------------------------------------ Track memory ranges with macros for tools like Valgrind address sanitizer, or other memory checkers. @@ -142,4 +142,4 @@ defined, undefined, or not accessible at all: } #endif -#endif +#endif // MI_TRACK_H diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index cc64a400..03d522b5 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -5,8 +5,8 @@ terms of the MIT license. A copy of the license can be found in the file "LICENSE" at the root of this distribution. -----------------------------------------------------------------------------*/ #pragma once -#ifndef MIMALLOC_TYPES_H -#define MIMALLOC_TYPES_H +#ifndef MI_TYPES_H +#define MI_TYPES_H // -------------------------------------------------------------------------- // This file contains the main type definitions for mimalloc: @@ -21,12 +21,9 @@ terms of the MIT license. A copy of the license can be found in the file #include // ptrdiff_t #include // uintptr_t, uint16_t, etc -#include "bits.h" // bit ops, size defines -#include "atomic.h" // _Atomic - -#ifdef _MSC_VER -#pragma warning(disable:4214) // bitfield is not int -#endif +#include // error codes +#include "bits.h" // size defines (MI_INTPTR_SIZE etc), bit operations +#include "atomic.h" // _Atomic primitives // Minimal alignment necessary. On most platforms 16 bytes are needed // due to SSE registers for example. This must be at least `sizeof(void*)` @@ -351,6 +348,7 @@ typedef enum mi_page_kind_e { // ------------------------------------------------------ // Heaps +// // Provide first-class heaps to allocate from. // A heap just owns a set of pages for allocation and // can only be allocate/reallocate from the thread that created it. @@ -426,40 +424,6 @@ struct mi_heap_s { }; -// ------------------------------------------------------ -// Debug -// ------------------------------------------------------ - -#if !defined(MI_DEBUG_UNINIT) -#define MI_DEBUG_UNINIT (0xD0) -#endif -#if !defined(MI_DEBUG_FREED) -#define MI_DEBUG_FREED (0xDF) -#endif -#if !defined(MI_DEBUG_PADDING) -#define MI_DEBUG_PADDING (0xDE) -#endif - -#if (MI_DEBUG) -// use our own assertion to print without memory allocation -void _mi_assert_fail(const char* assertion, const char* fname, unsigned int line, const char* func ); -#define mi_assert(expr) ((expr) ? (void)0 : _mi_assert_fail(#expr,__FILE__,__LINE__,__func__)) -#else -#define mi_assert(x) -#endif - -#if (MI_DEBUG>1) -#define mi_assert_internal mi_assert -#else -#define mi_assert_internal(x) -#endif - -#if (MI_DEBUG>2) -#define mi_assert_expensive mi_assert -#else -#define mi_assert_expensive(x) -#endif - // ------------------------------------------------------ // Statistics // ------------------------------------------------------ @@ -575,4 +539,28 @@ struct mi_tld_s { mi_stats_t stats; // statistics }; +/* ----------------------------------------------------------- + Error codes passed to `_mi_fatal_error` + All are recoverable but EFAULT is a serious error and aborts by default in secure mode. + For portability define undefined error codes using common Unix codes: + +----------------------------------------------------------- */ + +#ifndef EAGAIN // double free +#define EAGAIN (11) #endif +#ifndef ENOMEM // out of memory +#define ENOMEM (12) +#endif +#ifndef EFAULT // corrupted free-list or meta-data +#define EFAULT (14) +#endif +#ifndef EINVAL // trying to free an invalid pointer +#define EINVAL (22) +#endif +#ifndef EOVERFLOW // count*size overflow +#define EOVERFLOW (75) +#endif + + +#endif // MI_TYPES_H From 1c8d15abac1a9d269c335be7eaab4a8bc23aaf09 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 11 Dec 2024 14:30:44 -0800 Subject: [PATCH 072/264] fix build error --- include/mimalloc/internal.h | 14 -------------- include/mimalloc/types.h | 14 ++++++++++++++ 2 files changed, 14 insertions(+), 14 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 4b211d71..fb359763 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -239,20 +239,6 @@ bool _mi_page_is_valid(mi_page_t* page); #endif -// ------------------------------------------------------ -// Debug -// ------------------------------------------------------ - -#if !defined(MI_DEBUG_UNINIT) -#define MI_DEBUG_UNINIT (0xD0) -#endif -#if !defined(MI_DEBUG_FREED) -#define MI_DEBUG_FREED (0xDF) -#endif -#if !defined(MI_DEBUG_PADDING) -#define MI_DEBUG_PADDING (0xDE) -#endif - /* ----------------------------------------------------------- Assertions ----------------------------------------------------------- */ diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 03d522b5..77752398 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -562,5 +562,19 @@ struct mi_tld_s { #define EOVERFLOW (75) #endif +// ------------------------------------------------------ +// Debug +// ------------------------------------------------------ + +#ifndef MI_DEBUG_UNINIT +#define MI_DEBUG_UNINIT (0xD0) +#endif +#ifndef MI_DEBUG_FREED +#define MI_DEBUG_FREED (0xDF) +#endif +#ifndef MI_DEBUG_PADDING +#define MI_DEBUG_PADDING (0xDE) +#endif + #endif // MI_TYPES_H From ccf5e36e6bb273d0633b0756067d4808008b0b8a Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 11 Dec 2024 16:26:39 -0800 Subject: [PATCH 073/264] use frac 8 for reclaim_on_free and reabandon; halve full_page_retain if running in a threadpool --- include/mimalloc/types.h | 1 + src/free.c | 4 ++-- src/heap.c | 5 +++++ src/init.c | 4 ++++ src/page.c | 2 +- 5 files changed, 13 insertions(+), 3 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 77752398..f4bfa07a 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -409,6 +409,7 @@ struct mi_heap_s { size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread mi_memid_t memid; // provenance of the heap struct itseft (meta or os) + long full_page_retain; // how many full pages can be retained per queue (before abondoning them) bool allow_page_reclaim; // `true` if this heap should not reclaim abandoned pages bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint uint8_t tag; // custom tag, can be used for separating heaps based on the object types diff --git a/src/free.c b/src/free.c index 49bf8bf6..14034593 100644 --- a/src/free.c +++ b/src/free.c @@ -219,7 +219,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { // 2. if the page is not too full, we can try to reclaim it for ourselves // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit. if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && - !mi_page_is_used_at_frac(page,4) + !mi_page_is_used_at_frac(page,8) // && !mi_page_is_abandoned_mapped(page) ) { @@ -250,7 +250,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { } // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations - if (!mi_page_is_used_at_frac(page,4) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page + if (!mi_page_is_used_at_frac(page,8) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && _mi_arena_page_try_reabandon_to_mapped(page)) { diff --git a/src/heap.c b/src/heap.c index 70162d46..1d8142f7 100644 --- a/src/heap.c +++ b/src/heap.c @@ -194,11 +194,16 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint heap->arena_id = arena_id; heap->allow_page_reclaim = !noreclaim; heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0); + heap->full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); heap->tag = heap_tag; if (heap->tld->is_in_threadpool) { // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. // (but abandoning is good in this case) heap->allow_page_reclaim = false; + // and halve the full page retain (possibly to 0) + if (heap->full_page_retain >= 0) { + heap->full_page_retain = heap->full_page_retain / 2; + } } if (heap->tld->heap_backing == NULL) { diff --git a/src/init.c b/src/init.c index a5a0819e..85588970 100644 --- a/src/init.c +++ b/src/init.c @@ -109,6 +109,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { MI_BIN_FULL, 0, // page retired min/max NULL, // next MI_MEMID_STATIC, // memid + 0, // full page retain false, // can reclaim true, // can eager abandon 0, // tag @@ -155,6 +156,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = { MI_BIN_FULL, 0, // page retired min/max NULL, // next heap MI_MEMID_STATIC, // memid + 2, // full page retain true, // allow page reclaim true, // allow page abandon 0, // tag @@ -224,6 +226,8 @@ static void mi_heap_main_init(void) { mi_lock_init(&mi_subproc_default.abandoned_os_lock); mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock); _mi_heap_guarded_init(&_mi_heap_main); + _mi_heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0); + _mi_heap_main.full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); } } diff --git a/src/page.c b/src/page.c index 98319e53..a90c1d7d 100644 --- a/src/page.c +++ b/src/page.c @@ -642,7 +642,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m size_t count = 0; #endif long candidate_limit = 0; // we reset this on the first candidate to limit the search - long full_page_retain = _mi_option_get_fast(mi_option_full_page_retain); + long full_page_retain = heap->full_page_retain; mi_page_t* page_candidate = NULL; // a page with free space mi_page_t* page = pq->first; From aed76f29100302d3bbb9da7f2dfa75cd78f167e7 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 11 Dec 2024 20:34:23 -0800 Subject: [PATCH 074/264] wip: allow arena (re)loading --- ide/vs2022/mimalloc.vcxproj | 2 +- include/mimalloc.h | 11 ++ include/mimalloc/internal.h | 1 - src/arena.c | 262 ++++++++++++++++++++---------------- src/bitmap.c | 35 +++++ src/bitmap.h | 6 + src/os.c | 47 ++++++- 7 files changed, 244 insertions(+), 120 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 34bb28fe..d8cc25b1 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -116,7 +116,7 @@ true Default ../../include - MI_DEBUG=3;MI_GUARDED=1;%(PreprocessorDefinitions); + MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 diff --git a/include/mimalloc.h b/include/mimalloc.h index c11353b7..97f74c83 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -317,6 +317,17 @@ mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t samp mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, size_t max); +// experimental +//mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size); +//mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size); +//mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size); +//mi_decl_export void mi_os_free(void* p, size_t size); +//mi_decl_export void mi_os_commit(void* p, size_t size); +//mi_decl_export void mi_os_decommit(void* p, size_t size); + +mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size); +mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id); + // ------------------------------------------------------ // Convenience // ------------------------------------------------------ diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index fb359763..3be08b94 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -143,7 +143,6 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t m // arena.c mi_arena_id_t _mi_arena_id_none(void); void _mi_arena_init(void); -void _mi_arena_free(void* p, size_t size, size_t still_committed_size, mi_memid_t memid); void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid); void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid); bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id); diff --git a/src/arena.c b/src/arena.c index c9d21c75..03f40932 100644 --- a/src/arena.c +++ b/src/arena.c @@ -176,6 +176,17 @@ static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* slice_index, size_t* s return mi_arena_from_memid(page->memid, slice_index, slice_count); } +static size_t mi_memid_size(mi_memid_t memid) { + if (memid.memkind == MI_MEM_ARENA) { + return memid.mem.arena.slice_count * MI_ARENA_SLICE_SIZE; + } + else if (mi_memid_is_os(memid) || memid.memkind == MI_MEM_EXTERNAL) { + return memid.mem.os.size; + } + else { + return 0; + } +} /* ----------------------------------------------------------- Arena Allocation @@ -727,7 +738,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block return page; } - +static void mi_arena_free(void* p, size_t size, mi_memid_t memid); void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); @@ -754,7 +765,7 @@ void _mi_arena_page_free(mi_page_t* page) { #endif _mi_page_map_unregister(page); - _mi_arena_free(page, 1, 1, page->memid); + mi_arena_free(page, mi_memid_size(page->memid), page->memid); } /* ----------------------------------------------------------- @@ -843,7 +854,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]); } else { - // page is full (or a singleton), page is OS/externally allocated + // page is full (or a singleton), page is OS/nly allocated // nothing to do // TODO: maintain count of these as well? } @@ -863,22 +874,16 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices); static void mi_arenas_try_purge(bool force, bool visit_all); -void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid) { - mi_assert_internal(size > 0); - mi_assert_internal(committed_size <= size); +static void mi_arena_free(void* p, size_t size, mi_memid_t memid) { + mi_assert_internal(size >= 0); if (p==NULL) return; if (size==0) return; - const bool all_committed = (committed_size == size); - + // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) mi_track_mem_undefined(p, size); if (mi_memkind_is_os(memid.memkind)) { // was a direct OS allocation, pass through - if (!all_committed && committed_size > 0) { - // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) - _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } _mi_os_free(p, size, memid); } else if (memid.memkind == MI_MEM_ARENA) { @@ -886,7 +891,8 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi size_t slice_count; size_t slice_index; mi_arena_t* arena = mi_arena_from_memid(memid, &slice_index, &slice_count); - mi_assert_internal(size==1); + mi_assert_internal((size%MI_ARENA_SLICE_SIZE)==0); + mi_assert_internal((slice_count*MI_ARENA_SLICE_SIZE)==size); mi_assert_internal(mi_arena_slice_start(arena,slice_index) <= (uint8_t*)p); mi_assert_internal(mi_arena_slice_start(arena,slice_index) + mi_size_of_slices(slice_count) > (uint8_t*)p); // checks @@ -902,25 +908,7 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi } // potentially decommit - if (arena->memid.is_pinned || arena->memid.initially_committed) { - mi_assert_internal(all_committed); - } - else { - /* - if (!all_committed) { - // mark the entire range as no longer committed (so we recommit the full range when re-using) - mi_bitmap_clearN(&arena->slices_committed, slice_index, slice_count); - mi_track_mem_noaccess(p, size); - if (committed_size > 0) { - // if partially committed, adjust the committed stats (is it will be recommitted when re-using) - // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. - _mi_stat_decrease(&_mi_stats_main.committed, committed_size); - } - // note: if not all committed, it may be that the purge will reset/decommit the entire range - // that contains already decommitted parts. Since purge consistently uses reset or decommit that - // works (as we should never reset decommitted parts). - } - */ + if (!arena->memid.is_pinned && !arena->memid.initially_committed) { // todo: maybe allow decommit even if initially committed? // (delay) purge the entire range mi_arena_schedule_purge(arena, slice_index, slice_count); } @@ -944,6 +932,29 @@ void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memi mi_arenas_try_purge(false, false); } +// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired +void _mi_arenas_collect(bool force_purge) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */); +} + +// Is a pointer inside any of our arenas? +bool _mi_arena_contains(const void* p) { + const size_t max_arena = mi_arena_get_count(); + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p) { + return true; + } + } + return false; +} + + + +/* ----------------------------------------------------------- + Remove an arena. +----------------------------------------------------------- */ + // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. static void mi_arenas_unsafe_destroy(void) { @@ -953,8 +964,8 @@ static void mi_arenas_unsafe_destroy(void) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena != NULL) { // mi_lock_done(&arena->abandoned_visit_lock); - if (mi_memkind_is_os(arena->memid.memkind)) { - mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); + mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); + if (mi_memkind_is_os(arena->memid.memkind)) { _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid); } } @@ -965,10 +976,6 @@ static void mi_arenas_unsafe_destroy(void) { mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); } -// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired -void _mi_arenas_collect(bool force_purge) { - mi_arenas_try_purge(force_purge, force_purge /* visit all? */); -} // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. @@ -977,18 +984,6 @@ void _mi_arena_unsafe_destroy_all(void) { _mi_arenas_collect(true /* force purge */); // purge non-owned arenas } -// Is a pointer inside any of our arenas? -bool _mi_arena_contains(const void* p) { - const size_t max_arena = mi_arena_get_count(); - for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); - if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) > (const uint8_t*)p) { - return true; - } - } - return false; -} - /* ----------------------------------------------------------- Add an arena. @@ -999,7 +994,26 @@ static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* mi_assert_internal(arena->slice_count > 0); if (arena_id != NULL) { *arena_id = -1; } - size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); + // first try to find a NULL entry + const size_t count = mi_arena_get_count(); + size_t i; + for (i = 0; i < count; i++) { + if (mi_arena_from_index(i) == NULL) { + arena->id = mi_arena_id_create(i); + mi_arena_t* expected = NULL; + if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &mi_arenas[i], &expected, arena)) { + // success + if (arena_id != NULL) { *arena_id = arena->id; } + return true; + } + else { + arena->id = _mi_arena_id_none(); + } + } + } + + // otherwise increase the max + i = mi_atomic_increment_acq_rel(&mi_arena_count); if (i >= MI_MAX_ARENAS) { mi_atomic_decrement_acq_rel(&mi_arena_count); return false; @@ -1076,7 +1090,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int // init arena->id = _mi_arena_id_none(); arena->memid = memid; - arena->is_exclusive = exclusive; + arena->is_exclusive = exclusive; arena->slice_count = slice_count; arena->info_slices = info_slices; arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) @@ -1116,6 +1130,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); + memid.mem.os.base = start; + memid.mem.os.size = size; memid.initially_committed = is_committed; memid.initially_zero = is_zero; memid.is_pinned = is_large; @@ -1370,74 +1386,86 @@ static void mi_arenas_try_purge(bool force, bool visit_all) { } -/* ----------------------------------------------------------- - Special static area for mimalloc internal structures - to avoid OS calls (for example, for the subproc metadata (~= 721b)) ------------------------------------------------------------ */ - -#define MI_ARENA_STATIC_MAX ((MI_INTPTR_SIZE/2)*MI_KiB) // 4 KiB on 64-bit - -static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX]; // must be cache aligned, see issue #895 -static mi_decl_cache_align _Atomic(size_t)mi_arena_static_top; - -static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) { - *memid = _mi_memid_none(); - if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL; - const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top); - if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL; - - // try to claim space - if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; } - const size_t oversize = size + alignment - 1; - if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL; - const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize); - size_t top = oldtop + oversize; - if (top > MI_ARENA_STATIC_MAX) { - // try to roll back, ok if this fails - mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop); - return NULL; - } - - // success - *memid = _mi_memid_create(MI_MEM_STATIC); - memid->initially_zero = true; - const size_t start = _mi_align_up(oldtop, alignment); - uint8_t* const p = &mi_arena_static[start]; - _mi_memzero_aligned(p, size); - return p; -} - -void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) { - *memid = _mi_memid_none(); - - // try static - void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid); - if (p != NULL) return p; - - // or fall back to the OS - p = _mi_os_alloc(size, memid); - if (p == NULL) return NULL; - - // zero the OS memory if needed - if (!memid->initially_zero) { - _mi_memzero_aligned(p, size); - memid->initially_zero = true; - } - return p; -} - -void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { - if (mi_memkind_is_os(memid.memkind)) { - _mi_os_free(p, size, memid); - } - else { - mi_assert(memid.memkind == MI_MEM_STATIC); - } -} - bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg); _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n"); return false; } + +/* ----------------------------------------------------------- + Unloading and reloading an arena. +----------------------------------------------------------- */ + +mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) { + const size_t count = mi_arena_get_count(); + const size_t arena_idx = mi_arena_id_index(arena_id); + if (count <= arena_idx) { + _mi_warning_message("arena id is invalid (%zu)\n", arena_id); + return false; + } + mi_arena_t* arena = mi_arena_from_id(arena_id); + if (arena==NULL) { + return false; + } + else if (!arena->is_exclusive) { + _mi_warning_message("cannot unload a non-exclusive arena (id %zu at %p)\n", arena_id, arena); + return false; + } + else if (arena->memid.memkind != MI_MEM_EXTERNAL) { + _mi_warning_message("can only unload managed arena's for external memory (id %zu at %p)\n", arena_id, arena); + return false; + } + if (base != NULL) { *base = (void*)arena; } + if (full_size != NULL) { *full_size = arena->memid.mem.os.size; } + if (accessed_size != NULL) { + // scan the commit map for the highest entry + size_t idx; + if (mi_bitmap_bsr(arena->slices_committed, &idx)) { + *accessed_size = (idx + 1)* MI_ARENA_SLICE_SIZE; + } + else { + *accessed_size = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE; + } + } + + // set the entry to NULL + mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL); + if (arena_idx + 1 == count) { // try adjust the count? + size_t expected = count; + mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, count-1); + } + return true; +} + +bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) { + // assume the memory area is already containing the arena + if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } + if (start == NULL || size == 0) return false; + mi_arena_t* arena = (mi_arena_t*)start; + mi_memid_t memid = arena->memid; + if (memid.memkind != MI_MEM_EXTERNAL) { + _mi_warning_message("can only reload arena's from external memory (%p)\n", arena); + return false; + } + if (memid.mem.os.base != start) { + _mi_warning_message("the reloaded arena base address differs from the external memory (arena: %p, external: %p)\n", arena, start); + return false; + } + if (memid.mem.os.size != size) { + _mi_warning_message("the reloaded arena size differs from the external memory (arena size: %zu, external size: %zu)\n", arena->memid.mem.os.size, size); + return false; + } + if (!arena->is_exclusive) { + _mi_warning_message("the reloaded arena is not exclusive\n"); + return false; + } + arena->memid.is_pinned = is_large; + arena->memid.initially_committed = is_committed; + arena->memid.initially_zero = is_zero; + arena->is_exclusive = true; + arena->is_large = is_large; + arena->id = _mi_arena_id_none(); + return mi_arena_add(arena, arena_id, &_mi_stats_main); +} + diff --git a/src/bitmap.c b/src/bitmap.c index 2f563066..d16a1b24 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -796,6 +796,20 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { #endif } + +static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) { + for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) { + i--; + mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); + size_t idx; + if (mi_bsr(b, &idx)) { + *pidx = (i*MI_BFIELD_BITS) + idx; + return true; + } + } + return false; +} + /* -------------------------------------------------------------------------------- bitmap chunkmap -------------------------------------------------------------------------------- */ @@ -1154,6 +1168,27 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t return false; } + +bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx) { + const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); + for (size_t i = chunkmap_max; i > 0; ) { + i--; + mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); + size_t cmap_idx; + if (mi_bsr(cmap,&cmap_idx)) { + // highest chunk + const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx; + size_t cidx; + if (mi_bchunk_bsr(&bitmap->chunks[chunk_idx], &cidx)) { + *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx; + return true; + } + } + } + return false; +} + + // Clear a bit once it is set. void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); diff --git a/src/bitmap.h b/src/bitmap.h index 191b6864..71a016ee 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -206,4 +206,10 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t // allocated from `mi_arena_try_abandoned` (and is in the `claim` function of `mi_bitmap_try_find_and_claim`). void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx); + +// If a bit is set in the bitmap, return `true` and set `idx` to its index. +// Otherwise return `false` (and `*idx` is undefined). +bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx); + + #endif // MI_BITMAP_H diff --git a/src/os.c b/src/os.c index 55f7428e..9fcd5aed 100644 --- a/src/os.c +++ b/src/os.c @@ -290,7 +290,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo if (size == 0) return NULL; size = _mi_os_good_alloc_size(size); alignment = _mi_align_up(alignment, _mi_os_page_size()); - + bool os_is_large = false; bool os_is_zero = false; void* os_base = NULL; @@ -671,3 +671,48 @@ int _mi_os_numa_node_get(void) { if (numa_node >= numa_count) { numa_node = numa_node % numa_count; } return (int)numa_node; } + + +/* ---------------------------------------------------------------------------- + Public API +-----------------------------------------------------------------------------*/ + +mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) { + return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size); +} + +static void* mi_os_alloc_aligned_ex(size_t size, size_t alignment, bool commit, bool allow_large, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) { + mi_memid_t memid = _mi_memid_none(); + void* p = _mi_os_alloc_aligned(size, alignment, commit, allow_large, &memid); + if (p == NULL) return p; + if (is_committed != NULL) { *is_committed = memid.initially_committed; } + if (is_pinned != NULL) { *is_pinned = memid.is_pinned; } + if (base != NULL) { *base = memid.mem.os.base; } + if (full_size != NULL) { *full_size = memid.mem.os.size; } + if (!memid.initially_zero && memid.initially_committed) { + _mi_memzero_aligned(memid.mem.os.base, memid.mem.os.size); + } + return p; +} + +mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size) { + return mi_os_alloc_aligned_ex(size, alignment, commit, false, NULL, NULL, base, full_size); +} + +mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size) { + return mi_os_alloc_aligned_ex(size, alignment, commit, true, is_committed, is_pinned, base, full_size); +} + +mi_decl_export void mi_os_free(void* p, size_t size) { + if (p==NULL || size == 0) return; + mi_memid_t memid = _mi_memid_create_os(p, size, true, false, false); + _mi_os_free(p, size, memid); +} + +mi_decl_export void mi_os_commit(void* p, size_t size) { + _mi_os_commit(p, size, NULL); +} + +mi_decl_export void mi_os_decommit(void* p, size_t size) { + _mi_os_decommit(p, size); +} From 94ce342ea9114c368e88eb96a485a1911d9ce5af Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 11 Dec 2024 22:06:25 -0800 Subject: [PATCH 075/264] maintain pages set for arenas; improve arena load/unload --- include/mimalloc/internal.h | 9 +++-- src/arena.c | 68 +++++++++++++++++++++++++++++-------- src/bitmap.c | 23 ++++++++++++- src/bitmap.h | 5 +++ src/os.c | 3 +- src/page-map.c | 11 ++++-- 6 files changed, 97 insertions(+), 22 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 3be08b94..ee7f1026 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -164,6 +164,7 @@ void _mi_meta_free(void* p, size_t size, mi_memid_t memid); bool _mi_page_map_init(void); void _mi_page_map_register(mi_page_t* page); void _mi_page_map_unregister(mi_page_t* page); +void _mi_page_map_unregister_range(void* start, size_t size); // "page.c" void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; @@ -437,14 +438,18 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si extern uint8_t* _mi_page_map; +static inline uintptr_t _mi_page_map_index(const void* p) { + return (((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT); +} + static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) { #if 1 - const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT; + const uintptr_t idx = _mi_page_map_index(p); const size_t ofs = _mi_page_map[idx]; if (valid != NULL) *valid = (ofs != 0); return (mi_page_t*)((idx - ofs + 1) << MI_ARENA_SLICE_SHIFT); #else - const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT; + const uintptr_t idx = _mi_page_map_index(p); const uintptr_t up = idx << MI_ARENA_SLICE_SHIFT; __builtin_prefetch((void*)up); const size_t ofs = _mi_page_map[idx]; diff --git a/src/arena.c b/src/arena.c index 03f40932..4f89a629 100644 --- a/src/arena.c +++ b/src/arena.c @@ -48,6 +48,7 @@ typedef struct mi_arena_s { mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) mi_bitmap_t* slices_purge; // can the slice be purged? (slice in purge => slice in free) mi_bitmap_t* slices_dirty; // is the slice potentially non-zero? + mi_bitmap_t* pages; // all registered pages mi_bitmap_t* pages_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) // the full queue contains abandoned full pages // followed by the bitmaps (whose size depends on the arena size) @@ -117,7 +118,13 @@ static size_t mi_arena_info_slices(mi_arena_t* arena) { return arena->info_slices; } - +#if MI_DEBUG > 1 +static bool mi_arena_has_page(mi_arena_t* arena, mi_page_t* page) { + return (page->memid.memkind == MI_MEM_ARENA && + page->memid.mem.arena.arena == arena && + mi_bitmap_is_setN(arena->pages, page->memid.mem.arena.slice_index, 1)); +} +#endif /* ----------------------------------------------------------- Util @@ -551,10 +558,11 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(mi_arena_has_page(arena,page)); mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); - + _mi_page_free_collect(page, false); // update `used` count mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); @@ -588,6 +596,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large { page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, tld->tseq, &memid); + if (page != NULL) { + mi_assert_internal(mi_bitmap_is_clearN(memid.mem.arena.arena->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count)); + mi_bitmap_set(memid.mem.arena.arena->pages, memid.mem.arena.slice_index); + } } // otherwise fall back to the OS @@ -758,6 +770,7 @@ void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); + mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may // be (temporarily) not true if the free happens while trying to reclaim // see `mi_arana_try_claim_abandoned` @@ -765,6 +778,9 @@ void _mi_arena_page_free(mi_page_t* page) { #endif _mi_page_map_unregister(page); + if (page->memid.memkind == MI_MEM_ARENA) { + mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index); + } mi_arena_free(page, mi_memid_size(page->memid), page->memid); } @@ -1104,6 +1120,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->slices_committed = mi_arena_bitmap_init(slice_count,&base); arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base); arena->slices_purge = mi_arena_bitmap_init(slice_count,&base); + arena->pages = mi_arena_bitmap_init(slice_count, &base); for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) { arena->pages_abandoned[i] = mi_arena_bitmap_init(slice_count,&base); } @@ -1396,6 +1413,18 @@ bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool vi /* ----------------------------------------------------------- Unloading and reloading an arena. ----------------------------------------------------------- */ +static bool mi_arena_page_register(size_t slice_index, mi_arena_t* arena, void* arg) { + MI_UNUSED(arg); + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); + _mi_page_map_register(page); + mi_assert_internal(_mi_ptr_page(page)==page); + return true; +} + +static bool mi_arena_pages_reregister(mi_arena_t* arena) { + return _mi_bitmap_forall_set(arena->pages, &mi_arena_page_register, arena, NULL); +} mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) { const size_t count = mi_arena_get_count(); @@ -1416,18 +1445,23 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* _mi_warning_message("can only unload managed arena's for external memory (id %zu at %p)\n", arena_id, arena); return false; } - if (base != NULL) { *base = (void*)arena; } - if (full_size != NULL) { *full_size = arena->memid.mem.os.size; } - if (accessed_size != NULL) { - // scan the commit map for the highest entry - size_t idx; - if (mi_bitmap_bsr(arena->slices_committed, &idx)) { - *accessed_size = (idx + 1)* MI_ARENA_SLICE_SIZE; - } - else { - *accessed_size = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE; - } + + // find accessed size + size_t asize; + // scan the commit map for the highest entry + size_t idx; + if (mi_bitmap_bsr(arena->slices_committed, &idx)) { + asize = (idx + 1)* MI_ARENA_SLICE_SIZE; } + else { + asize = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE; + } + if (base != NULL) { *base = (void*)arena; } + if (full_size != NULL) { *full_size = arena->memid.mem.os.size; } + if (accessed_size != NULL) { *accessed_size = asize; } + + // unregister the pages + _mi_page_map_unregister_range(arena, asize); // set the entry to NULL mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL); @@ -1438,7 +1472,7 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* return true; } -bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) { +mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) { // assume the memory area is already containing the arena if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } if (start == NULL || size == 0) return false; @@ -1466,6 +1500,10 @@ bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, arena->is_exclusive = true; arena->is_large = is_large; arena->id = _mi_arena_id_none(); - return mi_arena_add(arena, arena_id, &_mi_stats_main); + if (!mi_arena_add(arena, arena_id, &_mi_stats_main)) { + return false; + } + mi_arena_pages_reregister(arena); + return true; } diff --git a/src/bitmap.c b/src/bitmap.c index d16a1b24..f1b1a759 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1051,7 +1051,6 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n #define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \ { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ - MI_UNUSED(tseq); \ const size_t chunk_max_acc = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \ const size_t chunk_start = tseq % chunk_max_acc; /* space out threads? */ \ const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \ @@ -1197,3 +1196,25 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); mi_bchunk_clear_once_set(&bitmap->chunks[chunk_idx], cidx); } + + +// Visit all set bits in a bitmap. +// todo: optimize further? maybe popcount to help the branch predictor for the loop, +// and keep b constant (using a mask)? or avx512 to directly get all indices using a mask_compressstore? +bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) { + mi_bitmap_forall_chunks(bitmap, 0, chunk_idx) { + mi_bchunk_t* chunk = &bitmap->chunks[chunk_idx]; + for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { + const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); + mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); + size_t bidx; + while (mi_bsf(b, &bidx)) { + b = b & (b-1); // clear low bit + const size_t idx = base_idx + bidx; + if (!visit(idx, arena, arg)) return false; + } + } + } + mi_bitmap_forall_chunks_end(); + return true; +} diff --git a/src/bitmap.h b/src/bitmap.h index 71a016ee..7fd09f43 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -212,4 +212,9 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx); bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx); +typedef bool (mi_forall_set_fun_t)(size_t slice_index, mi_arena_t* arena, void* arg2); + +// Visit all set bits in a bitmap +bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); + #endif // MI_BITMAP_H diff --git a/src/os.c b/src/os.c index 9fcd5aed..86ecb16b 100644 --- a/src/os.c +++ b/src/os.c @@ -676,7 +676,7 @@ int _mi_os_numa_node_get(void) { /* ---------------------------------------------------------------------------- Public API -----------------------------------------------------------------------------*/ - +#if 0 mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size) { return mi_os_alloc_aligned(size, mi_os_mem_config.alloc_granularity, commit, NULL, full_size); } @@ -716,3 +716,4 @@ mi_decl_export void mi_os_commit(void* p, size_t size) { mi_decl_export void mi_os_decommit(void* p, size_t size) { _mi_os_decommit(p, size); } +#endif diff --git a/src/page-map.c b/src/page-map.c index 181db7f0..7b74c711 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -15,6 +15,7 @@ static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; static void* mi_page_map_max_address = NULL; static mi_memid_t mi_page_map_memid; + // (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization) static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0), { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} }; @@ -84,7 +85,7 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* *page_start = mi_page_area(page, &page_size); if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; } // furthest interior pointer *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks - return ((uintptr_t)page >> MI_ARENA_SLICE_SHIFT); + return _mi_page_map_index(page); } @@ -113,16 +114,20 @@ void _mi_page_map_register(mi_page_t* page) { void _mi_page_map_unregister(mi_page_t* page) { mi_assert_internal(_mi_page_map != NULL); - // get index and count uint8_t* page_start; size_t slice_count; const size_t idx = mi_page_map_get_idx(page, &page_start, &slice_count); - // unset the offsets _mi_memzero(_mi_page_map + idx, slice_count); } +void _mi_page_map_unregister_range(void* start, size_t size) { + const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE); + const uintptr_t index = _mi_page_map_index(start); + mi_page_map_ensure_committed(index, slice_count); // we commit the range in total; todo: scan the commit bits and clear only those ranges? + _mi_memzero(&_mi_page_map[index], slice_count); +} mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { // if mi_unlikely(_mi_page_map==NULL) { // happens on macOS during loading From 118bd8c97f9b7d189d89b671b49a66f58ded2ad6 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 12 Dec 2024 16:37:31 -0800 Subject: [PATCH 076/264] space out threads when searching for free pages --- CMakeLists.txt | 13 ++- src/arena.c | 19 ++-- src/bitmap.c | 236 ++++++++++++++++++++++++++++++++++--------------- 3 files changed, 183 insertions(+), 85 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index e96ff089..fa35d749 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -15,6 +15,7 @@ option(MI_TRACK_ASAN "Compile with address sanitizer support (adds a smal option(MI_TRACK_ETW "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF) option(MI_USE_CXX "Use the C++ compiler to compile the library (instead of the C compiler)" OFF) option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON) +option(MI_OPT_SIMD "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF) option(MI_SEE_ASM "Generate assembly files" OFF) option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" ON) option(MI_OSX_ZONE "Use malloc zone to override standard malloc on macOS" ON) @@ -227,7 +228,7 @@ endif() if(MI_SEE_ASM) message(STATUS "Generate assembly listings (MI_SEE_ASM=ON)") list(APPEND mi_cflags -save-temps) - if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang") + if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang" AND CMAKE_C_COMPILER_VERSION VERSION_GREATER 14) message(STATUS "No GNU Line marker") list(APPEND mi_cflags -Wno-gnu-line-marker) endif() @@ -330,10 +331,10 @@ endif() # Determine architecture set(MI_OPT_ARCH_FLAGS "") set(MI_ARCH "") -if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR +if(CMAKE_SYSTEM_PROCESSOR STREQUAL "x86_64" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64") # msvc set(MI_ARCH "x64") -elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR +elseif(CMAKE_SYSTEM_PROCESSOR STREQUAL "aarch64" OR CMAKE_SYSTEM_PROCESSOR STREQUAL "arm64" OR # apple CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64") # msvc set(MI_ARCH "arm64") @@ -419,6 +420,12 @@ endif() if(MI_OPT_ARCH_FLAGS) list(APPEND mi_cflags ${MI_OPT_ARCH_FLAGS}) message(STATUS "Architecture specific optimization is enabled (with ${MI_OPT_ARCH_FLAGS}) (MI_OPT_ARCH=ON)") + if (MI_OPT_SIMD) + list(APPEND mi_defines "MI_OPT_SIMD=1") + message(STATUS "SIMD instructions are enabled (MI_OPT_SIMD=ON)") + endif() +elseif(MI_OPT_SIMD) + message(STATUS "SIMD instructions are not enabled (either MI_OPT_ARCH=OFF or this architecture has no SIMD support)") endif() # extra needed libraries diff --git a/src/arena.c b/src/arena.c index 4f89a629..32c0b32e 100644 --- a/src/arena.c +++ b/src/arena.c @@ -36,7 +36,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo typedef struct mi_arena_s { mi_memid_t memid; // memid of the memory area mi_arena_id_t id; // arena id (> 0 where `arena == arenas[arena->id - 1]`) - + size_t slice_count; // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`) size_t info_slices; // initial slices reserved for the arena bitmaps int numa_node; // associated NUMA node @@ -165,7 +165,7 @@ static mi_memid_t mi_memid_create_arena(mi_arena_t* arena, size_t slice_index, s mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); memid.mem.arena.arena = arena; memid.mem.arena.slice_index = (uint32_t)slice_index; - memid.mem.arena.slice_count = (uint32_t)slice_count; + memid.mem.arena.slice_count = (uint32_t)slice_count; return memid; } @@ -562,7 +562,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); - + _mi_page_free_collect(page, false); // update `used` count mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); @@ -770,7 +770,7 @@ void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); - mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); + mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may // be (temporarily) not true if the free happens while trying to reclaim // see `mi_arana_try_claim_abandoned` @@ -891,10 +891,9 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ static void mi_arenas_try_purge(bool force, bool visit_all); static void mi_arena_free(void* p, size_t size, mi_memid_t memid) { - mi_assert_internal(size >= 0); if (p==NULL) return; if (size==0) return; - + // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) mi_track_mem_undefined(p, size); @@ -981,7 +980,7 @@ static void mi_arenas_unsafe_destroy(void) { if (arena != NULL) { // mi_lock_done(&arena->abandoned_visit_lock); mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); - if (mi_memkind_is_os(arena->memid.memkind)) { + if (mi_memkind_is_os(arena->memid.memkind)) { _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid); } } @@ -1457,12 +1456,12 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* asize = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE; } if (base != NULL) { *base = (void*)arena; } - if (full_size != NULL) { *full_size = arena->memid.mem.os.size; } + if (full_size != NULL) { *full_size = arena->memid.mem.os.size; } if (accessed_size != NULL) { *accessed_size = asize; } - // unregister the pages + // unregister the pages _mi_page_map_unregister_range(arena, asize); - + // set the entry to NULL mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL); if (arena_idx + 1 == count) { // try adjust the count? diff --git a/src/bitmap.c b/src/bitmap.c index f1b1a759..4f21f68f 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -14,7 +14,9 @@ Concurrent bitmap that can set/reset sequences of bits atomically #include "mimalloc/bits.h" #include "bitmap.h" -#define MI_USE_SIMD 0 +#ifndef MI_OPT_SIMD +#define MI_OPT_SIMD 0 +#endif /* -------------------------------------------------------------------------------- bfields @@ -24,11 +26,15 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) { return mi_ctz(x); } - static inline size_t mi_bfield_popcount(mi_bfield_t x) { return mi_popcount(x); } +static inline mi_bfield_t mi_bfield_clear_least_bit(mi_bfield_t x) { + return (x & (x-1)); +} + + // find the least significant bit that is set (i.e. count trailing zero's) // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). @@ -156,16 +162,6 @@ static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b) { // ------- mi_bfield_atomic_try_xset --------------------------------------- -// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0. -// `all_clear` is set to true if the new bfield is zero (and false otherwise) -static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) { - mi_assert_internal(idx < MI_BFIELD_BITS); - const mi_bfield_t mask = mi_bfield_one()<bfields); const __m256i vcmp = _mm256_cmpeq_epi64(vec, mi_mm256_zero()); // (elem64 == 0 ? 0xFF : 0) @@ -502,10 +509,10 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 const size_t chunk_idx = _tzcnt_u32(mask) / 8; - if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; // try again } - #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { size_t chunk_idx = 0; #if 0 @@ -534,9 +541,9 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx // mask is inverted, so each 8-bits is 0xFF iff the corresponding elem64 has a bit set (and thus can be cleared) if (mask==0) return false; mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. - chunk_idx = _tzcnt_u64(mask) / 8; + chunk_idx = mi_ctz(mask) / 8; #endif - if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; // try again } #else @@ -551,12 +558,17 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx #endif } +static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, size_t* pidx) { + mi_assert_internal(n==1); MI_UNUSED(n); + return mi_bchunk_try_find_and_clear(chunk, pidx); +} +#if !MI_OPT_SIMD static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) { const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); if (!allow_all_set && (~b == 0)) return false; // has_set8 has low bit in each byte set if the byte in x == 0xFF - const mi_bfield_t has_set8 = + const mi_bfield_t has_set8 = ((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F (b & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 >> 7; // shift high bit to low bit @@ -573,13 +585,14 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c } return false; } +#endif // find least byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // Used to find medium size pages in the free blocks. // todo: try neon version static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, size_t* pidx) { - #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); @@ -615,6 +628,10 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s #endif } +static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n, size_t* pidx) { + mi_assert_internal(n==8); MI_UNUSED(n); + return mi_bchunk_try_find_and_clear8(chunk, pidx); +} // find least bfield in a chunk with all bits set, and try unset it atomically @@ -622,7 +639,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // Used to find large size pages in the free blocks. // todo: try neon version static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) { - #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); @@ -658,6 +675,10 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, #endif } +static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n, size_t* pidx) { + mi_assert_internal(n==MI_BFIELD_BITS); MI_UNUSED(n); + return mi_bchunk_try_find_and_clearX(chunk, pidx); +} // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set, // and try to clear them atomically. @@ -783,10 +804,10 @@ static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) { // are all bits in a bitmap chunk clear? static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { - #if MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) + #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); return mi_mm256_is_zero(vec); - #elif MI_USE_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) + #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) // a 64b cache-line contains the entire chunk anyway so load both at once const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); @@ -835,7 +856,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) mi_bchunk_clear(&bitmap->chunkmap, chunk_idx, NULL); // .. but a concurrent set may have happened in between our all-clear test and the clearing of the // bit in the mask. We check again to catch this situation. - if (!mi_bchunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) { mi_bchunk_set(&bitmap->chunkmap, chunk_idx); return false; } @@ -1043,11 +1064,129 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n +/* -------------------------------------------------------------------------------- + Iterate through a bfield +-------------------------------------------------------------------------------- */ + +// Cycle iteration through a bitfield. This is used to space out threads +// so there is less chance of contention. When searching for a free page we +// like to first search only the accessed part (so we reuse better). This +// high point is called the `cycle`. +// +// We then iterate through the bitfield as: +// first: [start, cycle> +// then : [0, start> +// then : [cycle, MI_BFIELD_BITS> +// +// The start is determined usually as `tseq % cycle` to have each thread +// start at a different spot. +// - We use `popcount` to improve branch prediction` +// - The `cycle_mask` is the part `[start, cycle>`. +#define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \ + mi_assert_internal(start <= cycle); \ + mi_assert_internal(start < MI_BFIELD_BITS); \ + mi_assert_internal(cycle < MI_BFIELD_BITS); \ + mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \ + size_t _bcount##SUF = mi_bfield_popcount(bfield); \ + mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\ + while(_bcount##SUF > 0) { \ + _bcount##SUF--;\ + if (_b##SUF==0) { _b##SUF = bfield & ~_cycle_mask##SUF; } /* process [0,start> + [cycle, MI_BFIELD_BITS> next */ \ + size_t name_idx; \ + bool _found##SUF = mi_bfield_find_least_bit(_b##SUF,&name_idx); \ + mi_assert_internal(_found##SUF); MI_UNUSED(_found##SUF); \ + { \ + +#define mi_bfield_iterate_end(SUF) \ + } \ + _b##SUF = mi_bfield_clear_least_bit(_b##SUF); \ + } \ +} + +#define mi_bfield_cycle_iterate(bfield,tseq,cycle,name_idx,SUF) { \ + const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); \ + mi_bfield_iterate(bfield,_start##SUF,cycle,name_idx,SUF) + +#define mi_bfield_cycle_iterate_end(SUF) \ + mi_bfield_iterate_end(SUF); } + + /* -------------------------------------------------------------------------------- bitmap try_find_and_clear (used to find free pages) -------------------------------------------------------------------------------- */ + +typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx); + +static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear) +{ + // we space out threads to reduce contention + const size_t cmap_max_count = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); + const size_t chunk_acc = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); + const size_t cmap_acc = chunk_acc / MI_BFIELD_BITS; + const size_t cmap_acc_bits = 1 + (chunk_acc % MI_BFIELD_BITS); + + // create a mask over the chunkmap entries to iterate over them efficiently + mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS); + const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); + const size_t cmap_cycle = cmap_acc+1; + mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) + { + // and for each chunkmap entry we iterate over its bits to find the chunks + mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[cmap_idx]); + size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); + mi_bfield_cycle_iterate(cmap_entry, tseq, cmap_entry_cycle, eidx, Y) + { + mi_assert_internal(eidx <= MI_BFIELD_BITS); + const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + size_t cidx; + // if we find a spot in the chunk we are done + if ((*try_find_and_clear)(&bitmap->chunks[chunk_idx], n, &cidx)) { + *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; + mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap)); + return true; + } + else { + /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); + } + } + mi_bfield_cycle_iterate_end(Y); + } + mi_bfield_cycle_iterate_end(X); + return false; +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { + return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BFIELD_BITS); + return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX); +} + +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BCHUNK_BITS); + return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_); +} + + +/* -------------------------------------------------------------------------------- + bitmap try_find_and_claim + (used to allocate abandoned pages) +-------------------------------------------------------------------------------- */ + #define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \ { \ /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ @@ -1084,53 +1223,6 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n } \ }} - -#define mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NSUF, NPAR) { \ - mi_bitmap_forall_chunks(bitmap, tseq, _chunk_idx) { \ - size_t _cidx; \ - if mi_likely(mi_bchunk_try_find_and_clear##NSUF(&bitmap->chunks[_chunk_idx] NPAR, &_cidx)) { \ - *pidx = (_chunk_idx * MI_BCHUNK_BITS) + _cidx; \ - return true; \ - } \ - else { \ - /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ \ - mi_bitmap_chunkmap_try_clear(bitmap, _chunk_idx); \ - } \ - } \ - mi_bitmap_forall_chunks_end(); \ - return false; \ -} - -#define COMMA , - -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, , ); -} - -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, 8, ); -} - -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, X, ); -} - -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { - mi_assert_internal(n<=MI_BFIELD_BITS); - mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, NX, COMMA n); -} - -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { - mi_assert_internal(n<=MI_BCHUNK_BITS); - mi_bitmap_forall_chunks_try_find_and_clear(bitmap, tseq, pidx, N_, COMMA n); -} - - -/* -------------------------------------------------------------------------------- - bitmap try_find_and_claim - (used to allocate abandoned pages) --------------------------------------------------------------------------------- */ - // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, @@ -1177,7 +1269,7 @@ bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx) { if (mi_bsr(cmap,&cmap_idx)) { // highest chunk const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx; - size_t cidx; + size_t cidx; if (mi_bchunk_bsr(&bitmap->chunks[chunk_idx], &cidx)) { *idx = (chunk_idx * MI_BCHUNK_BITS) + cidx; return true; From 98879ac8bcf545f03ec750c1172a60577f67aa63 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 12 Dec 2024 17:22:00 -0800 Subject: [PATCH 077/264] use thread spacing for reclaim as well --- src/bitmap.c | 197 +++++++++++++++++++++++++++------------------------ 1 file changed, 104 insertions(+), 93 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index 4f21f68f..0588858d 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -42,6 +42,13 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { return mi_bsf(x,idx); } +// find each set bit in a bit field `x` until it becomes zero. +static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) { + const bool found = mi_bfield_find_least_bit(*x, idx); + *x = mi_bfield_clear_least_bit(*x); + return found; +} + //static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { // return mi_rotr(x,r); //} @@ -1080,7 +1087,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n // // The start is determined usually as `tseq % cycle` to have each thread // start at a different spot. -// - We use `popcount` to improve branch prediction` +// - We use `popcount` to improve branch prediction (maybe not needed? can we simplify?) // - The `cycle_mask` is the part `[start, cycle>`. #define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \ mi_assert_internal(start <= cycle); \ @@ -1112,14 +1119,15 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n /* -------------------------------------------------------------------------------- - bitmap try_find_and_clear + mi_bitmap_find (used to find free pages) -------------------------------------------------------------------------------- */ +typedef bool (mi_bitmap_visit_fun_t)(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* idx, void* arg1, void* arg2); -typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx); - -static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear) +// Go through the bitmap and for every sequence of `n` set bits, call the visitor function. +// If it returns `true` stop the search. +static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bitmap_visit_fun_t* on_find, void* arg1, void* arg2) { // we space out threads to reduce contention const size_t cmap_max_count = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); @@ -1141,17 +1149,9 @@ static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, siz mi_assert_internal(eidx <= MI_BFIELD_BITS); const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - size_t cidx; - // if we find a spot in the chunk we are done - if ((*try_find_and_clear)(&bitmap->chunks[chunk_idx], n, &cidx)) { - *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; - mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap)); + if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) { return true; } - else { - /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); - } } mi_bfield_cycle_iterate_end(Y); } @@ -1159,6 +1159,36 @@ static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, siz return false; } + +/* -------------------------------------------------------------------------------- + mi_bitmap_try_find_and_clear -- used to find free pages + note: the compiler will fully inline the indirect function calls +-------------------------------------------------------------------------------- */ + + +typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx); + +static bool mi_bitmap_try_find_and_clear_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2) { + MI_UNUSED(arg2); + mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear = (mi_bchunk_try_find_and_clear_fun_t*)arg1; + size_t cidx; + // if we find a spot in the chunk we are done + if ((*try_find_and_clear)(&bitmap->chunks[chunk_idx], n, &cidx)) { + *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; + mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap)); + return true; + } + else { + /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); + return false; + } +} + +static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear) { + return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL); +} + mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1); } @@ -1183,80 +1213,55 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_ /* -------------------------------------------------------------------------------- - bitmap try_find_and_claim - (used to allocate abandoned pages) + Bitmap: try_find_and_claim -- used to allocate abandoned pages + note: the compiler will fully inline the indirect function call -------------------------------------------------------------------------------- */ -#define mi_bitmap_forall_chunks(bitmap, tseq, name_chunk_idx) \ - { \ - /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \ - const size_t chunk_max_acc = 1 + mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); \ - const size_t chunk_start = tseq % chunk_max_acc; /* space out threads? */ \ - const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); \ - const size_t chunkmap_max_acc = _mi_divide_up(chunk_max_acc,MI_BFIELD_BITS); \ - const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \ - /* for each chunkmap entry `i` */ \ - for (size_t _i = 0; _i < chunkmap_max; _i++) { \ - size_t i; \ - if (_i < chunkmap_max_acc) { /* first the chunks up to chunk_max_accessed */ \ - i = _i + chunkmap_start; \ - if (i >= chunkmap_max_acc) { i -= chunkmap_max_acc; } /* rotate */ \ - } \ - else { i = _i; } /* the rest of the chunks above chunk_max_accessed */ \ - const size_t chunk_idx0 = i*MI_BFIELD_BITS; \ - mi_bfield_t cmap = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); \ - /* todo: space out threads within a chunkmap (2GiB) as well? */ \ - size_t cmap_idx_shift = 0; /* shift through the cmap */ \ - size_t cmap_idx; \ - while (mi_bfield_find_least_bit(cmap, &cmap_idx)) { \ - /* set the chunk idx */ \ - size_t name_chunk_idx = chunk_idx0 + ((cmap_idx + cmap_idx_shift) % MI_BFIELD_BITS); \ - /* try to find and clear N bits in that chunk */ \ - { +typedef struct mi_claim_fun_data_s { + mi_arena_t* arena; + mi_subproc_t* subproc; + int heap_tag; +} mi_claim_fun_data_t; -#define mi_bitmap_forall_chunks_end() \ - } \ - /* skip to the next bit */ \ - cmap_idx_shift += cmap_idx+1; \ - cmap >>= cmap_idx; /* skip scanned bits (and avoid UB for `cmap_idx+1`) */ \ - cmap >>= 1; \ - } \ - }} +static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2) +{ + mi_assert_internal(n==1); MI_UNUSED(n); + mi_claim_fun_t* claim_fun = (mi_claim_fun_t*)arg1; + mi_claim_fun_data_t* claim_data = (mi_claim_fun_data_t*)arg2; + size_t cidx; + if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) { + const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx; + mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap)); + bool keep_set = true; + if ((*claim_fun)(slice_index, claim_data->arena, claim_data->subproc, claim_data->heap_tag, &keep_set)) { + // success! + mi_assert_internal(!keep_set); + *pidx = slice_index; + return true; + } + else { + // failed to claim it, set abandoned mapping again (unless the page was freed) + if (keep_set) { + const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); + mi_assert_internal(wasclear); MI_UNUSED(wasclear); + } + } + } + else { + // we may find that all are cleared only on a second iteration but that is ok as + // the chunkmap is a conservative approximation. + mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); + } + return false; +} // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, - mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag ) + mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag) { - mi_bitmap_forall_chunks(bitmap, tseq, chunk_idx) - { - size_t cidx; - if mi_likely(mi_bchunk_try_find_and_clear(&bitmap->chunks[chunk_idx], &cidx)) { - const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx; - mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap)); - bool keep_set = true; - if ((*claim)(slice_index, arena, subproc, heap_tag, &keep_set)) { - // success! - mi_assert_internal(!keep_set); - *pidx = slice_index; - return true; - } - else { - // failed to claim it, set abandoned mapping again (unless the page was freed) - if (keep_set) { - const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); - mi_assert_internal(wasclear); MI_UNUSED(wasclear); - } - } - } - else { - // we may find that all are cleared only on a second iteration but that is ok as - // the chunkmap is a conservative approximation. - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); - } - } - mi_bitmap_forall_chunks_end(); - return false; + mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag }; + return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data); } @@ -1291,22 +1296,28 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx) { // Visit all set bits in a bitmap. -// todo: optimize further? maybe popcount to help the branch predictor for the loop, -// and keep b constant (using a mask)? or avx512 to directly get all indices using a mask_compressstore? +// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore? bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) { - mi_bitmap_forall_chunks(bitmap, 0, chunk_idx) { - mi_bchunk_t* chunk = &bitmap->chunks[chunk_idx]; - for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { - const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); - mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); - size_t bidx; - while (mi_bsf(b, &bidx)) { - b = b & (b-1); // clear low bit - const size_t idx = base_idx + bidx; - if (!visit(idx, arena, arg)) return false; + // for all chunkmap entries + const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); + for(size_t i = 0; i < chunkmap_max; i++) { + mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); + size_t cmap_idx; + // for each chunk (corresponding to a set bit in a chunkmap entry) + while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) { + const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx; + // for each chunk field + mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx]; + for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { + const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); + mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); + size_t bidx; + while (mi_bfield_foreach_bit(&b, &bidx)) { + const size_t idx = base_idx + bidx; + if (!visit(idx, arena, arg)) return false; + } } } } - mi_bitmap_forall_chunks_end(); return true; } From df956c4a17cf6fa5e6c0bf73b50079c836ba5a37 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 12 Dec 2024 17:22:41 -0800 Subject: [PATCH 078/264] use thread spacing for reclaim as well --- src/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index 0588858d..b84b42a4 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1111,7 +1111,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n } #define mi_bfield_cycle_iterate(bfield,tseq,cycle,name_idx,SUF) { \ - const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); \ + const size_t _start##SUF = (uint32_t)(tseq) % (uint32_t)(cycle); /* or: 0 to always search from the start? */\ mi_bfield_iterate(bfield,_start##SUF,cycle,name_idx,SUF) #define mi_bfield_cycle_iterate_end(SUF) \ From d5c4a16e58d25c54809c65461c3abe9b075a64df Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 12 Dec 2024 17:57:36 -0800 Subject: [PATCH 079/264] lower full page retain more aggressively in a threadpool --- src/heap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/heap.c b/src/heap.c index 1d8142f7..dee404d2 100644 --- a/src/heap.c +++ b/src/heap.c @@ -202,7 +202,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint heap->allow_page_reclaim = false; // and halve the full page retain (possibly to 0) if (heap->full_page_retain >= 0) { - heap->full_page_retain = heap->full_page_retain / 2; + heap->full_page_retain = heap->full_page_retain / 4; } } From 637de624b3a012dbe8417c15bb61112712b71167 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 12 Dec 2024 19:55:45 -0800 Subject: [PATCH 080/264] fix free bug for meta data --- src/arena-meta.c | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/src/arena-meta.c b/src/arena-meta.c index 0fb4dfa5..401231ac 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -145,7 +145,7 @@ void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { mi_assert_internal(mi_bitmap_is_clearN(&mpage->blocks_free, block_idx, block_count)); // we zero on free (and on the initial page allocation) so we don't need a "dirty" map _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE); - mi_bitmap_clearN(&mpage->blocks_free, block_idx, block_count); + mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL); } else if (mi_memid_is_os(memid)) { _mi_os_free(p, size, memid); @@ -154,3 +154,14 @@ void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { mi_assert_internal(mi_memid_needs_no_free(memid)); } } + +bool _mi_meta_is_meta_page(void* p) +{ + mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages); + mi_meta_page_t* mpage = mpage0; + while (mpage != NULL) { + if ((void*)mpage == p) return true; + mpage = mi_meta_page_next(mpage); + } + return false; +} \ No newline at end of file From 623eaedf336fb4372b4de368c7b9292ea81d5f18 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 12 Dec 2024 19:59:54 -0800 Subject: [PATCH 081/264] add debug output for page map; free tld on thread exit --- include/mimalloc/internal.h | 1 + src/arena.c | 45 +++++++++++++++++++++++++++++++------ src/init.c | 13 ++++++----- src/page.c | 4 ++-- test/test-stress.c | 10 +++++---- 5 files changed, 55 insertions(+), 18 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index ee7f1026..a5ca3e27 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -159,6 +159,7 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page); // arena-meta.c void* _mi_meta_zalloc( size_t size, mi_memid_t* memid ); void _mi_meta_free(void* p, size_t size, mi_memid_t memid); +bool _mi_meta_is_meta_page(void* p); // "page-map.c" bool _mi_page_map_init(void); diff --git a/src/arena.c b/src/arena.c index 32c0b32e..a61f59b0 100644 --- a/src/arena.c +++ b/src/arena.c @@ -162,6 +162,8 @@ void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { static mi_memid_t mi_memid_create_arena(mi_arena_t* arena, size_t slice_index, size_t slice_count) { mi_assert_internal(slice_index < UINT32_MAX); mi_assert_internal(slice_count < UINT32_MAX); + mi_assert_internal(slice_count > 0); + mi_assert_internal(slice_index < arena->slice_count); mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); memid.mem.arena.arena = arena; memid.mem.arena.slice_index = (uint32_t)slice_index; @@ -663,7 +665,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz page->reserved = (uint16_t)reserved; page->page_start = (uint8_t*)page + block_start; page->block_size = block_size; - page->memid = memid; + page->memid = memid; + mi_assert_internal(memid.mem.arena.slice_count > 0); page->free_is_zero = memid.initially_zero; if (block_size > 0 && _mi_is_power_of_two(block_size)) { page->block_size_shift = (uint8_t)mi_ctz(block_size); @@ -1197,7 +1200,33 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { return bit_set_count; } -static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) { +static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t* arena, size_t slice_index) { + size_t bit_set_count = 0; + long bit_of_page = 0; + for (int bit = 0; bit < MI_BFIELD_BITS; bit++, bit_of_page--) { + bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); + void* start = mi_arena_slice_start(arena, slice_index + bit); + if (is_set) { + bit_set_count++; + mi_page_t* page = (mi_page_t*)start; + char c = 'p'; + if (mi_page_is_abandoned_mapped(page)) { c = 'a'; } + else if (mi_page_is_abandoned(page)) { c = 'f'; } + bit_of_page = (long)page->memid.mem.arena.slice_count - 1; + buf[bit] = c; + } + else { + char c = '.'; + if (bit_of_page > 0) { c = '-'; } + else if (_mi_meta_is_meta_page(start)) { c = 'm'; } + else if (slice_index + bit < arena->info_slices) { c = 'i'; } + buf[bit] = c; + } + } + return bit_set_count; +} + +static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { _mi_output_message("%s:\n", header); size_t bit_count = 0; size_t bit_set_count = 0; @@ -1217,7 +1246,8 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; if (invert) bfield = ~bfield; - size_t xcount = mi_debug_show_bfield(bfield, buf + k); + size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf + k, arena, bit_count) + : mi_debug_show_bfield(bfield, buf + k)); if (invert) xcount = MI_BFIELD_BITS - xcount; bit_set_count += xcount; k += MI_BFIELD_BITS; @@ -1246,15 +1276,16 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; slice_total += arena->slice_count; - _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); + _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { - free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true); + free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); } - mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false); + mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL); // todo: abandoned slices if (show_purge) { - purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false); + purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); } + mi_debug_show_bitmap("pages (p=page, a=abandoned, f=full-abandoned, i=info, m=meta)", arena->slice_count, arena->pages, false, arena); } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total); diff --git a/src/init.c b/src/init.c index 85588970..5c5186b9 100644 --- a/src/init.c +++ b/src/init.c @@ -398,11 +398,10 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { // merge stats _mi_stats_done(&heap->tld->stats); - // free if not the main thread - if (heap != &_mi_heap_main) { - _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid); - } - else { + // free heap meta data + _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid); + + if (heap == &_mi_heap_main) { #if 0 // never free the main thread even in debug mode; if a dll is linked statically with mimalloc, // there may still be delete/free calls after the mi_fls_done is called. Issue #207 @@ -410,6 +409,10 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main); #endif } + + // free the tld + mi_tld_t* tld = _mi_tld(); + _mi_meta_free(_mi_tld(), sizeof(mi_tld_t), tld->memid); return false; } diff --git a/src/page.c b/src/page.c index a90c1d7d..a30db6c9 100644 --- a/src/page.c +++ b/src/page.c @@ -82,7 +82,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) { mi_assert_internal(mi_page_block_size(page) > 0); mi_assert_internal(page->used <= page->capacity); mi_assert_internal(page->capacity <= page->reserved); - + // const size_t bsize = mi_page_block_size(page); // uint8_t* start = mi_page_start(page); //mi_assert_internal(start + page->capacity*page->block_size == page->top); @@ -623,7 +623,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { #endif mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift))); mi_assert_expensive(mi_page_is_valid_init(page)); - + // initialize an initial free list mi_page_extend_free(heap,page); mi_assert(mi_page_immediate_available(page)); diff --git a/test/test-stress.c b/test/test-stress.c index 0488fc2b..df535e6e 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -344,12 +344,14 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG //mi_debug_show_arenas(true, true, false); + // mi_debug_show_arenas(true, false, false); mi_collect(true); - mi_debug_show_arenas(true,true,false); - #endif - // mi_collect(true); - // mi_debug_show_arenas(true, true, false); + mi_debug_show_arenas(true, false, false); + #else + mi_collect(false); + mi_debug_show_arenas(true, true, false); // mi_stats_print(NULL); + #endif #else mi_stats_print(NULL); // so we see rss/commit/elapsed #endif From b53ac835f117076b83f985943e1e0b436f54f755 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 12 Dec 2024 20:01:37 -0800 Subject: [PATCH 082/264] comment --- src/arena-meta.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/arena-meta.c b/src/arena-meta.c index 401231ac..bc98d3f9 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -155,6 +155,7 @@ void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { } } +// used for debug output bool _mi_meta_is_meta_page(void* p) { mi_meta_page_t* mpage0 = mi_atomic_load_ptr_acquire(mi_meta_page_t, &mi_meta_pages); @@ -164,4 +165,4 @@ bool _mi_meta_is_meta_page(void* p) mpage = mi_meta_page_next(mpage); } return false; -} \ No newline at end of file +} From e43eb1f19167b01ec89d91fe05e8e6c2a2d2828b Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 12 Dec 2024 20:22:24 -0800 Subject: [PATCH 083/264] nicer debug output --- include/mimalloc.h | 2 +- src/arena.c | 23 +++++++++++++---------- src/bitmap.c | 2 +- 3 files changed, 15 insertions(+), 12 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 97f74c83..710e5d67 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size mi_decl_export int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept; mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept; -mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept; +mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) mi_attr_noexcept; // Experimental: heaps associated with specific memory arena's typedef int mi_arena_id_t; diff --git a/src/arena.c b/src/arena.c index a61f59b0..c9a61291 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1212,7 +1212,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t char c = 'p'; if (mi_page_is_abandoned_mapped(page)) { c = 'a'; } else if (mi_page_is_abandoned(page)) { c = 'f'; } - bit_of_page = (long)page->memid.mem.arena.slice_count - 1; + bit_of_page = (long)page->memid.mem.arena.slice_count; buf[bit] = c; } else { @@ -1265,13 +1265,12 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi return bit_set_count; } -void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { - MI_UNUSED(show_abandoned); +void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) mi_attr_noexcept { size_t max_arenas = mi_arena_get_count(); size_t free_total = 0; size_t slice_total = 0; //size_t abandoned_total = 0; - size_t purge_total = 0; + size_t page_total = 0; for (size_t i = 0; i < max_arenas; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); if (arena == NULL) break; @@ -1280,16 +1279,20 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) if (show_inuse) { free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); } - mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL); - // todo: abandoned slices - if (show_purge) { - purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); + if (show_committed) { + mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL); + } + // todo: abandoned slices + //if (show_purge) { + // purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); + //} + if (show_pages) { + page_total += mi_debug_show_bitmap("pages (p=page, a=abandoned, f=full-abandoned, i=info, m=meta)", arena->slice_count, arena->pages, false, arena); } - mi_debug_show_bitmap("pages (p=page, a=abandoned, f=full-abandoned, i=info, m=meta)", arena->slice_count, arena->pages, false, arena); } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total); - if (show_purge) _mi_output_message("total purgeable slices: %zu\n", purge_total); + if (show_pages) _mi_output_message("total pages in areanas: %zu\n", page_total); } diff --git a/src/bitmap.c b/src/bitmap.c index b84b42a4..649a7046 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1220,7 +1220,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_ typedef struct mi_claim_fun_data_s { mi_arena_t* arena; mi_subproc_t* subproc; - int heap_tag; + mi_heaptag_t heap_tag; } mi_claim_fun_data_t; static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2) From 3010d5890f0c305ef9b54dc8a138f81fe496fdac Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 12 Dec 2024 20:27:46 -0800 Subject: [PATCH 084/264] fix assertion --- src/arena.c | 1 - 1 file changed, 1 deletion(-) diff --git a/src/arena.c b/src/arena.c index c9a61291..5f996b89 100644 --- a/src/arena.c +++ b/src/arena.c @@ -666,7 +666,6 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz page->page_start = (uint8_t*)page + block_start; page->block_size = block_size; page->memid = memid; - mi_assert_internal(memid.mem.arena.slice_count > 0); page->free_is_zero = memid.initially_zero; if (block_size > 0 && _mi_is_power_of_two(block_size)) { page->block_size_shift = (uint8_t)mi_ctz(block_size); From ba39e4d65b87c7a7a5a6bdb09174d35adcb058ed Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 13 Dec 2024 09:03:17 -0800 Subject: [PATCH 085/264] wip: start on purge --- include/mimalloc.h | 2 +- src/arena.c | 60 ++++++++++++++++++++++++++++------------------ test/test-stress.c | 8 +++---- 3 files changed, 42 insertions(+), 28 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 710e5d67..24217fae 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size mi_decl_export int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept; mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept; -mi_decl_export void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) mi_attr_noexcept; +mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept; // Experimental: heaps associated with specific memory arena's typedef int mi_arena_id_t; diff --git a/src/arena.c b/src/arena.c index c9a61291..07239d25 100644 --- a/src/arena.c +++ b/src/arena.c @@ -37,23 +37,31 @@ typedef struct mi_arena_s { mi_memid_t memid; // memid of the memory area mi_arena_id_t id; // arena id (> 0 where `arena == arenas[arena->id - 1]`) - size_t slice_count; // size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`) + size_t slice_count; // total size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`) size_t info_slices; // initial slices reserved for the arena bitmaps int numa_node; // associated NUMA node - bool is_exclusive; // only allow allocations if specifically for this arena + bool is_exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) - _Atomic(mi_msecs_t) purge_expire; // expiration time when slices should be decommitted from `slices_decommit`. + _Atomic(mi_msecs_t) purge_expire; // expiration time when pages can be purged from `pages_purge`. mi_bitmap_t* slices_free; // is the slice free? mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) - mi_bitmap_t* slices_purge; // can the slice be purged? (slice in purge => slice in free) mi_bitmap_t* slices_dirty; // is the slice potentially non-zero? - mi_bitmap_t* pages; // all registered pages + mi_bitmap_t* pages; // all registered pages (abandoned and owned) + mi_bitmap_t* pages_purge; // pages that are scheduled to be purged mi_bitmap_t* pages_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) // the full queue contains abandoned full pages - // followed by the bitmaps (whose size depends on the arena size) + // followed by the bitmaps (whose sizes depend on the arena size) } mi_arena_t; +// Every "page" in `pages_purge` points to purge info +// (since we use it for any free'd range and not just for pages) +typedef struct mi_purge_info_s { + mi_msecs_t expire; + size_t slice_count; +} mi_purge_info_t; + + #define MI_MAX_ARENAS (160) // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`) // 160 arenas is enough for ~2 TiB memory @@ -262,8 +270,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); } mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); - // mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); - + return p; } @@ -569,7 +576,6 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); @@ -665,8 +671,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz page->reserved = (uint16_t)reserved; page->page_start = (uint8_t*)page + block_start; page->block_size = block_size; - page->memid = memid; - mi_assert_internal(memid.mem.arena.slice_count > 0); + page->memid = memid; page->free_is_zero = memid.initially_zero; if (block_size > 0 && _mi_is_power_of_two(block_size)) { page->block_size_shift = (uint8_t)mi_ctz(block_size); @@ -771,7 +776,6 @@ void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may @@ -809,7 +813,6 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) { mi_assert_internal(!mi_page_is_singleton(page)); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_page_set_abandoned_mapped(page); @@ -865,8 +868,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_purge, slice_index, slice_count)); - + // this busy waits until a concurrent reader (from alloc_abandoned) is done mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index); mi_page_clear_abandoned_mapped(page); @@ -926,8 +928,8 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) { } // potentially decommit - if (!arena->memid.is_pinned && !arena->memid.initially_committed) { // todo: maybe allow decommit even if initially committed? - // (delay) purge the entire range + if (!arena->memid.is_pinned /* && !arena->memid.initially_committed */) { // todo: allow decommit even if initially committed? + // (delay) purge the page mi_arena_schedule_purge(arena, slice_index, slice_count); } @@ -1121,8 +1123,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->slices_free = mi_arena_bitmap_init(slice_count,&base); arena->slices_committed = mi_arena_bitmap_init(slice_count,&base); arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base); - arena->slices_purge = mi_arena_bitmap_init(slice_count,&base); arena->pages = mi_arena_bitmap_init(slice_count, &base); + arena->pages_purge = mi_arena_bitmap_init(slice_count, &base); for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) { arena->pages_abandoned[i] = mi_arena_bitmap_init(slice_count,&base); } @@ -1207,21 +1209,33 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); void* start = mi_arena_slice_start(arena, slice_index + bit); if (is_set) { + mi_assert_internal(bit_of_page <= 0); bit_set_count++; mi_page_t* page = (mi_page_t*)start; char c = 'p'; if (mi_page_is_abandoned_mapped(page)) { c = 'a'; } - else if (mi_page_is_abandoned(page)) { c = 'f'; } + else if (mi_page_is_abandoned(page)) { c = (mi_page_is_singleton(page) ? 's' : 'f'); } bit_of_page = (long)page->memid.mem.arena.slice_count; buf[bit] = c; } else { - char c = '.'; + char c = '?'; if (bit_of_page > 0) { c = '-'; } else if (_mi_meta_is_meta_page(start)) { c = 'm'; } else if (slice_index + bit < arena->info_slices) { c = 'i'; } + // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; } + else if (mi_bitmap_is_setN(arena->slices_free, slice_index+bit, 1)) { + if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, 1)) { + mi_assert_internal(bit_of_page <= 0); + mi_purge_info_t* pinfo = (mi_purge_info_t*)start; + c = '!'; + bit_of_page = (long)pinfo->slice_count; + } + if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; } + else { c = '.'; } + } buf[bit] = c; - } + } } return bit_set_count; } @@ -1265,7 +1279,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi return bit_set_count; } -void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) mi_attr_noexcept { +void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept { size_t max_arenas = mi_arena_get_count(); size_t free_total = 0; size_t slice_total = 0; @@ -1287,7 +1301,7 @@ void mi_debug_show_arenas(bool show_inuse, bool show_committed, bool show_pages) // purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); //} if (show_pages) { - page_total += mi_debug_show_bitmap("pages (p=page, a=abandoned, f=full-abandoned, i=info, m=meta)", arena->slice_count, arena->pages, false, arena); + page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, !:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena); } } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); diff --git a/test/test-stress.c b/test/test-stress.c index df535e6e..4fe6e0c6 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,7 +40,7 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; -#elif 0 +#elif 1 static int THREADS = 4; static int SCALE = 10; static int ITER = 10; @@ -345,11 +345,11 @@ int main(int argc, char** argv) { #ifndef NDEBUG //mi_debug_show_arenas(true, true, false); // mi_debug_show_arenas(true, false, false); - mi_collect(true); - mi_debug_show_arenas(true, false, false); + // mi_collect(true); + mi_debug_show_arenas(true,false,false); #else mi_collect(false); - mi_debug_show_arenas(true, true, false); + mi_debug_show_arenas(true,false,false); // mi_stats_print(NULL); #endif #else From 4c81c3cf90135fd2d3e00be19faf3c5fd7d53f71 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 13 Dec 2024 13:17:00 -0800 Subject: [PATCH 086/264] enable purging of free committed slices from arenas --- include/mimalloc/types.h | 2 +- src/arena.c | 162 +++++++++++++++++++++++++++++---------- src/bitmap.c | 59 +++++++++++++- src/bitmap.h | 23 +++++- src/options.c | 10 ++- src/prim/unix/prim.c | 6 +- test/test-stress.c | 16 ++-- 7 files changed, 222 insertions(+), 56 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index f4bfa07a..bf1cb5c8 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -321,7 +321,7 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) -#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 8 KiB #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB #define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // < 2 MiB #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/arena.c b/src/arena.c index 8cf61b74..9f95a699 100644 --- a/src/arena.c +++ b/src/arena.c @@ -42,13 +42,13 @@ typedef struct mi_arena_s { int numa_node; // associated NUMA node bool is_exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) - _Atomic(mi_msecs_t) purge_expire; // expiration time when pages can be purged from `pages_purge`. + _Atomic(mi_msecs_t) purge_expire; // expiration time when slices can be purged from `slices_purge`. mi_bitmap_t* slices_free; // is the slice free? mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) mi_bitmap_t* slices_dirty; // is the slice potentially non-zero? + mi_bitmap_t* slices_purge; // slices that can be purged mi_bitmap_t* pages; // all registered pages (abandoned and owned) - mi_bitmap_t* pages_purge; // pages that are scheduled to be purged mi_bitmap_t* pages_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) // the full queue contains abandoned full pages // followed by the bitmaps (whose sizes depend on the arena size) @@ -57,8 +57,8 @@ typedef struct mi_arena_s { // Every "page" in `pages_purge` points to purge info // (since we use it for any free'd range and not just for pages) typedef struct mi_purge_info_s { - mi_msecs_t expire; - size_t slice_count; + _Atomic(mi_msecs_t) expire; + _Atomic(size_t) slice_count; } mi_purge_info_t; @@ -1123,8 +1123,8 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->slices_free = mi_arena_bitmap_init(slice_count,&base); arena->slices_committed = mi_arena_bitmap_init(slice_count,&base); arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base); + arena->slices_purge = mi_arena_bitmap_init(slice_count, &base); arena->pages = mi_arena_bitmap_init(slice_count, &base); - arena->pages_purge = mi_arena_bitmap_init(slice_count, &base); for( size_t i = 0; i < MI_ARENA_BIN_COUNT; i++) { arena->pages_abandoned[i] = mi_arena_bitmap_init(slice_count,&base); } @@ -1224,16 +1224,12 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t else if (_mi_meta_is_meta_page(start)) { c = 'm'; } else if (slice_index + bit < arena->info_slices) { c = 'i'; } // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; } - else if (mi_bitmap_is_setN(arena->slices_free, slice_index+bit, 1)) { - if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, 1)) { - mi_assert_internal(bit_of_page <= 0); - mi_purge_info_t* pinfo = (mi_purge_info_t*)start; - c = '!'; - bit_of_page = (long)pinfo->slice_count; - } - if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; } + else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) { + if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '!'; } + else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; } else { c = '.'; } } + if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; } buf[bit] = c; } } @@ -1390,53 +1386,121 @@ static long mi_arena_purge_delay(void) { return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); } -// reset or decommit in an arena and update the committed/decommit bitmaps +// reset or decommit in an arena and update the commit bitmap // assumes we own the area (i.e. slices_free is claimed by us) -static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slices) { +static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { mi_assert_internal(!arena->memid.is_pinned); - const size_t size = mi_size_of_slices(slices); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + + const size_t size = mi_size_of_slices(slice_count); void* const p = mi_arena_slice_start(arena, slice_index); - bool needs_recommit; - if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slices)) { - // all slices are committed, we can purge freely + bool needs_recommit = false; // reset needs no recommit, decommit does need it + if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)) { + // all slices are committed, we can purge the entire range needs_recommit = _mi_os_purge(p, size); } else { - // some slices are not committed -- this can happen when a partially committed slice is freed - // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge - // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), - // and also undo the decommit stats (as it was already adjusted) - mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); - needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */); - if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } + mi_assert_internal(false); // ? } - // clear the purged slices - mi_bitmap_clearN(arena->slices_purge, slices, slice_index); - // update committed bitmap if (needs_recommit) { - mi_bitmap_clearN(arena->slices_committed, slices, slice_index); + mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count); } } // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. // Note: assumes we (still) own the area as we may purge immediately -static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices) { +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { const long delay = mi_arena_purge_delay(); if (delay < 0 || _mi_preloading()) return; // is purging allowed at all? + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); if (delay == 0) { - // decommit directly - mi_arena_purge(arena, slice_index, slices); + // purge directly + mi_arena_purge(arena, slice_index, slice_count); } else { - // schedule decommit - _mi_error_message(EFAULT, "purging not yet implemented\n"); + // schedule purge + mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); + if (expire == 0) { + mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); + } + //else { + // mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10)); // add smallish extra delay + //} + mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL); } } +typedef struct mi_purge_visit_info_s { + mi_msecs_t now; + mi_msecs_t delay; + bool all_purged; + bool any_purged; +} mi_purge_visit_info_t; + +static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) { + if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) { + // purge + mi_arena_purge(arena, slice_index, slice_count); + // and reset the free range + mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL); + return true; + } + else { + return false; + } +} + +static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) { + mi_purge_visit_info_t* vinfo = (mi_purge_visit_info_t*)arg; + // try to purge: first claim the free blocks + if (mi_arena_try_purge_range(arena, slice_index, slice_count)) { + vinfo->any_purged = true; + } + else { + // failed to claim the full range, try per slice instead + for (size_t i = 0; i < slice_count; i++) { + vinfo->any_purged = vinfo->any_purged || mi_arena_try_purge_range(arena, slice_index + i, 1); + } + } + // done: clear the purge bits + mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count); + return true; // continue +} + + + +// returns true if anything was purged +static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) +{ + // check pre-conditions + if (arena->memid.is_pinned) return false; + mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); + if (expire == 0) return false; + + // expired yet? + if (!force && expire > now) return false; + + // reset expire (if not already set concurrently) + mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); + + // go through all purge info's + // todo: instead of visiting per-bit, we should visit per range of bits + mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/}; + _mi_bitmap_forall_set(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo); + + // if not fully purged, make sure to purge again in the future + if (!vinfo.all_purged) { + const long delay = mi_arena_purge_delay(); + mi_msecs_t expected = 0; + mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expected, _mi_clock_now() + delay); + } + return vinfo.any_purged; +} + static void mi_arenas_try_purge(bool force, bool visit_all) { if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled @@ -1444,12 +1508,27 @@ static void mi_arenas_try_purge(bool force, bool visit_all) { const size_t max_arena = mi_arena_get_count(); if (max_arena == 0) return; - // _mi_error_message(EFAULT, "purging not yet implemented\n"); - MI_UNUSED(visit_all); - MI_UNUSED(force); + // allow only one thread to purge at a time + static mi_atomic_guard_t purge_guard; + mi_atomic_guard(&purge_guard) + { + const mi_msecs_t now = _mi_clock_now(); + const size_t arena_start = _mi_tld()->tseq % max_arena; + size_t max_purge_count = (visit_all ? max_arena : 1); + for (size_t _i = 0; _i < max_arena; _i++) { + size_t i = _i + arena_start; + if (i >= max_arena) { i -= max_arena; } + mi_arena_t* arena = mi_arena_from_index(i); + if (arena != NULL) { + if (mi_arena_try_purge(arena, now, force)) { + if (max_purge_count <= 1) break; + max_purge_count--; + } + } + } + } } - bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg); _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n"); @@ -1460,8 +1539,9 @@ bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool vi /* ----------------------------------------------------------- Unloading and reloading an arena. ----------------------------------------------------------- */ -static bool mi_arena_page_register(size_t slice_index, mi_arena_t* arena, void* arg) { - MI_UNUSED(arg); +static bool mi_arena_page_register(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) { + MI_UNUSED(arg); MI_UNUSED(slice_count); + mi_assert_internal(slice_count == 1); mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); _mi_page_map_register(page); diff --git a/src/bitmap.c b/src/bitmap.c index 649a7046..88b45a5e 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1051,6 +1051,23 @@ bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, s return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset); } +// ------- mi_bitmap_try_clearN --------------------------------------- + +bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BCHUNK_BITS); + mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); + + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + if (cidx + n > MI_BCHUNK_BITS) return false; + bool maybe_all_clear; + const bool cleared = mi_bchunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); + if (cleared && maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return cleared; +} // ------- mi_bitmap_is_xset --------------------------------------- @@ -1071,6 +1088,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n + /* -------------------------------------------------------------------------------- Iterate through a bfield -------------------------------------------------------------------------------- */ @@ -1144,7 +1162,7 @@ static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, si // and for each chunkmap entry we iterate over its bits to find the chunks mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[cmap_idx]); size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); - mi_bfield_cycle_iterate(cmap_entry, tseq, cmap_entry_cycle, eidx, Y) + mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) { mi_assert_internal(eidx <= MI_BFIELD_BITS); const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; @@ -1314,10 +1332,47 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a size_t bidx; while (mi_bfield_foreach_bit(&b, &bidx)) { const size_t idx = base_idx + bidx; - if (!visit(idx, arena, arg)) return false; + if (!visit(idx, 1, arena, arg)) return false; } } } } return true; } + +// Visit all set bits in a bitmap but try to return ranges (within bfields) if possible. +// used by purging to purge larger ranges if possible +// todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore? +bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) { + // for all chunkmap entries + const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); + for (size_t i = 0; i < chunkmap_max; i++) { + mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); + size_t cmap_idx; + // for each chunk (corresponding to a set bit in a chunkmap entry) + while (mi_bfield_foreach_bit(&cmap_entry, &cmap_idx)) { + const size_t chunk_idx = i*MI_BFIELD_BITS + cmap_idx; + // for each chunk field + mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx]; + for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { + const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); + mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); + size_t bshift = 0; + size_t bidx; + while (mi_bfield_find_least_bit(b, &bidx)) { + b >>= bidx; + bshift += bidx; + const size_t rng = mi_ctz(~b); // all the set bits from bidx + mi_assert_internal(rng>=1); + const size_t idx = base_idx + bshift + bidx; + if (!visit(idx, rng, arena, arg)) return false; + // skip rng + b >>= rng; + bshift += rng; + } + } + } + } + return true; +} + diff --git a/src/bitmap.h b/src/bitmap.h index 7fd09f43..72ba69c1 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -171,6 +171,22 @@ static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n); } +static inline bool mi_bitmap_is_set(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_is_setN(bitmap, idx, 1); +} + +static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_is_clearN(bitmap, idx, 1); +} + + +bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n); + +static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_try_clearN(bitmap, idx, 1); +} + + // Specialized versions for common bit sequence sizes mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 1-bit @@ -212,9 +228,12 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx); bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx); -typedef bool (mi_forall_set_fun_t)(size_t slice_index, mi_arena_t* arena, void* arg2); +typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg2); -// Visit all set bits in a bitmap +// Visit all set bits in a bitmap (`slice_count == 1`) bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); +// Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`) +bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); + #endif // MI_BITMAP_H diff --git a/src/options.c b/src/options.c index 8fcee452..4f1a00b8 100644 --- a/src/options.c +++ b/src/options.c @@ -79,8 +79,12 @@ typedef struct mi_option_desc_s { #endif #ifndef MI_DEFAULT_ALLOW_LARGE_OS_PAGES +#if defined(__linux__) && !defined(__ANDROID__) +#define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 1 +#else #define MI_DEFAULT_ALLOW_LARGE_OS_PAGES 0 #endif +#endif #ifndef MI_DEFAULT_RESERVE_HUGE_OS_PAGES #define MI_DEFAULT_RESERVE_HUGE_OS_PAGES 0 @@ -132,7 +136,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { -1, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose @@ -141,7 +145,7 @@ static mi_option_desc_t options[_mi_option_last] = { 10, UNINIT, MI_OPTION(max_segment_reclaim)}, // max. percentage of the abandoned segments to be reclaimed per try. { 0, UNINIT, MI_OPTION(destroy_on_exit)}, // release all OS memory on process exit; careful with dangling pointer or after-exit frees! { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`) - { 10, UNINIT, MI_OPTION(arena_purge_mult) }, // purge delay multiplier for arena's + { 1, UNINIT, MI_OPTION(arena_purge_mult) }, // purge delay multiplier for arena's { 1, UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) }, { MI_DEFAULT_DISALLOW_ARENA_ALLOC, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. @@ -192,7 +196,7 @@ void _mi_options_init(void) { } } _mi_verbose_message("guarded build: %s\n", mi_option_get(mi_option_guarded_sample_rate) != 0 ? "enabled" : "disabled"); - #endif + #endif } long _mi_option_get_fast(mi_option_t option) { diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index e1ca3964..eb351f69 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -61,6 +61,7 @@ terms of the MIT license. A copy of the license can be found in the file #include #endif +#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this? //------------------------------------------------------------------------------------ // Use syscalls for some primitives to allow for libraries that override open/read/close etc. @@ -146,7 +147,7 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) } #endif } - config->large_page_size = 2*MI_MiB; // TODO: can we query the OS for this? + config->large_page_size = MI_UNIX_LARGE_PAGE_SIZE; config->has_overcommit = unix_detect_overcommit(); config->has_partial_free = true; // mmap can free in parts config->has_virtual_reserve = true; // todo: check if this true for NetBSD? (for anonymous mmap with PROT_NONE) @@ -361,6 +362,9 @@ int _mi_prim_alloc(void* hint_addr, size_t size, size_t try_alignment, bool comm mi_assert_internal(size > 0 && (size % _mi_os_page_size()) == 0); mi_assert_internal(commit || !allow_large); mi_assert_internal(try_alignment > 0); + if (hint_addr == NULL && size >= 8*MI_UNIX_LARGE_PAGE_SIZE && try_alignment > 1 && _mi_is_power_of_two(try_alignment) && try_alignment < MI_UNIX_LARGE_PAGE_SIZE) { + try_alignment = MI_UNIX_LARGE_PAGE_SIZE; // try to align along large page size for larger allocations + } *is_zero = true; int protect_flags = (commit ? (PROT_WRITE | PROT_READ) : PROT_NONE); diff --git a/test/test-stress.c b/test/test-stress.c index 4fe6e0c6..126a7601 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -40,10 +40,10 @@ static int ITER = 20; static int THREADS = 8; static int SCALE = 10; static int ITER = 10; -#elif 1 +#elif 0 static int THREADS = 4; static int SCALE = 10; -static int ITER = 10; +static int ITER = 20; #define ALLOW_LARGE false #elif 0 static int THREADS = 32; @@ -260,8 +260,12 @@ static void test_stress(void) { //mi_debug_show_arenas(); #endif #if !defined(NDEBUG) || defined(MI_TSAN) - if ((n + 1) % 10 == 0) - { printf("- iterations left: %3d\n", ITER - (n + 1)); } + if ((n + 1) % 10 == 0) { + printf("- iterations left: %3d\n", ITER - (n + 1)); + //mi_debug_show_arenas(true, false, false); + //mi_collect(true); + //mi_debug_show_arenas(true, false, false); + } #endif } // clean up @@ -344,8 +348,8 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG //mi_debug_show_arenas(true, true, false); - // mi_debug_show_arenas(true, false, false); - // mi_collect(true); + mi_debug_show_arenas(true, false, false); + mi_collect(true); mi_debug_show_arenas(true,false,false); #else mi_collect(false); From 216c04f8d91cd433897e5c2e46a4a24554558c5d Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 13 Dec 2024 18:39:03 -0800 Subject: [PATCH 087/264] clean up bitmap api --- include/mimalloc/types.h | 3 +- src/arena.c | 7 +- src/bitmap.c | 363 ++++++++++++++++++--------------------- src/bitmap.h | 54 +++--- src/init.c | 3 +- src/stats.c | 11 +- test/test-stress.c | 2 +- 7 files changed, 202 insertions(+), 241 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index bf1cb5c8..bf91a58a 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -471,13 +471,12 @@ typedef struct mi_stats_s { mi_stat_counter_t commit_calls; mi_stat_counter_t reset_calls; mi_stat_counter_t purge_calls; + mi_stat_counter_t arena_purges; mi_stat_counter_t page_no_retire; mi_stat_counter_t searches; mi_stat_counter_t normal_count; mi_stat_counter_t huge_count; mi_stat_counter_t arena_count; - mi_stat_counter_t arena_crossover_count; - mi_stat_counter_t arena_rollback_count; mi_stat_counter_t guarded_alloc_count; #if MI_STAT>1 mi_stat_count_t normal_bins[MI_BIN_HUGE+1]; diff --git a/src/arena.c b/src/arena.c index 9f95a699..7aec429e 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1225,7 +1225,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t else if (slice_index + bit < arena->info_slices) { c = 'i'; } // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; } else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) { - if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '!'; } + if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; } else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; } else { c = '.'; } } @@ -1297,7 +1297,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) // purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); //} if (show_pages) { - page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, !:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena); + page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena); } } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); @@ -1470,8 +1470,6 @@ static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, m mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count); return true; // continue } - - // returns true if anything was purged static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) @@ -1486,6 +1484,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) // reset expire (if not already set concurrently) mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); + _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1); // go through all purge info's // todo: instead of visiting per-bit, we should visit per range of bits diff --git a/src/bitmap.c b/src/bitmap.c index 88b45a5e..f689ee58 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -34,7 +34,6 @@ static inline mi_bfield_t mi_bfield_clear_least_bit(mi_bfield_t x) { return (x & (x-1)); } - // find the least significant bit that is set (i.e. count trailing zero's) // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). @@ -42,17 +41,13 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { return mi_bsf(x,idx); } -// find each set bit in a bit field `x` until it becomes zero. +// find each set bit in a bit field `x` and clear it, until it becomes zero. static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) { const bool found = mi_bfield_find_least_bit(*x, idx); *x = mi_bfield_clear_least_bit(*x); return found; } -//static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { -// return mi_rotr(x,r); -//} - static inline mi_bfield_t mi_bfield_zero(void) { return 0; } @@ -65,6 +60,7 @@ static inline mi_bfield_t mi_bfield_all_set(void) { return ~((mi_bfield_t)0); } +// mask of `bit_count` bits set shifted to the left by `shiftl` static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) { mi_assert_internal(bit_count > 0); mi_assert_internal(bit_count + shiftl <= MI_BFIELD_BITS); @@ -72,7 +68,10 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) { return (mask0 << shiftl); } + // ------- mi_bfield_atomic_set --------------------------------------- +// the `_set` functions return also the count of bits that were already set (for commit statistics) +// the `_clear` functions return also whether the new bfield is all clear or not (for the chunk_map) // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { @@ -93,7 +92,8 @@ static inline bool mi_bfield_atomic_clear(_Atomic(mi_bfield_t)*b, size_t idx, bo } // Clear a bit but only when/once it is set. This is used by concurrent free's while -// the page is abandoned and mapped. +// the page is abandoned and mapped. This can incure a busy wait :-( but it should +// happen almost never (and is accounted for in the stats) static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_one()<bfields[i], idx); } -static inline bool mi_bchunk_clear(mi_bchunk_t* chunk, size_t cidx, bool* maybe_all_clear) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); +static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; - return mi_bfield_atomic_clear(&chunk->bfields[i], idx, maybe_all_clear); + const size_t idx = cidx % MI_BFIELD_BITS; + const mi_bfield_t mask = mi_bfield_mask(n, idx); + return mi_bfield_atomic_set_mask(&chunk->bfields[i], mask, already_set); } -static inline bool mi_bchunk_set8(mi_bchunk_t* chunk, size_t byte_idx) { - mi_assert_internal(byte_idx < MI_BCHUNK_SIZE); - const size_t i = byte_idx / MI_BFIELD_SIZE; - const size_t bidx = byte_idx % MI_BFIELD_SIZE; - return mi_bfield_atomic_set8(&chunk->bfields[i], bidx); +static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + mi_assert_internal((cidx%MI_BFIELD_BITS)==0); + const size_t i = cidx / MI_BFIELD_BITS; + return mi_bfield_atomic_setX(&chunk->bfields[i], already_set); } -static inline bool mi_bchunk_clear8(mi_bchunk_t* chunk, size_t byte_idx, bool* maybe_all_clear) { - mi_assert_internal(byte_idx < MI_BCHUNK_SIZE); - const size_t i = byte_idx / MI_BFIELD_SIZE; - const size_t bidx = byte_idx % MI_BFIELD_SIZE; - return mi_bfield_atomic_clear8(&chunk->bfields[i], bidx, maybe_all_clear); -} - -static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t field_idx) { - mi_assert_internal(field_idx < MI_BCHUNK_FIELDS); - return mi_bfield_atomic_setX(&chunk->bfields[field_idx]); -} - -static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t field_idx, bool* maybe_all_clear) { - mi_assert_internal(field_idx < MI_BCHUNK_FIELDS); - if (maybe_all_clear != NULL) { *maybe_all_clear = true; } - return mi_bfield_atomic_clearX(&chunk->bfields[field_idx]); -} - -// Set/clear a sequence of `n` bits within a chunk. +// Set a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0). -static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_xset) { +mi_decl_noinline static bool mi_bchunk_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* palready_set, bool* pmaybe_all_clear) { mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); bool all_transition = true; - size_t total_already_xset = 0; + bool maybe_all_clear = true; + size_t total_already_set = 0; size_t idx = cidx % MI_BFIELD_BITS; size_t field = cidx / MI_BFIELD_BITS; while (n > 0) { @@ -326,28 +272,67 @@ static bool mi_bchunk_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size mi_assert_internal(idx + m <= MI_BFIELD_BITS); mi_assert_internal(field < MI_BCHUNK_FIELDS); const mi_bfield_t mask = mi_bfield_mask(m, idx); - size_t already_xset = 0; - const bool transition = mi_bfield_atomic_xset_mask(set, &chunk->bfields[field], mask, &already_xset); - mi_assert_internal((transition && already_xset == 0) || (!transition && already_xset > 0)); + size_t already_set = 0; + bool all_clear = false; + const bool transition = (set ? mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, &already_set) + : mi_bfield_atomic_clear_mask(&chunk->bfields[field], mask, &all_clear)); + mi_assert_internal((transition && already_set == 0) || (!transition && already_set > 0)); all_transition = all_transition && transition; - total_already_xset += already_xset; + total_already_set += already_set; + maybe_all_clear = maybe_all_clear && all_clear; // next field field++; idx = 0; n -= m; } - if (palready_xset!=NULL) { *palready_xset = total_already_xset; } + if (palready_set!=NULL) { *palready_set = total_already_set; } + if (pmaybe_all_clear!=NULL) { *pmaybe_all_clear = maybe_all_clear; } return all_transition; } static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { - return mi_bchunk_xsetN(MI_BIT_SET, chunk, cidx, n, already_set); + mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS); + if (n==1) { + bool was_clear = mi_bchunk_set(chunk, cidx); + if (already_set != NULL) { *already_set = !was_clear; } + return was_clear; + } + if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set); + if (n bfields[i], idx, all_clear); } +static inline bool mi_bchunk_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* all_clear) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + const mi_bfield_t mask = mi_bfield_mask(n, idx); + return mi_bfield_atomic_clear_mask(&chunk->bfields[i], mask, all_clear); +} + +static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + mi_assert_internal((cidx%MI_BFIELD_BITS)==0); + const size_t i = cidx / MI_BFIELD_BITS; + return mi_bfield_atomic_clearX(&chunk->bfields[i], all_clear); +} + +static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) { + mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS); + if (n==1) return mi_bchunk_clear(chunk, cidx, maybe_all_clear); + if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, maybe_all_clear); + if (n 0); if (n==0) return true; - size_t field = cidx / MI_BFIELD_BITS; - size_t idx = cidx % MI_BFIELD_BITS; + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; if mi_likely(n<=MI_BFIELD_BITS) { - return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[field], mi_bfield_mask(n, idx)); + return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); } else { - return mi_bchunk_is_xsetN_(set, chunk, field, idx, n); + return mi_bchunk_is_xsetN_(set, chunk, i, idx, n); } } -// ------- mi_bchunk_try_xset --------------------------------------- +// ------- mi_bchunk_try_clear --------------------------------------- + +static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + mi_assert_internal(n <= MI_BFIELD_BITS); + const size_t i = cidx / MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; + mi_assert_internal(idx + n <= MI_BFIELD_BITS); + const size_t mask = mi_bfield_mask(n, idx); + return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mask, pmaybe_all_clear); +} + +static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) { + mi_assert_internal(cidx < MI_BCHUNK_BITS); + mi_assert_internal((cidx%MI_BFIELD_BITS) == 0); + const size_t i = cidx / MI_BFIELD_BITS; + return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear); +} // Try to atomically set/clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 0 to 1 (or 1 to 0), // and false otherwise leaving all bit fields as is. -static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { +// Note: this is a hard one as we need to unwind partial atomic operations +// if we fail halfway.. +mi_decl_noinline static bool mi_bchunk_try_clearN_(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); if (n==0) return true; @@ -414,7 +418,7 @@ static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); mi_assert_internal(start_field < MI_BCHUNK_FIELDS); const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx); - if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_start, &field_is_clear)) return false; + if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &field_is_clear)) return false; maybe_all_clear = maybe_all_clear && field_is_clear; // done? @@ -431,7 +435,7 @@ static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, field++; mi_assert_internal(field < MI_BCHUNK_FIELDS); mask_mid = mi_bfield_all_set(); - if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid, &field_is_clear)) goto restore; + if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_mid, &field_is_clear)) goto restore; maybe_all_clear = maybe_all_clear && field_is_clear; n -= MI_BFIELD_BITS; } @@ -443,7 +447,7 @@ static bool mi_bchunk_try_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, mi_assert_internal(field < MI_BCHUNK_FIELDS); end_field = field; mask_end = mi_bfield_mask(n, 0); - if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_end, &field_is_clear)) goto restore; + if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_end, &field_is_clear)) goto restore; maybe_all_clear = maybe_all_clear && field_is_clear; } @@ -456,17 +460,17 @@ restore: while( field > start_field) { field--; const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); - mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, NULL); + mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, NULL); } return false; } -// static inline bool mi_bchunk_try_setN(mi_bchunk_t* chunk, size_t cidx, size_t n) { -// return mi_bchunk_try_xsetN(MI_BIT_SET, chunk, cidx, n, NULL); -// } static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) { - return mi_bchunk_try_xsetN(MI_BIT_CLEAR, chunk, cidx, n, maybe_all_clear); + mi_assert_internal(n>0); + if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear); + if (nbfields[chunk_idx], byte_idx, NULL)) { // unset the byte atomically + mi_assert_internal((idx%8)==0); + if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // unset the byte atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; @@ -614,9 +617,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s if (mask==0) return false; const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk const size_t chunk_idx = bidx / 8; - const size_t byte_idx = bidx % 8; // byte index of the byte in the bfield + const size_t idx = (bidx % 8)*8; mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], byte_idx, NULL)) { // clear it atomically + if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; @@ -672,7 +675,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, #else for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); - if (~b==0 && mi_bfield_atomic_try_clearX(&chunk->bfields[i])) { + if (~b==0 && mi_bfield_atomic_try_clearX(&chunk->bfields[i], NULL)) { *pidx = i*MI_BFIELD_BITS; mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS); return true; @@ -691,7 +694,7 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n, // and try to clear them atomically. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // (We do not cross bfield boundaries) -static mi_decl_noinline bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { +mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BFIELD_BITS) return false; const mi_bfield_t mask = mi_bfield_mask(n, 0); for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { @@ -955,69 +958,31 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { // ------- mi_bitmap_xset --------------------------------------- // Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) -bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { +bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (set) { - const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); - mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards - return wasclear; - } - else { - bool maybe_all_clear; - const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear); - if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return wasset; - } + const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards + return wasclear; } -// Set/clear aligned 8-bits in the bitmap (with `(idx%8)==0`). -// Returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) -static bool mi_bitmap_xset8(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { +bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) { mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); - mi_assert_internal((idx%8)==0); const size_t chunk_idx = idx / MI_BCHUNK_BITS; - const size_t byte_idx = (idx % MI_BCHUNK_BITS)/8; + const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (set) { - const bool wasclear = mi_bchunk_set8(&bitmap->chunks[chunk_idx], byte_idx); - mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards - return wasclear; - } - else { - bool maybe_all_clear; - const bool wasset = mi_bchunk_clear8(&bitmap->chunks[chunk_idx], byte_idx, &maybe_all_clear); - if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return wasset; - } + bool maybe_all_clear; + const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear); + if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return wasset; } -// Set/clear a field of bits. -// Returns `true` if atomically transitioned from 0 to ~0 (or ~0 to 0) -static bool mi_bitmap_xsetX(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); - mi_assert_internal((idx%MI_BFIELD_BITS)==0); - const size_t chunk_idx = idx / MI_BCHUNK_BITS; - const size_t field_idx = (idx % MI_BCHUNK_BITS)/MI_BFIELD_BITS; - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (set) { - const bool wasclear = mi_bchunk_setX(&bitmap->chunks[chunk_idx],field_idx); - mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards - return wasclear; - } - else { - bool maybe_all_clear; - const bool wasset = mi_bchunk_clearX(&bitmap->chunks[chunk_idx], field_idx, &maybe_all_clear); - if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return wasset; - } -} -// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! -static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset ) { +bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) { mi_assert_internal(n>0); mi_assert_internal(n<=MI_BCHUNK_BITS); @@ -1027,30 +992,30 @@ static bool mi_bitmap_xsetN_(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, siz mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia - if (set) { - const bool allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_xset); - mi_bitmap_chunkmap_set(bitmap,chunk_idx); // set afterwards - return allclear; - } - else { - size_t already_clear = 0; - const bool allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &already_clear ); - if (already_xset != NULL) { *already_xset = already_clear; } - if (already_clear < n) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return allset; - } + const bool were_allclear = mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, n, already_set); + mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards + return were_allclear; } -// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 1's to 0's. // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! -bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset) { - mi_assert_internal(n>0 && n<=MI_BCHUNK_BITS); - if (n==1) return mi_bitmap_xset(set, bitmap, idx); - if (n==8) return mi_bitmap_xset8(set, bitmap, idx); - if (n==MI_BFIELD_BITS) return mi_bitmap_xsetX(set, bitmap, idx); - return mi_bitmap_xsetN_(set, bitmap, idx, n, already_xset); +bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BCHUNK_BITS); + + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia + + bool maybe_all_clear; + const bool were_allset = mi_bchunk_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); + if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } + return were_allset; } + // ------- mi_bitmap_try_clearN --------------------------------------- bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { diff --git a/src/bitmap.h b/src/bitmap.h index 72ba69c1..4afcdaf1 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -13,7 +13,7 @@ Concurrent bitmap that can set/reset sequences of bits atomically #define MI_BITMAP_H /* -------------------------------------------------------------------------------- - Atomic bitmaps: + Atomic bitmaps with release/acquire guarantees: `mi_bfield_t`: is a single machine word that can efficiently be bit counted (usually `size_t`) each bit usually represents a single MI_ARENA_SLICE_SIZE in an arena (64 KiB). @@ -25,19 +25,25 @@ Concurrent bitmap that can set/reset sequences of bits atomically These chunks are cache-aligned and we can use AVX2/AVX512/NEON/SVE/SVE2/etc. instructions to scan for bits (perhaps) more efficiently. - `mi_bchunkmap_t` == `mi_bchunk_t`: for each chunk we track if it has (potentially) any bit set. + We allocate byte-sized ranges aligned to bytes in the bfield, and bfield-sized + ranges aligned to a bfield. + + Searching linearly through the chunks would be too slow (16K bits per GiB). + Instead we add a "chunkmap" to do a two-level search (more or less a btree of depth 2). + + `mi_bchunkmap_t` (== `mi_bchunk_t`): for each chunk we track if it has (potentially) any bit set. The chunkmap has 1 bit per chunk that is set if the chunk potentially has a bit set. This is used to avoid scanning every chunk. (and thus strictly an optimization) - It is conservative: it is fine to a bit in the chunk map even if the chunk turns out + It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out to have no bits set. It is also allowed to briefly have a clear bit even if the - chunk has bits set, as long as we guarantee that we set the bit later on -- this - allows us to set the chunkmap bit after we set a bit in the corresponding chunk. + chunk has bits set -- as long as we guarantee that the bit will be set later on; + (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk). However, when we clear a bit in a chunk, and the chunk is indeed all clear, we cannot safely clear the bit corresponding to the chunk in the chunkmap since it may race with another thread setting a bit in the same chunk. Therefore, when clearing, we first test if a chunk is clear, then clear the chunkmap bit, and - then test again to catch any set bits that we missed. + then test again to catch any set bits that we may have missed. Since the chunkmap may thus be briefly out-of-sync, this means that we may sometimes not find a free page even though it's there (but we accept this as we avoid taking @@ -130,32 +136,22 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero); // Not atomic so only use if still local to a thread. void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); +// Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1 +bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx); -// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) -bool mi_bitmap_xset(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx); - -static inline bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) { - return mi_bitmap_xset(MI_BIT_SET, bitmap, idx); -} - -static inline bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) { - return mi_bitmap_xset(MI_BIT_CLEAR, bitmap, idx); -} +// Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0 +bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx); -// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). +// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! -// If `already_xset` is not NULL, it is set to count of bits were already all set/cleared. +// If `already_set` is not NULL, it is set to count of bits were already all set. // (this is used for correct statistics if commiting over a partially committed area) -bool mi_bitmap_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_xset); +bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set); -static inline bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) { - return mi_bitmap_xsetN(MI_BIT_SET, bitmap, idx, n, already_set); -} - -static inline bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { - return mi_bitmap_xsetN(MI_BIT_CLEAR, bitmap, idx, n, NULL); -} +// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! +bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n); // Is a sequence of n bits already all set/cleared? @@ -167,6 +163,7 @@ static inline bool mi_bitmap_is_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) return mi_bitmap_is_xsetN(MI_BIT_SET, bitmap, idx, n); } +// Is a sequence of n bits already clear? static inline bool mi_bitmap_is_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { return mi_bitmap_is_xsetN(MI_BIT_CLEAR, bitmap, idx, n); } @@ -180,8 +177,11 @@ static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) { } +// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes. +// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`. bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n); +// Try to atomically transition a bit from set to clear. Returns `true` on succes. static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { return mi_bitmap_try_clearN(bitmap, idx, 1); } @@ -223,7 +223,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx); -// If a bit is set in the bitmap, return `true` and set `idx` to its index. +// If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit. // Otherwise return `false` (and `*idx` is undefined). bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx); diff --git a/src/init.c b/src/init.c index 5c5186b9..8f1449a3 100644 --- a/src/init.c +++ b/src/init.c @@ -84,8 +84,7 @@ const mi_page_t _mi_page_empty = { { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 }, \ - { 0, 0 } \ + { 0, 0 }, { 0, 0 }, { 0, 0 }, { 0, 0 } \ MI_STAT_COUNT_END_NULL() // -------------------------------------------------------- diff --git a/src/stats.c b/src/stats.c index 2a793b59..860a69ef 100644 --- a/src/stats.c +++ b/src/stats.c @@ -338,12 +338,11 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) mi_stat_counter_print(&stats->pages_extended, "-extended", out, arg); mi_stat_counter_print(&stats->page_no_retire, "-noretire", out, arg); mi_stat_counter_print(&stats->arena_count, "arenas", out, arg); - mi_stat_counter_print(&stats->arena_crossover_count, "-crossover", out, arg); - mi_stat_counter_print(&stats->arena_rollback_count, "-rollback", out, arg); - mi_stat_counter_print(&stats->mmap_calls, "mmaps", out, arg); - mi_stat_counter_print(&stats->commit_calls, "commits", out, arg); - mi_stat_counter_print(&stats->reset_calls, "resets", out, arg); - mi_stat_counter_print(&stats->purge_calls, "purges", out, arg); + mi_stat_counter_print(&stats->arena_purges, "-purges", out, arg); + mi_stat_counter_print(&stats->mmap_calls, "mmap calls", out, arg); + mi_stat_counter_print(&stats->commit_calls, " -commit", out, arg); + mi_stat_counter_print(&stats->reset_calls, "-reset", out, arg); + mi_stat_counter_print(&stats->purge_calls, "-purge", out, arg); mi_stat_counter_print(&stats->guarded_alloc_count, "guarded", out, arg); mi_stat_print(&stats->threads, "threads", -1, out, arg); mi_stat_counter_print_avg(&stats->searches, "searches", out, arg); diff --git a/test/test-stress.c b/test/test-stress.c index 126a7601..1996e52e 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -352,7 +352,7 @@ int main(int argc, char** argv) { mi_collect(true); mi_debug_show_arenas(true,false,false); #else - mi_collect(false); + //mi_collect(true); mi_debug_show_arenas(true,false,false); // mi_stats_print(NULL); #endif From b5dfd233e943855a381b7c36750fc20e54e154bc Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 13 Dec 2024 19:59:08 -0800 Subject: [PATCH 088/264] fix avx2 bug with atomics --- CMakeLists.txt | 4 +-- src/bitmap.c | 63 +++++++++++++++++++--------------------------- test/test-stress.c | 2 +- 3 files changed, 29 insertions(+), 40 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index fa35d749..344b72a5 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -117,8 +117,8 @@ if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo") if (NOT MI_OPT_ARCH) message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)") endif() -else() - set(MI_OPT_ARCH OFF) +#else() +# set(MI_OPT_ARCH OFF) endif() if(MI_OVERRIDE) diff --git a/src/bitmap.c b/src/bitmap.c index f689ee58..d8e207e3 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -143,20 +143,9 @@ static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_cle return (~old==0); } -// ------- mi_bfield_atomic_try_set/clear --------------------------------------- +// ------- mi_bfield_atomic_try_clear --------------------------------------- -// Tries to set a mask atomically, and returns true if the mask bits atomically transitioned from 0 to mask -// and false otherwise (leaving the bit field as is). -static inline bool mi_bfield_atomic_try_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) { - mi_assert_internal(mask != 0); - mi_bfield_t old = mi_atomic_load_relaxed(b); - do { - if ((old&mask) != 0) return false; // the mask bits are no longer 0 - } while (!mi_atomic_cas_weak_acq_rel(b, &old, old|mask)); // try to atomically set the mask bits - return true; -} - // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0 // and false otherwise (leaving the bit field as is). static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) { @@ -242,16 +231,16 @@ static inline bool mi_bchunk_set(mi_bchunk_t* chunk, size_t cidx) { } static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); + mi_assert_internal(cidx < MI_BCHUNK_BITS); const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; const mi_bfield_t mask = mi_bfield_mask(n, idx); return mi_bfield_atomic_set_mask(&chunk->bfields[i], mask, already_set); } static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) { mi_assert_internal(cidx < MI_BCHUNK_BITS); - mi_assert_internal((cidx%MI_BFIELD_BITS)==0); + mi_assert_internal((cidx%MI_BFIELD_BITS)==0); const size_t i = cidx / MI_BFIELD_BITS; return mi_bfield_atomic_setX(&chunk->bfields[i], already_set); } @@ -380,9 +369,9 @@ static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t mi_assert_internal(cidx < MI_BCHUNK_BITS); mi_assert_internal(n <= MI_BFIELD_BITS); const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; + const size_t idx = cidx % MI_BFIELD_BITS; mi_assert_internal(idx + n <= MI_BFIELD_BITS); - const size_t mask = mi_bfield_mask(n, idx); + const size_t mask = mi_bfield_mask(n, idx); return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mask, pmaybe_all_clear); } @@ -493,12 +482,14 @@ static inline bool mi_mm256_is_zero( __m256i vec) { static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_allset) { mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); - size_t cidx; + // note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever + // as the compiler won't reload the registers vec1 and vec2 from memory again. + const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]); + size_t idx; if (!allow_allset && (~b == 0)) return false; - if (mi_bfield_find_least_bit(b, &cidx)) { // find the least bit - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], cidx, NULL)) { // clear it atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit + if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; } @@ -522,6 +513,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx const size_t chunk_idx = _tzcnt_u32(mask) / 8; if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; // try again + // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { @@ -555,7 +547,8 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx chunk_idx = mi_ctz(mask) / 8; #endif if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; - // try again + // try again + // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #else // try first to find a field that is not all set (to reduce fragmentation) @@ -586,7 +579,7 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c size_t idx; if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); - mi_assert_internal((idx%8)==0); + mi_assert_internal((idx%8)==0); if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // unset the byte atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); @@ -617,10 +610,10 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s if (mask==0) return false; const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk const size_t chunk_idx = bidx / 8; - const size_t idx = (bidx % 8)*8; + const size_t idx = (bidx % 8)*8; mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically - *pidx = (chunk_idx*MI_BFIELD_BITS) + 8*byte_idx; + *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; } @@ -665,7 +658,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. const size_t chunk_idx = _tzcnt_u64(mask) / 8; mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx])) { + if mi_likely(mi_bfield_atomic_try_clearX(&chunk->bfields[chunk_idx],NULL)) { *pidx = chunk_idx*MI_BFIELD_BITS; mi_assert_internal(*pidx + MI_BFIELD_BITS <= MI_BCHUNK_BITS); return true; @@ -804,13 +797,6 @@ static inline void mi_bchunk_clear_once_set(mi_bchunk_t* chunk, size_t cidx) { // ------- mi_bitmap_all_are_clear --------------------------------------- -// are all bits in a bitmap chunk clear? (this uses guaranteed atomic reads) -static inline bool mi_bchunk_all_are_clear(mi_bchunk_t* chunk) { - for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { - if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false; - } - return true; -} // are all bits in a bitmap chunk clear? static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { @@ -823,7 +809,10 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2))); #else - return mi_bchunk_all_are_clear(chunk); + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false; + } + return true; #endif } @@ -976,7 +965,7 @@ bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) { bool maybe_all_clear; const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear); if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return wasset; + return wasset; } @@ -1169,7 +1158,7 @@ static bool mi_bitmap_try_find_and_clear_visit(mi_bitmap_t* bitmap, size_t chunk } static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear) { - return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL); + return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL); } mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { diff --git a/test/test-stress.c b/test/test-stress.c index 1996e52e..277f9e6e 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -353,7 +353,7 @@ int main(int argc, char** argv) { mi_debug_show_arenas(true,false,false); #else //mi_collect(true); - mi_debug_show_arenas(true,false,false); + //mi_debug_show_arenas(true,false,false); // mi_stats_print(NULL); #endif #else From 4aeb2e1005c41114844175a27985df483120daff Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 15 Dec 2024 13:21:13 -0800 Subject: [PATCH 089/264] flexible clearN_ that can start at any index --- src/bitmap.c | 95 +++++++++++++++++++++++++++++++++++----------------- 1 file changed, 64 insertions(+), 31 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index d8e207e3..b7b228c1 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -26,6 +26,10 @@ static inline size_t mi_bfield_ctz(mi_bfield_t x) { return mi_ctz(x); } +static inline size_t mi_bfield_clz(mi_bfield_t x) { + return mi_clz(x); +} + static inline size_t mi_bfield_popcount(mi_bfield_t x) { return mi_popcount(x); } @@ -41,6 +45,15 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { return mi_bsf(x,idx); } + +// find the most significant bit that is set. +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) { + return mi_bsr(x, idx); +} + + // find each set bit in a bit field `x` and clear it, until it becomes zero. static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) { const bool found = mi_bfield_find_least_bit(*x, idx); @@ -598,9 +611,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { // since a cache-line is 64b, load all at once - const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); - const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); - const __m256i cmpv = mi_mm256_ones(); + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vec2 = _mm256_load_si256((const __m256i*)chunk->bfields+1); + const __m256i cmpv = mi_mm256_ones(); const __m256i vcmp1 = _mm256_cmpeq_epi8(vec1, cmpv); // (byte == ~0 ? 0xFF : 0) const __m256i vcmp2 = _mm256_cmpeq_epi8(vec2, cmpv); // (byte == ~0 ? 0xFF : 0) const uint32_t mask1 = _mm256_movemask_epi8(vcmp1); // mask of most significant bit of each byte @@ -610,7 +623,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s if (mask==0) return false; const size_t bidx = _tzcnt_u64(mask); // byte-idx of the byte in the chunk const size_t chunk_idx = bidx / 8; - const size_t idx = (bidx % 8)*8; + const size_t idx = (bidx % 8)*8; mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; @@ -618,6 +631,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s return true; } // try again + // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } } #else // first skip allset fields to reduce fragmentation @@ -664,6 +678,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, return true; } // try again + // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #else for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { @@ -684,7 +699,8 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n, } // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set, -// and try to clear them atomically. +// and try to clear them atomically. +// Currently does not cross bfield boundaries. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // (We do not cross bfield boundaries) mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { @@ -732,35 +748,51 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - // we align at a bfield, and scan `field_count` fields - // n >= MI_BFIELD_BITS; find a first field that is 0 - const size_t field_count = _mi_divide_up(n, MI_BFIELD_BITS); // we need this many fields - for (size_t i = 0; i <= MI_BCHUNK_FIELDS - field_count; i++) + const size_t skip_count = n/MI_BFIELD_BITS; + size_t cidx; + for (size_t i = 0; i <= MI_BCHUNK_FIELDS - skip_count; i++) { - // first pre-scan for a range of fields that are all set (up to the last one) - bool allset = true; - size_t j = 0; - size_t m = n; - do { - mi_assert_internal(i + j < MI_BCHUNK_FIELDS); - mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i+j]); - size_t idx; - if (mi_bfield_find_least_bit(~b,&idx)) { - if (m > idx) { - allset = false; - i += j; // no need to look again at the previous fields - break; - } + size_t j = 1; // field count from i + size_t m = n; // bits to go + + // first field + mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); + size_t ones = mi_bfield_clz(~b); + cidx = i*MI_BFIELD_BITS + (MI_BFIELD_BITS - ones); // start index + if (ones >= m) { + // we found enough bits! + m = 0; + } + else { + m -= ones; + mi_assert_internal(m>0); + } + + // keep scanning further fields? + while (i+j < MI_BCHUNK_FIELDS) { + mi_assert_internal(m > 0); + b = mi_atomic_load_relaxed(&chunk->bfields[i+j]); + ones = mi_bfield_ctz(~b); + if (ones >= m) { + // we found enough bits + m = 0; + break; + } + else if (ones == MI_BFIELD_BITS) { + // not enough yet, proceed to the next field + j++; + m -= MI_BFIELD_BITS; } else { - // all bits in b were set - m -= MI_BFIELD_BITS; // note: can underflow + // the range was not enough, start from scratch + i = i + j - 1; // no need to re-scan previous fields, except the last one (with clz this time) + mi_assert_internal(m>0); + break; } - } while (++j < field_count); - - // if all set, we can try to atomically clear them - if (allset) { - const size_t cidx = i*MI_BFIELD_BITS; + } + + // did we find a range? + if (m==0) { if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) { // we cleared all atomically *pidx = cidx; @@ -768,8 +800,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); return true; } + // note: if we fail for a small `n` on the first field, we don't rescan that field (as `i` is incremented) } - // continue + // otherwise continue searching } return false; } From 13ee94cef6900539ad5f4abf322efdfc3650cfc9 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 15 Dec 2024 13:22:00 -0800 Subject: [PATCH 090/264] fix concurrent mi_tld access bug --- src/arena-meta.c | 4 ++-- src/arena.c | 1 + src/init.c | 28 ++++++++++++++++++---------- 3 files changed, 21 insertions(+), 12 deletions(-) diff --git a/src/arena-meta.c b/src/arena-meta.c index bc98d3f9..ceda06ba 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -93,7 +93,7 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) { // allocate meta-data -void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid ) +mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid ) { mi_assert_internal(pmemid != NULL); size = _mi_align_up(size,MI_META_BLOCK_SIZE); @@ -133,7 +133,7 @@ void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid ) } // free meta-data -void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { +mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { if (p==NULL) return; if (memid.memkind == MI_MEM_META) { mi_assert_internal(_mi_divide_up(size, MI_META_BLOCK_SIZE) == memid.mem.meta.block_count); diff --git a/src/arena.c b/src/arena.c index 7aec429e..d8b882d3 100644 --- a/src/arena.c +++ b/src/arena.c @@ -551,6 +551,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl // any abandoned in our size class? mi_subproc_t* const subproc = tld->subproc; + mi_assert_internal(subproc != NULL); if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) return NULL; // search arena's diff --git a/src/init.c b/src/init.c index 8f1449a3..c103f521 100644 --- a/src/init.c +++ b/src/init.c @@ -134,7 +134,8 @@ static mi_decl_cache_align mi_subproc_t mi_subproc_default; static mi_decl_cache_align mi_tld_t tld_main = { 0, - &_mi_heap_main, &_mi_heap_main, + &_mi_heap_main, // heap_backing + &_mi_heap_main, // heaps list &mi_subproc_default, // subproc 0, // tseq MI_MEMID_STATIC, // memid @@ -271,10 +272,23 @@ static mi_tld_t* mi_tld_alloc(void) { } } -mi_tld_t* _mi_tld(void) { +#define MI_TLD_INVALID ((mi_tld_t*)1) + +static mi_decl_noinline void mi_tld_free(void) { + mi_tld_t* tld = _mi_tld(); + mi_tld = MI_TLD_INVALID; + _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid); +} + +mi_tld_t* mi_decl_noinline _mi_tld(void) { + if (mi_tld == MI_TLD_INVALID) { + _mi_error_message(EFAULT, "internal error: tld accessed after the thread terminated\n"); + abort(); + mi_tld = NULL; + } if (mi_tld==NULL) { mi_tld = mi_tld_alloc(); - } + } return mi_tld; } @@ -409,9 +423,6 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { #endif } - // free the tld - mi_tld_t* tld = _mi_tld(); - _mi_meta_free(_mi_tld(), sizeof(mi_tld_t), tld->memid); return false; } @@ -497,10 +508,7 @@ void _mi_thread_done(mi_heap_t* heap) _mi_thread_heap_done(heap); // returns true if already ran // free thread local data - if (mi_tld != NULL) { - _mi_meta_free(mi_tld, sizeof(mi_tld_t), mi_tld->memid); - mi_tld = NULL; - } + mi_tld_free(); } void _mi_heap_set_default_direct(mi_heap_t* heap) { From 3153e5a4c5136006a8ea9a8500a578d06f486170 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 15 Dec 2024 13:47:33 -0800 Subject: [PATCH 091/264] small fixes --- src/bitmap.c | 58 +++++++++++++++++++++------------------------------- src/init.c | 5 ++--- 2 files changed, 25 insertions(+), 38 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index b7b228c1..2734e2b2 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -46,14 +46,6 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { } -// find the most significant bit that is set. -// return false if `x==0` (with `*idx` undefined) and true otherwise, -// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). -static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) { - return mi_bsr(x, idx); -} - - // find each set bit in a bit field `x` and clear it, until it becomes zero. static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) { const bool found = mi_bfield_find_least_bit(*x, idx); @@ -497,7 +489,7 @@ static inline bool mi_bchunk_try_find_and_clear_at(mi_bchunk_t* chunk, size_t ch mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); // note: this must be acquire (and not relaxed), or otherwise the AVX code below can loop forever // as the compiler won't reload the registers vec1 and vec2 from memory again. - const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]); + const mi_bfield_t b = mi_atomic_load_acquire(&chunk->bfields[chunk_idx]); size_t idx; if (!allow_allset && (~b == 0)) return false; if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit @@ -560,7 +552,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx chunk_idx = mi_ctz(mask) / 8; #endif if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; - // try again + // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #else @@ -699,42 +691,38 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n, } // find a sequence of `n` bits in a chunk with `n < MI_BFIELD_BITS` with all bits set, -// and try to clear them atomically. +// and try to clear them atomically. // Currently does not cross bfield boundaries. // set `*pidx` to its bit index (0 <= *pidx <= MI_BCHUNK_BITS - n) on success. // (We do not cross bfield boundaries) mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BFIELD_BITS) return false; const mi_bfield_t mask = mi_bfield_mask(n, 0); - for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); - size_t bshift = 0; size_t idx; while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit - b >>= idx; - bshift += idx; - if (bshift + n > MI_BFIELD_BITS) break; + if (idx + n > MI_BFIELD_BITS) break; - if ((b&mask) == mask) { // found a match - mi_assert_internal( ((mask << bshift) >> bshift) == mask ); - if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i],mask<>idx == mask); + if ((b&bmask) == bmask) { // found a match + if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) { + *pidx = (i*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); return true; } else { // if failed to atomically commit, reload b and try again from this position - bshift -= idx; - b = mi_atomic_load_relaxed(&chunk->bfields[i]) >> bshift; + b = mi_atomic_load_acquire(&chunk->bfields[i]); } } else { // advance - const size_t ones = mi_bfield_ctz(~b); // skip all ones (since it didn't fit the mask) + const size_t ones = mi_bfield_ctz(~(b>>idx)); // skip all ones (since it didn't fit the mask) mi_assert_internal(ones>0); - b >>= ones; - bshift += ones; + b = b & ~mi_bfield_mask(ones, idx); // clear the ones } } } @@ -748,11 +736,10 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - const size_t skip_count = n/MI_BFIELD_BITS; + const size_t skip_count = n/MI_BFIELD_BITS; size_t cidx; for (size_t i = 0; i <= MI_BCHUNK_FIELDS - skip_count; i++) { - size_t j = 1; // field count from i size_t m = n; // bits to go // first field @@ -761,7 +748,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, cidx = i*MI_BFIELD_BITS + (MI_BFIELD_BITS - ones); // start index if (ones >= m) { // we found enough bits! - m = 0; + m = 0; } else { m -= ones; @@ -769,6 +756,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, } // keep scanning further fields? + size_t j = 1; // field count from i while (i+j < MI_BCHUNK_FIELDS) { mi_assert_internal(m > 0); b = mi_atomic_load_relaxed(&chunk->bfields[i+j]); @@ -790,8 +778,8 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, break; } } - - // did we find a range? + + // did we find a range? if (m==0) { if (mi_bchunk_try_clearN(chunk, cidx, n, NULL)) { // we cleared all atomically @@ -1194,24 +1182,24 @@ static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, siz return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL); } -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { +bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1); } -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { +bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8); } -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { +bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X); } -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { +bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { mi_assert_internal(n<=MI_BFIELD_BITS); return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX); } -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { +bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { mi_assert_internal(n<=MI_BCHUNK_BITS); return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_); } diff --git a/src/init.c b/src/init.c index c103f521..9a26d56f 100644 --- a/src/init.c +++ b/src/init.c @@ -274,16 +274,15 @@ static mi_tld_t* mi_tld_alloc(void) { #define MI_TLD_INVALID ((mi_tld_t*)1) -static mi_decl_noinline void mi_tld_free(void) { +mi_decl_noinline static void mi_tld_free(void) { mi_tld_t* tld = _mi_tld(); mi_tld = MI_TLD_INVALID; _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid); } -mi_tld_t* mi_decl_noinline _mi_tld(void) { +mi_decl_noinline mi_tld_t* _mi_tld(void) { if (mi_tld == MI_TLD_INVALID) { _mi_error_message(EFAULT, "internal error: tld accessed after the thread terminated\n"); - abort(); mi_tld = NULL; } if (mi_tld==NULL) { From df9009a06051ba763cae6700d49c7e5318934940 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 15 Dec 2024 17:15:56 -0800 Subject: [PATCH 092/264] wip: binned bitmap for the free slices --- src/arena.c | 57 +++++---- src/bitmap.c | 324 ++++++++++++++++++++++++++++++++++++++++++++++----- src/bitmap.h | 95 ++++++++++++++- 3 files changed, 423 insertions(+), 53 deletions(-) diff --git a/src/arena.c b/src/arena.c index d8b882d3..84db2fb0 100644 --- a/src/arena.c +++ b/src/arena.c @@ -44,7 +44,7 @@ typedef struct mi_arena_s { bool is_large; // memory area consists of large- or huge OS pages (always committed) _Atomic(mi_msecs_t) purge_expire; // expiration time when slices can be purged from `slices_purge`. - mi_bitmap_t* slices_free; // is the slice free? + mi_bbitmap_t* slices_free; // is the slice free? (a binned bitmap with size classes) mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) mi_bitmap_t* slices_dirty; // is the slice potentially non-zero? mi_bitmap_t* slices_purge; // slices that can be purged @@ -213,7 +213,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid) { size_t slice_index; - if (!mi_bitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL; + if (!mi_bbitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL; // claimed it! void* p = mi_arena_slice_start(arena, slice_index); @@ -267,7 +267,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count); } - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); if (commit) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); } mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); @@ -574,7 +574,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); _mi_page_free_collect(page, false); // update `used` count - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); @@ -775,7 +775,7 @@ void _mi_arena_page_free(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); @@ -812,7 +812,7 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(!mi_page_is_singleton(page)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); @@ -867,7 +867,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { size_t slice_count; mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done @@ -935,7 +935,7 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) { } // and make it available to others again - bool all_inuse = mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL); + bool all_inuse = mi_bbitmap_setN(arena->slices_free, slice_index, slice_count); if (!all_inuse) { _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", mi_arena_slice_start(arena,slice_index), mi_size_of_slices(slice_count)); return; @@ -1051,8 +1051,8 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas if (slice_count == 0) slice_count = MI_BCHUNK_BITS; mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0); const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE); - const size_t bitmaps_count = 4 + MI_BIN_COUNT; // free, commit, dirty, purge, and abandonded - const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count,NULL); + const size_t bitmaps_count = 4 + MI_BIN_COUNT; // commit, dirty, purge, and abandonded + const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count, NULL) + mi_bbitmap_size(slice_count, NULL); // + free const size_t size = base_size + bitmaps_size; const size_t os_page_size = _mi_os_page_size(); @@ -1069,6 +1069,12 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) { return bitmap; } +static mi_bbitmap_t* mi_arena_bbitmap_init(size_t slice_count, uint8_t** base) { + mi_bbitmap_t* bbitmap = (mi_bbitmap_t*)(*base); + *base = (*base) + mi_bbitmap_init(bbitmap, slice_count, true /* already zero */); + return bbitmap; +} + static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { @@ -1121,7 +1127,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int // init bitmaps uint8_t* base = mi_arena_start(arena) + bitmap_base; - arena->slices_free = mi_arena_bitmap_init(slice_count,&base); + arena->slices_free = mi_arena_bbitmap_init(slice_count,&base); arena->slices_committed = mi_arena_bitmap_init(slice_count,&base); arena->slices_dirty = mi_arena_bitmap_init(slice_count,&base); arena->slices_purge = mi_arena_bitmap_init(slice_count, &base); @@ -1132,7 +1138,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int mi_assert_internal(mi_size_of_slices(info_slices) >= (size_t)(base - mi_arena_start(arena))); // reserve our meta info (and reserve slices outside the memory area) - mi_bitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); + mi_bbitmap_unsafe_setN(arena->slices_free, info_slices /* start */, arena->slice_count - info_slices); if (memid.initially_committed) { mi_bitmap_unsafe_setN(arena->slices_committed, 0, arena->slice_count); } @@ -1225,7 +1231,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t else if (_mi_meta_is_meta_page(start)) { c = 'm'; } else if (slice_index + bit < arena->info_slices) { c = 'i'; } // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; } - else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) { + else if (mi_bbitmap_is_setN(arena->slices_free, slice_index+bit, 1)) { if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; } else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; } else { c = '.'; } @@ -1237,14 +1243,14 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t return bit_set_count; } -static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { +static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, bool invert, mi_arena_t* arena) { _mi_output_message("%s:\n", header); size_t bit_count = 0; size_t bit_set_count = 0; - for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { + for (size_t i = 0; i < chunk_count && bit_count < slice_count; i++) { char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); size_t k = 0; - mi_bchunk_t* chunk = &bitmap->chunks[i]; + mi_bchunk_t* chunk = &chunks[i]; if (i<10) { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; } else if (i<100) { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; } @@ -1276,6 +1282,15 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi return bit_set_count; } +static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { + return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], invert, arena); +} + +static size_t mi_debug_show_bbitmap(const char* header, size_t slice_count, mi_bbitmap_t* bbitmap, bool invert, mi_arena_t* arena) { + return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], invert, arena); +} + + void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept { size_t max_arenas = mi_arena_get_count(); size_t free_total = 0; @@ -1288,7 +1303,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) slice_total += arena->slice_count; _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); if (show_inuse) { - free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); + free_total += mi_debug_show_bbitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); } if (show_committed) { mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL); @@ -1391,7 +1406,7 @@ static long mi_arena_purge_delay(void) { // assumes we own the area (i.e. slices_free is claimed by us) static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { mi_assert_internal(!arena->memid.is_pinned); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); const size_t size = mi_size_of_slices(slice_count); void* const p = mi_arena_slice_start(arena, slice_index); @@ -1417,7 +1432,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ const long delay = mi_arena_purge_delay(); if (delay < 0 || _mi_preloading()) return; // is purging allowed at all? - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); if (delay == 0) { // purge directly mi_arena_purge(arena, slice_index, slice_count); @@ -1443,11 +1458,11 @@ typedef struct mi_purge_visit_info_s { } mi_purge_visit_info_t; static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) { - if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) { + if (mi_bbitmap_try_clearN(arena->slices_free, slice_index, slice_count)) { // purge mi_arena_purge(arena, slice_index, slice_count); // and reset the free range - mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL); + mi_bbitmap_setN(arena->slices_free, slice_index, slice_count); return true; } else { diff --git a/src/bitmap.c b/src/bitmap.c index 2734e2b2..4a0c4a60 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -477,9 +477,9 @@ static inline __m256i mi_mm256_zero(void) { static inline __m256i mi_mm256_ones(void) { return _mm256_set1_epi64x(~0); } -//static inline bool mi_mm256_is_ones(__m256i vec) { -// return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec)); -//} +static inline bool mi_mm256_is_ones(__m256i vec) { + return _mm256_testc_si256(vec, _mm256_cmpeq_epi32(vec, vec)); +} static inline bool mi_mm256_is_zero( __m256i vec) { return _mm256_testz_si256(vec,vec); } @@ -706,7 +706,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, const size_t bmask = mask<>idx == mask); - if ((b&bmask) == bmask) { // found a match + if ((b&bmask) == bmask) { // found a match if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) { *pidx = (i*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); @@ -837,6 +837,24 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { #endif } +// are all bits in a bitmap chunk set? +static inline bool mi_bchunk_all_are_set_relaxed(mi_bchunk_t* chunk) { +#if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return mi_mm256_is_ones(vec); +#elif MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) + // a 64b cache-line contains the entire chunk anyway so load both at once + const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); + return (mi_mm256_is_ones(_mm256_and_si256(vec1, vec2))); +#else + for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { + if (~mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false; + } + return true; +#endif +} + static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) { for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) { @@ -902,6 +920,7 @@ size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) { return size; } + // initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true // returns the size of the bitmap size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) { @@ -915,38 +934,33 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) return size; } -// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. -void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0); - mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); - // first chunk +// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + const size_t total = n; + + + // start chunk and index size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; + const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS); + + // first update the chunkmap + mi_bchunk_setN(cmap, chunk_idx, ccount, NULL); + + // first chunk size_t m = MI_BCHUNK_BITS - cidx; if (m > n) { m = n; } - mi_bchunk_setN(&bitmap->chunks[chunk_idx], cidx, m, NULL); - mi_bitmap_chunkmap_set(bitmap, chunk_idx); + mi_bchunk_setN(&chunks[chunk_idx], cidx, m, NULL); // n can be large so use memset for efficiency for all in-between chunks chunk_idx++; n -= m; const size_t mid_chunks = n / MI_BCHUNK_BITS; if (mid_chunks > 0) { - _mi_memset(&bitmap->chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE); - const size_t end_chunk = chunk_idx + mid_chunks; - while (chunk_idx < end_chunk) { - if ((chunk_idx % MI_BFIELD_BITS) == 0 && (chunk_idx + MI_BFIELD_BITS <= end_chunk)) { - // optimize: we can set a full bfield in the chunkmap - mi_atomic_store_relaxed( &bitmap->chunkmap.bfields[chunk_idx/MI_BFIELD_BITS], mi_bfield_all_set()); - mi_bitmap_chunkmap_set(bitmap, chunk_idx + MI_BFIELD_BITS - 1); // track the max set - chunk_idx += MI_BFIELD_BITS; - } - else { - mi_bitmap_chunkmap_set(bitmap, chunk_idx); - chunk_idx++; - } - } + _mi_memset(&chunks[chunk_idx], ~0, mid_chunks * MI_BCHUNK_SIZE); + chunk_idx += mid_chunks; n -= (mid_chunks * MI_BCHUNK_BITS); } @@ -954,12 +968,15 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { if (n > 0) { mi_assert_internal(n < MI_BCHUNK_BITS); mi_assert_internal(chunk_idx < MI_BCHUNK_FIELDS); - mi_bchunk_setN(&bitmap->chunks[chunk_idx], 0, n, NULL); - mi_bitmap_chunkmap_set(bitmap, chunk_idx); + mi_bchunk_setN(&chunks[chunk_idx], 0, n, NULL); } +} - // reset max_accessed - mi_atomic_store_relaxed(&bitmap->chunk_max_accessed, 0); +// Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); + mi_bchunks_unsafe_setN(&bitmap->chunks[0], &bitmap->chunkmap, idx, n); } @@ -1085,7 +1102,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n #define mi_bfield_iterate(bfield,start,cycle,name_idx,SUF) { \ mi_assert_internal(start <= cycle); \ mi_assert_internal(start < MI_BFIELD_BITS); \ - mi_assert_internal(cycle < MI_BFIELD_BITS); \ + mi_assert_internal(cycle <= MI_BFIELD_BITS); \ mi_bfield_t _cycle_mask##SUF = mi_bfield_mask(cycle - start, start); \ size_t _bcount##SUF = mi_bfield_popcount(bfield); \ mi_bfield_t _b##SUF = bfield & _cycle_mask##SUF; /* process [start, cycle> first*/\ @@ -1250,7 +1267,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) -mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, +bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag) { mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag }; @@ -1351,3 +1368,248 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi return true; } + + +/* -------------------------------------------------------------------------------- + binned bitmap's +-------------------------------------------------------------------------------- */ + + +size_t mi_bbitmap_size(size_t bit_count, size_t* pchunk_count) { + mi_assert_internal((bit_count % MI_BCHUNK_BITS) == 0); + bit_count = _mi_align_up(bit_count, MI_BCHUNK_BITS); + mi_assert_internal(bit_count <= MI_BITMAP_MAX_BIT_COUNT); + mi_assert_internal(bit_count > 0); + const size_t chunk_count = bit_count / MI_BCHUNK_BITS; + mi_assert_internal(chunk_count >= 1); + const size_t size = offsetof(mi_bbitmap_t,chunks) + (chunk_count * MI_BCHUNK_SIZE); + mi_assert_internal( (size%MI_BCHUNK_SIZE) == 0 ); + if (pchunk_count != NULL) { *pchunk_count = chunk_count; } + return size; +} + +// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +// returns the size of the bitmap +size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero) { + size_t chunk_count; + const size_t size = mi_bbitmap_size(bit_count, &chunk_count); + if (!already_zero) { + _mi_memzero_aligned(bbitmap, size); + } + mi_atomic_store_release(&bbitmap->chunk_count, chunk_count); + mi_assert_internal(mi_atomic_load_relaxed(&bbitmap->chunk_count) <= MI_BITMAP_MAX_CHUNK_COUNT); + return size; +} + +void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); + mi_bchunks_unsafe_setN(&bbitmap->chunks[0], &bbitmap->chunkmap, idx, n); +} + + + +/* -------------------------------------------------------------------------------- + binned bitmap chunkmap +-------------------------------------------------------------------------------- */ + +static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) { + size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed); + if mi_unlikely(chunk_idx > oldmax) { + mi_atomic_cas_strong_relaxed(&bbitmap->chunk_max_accessed, &oldmax, chunk_idx); + } +} + +static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) { + mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + if (check_all_set) { + if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) { + // all slices are free in this chunk: return back to the NONE bin + mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], MI_BBIN_NONE); + } + } + mi_bchunk_set(&bbitmap->chunkmap, chunk_idx); + mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx); +} + +static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_idx) { + mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + // check if the corresponding chunk is all clear + if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) return false; + // clear the chunkmap bit + mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL); + // .. but a concurrent set may have happened in between our all-clear test and the clearing of the + // bit in the mask. We check again to catch this situation. + if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) { + mi_bchunk_set(&bbitmap->chunkmap, chunk_idx); + return false; + } + mi_bbitmap_chunkmap_set_max(bbitmap, chunk_idx); + return true; +} + +// Assign from the NONE bin to a specific size bin +static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) { + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin); +} + + +/* -------------------------------------------------------------------------------- + mi_bbitmap_setN, try_clearN, and is_xsetN + (used to find free pages) +-------------------------------------------------------------------------------- */ + +// Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! +bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BCHUNK_BITS); + + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia + + const bool were_allclear = mi_bchunk_setN(&bbitmap->chunks[chunk_idx], cidx, n, NULL); + mi_bbitmap_chunkmap_set(bbitmap, chunk_idx, true); // set after + return were_allclear; +} + + +// ------- mi_bbitmap_try_clearN --------------------------------------- + +bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BCHUNK_BITS); + mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); + + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + if (cidx + n > MI_BCHUNK_BITS) return false; + bool maybe_all_clear; + const bool cleared = mi_bchunk_try_clearN(&bbitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); + if (cleared && maybe_all_clear) { mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); } + // note: we don't set the size class for an explicit try_clearN (only used by purging) + return cleared; +} + + +// ------- mi_bbitmap_is_xset --------------------------------------- + +// Is a sequence of n bits already all set/cleared? +bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BCHUNK_BITS); + mi_assert_internal(idx + n <= mi_bbitmap_max_bits(bbitmap)); + + const size_t chunk_idx = idx / MI_BCHUNK_BITS; + const size_t cidx = idx % MI_BCHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + if (cidx + n > MI_BCHUNK_BITS) { n = MI_BCHUNK_BITS - cidx; } // paranoia + + return mi_bchunk_is_xsetN(set, &bbitmap->chunks[chunk_idx], cidx, n); +} + + + + +/* -------------------------------------------------------------------------------- + mi_bbitmap_find + (used to find free pages) +-------------------------------------------------------------------------------- */ + +typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx); + +// Go through the bbitmap and for every sequence of `n` set bits, call the visitor function. +// If it returns `true` stop the search. +static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find) +{ + // we space out threads to reduce contention + const size_t cmap_max_count = _mi_divide_up(mi_bbitmap_chunk_count(bbitmap),MI_BFIELD_BITS); + const size_t chunk_acc = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed); + const size_t cmap_acc = chunk_acc / MI_BFIELD_BITS; + const size_t cmap_acc_bits = 1 + (chunk_acc % MI_BFIELD_BITS); + + // create a mask over the chunkmap entries to iterate over them efficiently + mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS); + const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); + const size_t cmap_cycle = cmap_acc+1; + const mi_bbin_t bbin = mi_bbin_of(n); + // visit bins from largest size bin up to the NONE bin + // for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL + const mi_bbin_t bin = bbin; + { + mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) + { + // don't search into non-accessed memory until we tried other size bins as well + //if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { + // break; + //} + + // and for each chunkmap entry we iterate over its bits to find the chunks + const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]); + const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); + mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) + { + mi_assert_internal(eidx <= MI_BFIELD_BITS); + const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + // only in the current size class! + const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]); + if (bin >= chunk_bin) { // || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { + mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; + size_t cidx; + if ((*on_find)(chunk, n, &cidx)) { + if (cidx==0 && chunk_bin == MI_BBIN_NONE) { // only the first determines the size bin + // this chunk is now reserved for the `bbin` size class + mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin); + } + *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; + mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap)); + return true; + } + else { + /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ + mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); + } + } + } + mi_bfield_cycle_iterate_end(Y); + } + mi_bfield_cycle_iterate_end(X); + } + return false; +} + + +/* -------------------------------------------------------------------------------- + mi_bbitmap_try_find_and_clear -- used to find free pages + note: the compiler will fully inline the indirect function calls +-------------------------------------------------------------------------------- */ + +bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1); +} + +bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8); +} + +bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx) { + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X); +} + +bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BFIELD_BITS); + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX); +} + +bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx) { + mi_assert_internal(n<=MI_BCHUNK_BITS); + return mi_bbitmap_try_find_and_clear_generic(bbitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_); +} diff --git a/src/bitmap.h b/src/bitmap.h index 4afcdaf1..b28a09e4 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -36,7 +36,7 @@ Concurrent bitmap that can set/reset sequences of bits atomically This is used to avoid scanning every chunk. (and thus strictly an optimization) It is conservative: it is fine to set a bit in the chunk map even if the chunk turns out to have no bits set. It is also allowed to briefly have a clear bit even if the - chunk has bits set -- as long as we guarantee that the bit will be set later on; + chunk has bits set -- as long as we guarantee that the bit will be set later on; (this allows us to set the chunkmap bit right after we set a bit in the corresponding chunk). However, when we clear a bit in a chunk, and the chunk is indeed all clear, we @@ -236,4 +236,97 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a // Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`) bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); +// +typedef enum mi_bbin_e { + MI_BBIN_NONE, // no bin assigned yet (the chunk is completely free) + MI_BBIN_SMALL, // slice_count == 1 + MI_BBIN_MEDIUM, // slice_count == 8 + MI_BBIN_OTHER, // slice_count > 1, and not 8 + MI_BBIN_COUNT +} mi_bbin_t; + +static inline mi_bbin_t mi_bbin_of(size_t n) { + return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : MI_BBIN_OTHER)); +} + +// An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes +typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s { + _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) + _Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set + size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc + mi_bchunkmap_t chunkmap; + _Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT]; // 512b + mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT +} mi_bbitmap_t; + + +static inline size_t mi_bbitmap_chunk_count(const mi_bbitmap_t* bbitmap) { + return mi_atomic_load_relaxed(&((mi_bbitmap_t*)bbitmap)->chunk_count); +} + +static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) { + return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS); +} + +size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count); + + +// Initialize a bitmap to all clear; avoid a mem_zero if `already_zero` is true +// returns the size of the bitmap. +size_t mi_bbitmap_init(mi_bbitmap_t* bbitmap, size_t bit_count, bool already_zero); + +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). +// Not atomic so only use if still local to a thread. +void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); + + +// Set a sequence of `n` bits in the bbitmap; returns `true` if atomically transitioned from all 0's to 1's +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! +bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); + +// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's +// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! +bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); + + +// Is a sequence of n bits already all set/cleared? +bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n); + +// Is a sequence of n bits already set? +// (Used to check if a memory range is already committed) +static inline bool mi_bbitmap_is_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + return mi_bbitmap_is_xsetN(MI_BIT_SET, bbitmap, idx, n); +} + +// Is a sequence of n bits already clear? +static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { + return mi_bbitmap_is_xsetN(MI_BIT_CLEAR, bbitmap, idx, n); +} + + +// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes. +// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`. +bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); + + + +// Specialized versions for common bit sequence sizes +bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 1-bit +bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits +bool mi_bbitmap_try_find_and_clearX(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS +bool mi_bbitmap_try_find_and_clearNX(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS +bool mi_bbitmap_try_find_and_clearN_(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS + +// Find a sequence of `n` bits in the bbitmap with all bits set, and try to atomically clear all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +mi_decl_nodiscard static inline bool mi_bbitmap_try_find_and_clearN(mi_bbitmap_t* bbitmap, size_t n, size_t tseq, size_t* pidx) { + if (n==1) return mi_bbitmap_try_find_and_clear(bbitmap, tseq, pidx); // small pages + if (n==8) return mi_bbitmap_try_find_and_clear8(bbitmap, tseq, pidx); // medium pages + if (n==MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearX(bbitmap, tseq, pidx); // large pages + if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk + if (n < MI_BFIELD_BITS) return mi_bbitmap_try_find_and_clearNX(bbitmap, tseq, n, pidx); + return mi_bbitmap_try_find_and_clearN_(bbitmap, tseq, n, pidx); +} + + #endif // MI_BITMAP_H From e24217e69cb37d3b2087b7489ca4b2c6bc40f7d7 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 15 Dec 2024 18:35:12 -0800 Subject: [PATCH 093/264] more bbin size classes, bug fixes --- src/arena-meta.c | 12 ++--- src/arena.c | 27 ++++++++-- src/bitmap.c | 119 +++++++-------------------------------------- src/bitmap.h | 48 +++++------------- test/test-stress.c | 2 +- 5 files changed, 59 insertions(+), 149 deletions(-) diff --git a/src/arena-meta.c b/src/arena-meta.c index ceda06ba..86a89755 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -33,7 +33,7 @@ terms of the MIT license. A copy of the license can be found in the file typedef struct mi_meta_page_s { _Atomic(struct mi_meta_page_s*) next; // a linked list of meta-data pages (never released) mi_memid_t memid; // provenance of the meta-page memory itself - mi_bitmap_t blocks_free; // a small bitmap with 1 bit per block. + mi_bbitmap_t blocks_free; // a small bitmap with 1 bit per block. } mi_meta_page_t; static mi_decl_cache_align _Atomic(mi_meta_page_t*) mi_meta_pages = MI_ATOMIC_VAR_INIT(NULL); @@ -76,11 +76,11 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) { // initialize the page mpage->memid = memid; - mi_bitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */); + mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */); const size_t mpage_size = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL); const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE); mi_assert_internal(info_blocks < MI_META_BLOCKS_PER_PAGE); - mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks); + mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks); // push atomically in front of the meta page list // (note: there is no ABA issue since we never free meta-pages) @@ -104,7 +104,7 @@ mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid ) mi_meta_page_t* mpage = mpage0; while (mpage != NULL) { size_t block_idx; - if (mi_bitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) { + if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) { // found and claimed `block_count` blocks *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count); return mi_meta_block_start(mpage,block_idx); @@ -122,7 +122,7 @@ mi_decl_noinline void* _mi_meta_zalloc( size_t size, mi_memid_t* pmemid ) mpage = mi_meta_page_zalloc(); if (mpage != NULL) { size_t block_idx; - if (mi_bitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) { + if (mi_bbitmap_try_find_and_clearN(&mpage->blocks_free, block_count, 0, &block_idx)) { // found and claimed `block_count` blocks *pmemid = _mi_memid_create_meta(mpage, block_idx, block_count); return mi_meta_block_start(mpage,block_idx); @@ -145,7 +145,7 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { mi_assert_internal(mi_bitmap_is_clearN(&mpage->blocks_free, block_idx, block_count)); // we zero on free (and on the initial page allocation) so we don't need a "dirty" map _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE); - mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL); + mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count); } else if (mi_memid_is_os(memid)) { _mi_os_free(p, size, memid); diff --git a/src/arena.c b/src/arena.c index 84db2fb0..1547c9b2 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1243,7 +1243,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t return bit_set_count; } -static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, bool invert, mi_arena_t* arena) { +static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena) { _mi_output_message("%s:\n", header); size_t bit_count = 0; size_t bit_set_count = 0; @@ -1256,9 +1256,22 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_ else if (i<100) { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; } else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); } + char chunk_kind = ' '; + if (chunk_bins != NULL) { + switch (chunk_bins[i]) { + // case MI_BBIN_SMALL: chunk_kind = 'S'; break; + case MI_BBIN_MEDIUM: chunk_kind = 'M'; break; + case MI_BBIN_LARGE: chunk_kind = 'L'; break; + case MI_BBIN_OTHER: chunk_kind = 'O'; break; + // case MI_BBIN_NONE: chunk_kind = 'N'; break; + } + } + buf[k++] = chunk_kind; + buf[k++] = ' '; + for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { if (j > 0 && (j % 4) == 0) { - buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5; + buf[k++] = '\n'; _mi_memset(buf+k,' ',7); k += 7; } if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; @@ -1283,11 +1296,15 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_ } static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { - return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], invert, arena); + return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], NULL, invert, arena); +} + +static size_t mi_debug_show_bitmap_binned(const char* header, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena) { + return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena); } static size_t mi_debug_show_bbitmap(const char* header, size_t slice_count, mi_bbitmap_t* bbitmap, bool invert, mi_arena_t* arena) { - return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], invert, arena); + return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], &bbitmap->chunk_bins[0], invert, arena); } @@ -1313,7 +1330,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) // purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); //} if (show_pages) { - page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena); + page_total += mi_debug_show_bitmap_binned("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena); } } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); diff --git a/src/bitmap.c b/src/bitmap.c index 4a0c4a60..a847740b 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -938,9 +938,7 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) // Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) { mi_assert_internal(n>0); - const size_t total = n; - - + // start chunk and index size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; @@ -984,29 +982,6 @@ void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n) { // ------- mi_bitmap_xset --------------------------------------- -// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) -bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); - const size_t chunk_idx = idx / MI_BCHUNK_BITS; - const size_t cidx = idx % MI_BCHUNK_BITS; - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); - mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards - return wasclear; -} - -bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) { - mi_assert_internal(idx < mi_bitmap_max_bits(bitmap)); - const size_t chunk_idx = idx / MI_BCHUNK_BITS; - const size_t cidx = idx % MI_BCHUNK_BITS; - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - bool maybe_all_clear; - const bool wasset = mi_bchunk_clear(&bitmap->chunks[chunk_idx], cidx, &maybe_all_clear); - if (maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return wasset; -} - - // Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! bool mi_bitmap_setN(mi_bitmap_t* bitmap, size_t idx, size_t n, size_t* already_set) { @@ -1043,24 +1018,17 @@ bool mi_bitmap_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { } -// ------- mi_bitmap_try_clearN --------------------------------------- - -bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n) { - mi_assert_internal(n>0); - mi_assert_internal(n<=MI_BCHUNK_BITS); - mi_assert_internal(idx + n <= mi_bitmap_max_bits(bitmap)); - - const size_t chunk_idx = idx / MI_BCHUNK_BITS; - const size_t cidx = idx % MI_BCHUNK_BITS; - mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); // don't cross chunks (for now) - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if (cidx + n > MI_BCHUNK_BITS) return false; - bool maybe_all_clear; - const bool cleared = mi_bchunk_try_clearN(&bitmap->chunks[chunk_idx], cidx, n, &maybe_all_clear); - if (cleared && maybe_all_clear) { mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); } - return cleared; +// Set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) +bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_setN(bitmap, idx, 1, NULL); } +bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx) { + return mi_bitmap_clearN(bitmap, idx, 1); +} + + + // ------- mi_bitmap_is_xset --------------------------------------- // Is a sequence of n bits already all set/cleared? @@ -1170,58 +1138,6 @@ static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, si } -/* -------------------------------------------------------------------------------- - mi_bitmap_try_find_and_clear -- used to find free pages - note: the compiler will fully inline the indirect function calls --------------------------------------------------------------------------------- */ - - -typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, size_t* idx); - -static bool mi_bitmap_try_find_and_clear_visit(mi_bitmap_t* bitmap, size_t chunk_idx, size_t n, size_t* pidx, void* arg1, void* arg2) { - MI_UNUSED(arg2); - mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear = (mi_bchunk_try_find_and_clear_fun_t*)arg1; - size_t cidx; - // if we find a spot in the chunk we are done - if ((*try_find_and_clear)(&bitmap->chunks[chunk_idx], n, &cidx)) { - *pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx; - mi_assert_internal(*pidx + n <= mi_bitmap_max_bits(bitmap)); - return true; - } - else { - /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ - mi_bitmap_chunkmap_try_clear(bitmap, chunk_idx); - return false; - } -} - -static inline bool mi_bitmap_try_find_and_clear_generic(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* try_find_and_clear) { - return mi_bitmap_find(bitmap, tseq, n, pidx, &mi_bitmap_try_find_and_clear_visit, (void*)try_find_and_clear, NULL); -} - -bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 1, pidx, &mi_bchunk_try_find_and_clear_1); -} - -bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8); -} - -bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X); -} - -bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { - mi_assert_internal(n<=MI_BFIELD_BITS); - return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearNX); -} - -bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { - mi_assert_internal(n<=MI_BCHUNK_BITS); - return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, n, pidx, &mi_bchunk_try_find_and_clearN_); -} - - /* -------------------------------------------------------------------------------- Bitmap: try_find_and_claim -- used to allocate abandoned pages note: the compiler will fully inline the indirect function call @@ -1267,7 +1183,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) -bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, +mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag) { mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag }; @@ -1541,15 +1457,15 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, const size_t cmap_cycle = cmap_acc+1; const mi_bbin_t bbin = mi_bbin_of(n); // visit bins from largest size bin up to the NONE bin - // for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL - const mi_bbin_t bin = bbin; + for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL + // const mi_bbin_t bin = bbin; { mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) { // don't search into non-accessed memory until we tried other size bins as well - //if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { - // break; - //} + if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { + break; + } // and for each chunkmap entry we iterate over its bits to find the chunks const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]); @@ -1561,7 +1477,8 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); // only in the current size class! const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]); - if (bin >= chunk_bin) { // || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { + if // (bin >= chunk_bin) { + (bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; size_t cidx; if ((*on_find)(chunk, n, &cidx)) { diff --git a/src/bitmap.h b/src/bitmap.h index b28a09e4..f221f2cd 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -136,13 +136,13 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero); // Not atomic so only use if still local to a thread. void mi_bitmap_unsafe_setN(mi_bitmap_t* bitmap, size_t idx, size_t n); + // Set a bit in the bitmap; returns `true` if it atomically transitioned from 0 to 1 bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx); // Clear a bit in the bitmap; returns `true` if it atomically transitioned from 1 to 0 bool mi_bitmap_clear(mi_bitmap_t* bitmap, size_t idx); - // Set a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! // If `already_set` is not NULL, it is set to count of bits were already all set. @@ -177,36 +177,6 @@ static inline bool mi_bitmap_is_clear(mi_bitmap_t* bitmap, size_t idx) { } -// Try to atomically transition `n` bits from all set to all clear. Returns `true` on succes. -// `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`. -bool mi_bitmap_try_clearN(mi_bitmap_t* bitmap, size_t idx, size_t n); - -// Try to atomically transition a bit from set to clear. Returns `true` on succes. -static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { - return mi_bitmap_try_clearN(bitmap, idx, 1); -} - - - -// Specialized versions for common bit sequence sizes -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 1-bit -mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS - -// Find a sequence of `n` bits in the bitmap with all bits set, and try to atomically clear all. -// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. -mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { - if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); // small pages - if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); // medium pages - if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages - if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx); - return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx); -} - - // Called once a bit is cleared to see if the memory slice can be claimed. typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set); @@ -225,6 +195,7 @@ void mi_bitmap_clear_once_set(mi_bitmap_t* bitmap, size_t idx); // If a bit is set in the bitmap, return `true` and set `idx` to the index of the highest bit. // Otherwise return `false` (and `*idx` is undefined). +// Used for unloading arena's bool mi_bitmap_bsr(mi_bitmap_t* bitmap, size_t* idx); @@ -236,17 +207,24 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a // Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`) bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); -// + +/* ---------------------------------------------------------------------------- + Binned concurrent bitmap + Assigns a size class to each chunk such that small blocks don't cause too + much fragmentation by keeping chunks for larger blocks separate. +---------------------------------------------------------------------------- */ + typedef enum mi_bbin_e { MI_BBIN_NONE, // no bin assigned yet (the chunk is completely free) MI_BBIN_SMALL, // slice_count == 1 MI_BBIN_MEDIUM, // slice_count == 8 - MI_BBIN_OTHER, // slice_count > 1, and not 8 + MI_BBIN_LARGE, // slice_count == MI_BFIELD_BITS + MI_BBIN_OTHER, // slice_count > 1, and not 8 or MI_BFIELD_BITS MI_BBIN_COUNT } mi_bbin_t; static inline mi_bbin_t mi_bbin_of(size_t n) { - return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : MI_BBIN_OTHER)); + return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : (n==64 ? MI_BBIN_LARGE : MI_BBIN_OTHER))); } // An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes @@ -308,8 +286,6 @@ static inline bool mi_bbitmap_is_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_ // `n` cannot cross chunk boundaries, where `n <= MI_CHUNK_BITS`. bool mi_bbitmap_try_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); - - // Specialized versions for common bit sequence sizes bool mi_bbitmap_try_find_and_clear(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 1-bit bool mi_bbitmap_try_find_and_clear8(mi_bbitmap_t* bbitmap, size_t tseq, size_t* pidx); // 8-bits diff --git a/test/test-stress.c b/test/test-stress.c index 277f9e6e..1996e52e 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -353,7 +353,7 @@ int main(int argc, char** argv) { mi_debug_show_arenas(true,false,false); #else //mi_collect(true); - //mi_debug_show_arenas(true,false,false); + mi_debug_show_arenas(true,false,false); // mi_stats_print(NULL); #endif #else From 3330d4353aeb5e5785d3a16a3b70d738a7c0c696 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 15 Dec 2024 19:15:00 -0800 Subject: [PATCH 094/264] remove maxaccessed from general bitmaps --- src/arena-meta.c | 4 ++-- src/arena.c | 2 +- src/bitmap.c | 60 +++++++++++++++++++--------------------------- src/bitmap.h | 3 +-- src/heap.c | 4 +++- src/page-map.c | 2 +- src/page.c | 14 +++++++---- test/test-stress.c | 4 ++-- 8 files changed, 44 insertions(+), 49 deletions(-) diff --git a/src/arena-meta.c b/src/arena-meta.c index 86a89755..49195e22 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -77,7 +77,7 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) { // initialize the page mpage->memid = memid; mi_bbitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */); - const size_t mpage_size = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL); + const size_t mpage_size = offsetof(mi_meta_page_t,blocks_free) + mi_bbitmap_size(MI_META_BLOCKS_PER_PAGE, NULL); const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE); mi_assert_internal(info_blocks < MI_META_BLOCKS_PER_PAGE); mi_bbitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks); @@ -142,7 +142,7 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage); mi_assert_internal(block_idx + block_count < MI_META_BLOCKS_PER_PAGE); - mi_assert_internal(mi_bitmap_is_clearN(&mpage->blocks_free, block_idx, block_count)); + mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count)); // we zero on free (and on the initial page allocation) so we don't need a "dirty" map _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE); mi_bbitmap_setN(&mpage->blocks_free, block_idx, block_count); diff --git a/src/arena.c b/src/arena.c index 1547c9b2..9bc12272 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1258,7 +1258,7 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_ char chunk_kind = ' '; if (chunk_bins != NULL) { - switch (chunk_bins[i]) { + switch (mi_atomic_load_relaxed(&chunk_bins[i])) { // case MI_BBIN_SMALL: chunk_kind = 'S'; break; case MI_BBIN_MEDIUM: chunk_kind = 'M'; break; case MI_BBIN_LARGE: chunk_kind = 'L'; break; diff --git a/src/bitmap.c b/src/bitmap.c index a847740b..ccc17514 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -45,6 +45,14 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { return mi_bsf(x,idx); } +// find the most significant bit that is set. +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) { + return mi_bsf(x, idx); +} + + // find each set bit in a bit field `x` and clear it, until it becomes zero. static inline bool mi_bfield_foreach_bit(mi_bfield_t* x, size_t* idx) { @@ -873,17 +881,9 @@ static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) { bitmap chunkmap -------------------------------------------------------------------------------- */ -static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) { - size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); - if mi_unlikely(chunk_idx > oldmax) { - mi_atomic_cas_strong_relaxed(&bitmap->chunk_max_accessed, &oldmax, chunk_idx); - } -} - static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); - mi_bchunk_set(&bitmap->chunkmap, chunk_idx); - mi_bitmap_chunkmap_set_max(bitmap, chunk_idx); + mi_bchunk_set(&bitmap->chunkmap, chunk_idx); } static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) { @@ -898,13 +898,12 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) mi_bchunk_set(&bitmap->chunkmap, chunk_idx); return false; } - mi_bitmap_chunkmap_set_max(bitmap, chunk_idx); return true; } /* -------------------------------------------------------------------------------- - bitmap + bitmap -------------------------------------------------------------------------------- */ size_t mi_bitmap_size(size_t bit_count, size_t* pchunk_count) { @@ -1107,33 +1106,24 @@ typedef bool (mi_bitmap_visit_fun_t)(mi_bitmap_t* bitmap, size_t chunk_idx, size // If it returns `true` stop the search. static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx, mi_bitmap_visit_fun_t* on_find, void* arg1, void* arg2) { - // we space out threads to reduce contention - const size_t cmap_max_count = _mi_divide_up(mi_bitmap_chunk_count(bitmap),MI_BFIELD_BITS); - const size_t chunk_acc = mi_atomic_load_relaxed(&bitmap->chunk_max_accessed); - const size_t cmap_acc = chunk_acc / MI_BFIELD_BITS; - const size_t cmap_acc_bits = 1 + (chunk_acc % MI_BFIELD_BITS); - - // create a mask over the chunkmap entries to iterate over them efficiently - mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS); - const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); - const size_t cmap_cycle = cmap_acc+1; - mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) - { + const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); + for (size_t i = 0; i < chunkmap_max; i++) { // and for each chunkmap entry we iterate over its bits to find the chunks - mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[cmap_idx]); - size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); - mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) - { - mi_assert_internal(eidx <= MI_BFIELD_BITS); - const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; - mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) { - return true; + const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[i]); + size_t hi; + if (mi_bfield_find_highest_bit(cmap_entry, &hi)) { + mi_bfield_cycle_iterate(cmap_entry, tseq%8, hi+1, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) + { + mi_assert_internal(eidx <= MI_BFIELD_BITS); + const size_t chunk_idx = i*MI_BFIELD_BITS + eidx; + mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); + if ((*on_find)(bitmap, chunk_idx, n, pidx, arg1, arg2)) { + return true; + } } + mi_bfield_cycle_iterate_end(Y); } - mi_bfield_cycle_iterate_end(Y); } - mi_bfield_cycle_iterate_end(X); return false; } @@ -1478,7 +1468,7 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, // only in the current size class! const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]); if // (bin >= chunk_bin) { - (bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { + ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; size_t cidx; if ((*on_find)(chunk, n, &cidx)) { diff --git a/src/bitmap.h b/src/bitmap.h index f221f2cd..62c42129 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -98,8 +98,7 @@ typedef mi_bchunk_t mi_bchunkmap_t; // An atomic bitmap typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bitmap_s { _Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS) - _Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set - size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc + size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 1]; // suppress warning on msvc mi_bchunkmap_t chunkmap; mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT } mi_bitmap_t; diff --git a/src/heap.c b/src/heap.c index dee404d2..1c2b017b 100644 --- a/src/heap.c +++ b/src/heap.c @@ -138,7 +138,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // collect retired pages _mi_heap_collect_retired(heap, force); - + + // if (_mi_is_main_thread()) { mi_debug_show_arenas(true, false, false); } + // collect all pages owned by this thread mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); diff --git a/src/page-map.c b/src/page-map.c index 7b74c711..64f4bbbb 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -17,7 +17,7 @@ static mi_memid_t mi_page_map_memid; // (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization) -static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0), +static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} }; bool _mi_page_map_init(void) { diff --git a/src/page.c b/src/page.c index a30db6c9..b3fdb78f 100644 --- a/src/page.c +++ b/src/page.c @@ -82,7 +82,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) { mi_assert_internal(mi_page_block_size(page) > 0); mi_assert_internal(page->used <= page->capacity); mi_assert_internal(page->capacity <= page->reserved); - + // const size_t bsize = mi_page_block_size(page); // uint8_t* start = mi_page_start(page); //mi_assert_internal(start + page->capacity*page->block_size == page->top); @@ -623,7 +623,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { #endif mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift))); mi_assert_expensive(mi_page_is_valid_init(page)); - + // initialize an initial free list mi_page_extend_free(heap,page); mi_assert(mi_page_immediate_available(page)); @@ -872,10 +872,14 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al mi_assert_internal(mi_heap_is_initialized(heap)); // call potential deferred free routines - // _mi_deferred_free(heap, false); + _mi_deferred_free(heap, false); - // free delayed frees from other threads (but skip contended ones) - // _mi_heap_delayed_free_partial(heap); + // collect every N generic mallocs + /*static long count = 0; + if (count++ > 100000) { + count = 0; + _mi_heap_collect_retired(heap,false); + }*/ // find (or allocate) a page of the right size mi_page_t* page = mi_find_page(heap, size, huge_alignment); diff --git a/test/test-stress.c b/test/test-stress.c index 1996e52e..ae1e83b6 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -262,7 +262,7 @@ static void test_stress(void) { #if !defined(NDEBUG) || defined(MI_TSAN) if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); - //mi_debug_show_arenas(true, false, false); + mi_debug_show_arenas(true, false, false); //mi_collect(true); //mi_debug_show_arenas(true, false, false); } @@ -352,7 +352,7 @@ int main(int argc, char** argv) { mi_collect(true); mi_debug_show_arenas(true,false,false); #else - //mi_collect(true); + mi_collect(false); mi_debug_show_arenas(true,false,false); // mi_stats_print(NULL); #endif From d2f670e6e50d8bba25b8471e0793da070c4251d2 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 15 Dec 2024 19:54:01 -0800 Subject: [PATCH 095/264] add delay to purg'ing; call collect_retired every N generic allocs --- include/mimalloc/types.h | 1 + src/arena.c | 33 ++++++++++++++------------------- src/init.c | 2 ++ src/page.c | 15 +++++++-------- 4 files changed, 24 insertions(+), 27 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index bf91a58a..057195a1 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -409,6 +409,7 @@ struct mi_heap_s { size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread mi_memid_t memid; // provenance of the heap struct itseft (meta or os) + long generic_count; long full_page_retain; // how many full pages can be retained per queue (before abondoning them) bool allow_page_reclaim; // `true` if this heap should not reclaim abandoned pages bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint diff --git a/src/arena.c b/src/arena.c index 9bc12272..8feb165b 100644 --- a/src/arena.c +++ b/src/arena.c @@ -43,6 +43,7 @@ typedef struct mi_arena_s { bool is_exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) _Atomic(mi_msecs_t) purge_expire; // expiration time when slices can be purged from `slices_purge`. + _Atomic(mi_msecs_t) purge_expire_delay; // mi_bbitmap_t* slices_free; // is the slice free? (a binned bitmap with size classes) mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) @@ -54,13 +55,6 @@ typedef struct mi_arena_s { // followed by the bitmaps (whose sizes depend on the arena size) } mi_arena_t; -// Every "page" in `pages_purge` points to purge info -// (since we use it for any free'd range and not just for pages) -typedef struct mi_purge_info_s { - _Atomic(mi_msecs_t) expire; - _Atomic(size_t) slice_count; -} mi_purge_info_t; - #define MI_MAX_ARENAS (160) // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`) // 160 arenas is enough for ~2 TiB memory @@ -208,13 +202,17 @@ static size_t mi_memid_size(mi_memid_t memid) { /* ----------------------------------------------------------- Arena Allocation ----------------------------------------------------------- */ +static long mi_arena_purge_delay(void) { + // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); +} static mi_decl_noinline void* mi_arena_try_alloc_at( mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid) { size_t slice_index; if (!mi_bbitmap_try_find_and_clearN(arena->slices_free, slice_count, tseq, &slice_index)) return NULL; - + // claimed it! void* p = mi_arena_slice_start(arena, slice_index); *memid = mi_memid_create_arena(arena, slice_index, slice_count); @@ -422,7 +420,7 @@ static mi_decl_noinline void* mi_arena_try_alloc( mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) { mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES); - mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); + mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); void* p; again: // try to find free slices in the arena's @@ -949,7 +947,7 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) { mi_assert_internal(mi_memid_needs_no_free(memid)); } - // purge expired decommits + // try to purge expired decommits mi_arenas_try_purge(false, false); } @@ -1123,6 +1121,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->is_large = is_large; arena->purge_expire = 0; + arena->purge_expire_delay = 0; // mi_lock_init(&arena->abandoned_visit_lock); // init bitmaps @@ -1414,11 +1413,6 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv Arena purge ----------------------------------------------------------- */ -static long mi_arena_purge_delay(void) { - // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay - return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); -} - // reset or decommit in an arena and update the commit bitmap // assumes we own the area (i.e. slices_free is claimed by us) static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { @@ -1459,10 +1453,11 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); if (expire == 0) { mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); + mi_atomic_storei64_release(&arena->purge_expire_delay, 0); + } + else if (mi_atomic_loadi64_acquire(&arena->purge_expire_delay) < 10*delay) { + mi_atomic_addi64_acq_rel(&arena->purge_expire_delay, (mi_msecs_t)(delay/10)); // add smallish extra delay } - //else { - // mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10)); // add smallish extra delay - //} mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL); } } @@ -1509,7 +1504,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) { // check pre-conditions if (arena->memid.is_pinned) return false; - mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); + mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire) + mi_atomic_loadi64_relaxed(&arena->purge_expire_delay); if (expire == 0) return false; // expired yet? diff --git a/src/init.c b/src/init.c index 9a26d56f..4465d603 100644 --- a/src/init.c +++ b/src/init.c @@ -108,6 +108,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { MI_BIN_FULL, 0, // page retired min/max NULL, // next MI_MEMID_STATIC, // memid + 0, 0, // full page retain false, // can reclaim true, // can eager abandon @@ -156,6 +157,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = { MI_BIN_FULL, 0, // page retired min/max NULL, // next heap MI_MEMID_STATIC, // memid + 0, 2, // full page retain true, // allow page reclaim true, // allow page abandon diff --git a/src/page.c b/src/page.c index b3fdb78f..53773aae 100644 --- a/src/page.c +++ b/src/page.c @@ -870,16 +870,15 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; } } mi_assert_internal(mi_heap_is_initialized(heap)); - - // call potential deferred free routines - _mi_deferred_free(heap, false); - + // collect every N generic mallocs - /*static long count = 0; - if (count++ > 100000) { - count = 0; + if (heap->generic_count++ > 10000) { + heap->generic_count = 0; + // call potential deferred free routines + _mi_deferred_free(heap, false); + // collect retired pages _mi_heap_collect_retired(heap,false); - }*/ + } // find (or allocate) a page of the right size mi_page_t* page = mi_find_page(heap, size, huge_alignment); From 037cb167f8d49aa903a950ee38b494ff8bd563a4 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 16 Dec 2024 09:51:54 -0800 Subject: [PATCH 096/264] comments --- include/mimalloc/types.h | 12 ++++++------ src/bitmap.c | 19 +++++++++++-------- 2 files changed, 17 insertions(+), 14 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 057195a1..f8615d1c 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -100,7 +100,7 @@ terms of the MIT license. A copy of the license can be found in the file // Sizes are for 64-bit #ifndef MI_ARENA_SLICE_SHIFT -#ifdef MI_SMALL_PAGE_SHIFT // compatibility +#ifdef MI_SMALL_PAGE_SHIFT // backward compatibility #define MI_ARENA_SLICE_SHIFT MI_SMALL_PAGE_SHIFT #else #define MI_ARENA_SLICE_SHIFT (13 + MI_SIZE_SHIFT) // 64 KiB (32 KiB on 32-bit) @@ -149,7 +149,7 @@ typedef struct mi_arena_s mi_arena_t; // defined in `arena.c` // a memory id tracks the provenance of arena/OS allocated memory // --------------------------------------------------------------- -// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated. +// Memory can reside in arena's, direct OS allocated, meta-data pages, or statically allocated. // The memid keeps track of this. typedef enum mi_memkind_e { MI_MEM_NONE, // not allocated @@ -264,7 +264,7 @@ typedef uint8_t mi_heaptag_t; // // We don't count `freed` (as |free|) but use `used` to reduce // the number of memory accesses in the `mi_page_all_free` function(s). -// +// // Notes: // - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`). // - If a page is not part of a heap it is called "abandoned" -- in @@ -310,7 +310,7 @@ typedef struct mi_page_s { #define MI_PAGE_ALIGN MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map. #define MI_PAGE_MIN_START_BLOCK_ALIGN MI_MAX_ALIGN_SIZE // minimal block alignment for the first block in a page (16b) -#define MI_PAGE_MAX_START_BLOCK_ALIGN2 MI_KiB // maximal block alignment for "power of 2"-sized blocks +#define MI_PAGE_MAX_START_BLOCK_ALIGN2 MI_KiB // maximal block alignment for "power of 2"-sized blocks #define MI_PAGE_MAX_OVERALLOC_ALIGN MI_ARENA_SLICE_SIZE // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8 @@ -348,12 +348,12 @@ typedef enum mi_page_kind_e { // ------------------------------------------------------ // Heaps -// +// // Provide first-class heaps to allocate from. // A heap just owns a set of pages for allocation and // can only be allocate/reallocate from the thread that created it. // Freeing blocks can be done from any thread though. -// +// // Per thread, there is always a default heap that is // used for allocation; it is initialized to statically // point to an empty heap to avoid initialization checks diff --git a/src/bitmap.c b/src/bitmap.c index ccc17514..be4f8d76 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -883,7 +883,7 @@ static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) { static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); - mi_bchunk_set(&bitmap->chunkmap, chunk_idx); + mi_bchunk_set(&bitmap->chunkmap, chunk_idx); } static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) { @@ -937,12 +937,12 @@ size_t mi_bitmap_init(mi_bitmap_t* bitmap, size_t bit_count, bool already_zero) // Set a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. static void mi_bchunks_unsafe_setN(mi_bchunk_t* chunks, mi_bchunkmap_t* cmap, size_t idx, size_t n) { mi_assert_internal(n>0); - + // start chunk and index size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; const size_t ccount = _mi_divide_up(n, MI_BCHUNK_BITS); - + // first update the chunkmap mi_bchunk_setN(cmap, chunk_idx, ccount, NULL); @@ -1433,6 +1433,9 @@ typedef bool (mi_bchunk_try_find_and_clear_fun_t)(mi_bchunk_t* chunk, size_t n, // Go through the bbitmap and for every sequence of `n` set bits, call the visitor function. // If it returns `true` stop the search. +// +// This is used for finding free blocks and it is important to be efficient (with 2-level bitscan) +// but also reduce fragmentation (through size bins). static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, size_t tseq, size_t n, size_t* pidx, mi_bchunk_try_find_and_clear_fun_t* on_find) { // we space out threads to reduce contention @@ -1453,8 +1456,8 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) { // don't search into non-accessed memory until we tried other size bins as well - if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { - break; + if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { + break; } // and for each chunkmap entry we iterate over its bits to find the chunks @@ -1466,8 +1469,8 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx; mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); // only in the current size class! - const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_acquire(&bbitmap->chunk_bins[chunk_idx]); - if // (bin >= chunk_bin) { + const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]); + if // (bin >= chunk_bin) { ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; size_t cidx; @@ -1482,7 +1485,7 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, } else { /* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */ - mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); + mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx); } } } From d9397be17803e0408944329d63151ae57827e582 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 16 Dec 2024 10:00:32 -0800 Subject: [PATCH 097/264] comments --- include/mimalloc/types.h | 4 ++-- src/bitmap.h | 9 +++++++-- 2 files changed, 9 insertions(+), 4 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index f8615d1c..920a8e2c 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -321,9 +321,9 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) -#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 8 KiB +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/6) // < 11 KiB #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB -#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // < 2 MiB +#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 1 MiB #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/bitmap.h b/src/bitmap.h index 62c42129..4faaa3a1 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -210,15 +210,20 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi /* ---------------------------------------------------------------------------- Binned concurrent bitmap Assigns a size class to each chunk such that small blocks don't cause too - much fragmentation by keeping chunks for larger blocks separate. + much fragmentation since we keep chunks for larger blocks separate. ---------------------------------------------------------------------------- */ +// Size bins; larger bins are allowed to go into smaller bins. +// Since LARGE and MEDIUM are aligned (on word and byte boundaries respectively), +// they are larger than OTHER even though those can contain very large objects (but we +// don't want those in the MEDIUM or LARGE bins as these are variable size). +// SMALL can only be in small (and NONE), so they cannot fragment the larger bins. typedef enum mi_bbin_e { MI_BBIN_NONE, // no bin assigned yet (the chunk is completely free) MI_BBIN_SMALL, // slice_count == 1 + MI_BBIN_OTHER, // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS MI_BBIN_MEDIUM, // slice_count == 8 MI_BBIN_LARGE, // slice_count == MI_BFIELD_BITS - MI_BBIN_OTHER, // slice_count > 1, and not 8 or MI_BFIELD_BITS MI_BBIN_COUNT } mi_bbin_t; From 98171fd80a34dfb5433a6efc5aab3537c6efa6a7 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 17 Dec 2024 00:24:32 -0800 Subject: [PATCH 098/264] testing on arm64 --- test/test-stress.c | 7 +++++-- 1 file changed, 5 insertions(+), 2 deletions(-) diff --git a/test/test-stress.c b/test/test-stress.c index 61aeabf5..e138ffb0 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -306,6 +306,9 @@ int main(int argc, char** argv) { #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); #endif + #if defined(NDEBUG) && !defined(USE_STD_MALLOC) + // mi_option_set(mi_option_purge_delay,-1); + #endif #ifndef USE_STD_MALLOC mi_stats_reset(); #endif @@ -352,8 +355,8 @@ int main(int argc, char** argv) { mi_collect(true); mi_debug_show_arenas(true,false,false); #else - mi_collect(false); - mi_debug_show_arenas(true,false,false); + // mi_collect(false); + // mi_debug_show_arenas(true,false,false); // mi_stats_print(NULL); #endif #else From fdad1a0d4f2d04eeed0b10f46afd2c0609e3f204 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 17 Dec 2024 09:49:09 -0800 Subject: [PATCH 099/264] fix infoslices needed calculation --- src/arena.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/src/arena.c b/src/arena.c index d8b882d3..c5846329 100644 --- a/src/arena.c +++ b/src/arena.c @@ -52,6 +52,7 @@ typedef struct mi_arena_s { mi_bitmap_t* pages_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) // the full queue contains abandoned full pages // followed by the bitmaps (whose sizes depend on the arena size) + // note: when adding bitmaps revise `mi_arena_info_slices_needed` } mi_arena_t; // Every "page" in `pages_purge` points to purge info @@ -1051,7 +1052,7 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas if (slice_count == 0) slice_count = MI_BCHUNK_BITS; mi_assert_internal((slice_count % MI_BCHUNK_BITS) == 0); const size_t base_size = _mi_align_up(sizeof(mi_arena_t), MI_BCHUNK_SIZE); - const size_t bitmaps_count = 4 + MI_BIN_COUNT; // free, commit, dirty, purge, and abandonded + const size_t bitmaps_count = 5 + MI_ARENA_BIN_COUNT; // free, commit, dirty, purge, pages, and abandoned const size_t bitmaps_size = bitmaps_count * mi_bitmap_size(slice_count,NULL); const size_t size = base_size + bitmaps_size; @@ -1303,7 +1304,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) } if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total); - if (show_pages) _mi_output_message("total pages in areanas: %zu\n", page_total); + if (show_pages) _mi_output_message("total pages in arenas: %zu\n", page_total); } From adfeb1f6f296ec23950fea90d7946e71ac0cf6de Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 17 Dec 2024 10:43:31 -0800 Subject: [PATCH 100/264] fix bug in bitmap_forall_ranges --- src/arena.c | 30 +++++++++++++++++++----------- src/bitmap.c | 18 ++++++++---------- test/test-stress.c | 5 +++-- 3 files changed, 30 insertions(+), 23 deletions(-) diff --git a/src/arena.c b/src/arena.c index c5846329..962e1898 100644 --- a/src/arena.c +++ b/src/arena.c @@ -43,6 +43,7 @@ typedef struct mi_arena_s { bool is_exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) _Atomic(mi_msecs_t) purge_expire; // expiration time when slices can be purged from `slices_purge`. + _Atomic(mi_msecs_t) purge_expire_extend; // the purge expiration may be extended by a bit mi_bitmap_t* slices_free; // is the slice free? mi_bitmap_t* slices_committed; // is the slice committed? (i.e. accessible) @@ -950,7 +951,7 @@ static void mi_arena_free(void* p, size_t size, mi_memid_t memid) { mi_assert_internal(mi_memid_needs_no_free(memid)); } - // purge expired decommits + // try to purge expired decommits mi_arenas_try_purge(false, false); } @@ -1118,6 +1119,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->is_large = is_large; arena->purge_expire = 0; + arena->purge_expire_extend = 0; // mi_lock_init(&arena->abandoned_visit_lock); // init bitmaps @@ -1402,7 +1404,7 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c needs_recommit = _mi_os_purge(p, size); } else { - mi_assert_internal(false); // ? + mi_assert_internal(false); // can this happen? } // update committed bitmap @@ -1428,10 +1430,11 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); if (expire == 0) { mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); + mi_atomic_storei64_release(&arena->purge_expire_extend, 0); + } + else if (mi_atomic_loadi64_acquire(&arena->purge_expire_extend) < 10*delay) { // limit max extension time + mi_atomic_addi64_acq_rel(&arena->purge_expire_extend, (mi_msecs_t)(delay/10)); // add smallish extra delay } - //else { - // mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10)); // add smallish extra delay - //} mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL); } } @@ -1478,26 +1481,31 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) { // check pre-conditions if (arena->memid.is_pinned) return false; - mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); + mi_msecs_t expire_base = mi_atomic_loadi64_relaxed(&arena->purge_expire); + mi_msecs_t expire_extend = mi_atomic_loadi64_relaxed(&arena->purge_expire_extend); + const mi_msecs_t expire = expire_base + expire_extend; if (expire == 0) return false; // expired yet? if (!force && expire > now) return false; // reset expire (if not already set concurrently) - mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); + if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire_base, (mi_msecs_t)0)) { + mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0); // and also reset the extend + } _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1); - // go through all purge info's - // todo: instead of visiting per-bit, we should visit per range of bits + // go through all purge info's (with max MI_BFIELD_BITS ranges at a time) mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/}; - _mi_bitmap_forall_set(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo); + _mi_bitmap_forall_set_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo); // if not fully purged, make sure to purge again in the future if (!vinfo.all_purged) { const long delay = mi_arena_purge_delay(); mi_msecs_t expected = 0; - mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expected, _mi_clock_now() + delay); + if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expected, _mi_clock_now() + delay)) { + mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0); + } } return vinfo.any_purged; } diff --git a/src/bitmap.c b/src/bitmap.c index 2734e2b2..9440df31 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1332,18 +1332,16 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); - size_t bshift = 0; size_t bidx; - while (mi_bfield_find_least_bit(b, &bidx)) { - b >>= bidx; - bshift += bidx; - const size_t rng = mi_ctz(~b); // all the set bits from bidx - mi_assert_internal(rng>=1); - const size_t idx = base_idx + bshift + bidx; + while (mi_bfield_find_least_bit(b, &bidx)) { + const size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx + mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS); + const size_t idx = base_idx + bidx; + mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS); + mi_assert_internal((idx / MI_BCHUNK_BITS) < mi_bitmap_chunk_count(bitmap)); if (!visit(idx, rng, arena, arg)) return false; - // skip rng - b >>= rng; - bshift += rng; + // clear rng bits in b + b = b & ~mi_bfield_mask(rng, bidx); } } } diff --git a/test/test-stress.c b/test/test-stress.c index 9122d70e..1591b38e 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -305,6 +305,7 @@ int main(int argc, char** argv) { #endif #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); + mi_option_set(mi_option_purge_delay,10); #endif #ifndef USE_STD_MALLOC mi_stats_reset(); @@ -353,8 +354,8 @@ int main(int argc, char** argv) { mi_debug_show_arenas(true,false,false); #else //mi_collect(true); - //mi_debug_show_arenas(true,false,false); - // mi_stats_print(NULL); + mi_debug_show_arenas(true,false,false); + mi_stats_print(NULL); #endif #else mi_stats_print(NULL); // so we see rss/commit/elapsed From c585753dcef498ebc1934b062dfb4d0a69306d05 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 17 Dec 2024 11:54:26 -0800 Subject: [PATCH 101/264] fix purging with ranges --- src/arena.c | 19 +++++++++---------- src/bitmap.c | 16 ++++++++++++---- test/test-stress.c | 2 +- 3 files changed, 22 insertions(+), 15 deletions(-) diff --git a/src/arena.c b/src/arena.c index 962e1898..cb4936d4 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1398,14 +1398,8 @@ static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c const size_t size = mi_size_of_slices(slice_count); void* const p = mi_arena_slice_start(arena, slice_index); - bool needs_recommit = false; // reset needs no recommit, decommit does need it - if (mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)) { - // all slices are committed, we can purge the entire range - needs_recommit = _mi_os_purge(p, size); - } - else { - mi_assert_internal(false); // can this happen? - } + const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count); + const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed); // update committed bitmap if (needs_recommit) { @@ -1450,11 +1444,13 @@ static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) { // purge mi_arena_purge(arena, slice_index, slice_count); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, slice_index, slice_count)); // and reset the free range mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL); return true; } else { + // was allocated again already return false; } } @@ -1463,12 +1459,15 @@ static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, m mi_purge_visit_info_t* vinfo = (mi_purge_visit_info_t*)arg; // try to purge: first claim the free blocks if (mi_arena_try_purge_range(arena, slice_index, slice_count)) { - vinfo->any_purged = true; + vinfo->any_purged = true; + vinfo->all_purged = true; } else { // failed to claim the full range, try per slice instead for (size_t i = 0; i < slice_count; i++) { - vinfo->any_purged = vinfo->any_purged || mi_arena_try_purge_range(arena, slice_index + i, 1); + const bool purged = mi_arena_try_purge_range(arena, slice_index + i, 1); + vinfo->any_purged = vinfo->any_purged || purged; + vinfo->all_purged = vinfo->all_purged && purged; } } // done: clear the purge bits diff --git a/src/bitmap.c b/src/bitmap.c index 9440df31..fb334d8a 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -81,7 +81,7 @@ static inline mi_bfield_t mi_bfield_mask(size_t bit_count, size_t shiftl) { // Set a bit atomically. Returns `true` if the bit transitioned from 0 to 1 static inline bool mi_bfield_atomic_set(_Atomic(mi_bfield_t)*b, size_t idx) { mi_assert_internal(idx < MI_BFIELD_BITS); - const mi_bfield_t mask = mi_bfield_one()< first*/\ @@ -1332,9 +1332,16 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); + #if MI_DEBUG > 1 + const size_t bpopcount = mi_popcount(b); + size_t rngcount = 0; + #endif size_t bidx; while (mi_bfield_find_least_bit(b, &bidx)) { const size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx + #if MI_DEBUG > 1 + rngcount += rng; + #endif mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS); const size_t idx = base_idx + bidx; mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS); @@ -1343,6 +1350,7 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi // clear rng bits in b b = b & ~mi_bfield_mask(rng, bidx); } + mi_assert_internal(rngcount == bpopcount); } } } diff --git a/test/test-stress.c b/test/test-stress.c index 1591b38e..1f8df226 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -262,7 +262,7 @@ static void test_stress(void) { #if !defined(NDEBUG) || defined(MI_TSAN) if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); - //mi_debug_show_arenas(true, false, false); + mi_debug_show_arenas(true, false, false); //mi_collect(true); //mi_debug_show_arenas(true, false, false); } From 34d03f3981670cca6620fc1d7b4e97bf691ac893 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 17 Dec 2024 12:32:18 -0800 Subject: [PATCH 102/264] atomically clear purge bits when visiting --- src/arena.c | 33 +++++++++++++++------------------ src/bitmap.c | 7 ++++--- src/bitmap.h | 2 +- 3 files changed, 20 insertions(+), 22 deletions(-) diff --git a/src/arena.c b/src/arena.c index cb4936d4..b6e98863 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1392,19 +1392,21 @@ static long mi_arena_purge_delay(void) { // reset or decommit in an arena and update the commit bitmap // assumes we own the area (i.e. slices_free is claimed by us) -static void mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { +// returns if the memory is no longer committed (versus reset which keeps the commit) +static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { mi_assert_internal(!arena->memid.is_pinned); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); const size_t size = mi_size_of_slices(slice_count); void* const p = mi_arena_slice_start(arena, slice_index); const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count); - const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed); + const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */); // update committed bitmap if (needs_recommit) { mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count); } + return needs_recommit; } @@ -1421,12 +1423,13 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ } else { // schedule purge - mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); - if (expire == 0) { - mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); + mi_msecs_t expire0 = 0; + if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, _mi_clock_now() + delay)) { + // expiration was not yet set mi_atomic_storei64_release(&arena->purge_expire_extend, 0); } else if (mi_atomic_loadi64_acquire(&arena->purge_expire_extend) < 10*delay) { // limit max extension time + // already an expiration was set mi_atomic_addi64_acq_rel(&arena->purge_expire_extend, (mi_msecs_t)(delay/10)); // add smallish extra delay } mi_bitmap_setN(arena->slices_purge, slice_index, slice_count, NULL); @@ -1443,8 +1446,8 @@ typedef struct mi_purge_visit_info_s { static bool mi_arena_try_purge_range(mi_arena_t* arena, size_t slice_index, size_t slice_count) { if (mi_bitmap_try_clearN(arena->slices_free, slice_index, slice_count)) { // purge - mi_arena_purge(arena, slice_index, slice_count); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, slice_index, slice_count)); + bool decommitted = mi_arena_purge(arena, slice_index, slice_count); MI_UNUSED(decommitted); + mi_assert_internal(!decommitted || mi_bitmap_is_clearN(arena->slices_committed, slice_index, slice_count)); // and reset the free range mi_bitmap_setN(arena->slices_free, slice_index, slice_count, NULL); return true; @@ -1470,8 +1473,8 @@ static bool mi_arena_try_purge_visitor(size_t slice_index, size_t slice_count, m vinfo->all_purged = vinfo->all_purged && purged; } } - // done: clear the purge bits - mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count); + // don't clear the purge bits as that is done atomically be the _bitmap_forall_set_ranges + // mi_bitmap_clearN(arena->slices_purge, slice_index, slice_count); return true; // continue } @@ -1495,17 +1498,11 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1); // go through all purge info's (with max MI_BFIELD_BITS ranges at a time) + // this also clears those ranges atomically (so any newly freed blocks will get purged next + // time around) mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/}; - _mi_bitmap_forall_set_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo); + _mi_bitmap_forall_setc_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo); - // if not fully purged, make sure to purge again in the future - if (!vinfo.all_purged) { - const long delay = mi_arena_purge_delay(); - mi_msecs_t expected = 0; - if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expected, _mi_clock_now() + delay)) { - mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0); - } - } return vinfo.any_purged; } diff --git a/src/bitmap.c b/src/bitmap.c index fb334d8a..a534bba5 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1316,9 +1316,10 @@ bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_a } // Visit all set bits in a bitmap but try to return ranges (within bfields) if possible. -// used by purging to purge larger ranges if possible +// Also clear those ranges atomically. +// Used by purging to purge larger ranges when possible // todo: optimize further? maybe use avx512 to directly get all indices using a mask_compressstore? -bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) { +bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg) { // for all chunkmap entries const size_t chunkmap_max = _mi_divide_up(mi_bitmap_chunk_count(bitmap), MI_BFIELD_BITS); for (size_t i = 0; i < chunkmap_max; i++) { @@ -1331,7 +1332,7 @@ bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visi mi_bchunk_t* const chunk = &bitmap->chunks[chunk_idx]; for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { const size_t base_idx = (chunk_idx*MI_BCHUNK_BITS) + (j*MI_BFIELD_BITS); - mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[j]); + mi_bfield_t b = mi_atomic_exchange_acq_rel(&chunk->bfields[j], 0); // can be relaxed? #if MI_DEBUG > 1 const size_t bpopcount = mi_popcount(b); size_t rngcount = 0; diff --git a/src/bitmap.h b/src/bitmap.h index 4afcdaf1..47c22025 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -234,6 +234,6 @@ typedef bool (mi_forall_set_fun_t)(size_t slice_index, size_t slice_count, mi_ar bool _mi_bitmap_forall_set(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); // Visit all set bits in a bitmap with larger ranges if possible (`slice_count >= 1`) -bool _mi_bitmap_forall_set_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); +bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* visit, mi_arena_t* arena, void* arg); #endif // MI_BITMAP_H From 84bb1c2712d1b01f828597e1620ef2280db1a8b7 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 17 Dec 2024 18:10:28 -0800 Subject: [PATCH 103/264] adjust stats more clearly to avoid double counting commits --- src/arena.c | 33 ++++++++++++++++++--------------- 1 file changed, 18 insertions(+), 15 deletions(-) diff --git a/src/arena.c b/src/arena.c index b6e98863..e7564cd6 100644 --- a/src/arena.c +++ b/src/arena.c @@ -234,12 +234,24 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // commit requested, but the range may not be committed as a whole: ensure it is committed now if (!mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)) { // not fully committed: commit the full range and set the commit bits - // (this may race and we may double-commit which is fine) + // we set the bits first since we own these slices (they are no longer free) + size_t already_committed_count = 0; + mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count); + // adjust the stats so we don't double count the commits + if (already_committed_count > 0) { + _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); + } + // now actually commit bool commit_zero = false; if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) { + // failed to commit (todo: give warning?) + if (already_committed_count > 0) { + _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); + } memid->initially_committed = false; } else { + // committed if (commit_zero) { memid->initially_zero = true; } #if MI_DEBUG > 1 if (memid->initially_zero) { @@ -248,13 +260,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( memid->initially_zero = false; } } - #endif - size_t already_committed_count = 0; - mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count); - if (already_committed_count < slice_count) { - // todo: also decrease total - mi_stat_decrease(_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); - } + #endif } } if (memid->initially_zero) { @@ -798,7 +804,7 @@ void _mi_arena_page_free(mi_page_t* page) { Arena abandon ----------------------------------------------------------- */ -static void mi_arena_page_abandon_no_stat(mi_page_t* page) { +void _mi_arena_page_abandon(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); @@ -827,12 +833,8 @@ static void mi_arena_page_abandon_no_stat(mi_page_t* page) { // page is full (or a singleton), page is OS/externally allocated // leave as is; it will be reclaimed when an object is free'd in the page } - _mi_page_unown(page); -} - -void _mi_arena_page_abandon(mi_page_t* page) { - mi_arena_page_abandon_no_stat(page); _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1); + _mi_page_unown(page); } bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) { @@ -849,7 +851,8 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) { } else { _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1); - mi_arena_page_abandon_no_stat(page); + _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1); // adjust as we are not abandoning fresh + _mi_arena_page_abandon(page); return true; } } From 58b726be6f814906870738ac225e119350c7e243 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 17 Dec 2024 18:57:00 -0800 Subject: [PATCH 104/264] better stats for commit on overcommit systems (by not counting on-demand commit upfront) --- include/mimalloc/types.h | 12 ++++++------ src/arena.c | 39 +++++++++++++++++++++++++++++++++------ src/stats.c | 22 ++++++++++++---------- 3 files changed, 51 insertions(+), 22 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 4c998f90..0cf909d0 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -488,8 +488,8 @@ typedef struct mi_stats_s { void _mi_stat_increase(mi_stat_count_t* stat, size_t amount); void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount); // adjust stat in special cases to compensate for double counting -void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount); -void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount); +void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc); +void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free); // counters can just be increased void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); @@ -497,14 +497,14 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); #define mi_stat_increase(stat,amount) _mi_stat_increase( &(stat), amount) #define mi_stat_decrease(stat,amount) _mi_stat_decrease( &(stat), amount) #define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount) -#define mi_stat_adjust_increase(stat,amount) _mi_stat_adjust_increase( &(stat), amount) -#define mi_stat_adjust_decrease(stat,amount) _mi_stat_adjust_decrease( &(stat), amount) +#define mi_stat_adjust_increase(stat,amnt,b) _mi_stat_adjust_increase( &(stat), amnt, b) +#define mi_stat_adjust_decrease(stat,amnt,b) _mi_stat_adjust_decrease( &(stat), amnt, b) #else #define mi_stat_increase(stat,amount) ((void)0) #define mi_stat_decrease(stat,amount) ((void)0) #define mi_stat_counter_increase(stat,amount) ((void)0) -#define mi_stat_adjuct_increase(stat,amount) ((void)0) -#define mi_stat_adjust_decrease(stat,amount) ((void)0) +#define mi_stat_adjuct_increase(stat,amnt,b) ((void)0) +#define mi_stat_adjust_decrease(stat,amnt,b) ((void)0) #endif #define mi_heap_stat_counter_increase(heap,stat,amount) mi_stat_counter_increase( (heap)->tld->stats.stat, amount) diff --git a/src/arena.c b/src/arena.c index e7564cd6..29279b86 100644 --- a/src/arena.c +++ b/src/arena.c @@ -222,9 +222,13 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( *memid = mi_memid_create_arena(arena, slice_index, slice_count); memid->is_pinned = arena->memid.is_pinned; - // set the dirty bits + // set the dirty bits and track which slices become accessible + size_t touched_slices = slice_count; if (arena->memid.initially_zero) { - memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, NULL); + size_t already_dirty = 0; + memid->initially_zero = mi_bitmap_setN(arena->slices_dirty, slice_index, slice_count, &already_dirty); + mi_assert_internal(already_dirty <= touched_slices); + touched_slices -= already_dirty; } // set commit state @@ -239,7 +243,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count); // adjust the stats so we don't double count the commits if (already_committed_count > 0) { - _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); + _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count), true /* on alloc */); } // now actually commit bool commit_zero = false; @@ -263,6 +267,15 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( #endif } } + else { + // already fully commited. + // if the OS has overcommit, and this is the first time we access these pages, then + // count the commit now (as at arena reserve we didn't count those commits as these are on-demand) + if (_mi_os_has_overcommit() && touched_slices > 0) { + _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(touched_slices)); + } + } + // tool support if (memid->initially_zero) { mi_track_mem_defined(p, slice_count * MI_ARENA_SLICE_SIZE); } @@ -324,17 +337,25 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re // commit eagerly? bool arena_commit = false; - if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } + const bool overcommit = _mi_os_has_overcommit(); + if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = overcommit; } else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } + // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice + // is actually allocated for the first time it will be counted. + const bool adjust = (overcommit && arena_commit); + if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); } // and try to reserve the arena int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); if (err != 0) { + if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back // failed, try a smaller size? const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB); + if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true); } if (arena_reserve > small_arena_reserve) { // try again err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); + if (err != 0 && adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back } } return (err==0); @@ -851,7 +872,7 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) { } else { _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1); - _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1); // adjust as we are not abandoning fresh + _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1, true /* on alloc */); // adjust as we are not abandoning fresh _mi_arena_page_abandon(page); return true; } @@ -1402,7 +1423,13 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c const size_t size = mi_size_of_slices(slice_count); void* const p = mi_arena_slice_start(arena, slice_index); - const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count); + //const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count); + size_t already_committed; + mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed); + const bool all_committed = (already_committed == slice_count); + if (mi_option_is_enabled(mi_option_purge_decommits)) { + _mi_stat_adjust_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed), false /* on freed */); + } const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */); // update committed bitmap diff --git a/src/stats.c b/src/stats.c index 53937330..bb17b936 100644 --- a/src/stats.c +++ b/src/stats.c @@ -54,21 +54,23 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { // Adjust stats to compensate; for example before committing a range, // first adjust downwards with parts that were already committed so // we avoid double counting. -static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount) { +static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount, bool on_alloc) { if (amount == 0) return; if mi_unlikely(mi_is_in_main(stat)) { // adjust atomically mi_atomic_addi64_relaxed(&stat->current, amount); - mi_atomic_addi64_relaxed(&stat->allocated, amount); - mi_atomic_addi64_relaxed(&stat->freed, amount); + mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount); } else { // don't affect the peak stat->current += amount; - // add to both - stat->allocated += amount; - stat->freed += amount; + if (on_alloc) { + stat->allocated += amount; + } + else { + stat->freed += amount; + } } } @@ -91,12 +93,12 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) { mi_stat_update(stat, -((int64_t)amount)); } -void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount) { - mi_stat_adjust(stat, (int64_t)amount); +void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) { + mi_stat_adjust(stat, (int64_t)amount, on_alloc); } -void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) { - mi_stat_adjust(stat, -((int64_t)amount)); +void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) { + mi_stat_adjust(stat, -((int64_t)amount), on_alloc); } // must be thread safe as it is called from stats_merge From fb9093840897749cc567c798a628d9e8f6399956 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 17 Dec 2024 19:11:23 -0800 Subject: [PATCH 105/264] adjust stats more clearly to avoid double counting commits --- src/arena.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/arena.c b/src/arena.c index d02d4760..044d3f39 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1451,13 +1451,11 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c size_t already_committed; mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed); const bool all_committed = (already_committed == slice_count); - if (mi_option_is_enabled(mi_option_purge_decommits)) { - _mi_stat_adjust_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed), false /* on freed */); - } const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */); // update committed bitmap if (needs_recommit) { + _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */); mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count); } return needs_recommit; From 264d5a67049e346eb27bb0b6009c44a14de48509 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 17 Dec 2024 19:13:03 -0800 Subject: [PATCH 106/264] update stat adjustment for purging --- src/arena.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/src/arena.c b/src/arena.c index 29279b86..44c909c1 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1427,13 +1427,11 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c size_t already_committed; mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed); const bool all_committed = (already_committed == slice_count); - if (mi_option_is_enabled(mi_option_purge_decommits)) { - _mi_stat_adjust_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed), false /* on freed */); - } const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */); // update committed bitmap if (needs_recommit) { + _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */); mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count); } return needs_recommit; From de8001c107a18c3139d0e024095f7206ccf1b1e6 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 19 Dec 2024 19:18:04 -0800 Subject: [PATCH 107/264] add specialized is_set for 1 bit --- src/bitmap.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index a534bba5..6fae1ed6 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -200,21 +200,40 @@ static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all // ------- mi_bfield_atomic_is_set --------------------------------------- +// Check if a bit is set +static inline bool mi_bfield_atomic_is_set(_Atomic(mi_bfield_t)*b, const size_t idx) { + const mi_bfield_t x = mi_atomic_load_relaxed(b); + return ((x & mi_bfield_mask(1,idx)) != 0); +} + +// Check if a bit is clear +static inline bool mi_bfield_atomic_is_clear(_Atomic(mi_bfield_t)*b, const size_t idx) { + const mi_bfield_t x = mi_atomic_load_relaxed(b); + return ((x & mi_bfield_mask(1, idx)) == 0); +} + +// Check if a bit is xset +static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, const size_t idx) { + if (set) return mi_bfield_atomic_is_set(b, idx); + else return mi_bfield_atomic_is_clear(b, idx); +} // Check if all bits corresponding to a mask are set. -static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) { +static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) { mi_assert_internal(mask != 0); - return ((*b & mask) == mask); + const mi_bfield_t x = mi_atomic_load_relaxed(b); + return ((x & mask) == mask); } // Check if all bits corresponding to a mask are clear. -static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask) { +static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) { mi_assert_internal(mask != 0); - return ((*b & mask) == 0); + const mi_bfield_t x = mi_atomic_load_relaxed(b); + return ((x & mask) == 0); } // Check if all bits corresponding to a mask are set/cleared. -static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)*b, mi_bfield_t mask) { +static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask) { mi_assert_internal(mask != 0); if (set) return mi_bfield_atomic_is_set_mask(b, mask); else return mi_bfield_atomic_is_clear_mask(b, mask); @@ -359,12 +378,9 @@ static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t if (n==0) return true; const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; - if mi_likely(n<=MI_BFIELD_BITS) { - return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); - } - else { - return mi_bchunk_is_xsetN_(set, chunk, i, idx, n); - } + if mi_likely(n==1) { return mi_bfield_atomic_is_xset(set, &chunk->bfields[i], idx); } + if mi_likely(n<=MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); } + return mi_bchunk_is_xsetN_(set, chunk, i, idx, n); } From 3746bf79edb5f09283b336d4b556a5a519cfea75 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 19 Dec 2024 21:30:03 -0800 Subject: [PATCH 108/264] small fixes; max object size 1/8th of a pages --- include/mimalloc/types.h | 4 ++-- src/arena.c | 2 +- src/bitmap.c | 10 +++++----- src/heap.c | 8 ++++++-- src/init.c | 12 ++++++------ src/page.c | 2 +- 6 files changed, 21 insertions(+), 17 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 085879bd..61681138 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -321,8 +321,8 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) -#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/6) // < 11 KiB -#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 11 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 128 KiB #define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 1 MiB #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/arena.c b/src/arena.c index 044d3f39..e0044392 100644 --- a/src/arena.c +++ b/src/arena.c @@ -313,7 +313,7 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re if (arena_count >= 1 && arena_count <= 128) { // scale up the arena sizes exponentially every 4 entries - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16); + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/2, 0, 16); size_t reserve = 0; if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { arena_reserve = reserve; diff --git a/src/bitmap.c b/src/bitmap.c index c64f227b..a04762af 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -352,7 +352,7 @@ static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, b if (n==1) return mi_bchunk_clear(chunk, cidx, pmaybe_all_clear); if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, pmaybe_all_clear); if (n bfields[chunk_idx]); if (!allow_all_set && (~b == 0)) return false; @@ -1277,18 +1277,18 @@ bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* vis size_t rngcount = 0; #endif size_t bidx; - while (mi_bfield_find_least_bit(b, &bidx)) { + while (mi_bfield_find_least_bit(b, &bidx)) { const size_t rng = mi_ctz(~(b>>bidx)); // all the set bits from bidx #if MI_DEBUG > 1 rngcount += rng; - #endif + #endif mi_assert_internal(rng>=1 && rng<=MI_BFIELD_BITS); const size_t idx = base_idx + bidx; mi_assert_internal((idx % MI_BFIELD_BITS) + rng <= MI_BFIELD_BITS); mi_assert_internal((idx / MI_BCHUNK_BITS) < mi_bitmap_chunk_count(bitmap)); if (!visit(idx, rng, arena, arg)) return false; // clear rng bits in b - b = b & ~mi_bfield_mask(rng, bidx); + b = b & ~mi_bfield_mask(rng, bidx); } mi_assert_internal(rngcount == bpopcount); } diff --git a/src/heap.c b/src/heap.c index 1c2b017b..a24b8356 100644 --- a/src/heap.c +++ b/src/heap.c @@ -166,8 +166,12 @@ void mi_collect(bool force) mi_attr_noexcept { ----------------------------------------------------------- */ mi_heap_t* mi_heap_get_default(void) { - mi_thread_init(); - return mi_prim_get_default_heap(); + mi_heap_t* heap = mi_prim_get_default_heap(); + if mi_unlikely(!mi_heap_is_initialized(heap)) { + mi_thread_init(); + heap = mi_prim_get_default_heap(); + } + return heap; } static bool mi_heap_is_default(const mi_heap_t* heap) { diff --git a/src/init.c b/src/init.c index 4465d603..241a3826 100644 --- a/src/init.c +++ b/src/init.c @@ -157,7 +157,7 @@ mi_decl_cache_align mi_heap_t _mi_heap_main = { MI_BIN_FULL, 0, // page retired min/max NULL, // next heap MI_MEMID_STATIC, // memid - 0, + 0, 2, // full page retain true, // allow page reclaim true, // allow page abandon @@ -289,7 +289,7 @@ mi_decl_noinline mi_tld_t* _mi_tld(void) { } if (mi_tld==NULL) { mi_tld = mi_tld_alloc(); - } + } return mi_tld; } @@ -361,11 +361,11 @@ static bool _mi_thread_heap_init(void) { //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap()); } else { - // allocates tld data - // note: we cannot access thread-locals yet as that can cause (recursive) allocation + // allocates tld data + // note: we cannot access thread-locals yet as that can cause (recursive) allocation // (on macOS <= 14 for example where the loader allocates thread-local data on demand). - mi_tld_t* tld = mi_tld_alloc(); - + mi_tld_t* tld = mi_tld_alloc(); + // allocate and initialize the heap mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld); diff --git a/src/page.c b/src/page.c index 53773aae..0e3e9bb5 100644 --- a/src/page.c +++ b/src/page.c @@ -870,7 +870,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al if mi_unlikely(!mi_heap_is_initialized(heap)) { return NULL; } } mi_assert_internal(mi_heap_is_initialized(heap)); - + // collect every N generic mallocs if (heap->generic_count++ > 10000) { heap->generic_count = 0; From 2db407d1e9ce00bc8e6363e66d845fc9ec78628b Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 20 Dec 2024 11:54:39 -0800 Subject: [PATCH 109/264] revert back to generating mimalloc.dll instead of mimalloc-override.dll --- CMakeLists.txt | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 6e7a6aca..30a6b3e8 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -673,8 +673,8 @@ if (MI_OVERRIDE) target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE) if (WIN32) # on windows we should generate mimalloc-override.dll. - string(REPLACE "mimalloc" "mimalloc-override" mi_override_output_name ${mi_basename}) - set_target_properties(mimalloc PROPERTIES OUTPUT_NAME ${mi_override_output_name}) + # string(REPLACE "mimalloc" "mimalloc-override" mi_override_output_name ${mi_basename}) + # set_target_properties(mimalloc PROPERTIES OUTPUT_NAME ${mi_override_output_name}) endif() endif() if(NOT WIN32) From 5614c5052ec5d7490391da3b98dc4bdcc0e1ed7c Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 20 Dec 2024 11:56:04 -0800 Subject: [PATCH 110/264] don't prefer high used candidate if it is too full --- src/page.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/page.c b/src/page.c index a30db6c9..0de56752 100644 --- a/src/page.c +++ b/src/page.c @@ -682,7 +682,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m _mi_page_free(page_candidate, pq); page_candidate = page; } - else if (page->used >= page_candidate->used) { // && !mi_page_is_mostly_used(page)) { + else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { page_candidate = page; } // if we find a non-expandable candidate, or searched for N pages, return with the best candidate From 7141d9f1642ff24f5d94e5ae3767f3212153f25f Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 20 Dec 2024 17:31:48 -0800 Subject: [PATCH 111/264] remove busy wait for arena reservation --- src/arena.c | 23 +++++++++++++---------- 1 file changed, 13 insertions(+), 10 deletions(-) diff --git a/src/arena.c b/src/arena.c index 44c909c1..74cd4977 100644 --- a/src/arena.c +++ b/src/arena.c @@ -453,7 +453,7 @@ static mi_decl_noinline void* mi_arena_try_alloc( mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); void* p; -again: + // try to find free slices in the arena's p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid); if (p != NULL) return p; @@ -465,22 +465,25 @@ again: if (_mi_preloading()) return NULL; // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?) - if (mi_lock_try_acquire(&mi_arena_reserve_lock)) { - mi_arena_id_t arena_id = 0; - bool ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id); + const size_t arena_count = mi_arena_get_count(); + if (mi_lock_acquire(&mi_arena_reserve_lock)) { + bool ok = true; + if (arena_count == mi_arena_get_count()) { + // we are the first to enter the lock, reserve a fresh arena + mi_arena_id_t arena_id = 0; + ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id); + } + else { + // another thread already reserved a new arena + } mi_lock_release(&mi_arena_reserve_lock); if (ok) { - // and try allocate in there + // try once more to allocate in the new arena mi_assert_internal(req_arena_id == _mi_arena_id_none()); p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid); if (p != NULL) return p; } } - else { - // if we are racing with another thread wait until the new arena is reserved (todo: a better yield?) - mi_atomic_yield(); - goto again; - } return NULL; } From a5b7d7f26461d0d241b6de41f215d63dbfa642cb Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 20 Dec 2024 21:38:31 -0800 Subject: [PATCH 112/264] subprocesses own arena's --- include/mimalloc.h | 2 +- include/mimalloc/atomic.h | 2 +- include/mimalloc/internal.h | 15 +- include/mimalloc/types.h | 56 +++---- src/alloc.c | 4 +- src/arena-meta.c | 6 +- src/arena.c | 315 +++++++++++++++++------------------- src/bitmap.c | 7 +- src/bitmap.h | 4 +- src/free.c | 6 +- src/heap.c | 7 +- src/init.c | 259 ++++++++++++++++------------- src/page.c | 2 +- 13 files changed, 351 insertions(+), 334 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 24217fae..7a58e54c 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -279,7 +279,7 @@ mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_commit mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept; // Experimental: heaps associated with specific memory arena's -typedef int mi_arena_id_t; +typedef void* mi_arena_id_t; mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size); mi_decl_export int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; mi_decl_export int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index 7dc492f6..ddb5a9a3 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -434,7 +434,7 @@ static inline void mi_lock_init(mi_lock_t* lock) { InitializeSRWLock(lock); } static inline void mi_lock_done(mi_lock_t* lock) { - // nothing + (void)(lock); } diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index a5ca3e27..24792f8c 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -101,8 +101,10 @@ bool _mi_is_main_thread(void); size_t _mi_current_thread_count(void); bool _mi_preloading(void); // true while the C runtime is not initialized yet void _mi_thread_done(mi_heap_t* heap); -mi_tld_t* _mi_tld(void); // current tld: `_mi_tld() == _mi_heap_get_default()->tld` +mi_tld_t* _mi_tld(void); // current tld: `_mi_tld() == _mi_heap_get_default()->tld` +mi_subproc_t* _mi_subproc(void); +mi_subproc_t* _mi_subproc_main(void); mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; size_t _mi_thread_seq_id(void) mi_attr_noexcept; @@ -142,10 +144,11 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t m // arena.c mi_arena_id_t _mi_arena_id_none(void); -void _mi_arena_init(void); -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid); -void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid); -bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id); +mi_arena_t* _mi_arena_from_id(mi_arena_id_t id); + +void* _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); +void* _mi_arena_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena); bool _mi_arena_contains(const void* p); void _mi_arenas_collect(bool force_purge); void _mi_arena_unsafe_destroy_all(void); @@ -524,7 +527,7 @@ static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { if (heap != NULL) { page->heap = heap; page->heap_tag = heap->tag; - mi_atomic_store_release(&page->xthread_id, heap->thread_id); + mi_atomic_store_release(&page->xthread_id, heap->tld->thread_id); } else { page->heap = NULL; diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 0cf909d0..4d43e887 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -243,9 +243,6 @@ typedef size_t mi_page_flags_t; // atomically in `free.c:mi_free_block_mt`. typedef uintptr_t mi_thread_free_t; -// Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython) -typedef struct mi_subproc_s mi_subproc_t; - // A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython) typedef uint8_t mi_heaptag_t; @@ -299,7 +296,6 @@ typedef struct mi_page_s { mi_heap_t* heap; // heap this threads belong to. struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` - mi_subproc_t* subproc; // sub-process of this heap mi_memid_t memid; // provenance of the page memory } mi_page_t; @@ -380,7 +376,7 @@ typedef struct mi_random_cxt_s { // In debug mode there is a padding structure at the end of the blocks to check for buffer overflows -#if (MI_PADDING) +#if MI_PADDING typedef struct mi_padding_s { uint32_t canary; // encoded block value to check validity of the padding (in case of overflow) uint32_t delta; // padding bytes before the block. (mi_usable_size(p) - delta == exact allocated bytes) @@ -397,10 +393,8 @@ typedef struct mi_padding_s { // A heap owns a set of pages. struct mi_heap_s { - mi_tld_t* tld; - // _Atomic(mi_block_t*) thread_delayed_free; - mi_threadid_t thread_id; // thread this heap belongs too - mi_arena_id_t arena_id; // arena id if the heap belongs to a specific arena (or 0) + mi_tld_t* tld; // thread-local data + mi_arena_t* exclusive_arena; // if the heap belongs to a specific arena (or NULL) uintptr_t cookie; // random cookie to verify pointers (see `_mi_ptr_cookie`) uintptr_t keys[2]; // two random keys used to encode the `thread_delayed_free` list mi_random_ctx_t random; // random number context used for secure allocation @@ -408,7 +402,6 @@ struct mi_heap_s { size_t page_retired_min; // smallest retired index (retired pages are fully free, but still in the page queues) size_t page_retired_max; // largest retired index into the `pages` array. mi_heap_t* next; // list of heaps per thread - mi_memid_t memid; // provenance of the heap struct itseft (meta or os) long full_page_retain; // how many full pages can be retained per queue (before abondoning them) bool allow_page_reclaim; // `true` if this heap should not reclaim abandoned pages bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint @@ -421,7 +414,8 @@ struct mi_heap_s { size_t guarded_sample_count; // current sample count (counting down to 0) #endif mi_page_t* pages_free_direct[MI_PAGES_DIRECT]; // optimize: array where every entry points a page with possibly free blocks in the corresponding queue for that size. - mi_page_queue_t pages[MI_BIN_FULL + 1]; // queue of pages for each size class (or "bin") + mi_page_queue_t pages[MI_BIN_COUNT]; // queue of pages for each size class (or "bin") + mi_memid_t memid; // provenance of the heap struct itself (meta or os) }; @@ -479,7 +473,7 @@ typedef struct mi_stats_s { mi_stat_counter_t arena_count; mi_stat_counter_t guarded_alloc_count; #if MI_STAT>1 - mi_stat_count_t normal_bins[MI_BIN_HUGE+1]; + mi_stat_count_t normal_bins[MI_BIN_COUNT]; #endif } mi_stats_t; @@ -513,19 +507,24 @@ void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); // ------------------------------------------------------ -// Sub processes do not reclaim or visit segments -// from other sub processes +// Sub processes use separate arena's and no heaps/pages/blocks +// are shared between sub processes. +// Each thread should also belong to one sub-process only // ------------------------------------------------------ -struct mi_subproc_s { - _Atomic(size_t) abandoned_count[MI_BIN_COUNT]; // count of abandoned pages for this sub-process - _Atomic(size_t) abandoned_os_list_count; // count of abandoned pages in the os-list - mi_lock_t abandoned_os_lock; // lock for the abandoned os pages list (outside of arena's) (this lock protect list operations) - mi_lock_t abandoned_os_visit_lock; // ensure only one thread per subproc visits the abandoned os list - mi_page_t* abandoned_os_list; // doubly-linked list of abandoned pages outside of arena's (in OS allocated memory) - mi_page_t* abandoned_os_list_tail; // the tail-end of the list - mi_memid_t memid; // provenance of this memory block -}; +#define MI_MAX_ARENAS (160) // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`) + // 160 arenas is enough for ~2 TiB memory + +typedef struct mi_subproc_s { + _Atomic(size_t) arena_count; // current count of arena's + _Atomic(mi_arena_t*) arenas[MI_MAX_ARENAS]; // arena's of this sub-process + mi_lock_t arena_reserve_lock; // lock to ensure arena's get reserved one at a time + _Atomic(size_t) abandoned_count[MI_BIN_COUNT]; // total count of abandoned pages for this sub-process + mi_page_queue_t os_pages; // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on) + mi_lock_t os_pages_lock; // lock for the os pages list (this lock protects list operations) + mi_memid_t memid; // provenance of this memory block (meta or OS) +} mi_subproc_t; + // ------------------------------------------------------ // Thread Local data @@ -534,20 +533,21 @@ struct mi_subproc_s { // Milliseconds as in `int64_t` to avoid overflows typedef int64_t mi_msecs_t; - // Thread local data struct mi_tld_s { - unsigned long long heartbeat; // monotonic heartbeat count + mi_threadid_t thread_id; // thread id of this thread + size_t thread_seq; // thread sequence id (linear count of created threads) + mi_subproc_t* subproc; // sub-process this thread belongs to. mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) - mi_subproc_t* subproc; // sub-process this thread belongs to. - size_t tseq; // thread sequence id - mi_memid_t memid; // provenance of the tld memory itself (meta or OS) + unsigned long long heartbeat; // monotonic heartbeat count bool recurse; // true if deferred was called; used to prevent infinite recursion. bool is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks) mi_stats_t stats; // statistics + mi_memid_t memid; // provenance of the tld memory itself (meta or OS) }; + /* ----------------------------------------------------------- Error codes passed to `_mi_fatal_error` All are recoverable but EFAULT is a serious error and aborts by default in secure mode. diff --git a/src/alloc.c b/src/alloc.c index 25d6f62e..e5f2b8ae 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -134,7 +134,7 @@ static inline mi_decl_restrict void* mi_heap_malloc_small_zero(mi_heap_t* heap, mi_assert(size <= MI_SMALL_SIZE_MAX); #if MI_DEBUG const uintptr_t tid = _mi_thread_id(); - mi_assert(heap->thread_id == 0 || heap->thread_id == tid); // heaps are thread local + mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == tid); // heaps are thread local #endif #if (MI_PADDING || MI_GUARDED) if (size == 0) { size = sizeof(void*); } @@ -188,7 +188,7 @@ extern inline void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool z else { // regular allocation mi_assert(heap!=NULL); - mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local + mi_assert(heap->tld->thread_id == 0 || heap->tld->thread_id == _mi_thread_id()); // heaps are thread local void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE, zero, huge_alignment); // note: size can overflow but it is detected in malloc_generic mi_track_malloc(p,size,zero); diff --git a/src/arena-meta.c b/src/arena-meta.c index ceda06ba..f28c50e9 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -64,10 +64,12 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) { // allocate a fresh meta page and add it to the global list. static mi_meta_page_t* mi_meta_page_zalloc(void) { // allocate a fresh arena slice + // note: we always use subproc_main directly for the meta-data since at thread start the metadata for the + // tld and heap need to be (meta) allocated and at that time we cannot read the tld pointer (yet). mi_memid_t memid; - mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0, + mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc_main(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0, true /* commit*/, true /* allow large */, - _mi_arena_id_none(), 0 /* tseq */, &memid ); + NULL, 0 /* tseq */, &memid ); if (mpage == NULL) return NULL; mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN)); if (!memid.initially_zero) { diff --git a/src/arena.c b/src/arena.c index 74cd4977..bb846da9 100644 --- a/src/arena.c +++ b/src/arena.c @@ -35,7 +35,7 @@ The arena allocation needs to be thread safe and we use an atomic bitmap to allo // A memory arena descriptor typedef struct mi_arena_s { mi_memid_t memid; // memid of the memory area - mi_arena_id_t id; // arena id (> 0 where `arena == arenas[arena->id - 1]`) + mi_subproc_t* subproc; // subprocess this arena belongs to (`this 'in' this->subproc->arenas`) size_t slice_count; // total size of the area in arena slices (of `MI_ARENA_SLICE_SIZE`) size_t info_slices; // initial slices reserved for the arena bitmaps @@ -64,64 +64,45 @@ typedef struct mi_purge_info_s { } mi_purge_info_t; -#define MI_MAX_ARENAS (160) // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`) - // 160 arenas is enough for ~2 TiB memory - -// The available arenas -static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; -static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 - - -static mi_lock_t mi_arena_reserve_lock; - -void _mi_arena_init(void) { - mi_lock_init(&mi_arena_reserve_lock); -} /* ----------------------------------------------------------- Arena id's - id = arena_index + 1 ----------------------------------------------------------- */ -size_t mi_arena_id_index(mi_arena_id_t id) { - return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); -} - -static mi_arena_id_t mi_arena_id_create(size_t arena_index) { - mi_assert_internal(arena_index < MI_MAX_ARENAS); - return (int)arena_index + 1; +static mi_arena_id_t mi_arena_id_create(mi_arena_t* arena) { + return arena; } mi_arena_id_t _mi_arena_id_none(void) { - return 0; + return NULL; } -static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { - return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || - (arena_id == req_arena_id)); +mi_arena_t* _mi_arena_from_id(mi_arena_id_t id) { + return (mi_arena_t*)id; } -bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { + +static bool mi_arena_id_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena) { + return ((arena == req_arena) || // they match, + (req_arena == NULL && !arena->is_exclusive)); // or the arena is not exclusive, and we didn't request a specific one +} + +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena) { if (memid.memkind == MI_MEM_ARENA) { - const mi_arena_t* arena = memid.mem.arena.arena; - return mi_arena_id_is_suitable(arena->id, arena->is_exclusive, request_arena_id); + return mi_arena_id_is_suitable(memid.mem.arena.arena, request_arena); } else { - return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); + return mi_arena_id_is_suitable(NULL, request_arena); } } -size_t mi_arena_get_count(void) { - return mi_atomic_load_relaxed(&mi_arena_count); +size_t mi_arenas_get_count(mi_subproc_t* subproc) { + return mi_atomic_load_relaxed(&subproc->arena_count); } -mi_arena_t* mi_arena_from_index(size_t idx) { - mi_assert_internal(idx < mi_arena_get_count()); - return mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[idx]); -} - -mi_arena_t* mi_arena_from_id(mi_arena_id_t id) { - return mi_arena_from_index(mi_arena_id_index(id)); +mi_arena_t* mi_arena_from_index(mi_subproc_t* subproc, size_t idx) { + mi_assert_internal(idx < mi_arenas_get_count(subproc)); + return mi_atomic_load_ptr_relaxed(mi_arena_t, &subproc->arenas[idx]); } static size_t mi_arena_info_slices(mi_arena_t* arena) { @@ -159,9 +140,7 @@ uint8_t* mi_arena_slice_start(mi_arena_t* arena, size_t slice_index) { // Arena area void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { if (size != NULL) *size = 0; - const size_t arena_index = mi_arena_id_index(arena_id); - if (arena_index >= MI_MAX_ARENAS) return NULL; - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); + mi_arena_t* arena = _mi_arena_from_id(arena_id); if (arena == NULL) return NULL; if (size != NULL) { *size = mi_size_of_slices(arena->slice_count); } return mi_arena_start(arena); @@ -297,12 +276,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // try to reserve a fresh arena space -static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) +static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) { // if (_mi_preloading()) return false; // use OS only while pre loading if (req_arena_id != _mi_arena_id_none()) return false; - const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); + const size_t arena_count = mi_arenas_get_count(subproc); if (arena_count > (MI_MAX_ARENAS - 4)) return false; // calc reserve @@ -368,32 +347,27 @@ static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t re Arena iteration ----------------------------------------------------------- */ -static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_arena_id, int numa_node, bool allow_large) { +static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_large) { if (!allow_large && arena->is_large) return false; - if (!mi_arena_id_is_suitable(arena->id, arena->is_exclusive, req_arena_id)) return false; - if (req_arena_id == _mi_arena_id_none()) { // if not specific, check numa affinity + if (!mi_arena_id_is_suitable(arena, req_arena)) return false; + if (req_arena == NULL) { // if not specific, check numa affinity const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); if (!numa_suitable) return false; } return true; } - -#define mi_forall_arenas(req_arena_id, tseq, name_arena) \ - { \ - const size_t _arena_count = mi_arena_get_count(); \ - if (_arena_count > 0) { \ - const size_t _arena_cycle = _arena_count - 1; /* first search the arenas below the last one */ \ - size_t _start; \ - if (req_arena_id == _mi_arena_id_none()) { \ - /* always start searching in the arena's below the max */ \ - _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \ +#define mi_forall_arenas(subproc, req_arena, tseq, name_arena) { \ + const size_t _arena_count = mi_arenas_get_count(subproc); \ + const size_t _arena_cycle = (_arena_count == 0 ? 0 : _arena_count - 1); /* first search the arenas below the last one */ \ + /* always start searching in the arena's below the max */ \ + size_t _start = (_arena_cycle <= 1 ? 0 : (tseq % _arena_cycle)); \ + for (size_t _i = 0; _i < _arena_count; _i++) { \ + mi_arena_t* name_arena; \ + if (req_arena != NULL) { \ + name_arena = req_arena; /* if there is a specific req_arena, only search that one */\ } \ else { \ - _start = mi_arena_id_index(req_arena_id); \ - mi_assert_internal(_start < _arena_count); \ - } \ - for (size_t _i = 0; _i < _arena_count; _i++) { \ size_t _idx; \ if (_i < _arena_cycle) { \ _idx = _i + _start; \ @@ -402,19 +376,20 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are else { \ _idx = _i; /* remaining arena's */ \ } \ - mi_arena_t* const name_arena = mi_arena_from_index(_idx); \ - if (name_arena != NULL) \ - { + name_arena = mi_arena_from_index(subproc,_idx); \ + } \ + if (name_arena != NULL) \ + { #define mi_forall_arenas_end() \ - } \ - if (req_arena_id != _mi_arena_id_none()) break; \ } \ - }} + if (req_arena != NULL) break; \ + } \ + } -#define mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, name_arena) \ - mi_forall_arenas(req_arena_id,tseq,name_arena) { \ - if (mi_arena_is_suitable(name_arena, req_arena_id, -1 /* todo: numa node */, allow_large)) { \ +#define mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, name_arena) \ + mi_forall_arenas(subproc, req_arena,tseq,name_arena) { \ + if (mi_arena_is_suitable(name_arena, req_arena, -1 /* todo: numa node */, allow_large)) { \ #define mi_forall_suitable_arenas_end() \ }} \ @@ -425,17 +400,16 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_id_t req_are ----------------------------------------------------------- */ // allocate slices from the arenas -static mi_decl_noinline void* mi_arena_try_find_free( - size_t slice_count, size_t alignment, - bool commit, bool allow_large, - mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) +static mi_decl_noinline void* mi_arenas_try_find_free( + mi_subproc_t* subproc, size_t slice_count, size_t alignment, + bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) { mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE)); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); if (alignment > MI_ARENA_SLICE_ALIGN) return NULL; // search arena's - mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena) + mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena) { void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid); if (p != NULL) return p; @@ -445,42 +419,43 @@ static mi_decl_noinline void* mi_arena_try_find_free( } // Allocate slices from the arena's -- potentially allocating a fresh arena -static mi_decl_noinline void* mi_arena_try_alloc( +static mi_decl_noinline void* mi_arenas_try_alloc( + mi_subproc_t* subproc, size_t slice_count, size_t alignment, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) + mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) { mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES); mi_assert(alignment <= MI_ARENA_SLICE_ALIGN); void* p; // try to find free slices in the arena's - p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid); + p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid); if (p != NULL) return p; // did we need a specific arena? - if (req_arena_id != _mi_arena_id_none()) return NULL; + if (req_arena != NULL) return NULL; // don't create arena's while preloading (todo: or should we?) if (_mi_preloading()) return NULL; // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?) - const size_t arena_count = mi_arena_get_count(); - if (mi_lock_acquire(&mi_arena_reserve_lock)) { + const size_t arena_count = mi_arenas_get_count(subproc); + if (mi_lock_acquire(&subproc->arena_reserve_lock)) { bool ok = true; - if (arena_count == mi_arena_get_count()) { + if (arena_count == mi_arenas_get_count(subproc)) { // we are the first to enter the lock, reserve a fresh arena mi_arena_id_t arena_id = 0; - ok = mi_arena_reserve(mi_size_of_slices(slice_count), allow_large, req_arena_id, &arena_id); + ok = mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id); } else { // another thread already reserved a new arena } - mi_lock_release(&mi_arena_reserve_lock); + mi_lock_release(&subproc->arena_reserve_lock); if (ok) { // try once more to allocate in the new arena - mi_assert_internal(req_arena_id == _mi_arena_id_none()); - p = mi_arena_try_find_free(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid); + mi_assert_internal(req_arena == NULL); + p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid); if (p != NULL) return p; } } @@ -510,10 +485,10 @@ static void* mi_arena_os_alloc_aligned( // Allocate large sized memory -void* _mi_arena_alloc_aligned( +void* _mi_arena_alloc_aligned( mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, - mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) + mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) { mi_assert_internal(memid != NULL); mi_assert_internal(size > 0); @@ -522,24 +497,24 @@ void* _mi_arena_alloc_aligned( // const int numa_node = _mi_os_numa_node(&tld->os); // current numa node // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) - if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed? - req_arena_id == _mi_arena_id_none() && // not a specific arena? + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed? + req_arena == NULL && // not a specific arena? size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && // and not too small/large alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0) // and good alignment { const size_t slice_count = mi_slice_count_of_size(size); - void* p = mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, tseq, memid); + void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, memid); if (p != NULL) return p; } // fall back to the OS - void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena_id, memid); + void* p = mi_arena_os_alloc_aligned(size, alignment, align_offset, commit, allow_large, req_arena, memid); return p; } -void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid) +void* _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) { - return _mi_arena_alloc_aligned(size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena_id, tseq, memid); + return _mi_arena_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid); } @@ -548,7 +523,7 @@ void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t Arena page allocation ----------------------------------------------------------- */ -static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_abandoned) { +static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_abandoned) { // found an abandoned page of the right size mi_page_t* const page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); // can we claim ownership? @@ -560,9 +535,9 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, *keep_abandoned = true; return false; } - if (subproc != page->subproc || heap_tag != page->heap_tag) { - // wrong sub-process or heap_tag.. we need to unown again - // note: this normally never happens unless subprocesses/heaptags are actually used. + if (heap_tag != page->heap_tag) { + // wrong heap_tag.. we need to unown again + // note: this normally never happens unless heaptags are actually used. // (an unown might free the page, and depending on that we can keep it in the abandoned map or not) // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point. // so we cannot check in `mi_arena_free` for this invariant to hold. @@ -570,31 +545,31 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, *keep_abandoned = !freed; return false; } - // yes, we can reclaim it, keep the abandaned map entry clear + // yes, we can reclaim it, keep the abandoned map entry clear *keep_abandoned = false; return true; } -static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_heaptag_t heaptag, mi_tld_t* tld) +static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t slice_count, size_t block_size, mi_arena_t* req_arena, mi_heaptag_t heaptag, size_t tseq) { MI_UNUSED(slice_count); const size_t bin = _mi_bin(block_size); mi_assert_internal(bin < MI_BIN_COUNT); // any abandoned in our size class? - mi_subproc_t* const subproc = tld->subproc; mi_assert_internal(subproc != NULL); - if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) return NULL; + if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) == 0) { + return NULL; + } // search arena's const bool allow_large = true; - size_t tseq = tld->tseq; - mi_forall_suitable_arenas(req_arena_id, tseq, allow_large, arena) + mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena) { size_t slice_index; mi_bitmap_t* const bitmap = arena->pages_abandoned[bin]; - if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, subproc, heaptag)) { + if (mi_bitmap_try_find_and_claim(bitmap, tseq, &slice_index, &mi_arena_try_claim_abandoned, arena, heaptag)) { // found an abandoned page of the right size // and claimed ownership. mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); @@ -621,8 +596,8 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl return NULL; } -static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, size_t block_alignment, - mi_arena_id_t req_arena_id, mi_tld_t* tld) +static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment, + mi_arena_t* req_arena, size_t tseq) { const bool allow_large = true; const bool commit = true; @@ -636,7 +611,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz !os_align && // not large alignment slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large { - page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, tld->tseq, &memid); + page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, &memid); if (page != NULL) { mi_assert_internal(mi_bitmap_is_clearN(memid.mem.arena.arena->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count)); mi_bitmap_set(memid.mem.arena.arena->pages, memid.mem.arena.slice_index); @@ -648,10 +623,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz if (os_align) { // note: slice_count already includes the page mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment)); - page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid); + page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, &memid); } else { - page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid); + page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena, &memid); } } @@ -724,17 +699,17 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz } static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size_t block_size) { - const mi_arena_id_t req_arena_id = heap->arena_id; + mi_arena_t* req_arena = heap->exclusive_arena; mi_tld_t* const tld = heap->tld; // 1. look for an abandoned page - mi_page_t* page = mi_arena_page_try_find_abandoned(slice_count, block_size, req_arena_id, heap->tag, tld); + mi_page_t* page = mi_arena_page_try_find_abandoned(tld->subproc, slice_count, block_size, req_arena, heap->tag, tld->thread_seq); if (page != NULL) { return page; // return as abandoned } // 2. find a free block, potentially allocating a new arena - page = mi_arena_page_alloc_fresh(slice_count, block_size, 1, req_arena_id, tld); + page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq); if (page != NULL) { mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); _mi_page_init(heap, page); @@ -746,13 +721,13 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { - const mi_arena_id_t req_arena_id = heap->arena_id; + mi_arena_t* req_arena = heap->exclusive_arena; mi_tld_t* const tld = heap->tld; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size()); const size_t slice_count = mi_slice_count_of_size(info_size + block_size); - mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld); + mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq); if (page == NULL) return NULL; mi_assert(page != NULL); @@ -836,7 +811,6 @@ void _mi_arena_page_abandon(mi_page_t* page) { mi_assert_internal(!mi_page_all_free(page)); mi_assert_internal(page->next==NULL); - mi_subproc_t* subproc = page->subproc; if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) { // make available for allocations size_t bin = _mi_bin(mi_page_block_size(page)); @@ -851,7 +825,7 @@ void _mi_arena_page_abandon(mi_page_t* page) { mi_page_set_abandoned_mapped(page); const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index); MI_UNUSED(wasclear); mi_assert_internal(wasclear); - mi_atomic_increment_relaxed(&subproc->abandoned_count[bin]); + mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]); } else { // page is full (or a singleton), page is OS/externally allocated @@ -902,7 +876,7 @@ void _mi_arena_page_unabandon(mi_page_t* page) { // this busy waits until a concurrent reader (from alloc_abandoned) is done mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index); mi_page_clear_abandoned_mapped(page); - mi_atomic_decrement_relaxed(&page->subproc->abandoned_count[bin]); + mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]); } else { // page is full (or a singleton), page is OS/nly allocated @@ -989,9 +963,10 @@ void _mi_arenas_collect(bool force_purge) { // Is a pointer inside any of our arenas? bool _mi_arena_contains(const void* p) { - const size_t max_arena = mi_arena_get_count(); + mi_subproc_t* subproc = _mi_subproc(); + const size_t max_arena = mi_arenas_get_count(subproc); for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]); if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p) { return true; } @@ -1007,14 +982,14 @@ bool _mi_arena_contains(const void* p) { // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. -static void mi_arenas_unsafe_destroy(void) { - const size_t max_arena = mi_arena_get_count(); +static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) { + const size_t max_arena = mi_arenas_get_count(subproc); size_t new_max_arena = 0; for (size_t i = 0; i < max_arena; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]); if (arena != NULL) { // mi_lock_done(&arena->abandoned_visit_lock); - mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); + mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL); if (mi_memkind_is_os(arena->memid.memkind)) { _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid); } @@ -1023,14 +998,14 @@ static void mi_arenas_unsafe_destroy(void) { // try to lower the max arena. size_t expected = max_arena; - mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); + mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, new_max_arena); } // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. void _mi_arena_unsafe_destroy_all(void) { - mi_arenas_unsafe_destroy(); + mi_arenas_unsafe_destroy(_mi_subproc()); _mi_arenas_collect(true /* force purge */); // purge non-owned arenas } @@ -1039,40 +1014,36 @@ void _mi_arena_unsafe_destroy_all(void) { Add an arena. ----------------------------------------------------------- */ -static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { +static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { mi_assert_internal(arena != NULL); mi_assert_internal(arena->slice_count > 0); - if (arena_id != NULL) { *arena_id = -1; } + if (arena_id != NULL) { *arena_id = NULL; } // first try to find a NULL entry - const size_t count = mi_arena_get_count(); + const size_t count = mi_arenas_get_count(subproc); size_t i; for (i = 0; i < count; i++) { - if (mi_arena_from_index(i) == NULL) { - arena->id = mi_arena_id_create(i); + if (mi_arena_from_index(subproc,i) == NULL) { mi_arena_t* expected = NULL; - if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &mi_arenas[i], &expected, arena)) { + if (mi_atomic_cas_ptr_strong_release(mi_arena_t, &subproc->arenas[i], &expected, arena)) { // success - if (arena_id != NULL) { *arena_id = arena->id; } + if (arena_id != NULL) { *arena_id = arena; } return true; - } - else { - arena->id = _mi_arena_id_none(); - } + } } } // otherwise increase the max - i = mi_atomic_increment_acq_rel(&mi_arena_count); + i = mi_atomic_increment_acq_rel(&subproc->arena_count); if (i >= MI_MAX_ARENAS) { - mi_atomic_decrement_acq_rel(&mi_arena_count); + mi_atomic_decrement_acq_rel(&subproc->arena_count); + arena->subproc = NULL; return false; } _mi_stat_counter_increase(&stats->arena_count,1); - arena->id = mi_arena_id_create(i); - mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); - if (arena_id != NULL) { *arena_id = arena->id; } + mi_atomic_store_ptr_release(mi_arena_t,&subproc->arenas[i], arena); + if (arena_id != NULL) { *arena_id = arena; } return true; } @@ -1099,7 +1070,7 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) { } -static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept +static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { mi_assert(!is_large || (memid.initially_committed && memid.is_pinned)); mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE)); @@ -1138,7 +1109,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int } // init - arena->id = _mi_arena_id_none(); + arena->subproc = subproc; arena->memid = memid; arena->is_exclusive = exclusive; arena->slice_count = slice_count; @@ -1176,7 +1147,7 @@ static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL); } - return mi_arena_add(arena, arena_id, &_mi_stats_main); + return mi_arena_add(subproc, arena, arena_id, &_mi_stats_main); } @@ -1187,7 +1158,7 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is memid.initially_committed = is_committed; memid.initially_zero = is_zero; memid.is_pinned = is_large; - return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id); + return mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, numa_node, exclusive, memid, arena_id); } // Reserve a range of regular OS memory @@ -1198,7 +1169,7 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid); if (start == NULL) return ENOMEM; const bool is_large = memid.is_pinned; // todo: use separate is_large field? - if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { + if (!mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { _mi_os_free_ex(start, size, commit, memid); _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); return ENOMEM; @@ -1307,16 +1278,18 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi } void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept { - size_t max_arenas = mi_arena_get_count(); + mi_subproc_t* subproc = _mi_subproc(); + size_t max_arenas = mi_arenas_get_count(subproc); size_t free_total = 0; size_t slice_total = 0; //size_t abandoned_total = 0; size_t page_total = 0; for (size_t i = 0; i < max_arenas; i++) { - mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]); if (arena == NULL) break; + mi_assert(arena->subproc == subproc); slice_total += arena->slice_count; - _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "")); + _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "", arena->subproc)); if (show_inuse) { free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); } @@ -1342,7 +1315,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) ----------------------------------------------------------- */ // reserve at a specific numa node int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { - if (arena_id != NULL) *arena_id = -1; + if (arena_id != NULL) *arena_id = NULL; if (pages==0) return 0; if (numa_node < -1) numa_node = -1; if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); @@ -1356,7 +1329,7 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m } _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); - if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { + if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, true, numa_node, exclusive, memid, arena_id)) { _mi_os_free(p, hsize, memid); return ENOMEM; } @@ -1538,10 +1511,13 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) } -static void mi_arenas_try_purge(bool force, bool visit_all) { +static void mi_arenas_try_purge(bool force, bool visit_all) +{ if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled - const size_t max_arena = mi_arena_get_count(); + mi_tld_t* tld = _mi_tld(); + mi_subproc_t* subproc = tld->subproc; + const size_t max_arena = mi_arenas_get_count(subproc); if (max_arena == 0) return; // allow only one thread to purge at a time @@ -1549,12 +1525,12 @@ static void mi_arenas_try_purge(bool force, bool visit_all) { mi_atomic_guard(&purge_guard) { const mi_msecs_t now = _mi_clock_now(); - const size_t arena_start = _mi_tld()->tseq % max_arena; + const size_t arena_start = tld->thread_seq % max_arena; size_t max_purge_count = (visit_all ? max_arena : 1); for (size_t _i = 0; _i < max_arena; _i++) { size_t i = _i + arena_start; if (i >= max_arena) { i -= max_arena; } - mi_arena_t* arena = mi_arena_from_index(i); + mi_arena_t* arena = mi_arena_from_index(subproc,i); if (arena != NULL) { if (mi_arena_try_purge(arena, now, force)) { if (max_purge_count <= 1) break; @@ -1590,13 +1566,7 @@ static bool mi_arena_pages_reregister(mi_arena_t* arena) { } mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* full_size) { - const size_t count = mi_arena_get_count(); - const size_t arena_idx = mi_arena_id_index(arena_id); - if (count <= arena_idx) { - _mi_warning_message("arena id is invalid (%zu)\n", arena_id); - return false; - } - mi_arena_t* arena = mi_arena_from_id(arena_id); + mi_arena_t* arena = _mi_arena_from_id(arena_id); if (arena==NULL) { return false; } @@ -1627,10 +1597,17 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* _mi_page_map_unregister_range(arena, asize); // set the entry to NULL - mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[arena_idx], NULL); - if (arena_idx + 1 == count) { // try adjust the count? - size_t expected = count; - mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, count-1); + mi_subproc_t* subproc = arena->subproc; + const size_t count = mi_arenas_get_count(subproc); + for(size_t i = 0; i < count; i++) { + if (mi_arena_from_index(subproc, i) == arena) { + mi_atomic_store_ptr_release(mi_arena_t, &subproc->arenas[i], NULL); + if (i + 1 == count) { // try adjust the count? + size_t expected = count; + mi_atomic_cas_strong_acq_rel(&subproc->arena_count, &expected, count-1); + } + break; + } } return true; } @@ -1662,8 +1639,8 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, arena->memid.initially_zero = is_zero; arena->is_exclusive = true; arena->is_large = is_large; - arena->id = _mi_arena_id_none(); - if (!mi_arena_add(arena, arena_id, &_mi_stats_main)) { + arena->subproc = NULL; + if (!mi_arena_add(_mi_subproc(), arena, arena_id, &_mi_stats_main)) { return false; } mi_arena_pages_reregister(arena); diff --git a/src/bitmap.c b/src/bitmap.c index 6fae1ed6..6352e4ea 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1228,7 +1228,6 @@ bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t tseq, size_t n, typedef struct mi_claim_fun_data_s { mi_arena_t* arena; - mi_subproc_t* subproc; mi_heaptag_t heap_tag; } mi_claim_fun_data_t; @@ -1242,7 +1241,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk const size_t slice_index = (chunk_idx * MI_BCHUNK_BITS) + cidx; mi_assert_internal(slice_index < mi_bitmap_max_bits(bitmap)); bool keep_set = true; - if ((*claim_fun)(slice_index, claim_data->arena, claim_data->subproc, claim_data->heap_tag, &keep_set)) { + if ((*claim_fun)(slice_index, claim_data->arena, claim_data->heap_tag, &keep_set)) { // success! mi_assert_internal(!keep_set); *pidx = slice_index; @@ -1267,9 +1266,9 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk // Find a set bit in the bitmap and try to atomically clear it and claim it. // (Used to find pages in the pages_abandoned bitmaps.) mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, - mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag) + mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag) { - mi_claim_fun_data_t claim_data = { arena, subproc, heap_tag }; + mi_claim_fun_data_t claim_data = { arena, heap_tag }; return mi_bitmap_find(bitmap, tseq, 1, pidx, &mi_bitmap_try_find_and_claim_visit, (void*)claim, &claim_data); } diff --git a/src/bitmap.h b/src/bitmap.h index 47c22025..16ecea07 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -208,13 +208,13 @@ mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* // Called once a bit is cleared to see if the memory slice can be claimed. -typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag, bool* keep_set); +typedef bool (mi_claim_fun_t)(size_t slice_index, mi_arena_t* arena, mi_heaptag_t heap_tag, bool* keep_set); // Find a set bits in the bitmap, atomically clear it, and check if `claim` returns true. // If not claimed, continue on (potentially setting the bit again depending on `keep_set`). // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. mi_decl_nodiscard bool mi_bitmap_try_find_and_claim(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx, - mi_claim_fun_t* claim, mi_arena_t* arena, mi_subproc_t* subproc, mi_heaptag_t heap_tag ); + mi_claim_fun_t* claim, mi_arena_t* arena, mi_heaptag_t heap_tag ); // Atomically clear a bit but only if it is set. Will block otherwise until the bit is set. diff --git a/src/free.c b/src/free.c index 14034593..770856da 100644 --- a/src/free.c +++ b/src/free.c @@ -210,7 +210,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { if (mi_page_all_free(page)) { // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish) - _mi_arena_page_unabandon(page); + _mi_arena_page_unabandon(page); // we can free the page directly _mi_arena_page_free(page); return; @@ -234,8 +234,8 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); if ((tagheap != NULL) && // don't reclaim across heap object types (tagheap->allow_page_reclaim) && // we are allowed to reclaim abandoned pages - (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) - (_mi_arena_memid_is_suitable(page->memid, tagheap->arena_id)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) + // (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) + (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) ) { if (mi_page_queue(tagheap, page->block_size)->first != NULL) { // don't reclaim for an block_size we don't use diff --git a/src/heap.c b/src/heap.c index dee404d2..e8743691 100644 --- a/src/heap.c +++ b/src/heap.c @@ -178,7 +178,7 @@ mi_heap_t* mi_heap_get_backing(void) { mi_assert_internal(heap!=NULL); mi_heap_t* bheap = heap->tld->heap_backing; mi_assert_internal(bheap!=NULL); - mi_assert_internal(bheap->thread_id == _mi_thread_id()); + mi_assert_internal(bheap->tld->thread_id == _mi_thread_id()); return bheap; } @@ -190,8 +190,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); heap->memid = memid; heap->tld = tld; // avoid reading the thread-local tld during initialization - heap->thread_id = _mi_thread_id(); - heap->arena_id = arena_id; + heap->exclusive_arena = _mi_arena_from_id(arena_id); heap->allow_page_reclaim = !noreclaim; heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0); heap->full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); @@ -254,7 +253,7 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new(void) { } bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) { - return _mi_arena_memid_is_suitable(memid, heap->arena_id); + return _mi_arena_memid_is_suitable(memid, heap->exclusive_arena); } uintptr_t _mi_heap_random_next(mi_heap_t* heap) { diff --git a/src/init.c b/src/init.c index 9a26d56f..a15a9c6c 100644 --- a/src/init.c +++ b/src/init.c @@ -33,8 +33,7 @@ const mi_page_t _mi_page_empty = { { 0, 0 }, #endif NULL, // xheap - NULL, NULL, // next, prev - NULL, // subproc + NULL, NULL, // next, prev MI_MEMID_STATIC // memid }; @@ -96,27 +95,76 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- +static mi_decl_cache_align mi_subproc_t subproc_main; + +static mi_decl_cache_align mi_tld_t tld_empty = { + 0, // thread_id + 0, // thread_seq + &subproc_main, // subproc + NULL, // heap_backing + NULL, // heaps list + 0, // heartbeat + false, // recurse + false, // is_in_threadpool + { MI_STATS_NULL }, // stats + MI_MEMID_STATIC // memid +}; + mi_decl_cache_align const mi_heap_t _mi_heap_empty = { - NULL, - // MI_ATOMIC_VAR_INIT(NULL), // thread delayed free - 0, // thread_id - 0, // arena_id - 0, // cookie - { 0, 0 }, // keys - { {0}, {0}, 0, true }, // random - 0, // page count - MI_BIN_FULL, 0, // page retired min/max - NULL, // next - MI_MEMID_STATIC, // memid - 0, // full page retain - false, // can reclaim - true, // can eager abandon - 0, // tag + &tld_empty, // tld + NULL, // exclusive_arena + 0, // cookie + { 0, 0 }, // keys + { {0}, {0}, 0, true }, // random + 0, // page count + MI_BIN_FULL, 0, // page retired min/max + NULL, // next + 0, // full page retain + false, // can reclaim + true, // can eager abandon + 0, // tag #if MI_GUARDED - 0, 0, 0, 0, 1, // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`) + 0, 0, 0, 0, 1, // count is 1 so we never write to it (see `internal.h:mi_heap_malloc_use_guarded`) #endif MI_SMALL_PAGES_EMPTY, - MI_PAGE_QUEUES_EMPTY + MI_PAGE_QUEUES_EMPTY, + MI_MEMID_STATIC +}; + +extern mi_heap_t heap_main; + +static mi_decl_cache_align mi_tld_t tld_main = { + 0, // thread_id + 0, // thread_seq + &subproc_main, // subproc + &heap_main, // heap_backing + &heap_main, // heaps list + 0, // heartbeat + false, // recurse + false, // is_in_threadpool + { MI_STATS_NULL }, // stats + MI_MEMID_STATIC // memid +}; + +mi_decl_cache_align mi_heap_t heap_main = { + &tld_main, // thread local data + 0, // initial cookie + 0, // arena id + { 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!) + { {0x846ca68b}, {0}, 0, true }, // random + 0, // page count + MI_BIN_FULL, 0, // page retired min/max + NULL, // next heap + 2, // full page retain + true, // allow page reclaim + true, // allow page abandon + 0, // tag + #if MI_GUARDED + 0, 0, 0, 0, 0, + #endif + MI_SMALL_PAGES_EMPTY, + MI_PAGE_QUEUES_EMPTY, + MI_MEMID_STATIC }; @@ -124,49 +172,9 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { return _mi_prim_thread_id(); } - // the thread-local default heap for allocation mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty; -extern mi_heap_t _mi_heap_main; - -static mi_decl_cache_align mi_subproc_t mi_subproc_default; - -static mi_decl_cache_align mi_tld_t tld_main = { - 0, - &_mi_heap_main, // heap_backing - &_mi_heap_main, // heaps list - &mi_subproc_default, // subproc - 0, // tseq - MI_MEMID_STATIC, // memid - false, // recurse - false, // is_in_threadpool - { MI_STATS_NULL } // stats -}; - -mi_decl_cache_align mi_heap_t _mi_heap_main = { - &tld_main, - // MI_ATOMIC_VAR_INIT(NULL), // thread delayed free list - 0, // thread id - 0, // initial cookie - 0, // arena id - { 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!) - { {0x846ca68b}, {0}, 0, true }, // random - 0, // page count - MI_BIN_FULL, 0, // page retired min/max - NULL, // next heap - MI_MEMID_STATIC, // memid - 2, // full page retain - true, // allow page reclaim - true, // allow page abandon - 0, // tag - #if MI_GUARDED - 0, 0, 0, 0, 0, - #endif - MI_SMALL_PAGES_EMPTY, - MI_PAGE_QUEUES_EMPTY -}; - bool _mi_process_is_initialized = false; // set to `true` in `mi_process_init`. mi_stats_t _mi_stats_main = { MI_STATS_NULL }; @@ -210,30 +218,46 @@ void _mi_heap_guarded_init(mi_heap_t* heap) { } #endif - -static void mi_heap_main_init(void) { - if (_mi_heap_main.cookie == 0) { - _mi_heap_main.thread_id = _mi_thread_id(); - _mi_heap_main.cookie = 1; - #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB) - _mi_random_init_weak(&_mi_heap_main.random); // prevent allocation failure during bcrypt dll initialization with static linking - #else - _mi_random_init(&_mi_heap_main.random); - #endif - _mi_heap_main.cookie = _mi_heap_random_next(&_mi_heap_main); - _mi_heap_main.keys[0] = _mi_heap_random_next(&_mi_heap_main); - _mi_heap_main.keys[1] = _mi_heap_random_next(&_mi_heap_main); - mi_lock_init(&mi_subproc_default.abandoned_os_lock); - mi_lock_init(&mi_subproc_default.abandoned_os_visit_lock); - _mi_heap_guarded_init(&_mi_heap_main); - _mi_heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0); - _mi_heap_main.full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); +// Initialize main subproc +static void mi_subproc_main_init(void) { + if (subproc_main.memid.memkind != MI_MEM_STATIC) { + subproc_main.memid = _mi_memid_create(MI_MEM_STATIC); + mi_lock_init(&subproc_main.os_pages_lock); + mi_lock_init(&subproc_main.arena_reserve_lock); } } -mi_heap_t* _mi_heap_main_get(void) { +// Initialize main tld +static void mi_tld_main_init(void) { + if (tld_main.thread_id == 0) { + tld_main.thread_id = _mi_prim_thread_id(); + } +} + +// Initialization of the (statically allocated) main heap, and the main tld and subproc. +static void mi_heap_main_init(void) { + if (heap_main.cookie == 0) { + mi_subproc_main_init(); + mi_tld_main_init(); + // heap + heap_main.cookie = 1; + #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB) + _mi_random_init_weak(&heap_main.random); // prevent allocation failure during bcrypt dll initialization with static linking + #else + _mi_random_init(&heap_main.random); + #endif + heap_main.cookie = _mi_heap_random_next(&heap_main); + heap_main.keys[0] = _mi_heap_random_next(&heap_main); + heap_main.keys[1] = _mi_heap_random_next(&heap_main); + _mi_heap_guarded_init(&heap_main); + heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0); + heap_main.full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); + } +} + +mi_heap_t* heap_main_get(void) { mi_heap_main_init(); - return &_mi_heap_main; + return &heap_main; } @@ -265,8 +289,9 @@ static mi_tld_t* mi_tld_alloc(void) { tld->memid = memid; tld->heap_backing = NULL; tld->heaps = NULL; - tld->subproc = &mi_subproc_default; - tld->tseq = mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->subproc = &subproc_main; + tld->thread_id = _mi_prim_thread_id(); + tld->thread_seq = mi_atomic_add_acq_rel(&mi_tcount, 1); tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool(); return tld; } @@ -291,12 +316,24 @@ mi_decl_noinline mi_tld_t* _mi_tld(void) { return mi_tld; } +mi_subproc_t* _mi_subproc(void) { + if (_mi_is_main_thread()) { // during initialization we should not recurse over reading the _mi_tld + return &subproc_main; + } + else { + return _mi_tld()->subproc; + } +} /* ----------------------------------------------------------- Sub process ----------------------------------------------------------- */ +mi_subproc_t* _mi_subproc_main(void) { + return &subproc_main; +} + mi_subproc_id_t mi_subproc_main(void) { return NULL; } @@ -305,42 +342,41 @@ mi_subproc_id_t mi_subproc_new(void) { mi_memid_t memid; mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid); if (subproc == NULL) return NULL; - subproc->abandoned_os_list = NULL; subproc->memid = memid; - mi_lock_init(&subproc->abandoned_os_lock); - mi_lock_init(&subproc->abandoned_os_visit_lock); + mi_lock_init(&subproc->os_pages_lock); + mi_lock_init(&subproc->arena_reserve_lock); return subproc; } mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id) { - return (subproc_id == NULL ? &mi_subproc_default : (mi_subproc_t*)subproc_id); + return (subproc_id == NULL ? &subproc_main : (mi_subproc_t*)subproc_id); } void mi_subproc_delete(mi_subproc_id_t subproc_id) { if (subproc_id == NULL) return; mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id); - // check if there are no abandoned segments still.. + // check if there are os pages still.. bool safe_to_delete = false; - if (mi_lock_acquire(&subproc->abandoned_os_lock)) { - if (subproc->abandoned_os_list == NULL) { + if (mi_lock_acquire(&subproc->os_pages_lock)) { + if (subproc->os_pages.first == NULL) { safe_to_delete = true; } - mi_lock_release(&subproc->abandoned_os_lock); + mi_lock_release(&subproc->os_pages_lock); } if (!safe_to_delete) return; // safe to release // todo: should we refcount subprocesses? - mi_lock_done(&subproc->abandoned_os_lock); - mi_lock_done(&subproc->abandoned_os_visit_lock); + mi_lock_done(&subproc->os_pages_lock); + mi_lock_done(&subproc->arena_reserve_lock); _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid); } void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { - mi_heap_t* heap = mi_heap_get_default(); - if (heap == NULL) return; - mi_assert(heap->tld->subproc == &mi_subproc_default); - if (heap->tld->subproc != &mi_subproc_default) return; - heap->tld->subproc = _mi_subproc_from_id(subproc_id); + mi_tld_t* tld = _mi_tld(); + if (tld == NULL) return; + mi_assert(tld->subproc == &subproc_main); + if (tld->subproc != &subproc_main) return; + tld->subproc = _mi_subproc_from_id(subproc_id); } @@ -352,10 +388,10 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { static bool _mi_thread_heap_init(void) { if (mi_heap_is_initialized(mi_prim_get_default_heap())) return true; if (_mi_is_main_thread()) { - // mi_assert_internal(_mi_heap_main.thread_id != 0); // can happen on freeBSD where alloc is called before any initialization + // mi_assert_internal(heap_main.thread_id != 0); // can happen on freeBSD where alloc is called before any initialization // the main heap is statically allocated mi_heap_main_init(); - _mi_heap_set_default_direct(&_mi_heap_main); + _mi_heap_set_default_direct(&heap_main); //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap()); } else { @@ -383,7 +419,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { if (!mi_heap_is_initialized(heap)) return true; // reset default heap - _mi_heap_set_default_direct(_mi_is_main_thread() ? &_mi_heap_main : (mi_heap_t*)&_mi_heap_empty); + _mi_heap_set_default_direct(_mi_is_main_thread() ? &heap_main : (mi_heap_t*)&_mi_heap_empty); // switch to backing heap heap = heap->tld->heap_backing; @@ -403,7 +439,7 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { mi_assert_internal(mi_heap_is_backing(heap)); // collect if not the main thread - if (heap != &_mi_heap_main) { + if (heap != &heap_main) { _mi_heap_collect_abandon(heap); } @@ -413,12 +449,12 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { // free heap meta data _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid); - if (heap == &_mi_heap_main) { + if (heap == &heap_main) { #if 0 // never free the main thread even in debug mode; if a dll is linked statically with mimalloc, // there may still be delete/free calls after the mi_fls_done is called. Issue #207 _mi_heap_destroy_pages(heap); - mi_assert_internal(heap->tld->heap_backing == &_mi_heap_main); + mi_assert_internal(heap->tld->heap_backing == &heap_main); #endif } @@ -449,12 +485,12 @@ static void mi_process_setup_auto_thread_done(void) { if (tls_initialized) return; tls_initialized = true; _mi_prim_thread_init_auto_done(); - _mi_heap_set_default_direct(&_mi_heap_main); + _mi_heap_set_default_direct(&heap_main); } bool _mi_is_main_thread(void) { - return (_mi_heap_main.thread_id==0 || _mi_heap_main.thread_id == _mi_thread_id()); + return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id()); } static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1); @@ -501,7 +537,7 @@ void _mi_thread_done(mi_heap_t* heap) _mi_stat_decrease(&_mi_stats_main.threads, 1); // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps... - if (heap->thread_id != _mi_thread_id()) return; + if (heap->tld->thread_id != _mi_prim_thread_id()) return; // abandon the thread local heap _mi_thread_heap_done(heap); // returns true if already ran @@ -560,7 +596,7 @@ void _mi_process_load(void) { } // reseed random - _mi_random_reinit_if_weak(&_mi_heap_main.random); + _mi_random_reinit_if_weak(&heap_main.random); } #if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64)) @@ -587,7 +623,7 @@ void mi_process_init(void) mi_attr_noexcept { // ensure we are called once static mi_atomic_once_t process_init; #if _MSC_VER < 1920 - mi_heap_main_init(); // vs2017 can dynamically re-initialize _mi_heap_main + mi_heap_main_init(); // vs2017 can dynamically re-initialize heap_main #endif if (!mi_atomic_once(&process_init)) return; _mi_process_is_initialized = true; @@ -595,10 +631,11 @@ void mi_process_init(void) mi_attr_noexcept { mi_process_setup_auto_thread_done(); mi_detect_cpu_features(); + mi_subproc_main_init(); + mi_tld_main_init(); + mi_heap_main_init(); _mi_os_init(); _mi_page_map_init(); - _mi_arena_init(); - mi_heap_main_init(); #if MI_DEBUG _mi_verbose_message("debug level : %d\n", MI_DEBUG); #endif @@ -609,7 +646,7 @@ void mi_process_init(void) mi_attr_noexcept { #endif mi_thread_init(); - #if defined(_WIN32) + #if defined(_WIN32) && defined(MI_WIN_USE_FLS) // On windows, when building as a static lib the FLS cleanup happens to early for the main thread. // To avoid this, set the FLS value for the main thread to NULL so the fls cleanup // will not call _mi_thread_done on the (still executing) main thread. See issue #508. @@ -670,7 +707,7 @@ void mi_cdecl _mi_process_done(void) { mi_stats_print(NULL); } _mi_allocator_done(); - _mi_verbose_message("process done: 0x%zx\n", _mi_heap_main.thread_id); + _mi_verbose_message("process done: 0x%zx\n", tld_main.thread_id); os_preloading = true; // don't call the C runtime anymore } diff --git a/src/page.c b/src/page.c index d97537d1..0444b47e 100644 --- a/src/page.c +++ b/src/page.c @@ -591,7 +591,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { mi_assert(page != NULL); mi_page_set_heap(page, heap); - page->subproc = heap->tld->subproc; + size_t page_size; uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start); mi_track_mem_noaccess(page_start,page_size); From daac75af3611710b9631434a25fbe9f30fd11414 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 20 Dec 2024 22:13:58 -0800 Subject: [PATCH 113/264] fix lock recursion --- ide/vs2022/mimalloc-test-stress.vcxproj | 4 +- include/mimalloc/atomic.h | 27 +++++++++++-- src/arena.c | 15 ++++++-- src/init.c | 51 +++++++++++++------------ 4 files changed, 62 insertions(+), 35 deletions(-) diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj index fd88cd8e..672cbb87 100644 --- a/ide/vs2022/mimalloc-test-stress.vcxproj +++ b/ide/vs2022/mimalloc-test-stress.vcxproj @@ -279,8 +279,8 @@ - - {abb5eae7-b3e6-432e-b636-333449892ea6} + + {abb5eae7-b3e6-432e-b636-333449892ea7} diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index ddb5a9a3..ab1e161d 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -408,9 +408,8 @@ static inline void mi_atomic_yield(void) { // ---------------------------------------------------------------------- // Locks -// These do not have to be recursive and should be light-weight -// in-process only locks. Only used for reserving arena's and to -// maintain the abandoned list. +// These should be light-weight in-process only locks. +// Only used for reserving arena's and to maintain the abandoned list. // ---------------------------------------------------------------------- #if _MSC_VER #pragma warning(disable:26110) // unlock with holding lock @@ -418,6 +417,26 @@ static inline void mi_atomic_yield(void) { #if defined(_WIN32) +#define mi_lock_t CRITICAL_SECTION + +static inline bool mi_lock_try_acquire(mi_lock_t* lock) { + return TryEnterCriticalSection(lock); +} +static inline bool mi_lock_acquire(mi_lock_t* lock) { + EnterCriticalSection(lock); + return true; +} +static inline void mi_lock_release(mi_lock_t* lock) { + LeaveCriticalSection(lock); +} +static inline void mi_lock_init(mi_lock_t* lock) { + InitializeCriticalSection(lock); +} +static inline void mi_lock_done(mi_lock_t* lock) { + DeleteCriticalSection(lock); +} + +#if 0 #define mi_lock_t SRWLOCK // slim reader-writer lock static inline bool mi_lock_try_acquire(mi_lock_t* lock) { @@ -436,7 +455,7 @@ static inline void mi_lock_init(mi_lock_t* lock) { static inline void mi_lock_done(mi_lock_t* lock) { (void)(lock); } - +#endif #elif defined(MI_USE_PTHREADS) diff --git a/src/arena.c b/src/arena.c index bb846da9..fd914f43 100644 --- a/src/arena.c +++ b/src/arena.c @@ -275,6 +275,8 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( } +static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id); + // try to reserve a fresh arena space static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) { @@ -325,7 +327,7 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_ const bool adjust = (overcommit && arena_commit); if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); } // and try to reserve the arena - int err = mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); + int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); if (err != 0) { if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back // failed, try a smaller size? @@ -1162,14 +1164,14 @@ bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is } // Reserve a range of regular OS memory -int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { +static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) { if (arena_id != NULL) *arena_id = _mi_arena_id_none(); size = _mi_align_up(size, MI_ARENA_SLICE_SIZE); // at least one slice mi_memid_t memid; void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid); if (start == NULL) return ENOMEM; const bool is_large = memid.is_pinned; // todo: use separate is_large field? - if (!mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { + if (!mi_manage_os_memory_ex2(subproc, start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { _mi_os_free_ex(start, size, commit, memid); _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); return ENOMEM; @@ -1180,6 +1182,11 @@ int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exc return 0; } +// Reserve a range of regular OS memory +int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + return mi_reserve_os_memory_ex2(_mi_subproc(), size, commit, allow_large, exclusive, arena_id); +} + // Manage a range of regular OS memory bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); @@ -1289,7 +1296,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) if (arena == NULL) break; mi_assert(arena->subproc == subproc); slice_total += arena->slice_count; - _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : "", arena->subproc)); + _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""), arena->subproc); if (show_inuse) { free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); } diff --git a/src/init.c b/src/init.c index a15a9c6c..177ca2bd 100644 --- a/src/init.c +++ b/src/init.c @@ -11,30 +11,31 @@ terms of the MIT license. A copy of the license can be found in the file #include // memcpy, memset #include // atexit -#define MI_MEMID_STATIC {{{NULL,0}}, MI_MEM_STATIC, true /* pinned */, true /* committed */, false /* zero */ } +#define MI_MEMID_INIT(kind) {{{NULL,0}}, kind, true /* pinned */, true /* committed */, false /* zero */ } +#define MI_MEMID_STATIC MI_MEMID_INIT(MI_MEM_STATIC) // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { - MI_ATOMIC_VAR_INIT(0), // xthread_id - NULL, // free - 0, // used - 0, // capacity - 0, // reserved capacity - 0, // block size shift - 0, // retire_expire - NULL, // local_free - MI_ATOMIC_VAR_INIT(0), // xthread_free - MI_ATOMIC_VAR_INIT(0), // xflags - 0, // block_size - NULL, // page_start - 0, // heap tag - false, // is_zero + MI_ATOMIC_VAR_INIT(0), // xthread_id + NULL, // free + 0, // used + 0, // capacity + 0, // reserved capacity + 0, // block size shift + 0, // retire_expire + NULL, // local_free + MI_ATOMIC_VAR_INIT(0), // xthread_free + MI_ATOMIC_VAR_INIT(0), // xflags + 0, // block_size + NULL, // page_start + 0, // heap tag + false, // is_zero #if (MI_PADDING || MI_ENCODE_FREELIST) - { 0, 0 }, + { 0, 0 }, // keys #endif - NULL, // xheap - NULL, NULL, // next, prev - MI_MEMID_STATIC // memid + NULL, // xheap + NULL, NULL, // next, prev + MI_MEMID_STATIC // memid }; #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty) @@ -100,7 +101,7 @@ static mi_decl_cache_align mi_subproc_t subproc_main; static mi_decl_cache_align mi_tld_t tld_empty = { 0, // thread_id 0, // thread_seq - &subproc_main, // subproc + &subproc_main, // subproc NULL, // heap_backing NULL, // heaps list 0, // heartbeat @@ -111,7 +112,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = { }; mi_decl_cache_align const mi_heap_t _mi_heap_empty = { - &tld_empty, // tld + &tld_empty, // tld NULL, // exclusive_arena 0, // cookie { 0, 0 }, // keys @@ -136,9 +137,9 @@ extern mi_heap_t heap_main; static mi_decl_cache_align mi_tld_t tld_main = { 0, // thread_id 0, // thread_seq - &subproc_main, // subproc - &heap_main, // heap_backing - &heap_main, // heaps list + &subproc_main, // subproc + &heap_main, // heap_backing + &heap_main, // heaps list 0, // heartbeat false, // recurse false, // is_in_threadpool @@ -147,7 +148,7 @@ static mi_decl_cache_align mi_tld_t tld_main = { }; mi_decl_cache_align mi_heap_t heap_main = { - &tld_main, // thread local data + &tld_main, // thread local data 0, // initial cookie 0, // arena id { 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!) From dece8a587b5cb8642c28e0aa40c850da9c30ceb4 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 10:43:08 -0800 Subject: [PATCH 114/264] make stats part of a subproc --- ide/vs2022/mimalloc-test-stress.vcxproj | 4 +- include/mimalloc/atomic.h | 6 +- include/mimalloc/internal.h | 1 - include/mimalloc/types.h | 126 ++++++++++++++-------- src/alloc-aligned.c | 4 +- src/arena.c | 51 +++++---- src/bitmap.c | 4 +- src/free.c | 2 +- src/heap.c | 20 ++-- src/init.c | 89 +++++++++------- src/os.c | 30 +++--- src/page.c | 12 +-- src/stats.c | 136 +++++++++++++----------- test/test-stress.c | 8 +- 14 files changed, 274 insertions(+), 219 deletions(-) diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj index 672cbb87..fd88cd8e 100644 --- a/ide/vs2022/mimalloc-test-stress.vcxproj +++ b/ide/vs2022/mimalloc-test-stress.vcxproj @@ -279,8 +279,8 @@ - - {abb5eae7-b3e6-432e-b636-333449892ea7} + + {abb5eae7-b3e6-432e-b636-333449892ea6} diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index ab1e161d..0c7fafe3 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -417,6 +417,8 @@ static inline void mi_atomic_yield(void) { #if defined(_WIN32) +#if 0 + #define mi_lock_t CRITICAL_SECTION static inline bool mi_lock_try_acquire(mi_lock_t* lock) { @@ -436,7 +438,8 @@ static inline void mi_lock_done(mi_lock_t* lock) { DeleteCriticalSection(lock); } -#if 0 +#else + #define mi_lock_t SRWLOCK // slim reader-writer lock static inline bool mi_lock_try_acquire(mi_lock_t* lock) { @@ -455,6 +458,7 @@ static inline void mi_lock_init(mi_lock_t* lock) { static inline void mi_lock_done(mi_lock_t* lock) { (void)(lock); } + #endif #elif defined(MI_USE_PTHREADS) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 24792f8c..7774b378 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -90,7 +90,6 @@ uintptr_t _mi_os_random_weak(uintptr_t extra_seed); static inline uintptr_t _mi_random_shuffle(uintptr_t x); // init.c -extern mi_decl_cache_align mi_stats_t _mi_stats_main; extern mi_decl_cache_align const mi_page_t _mi_page_empty; void _mi_process_load(void); void mi_cdecl _mi_process_done(void); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 4d43e887..ca3913ad 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -293,7 +293,7 @@ typedef struct mi_page_s { uintptr_t keys[2]; // two random keys to encode the free lists (see `_mi_block_next`) or padding canary #endif - mi_heap_t* heap; // heap this threads belong to. + mi_heap_t* heap; // the heap owning this page (or NULL for abandoned pages) struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` mi_memid_t memid; // provenance of the page memory @@ -394,7 +394,7 @@ typedef struct mi_padding_s { // A heap owns a set of pages. struct mi_heap_s { mi_tld_t* tld; // thread-local data - mi_arena_t* exclusive_arena; // if the heap belongs to a specific arena (or NULL) + mi_arena_t* exclusive_arena; // if the heap should only allocate from a specific arena (or NULL) uintptr_t cookie; // random cookie to verify pointers (see `_mi_ptr_cookie`) uintptr_t keys[2]; // two random keys used to encode the `thread_delayed_free` list mi_random_ctx_t random; // random number context used for secure allocation @@ -444,18 +444,18 @@ typedef struct mi_stat_counter_s { } mi_stat_counter_t; typedef struct mi_stats_s { - mi_stat_count_t pages; - mi_stat_count_t reserved; - mi_stat_count_t committed; - mi_stat_count_t reset; - mi_stat_count_t purged; - mi_stat_count_t page_committed; - mi_stat_count_t pages_abandoned; - mi_stat_count_t threads; - mi_stat_count_t normal; - mi_stat_count_t huge; - mi_stat_count_t giant; - mi_stat_count_t malloc; + mi_stat_count_t pages; + mi_stat_count_t reserved; + mi_stat_count_t committed; + mi_stat_count_t reset; + mi_stat_count_t purged; + mi_stat_count_t page_committed; + mi_stat_count_t pages_abandoned; + mi_stat_count_t threads; + mi_stat_count_t normal; + mi_stat_count_t huge; + mi_stat_count_t giant; + mi_stat_count_t malloc; mi_stat_counter_t pages_extended; mi_stat_counter_t pages_reclaim_on_alloc; mi_stat_counter_t pages_reclaim_on_free; @@ -479,37 +479,72 @@ typedef struct mi_stats_s { // add to stat keeping track of the peak -void _mi_stat_increase(mi_stat_count_t* stat, size_t amount); -void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount); +void __mi_stat_increase(mi_stat_count_t* stat, size_t amount); +void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount); +void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount); +void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount); // adjust stat in special cases to compensate for double counting -void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc); -void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free); +void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc); +void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_free); +void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc); +void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_free); // counters can just be increased -void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); +void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount); +void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount); #if (MI_STAT) -#define mi_stat_increase(stat,amount) _mi_stat_increase( &(stat), amount) -#define mi_stat_decrease(stat,amount) _mi_stat_decrease( &(stat), amount) -#define mi_stat_counter_increase(stat,amount) _mi_stat_counter_increase( &(stat), amount) -#define mi_stat_adjust_increase(stat,amnt,b) _mi_stat_adjust_increase( &(stat), amnt, b) -#define mi_stat_adjust_decrease(stat,amnt,b) _mi_stat_adjust_decrease( &(stat), amnt, b) +#define mi_debug_stat_increase(stat,amount) __mi_stat_increase( &(stat), amount) +#define mi_debug_stat_decrease(stat,amount) __mi_stat_decrease( &(stat), amount) +#define mi_debug_stat_counter_increase(stat,amount) __mi_stat_counter_increase( &(stat), amount) +#define mi_debug_stat_increase_mt(stat,amount) __mi_stat_increase_mt( &(stat), amount) +#define mi_debug_stat_decrease_mt(stat,amount) __mi_stat_decrease_mt( &(stat), amount) +#define mi_debug_stat_counter_increase_mt(stat,amount) __mi_stat_counter_increase_mt( &(stat), amount) +#define mi_debug_stat_adjust_increase_mt(stat,amnt,b) __mi_stat_adjust_increase_mt( &(stat), amnt, b) +#define mi_debug_stat_adjust_decrease_mt(stat,amnt,b) __mi_stat_adjust_decrease_mt( &(stat), amnt, b) #else -#define mi_stat_increase(stat,amount) ((void)0) -#define mi_stat_decrease(stat,amount) ((void)0) -#define mi_stat_counter_increase(stat,amount) ((void)0) -#define mi_stat_adjuct_increase(stat,amnt,b) ((void)0) -#define mi_stat_adjust_decrease(stat,amnt,b) ((void)0) +#define mi_debug_stat_increase(stat,amount) ((void)0) +#define mi_debug_stat_decrease(stat,amount) ((void)0) +#define mi_debug_stat_counter_increase(stat,amount) ((void)0) +#define mi_debug_stat_increase_mt(stat,amount) ((void)0) +#define mi_debug_stat_decrease_mt(stat,amount) ((void)0) +#define mi_debug_stat_counter_increase_mt(stat,amount) ((void)0) +#define mi_debug_stat_adjust_increase(stat,amnt,b) ((void)0) +#define mi_debug_stat_adjust_decrease(stat,amnt,b) ((void)0) #endif -#define mi_heap_stat_counter_increase(heap,stat,amount) mi_stat_counter_increase( (heap)->tld->stats.stat, amount) -#define mi_heap_stat_increase(heap,stat,amount) mi_stat_increase( (heap)->tld->stats.stat, amount) -#define mi_heap_stat_decrease(heap,stat,amount) mi_stat_decrease( (heap)->tld->stats.stat, amount) +#define mi_subproc_stat_counter_increase(subproc,stat,amount) __mi_stat_counter_increase_mt( &(subproc)->stats.stat, amount) +#define mi_subproc_stat_increase(subproc,stat,amount) __mi_stat_increase_mt( &(subproc)->stats.stat, amount) +#define mi_subproc_stat_decrease(subproc,stat,amount) __mi_stat_decrease_mt( &(subproc)->stats.stat, amount) +#define mi_subproc_stat_adjust_increase(subproc,stat,amnt,b) __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt, b) +#define mi_subproc_stat_adjust_decrease(subproc,stat,amnt,b) __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt, b) + +#define mi_os_stat_counter_increase(stat,amount) mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount) +#define mi_os_stat_increase(stat,amount) mi_subproc_stat_increase(_mi_subproc(),stat,amount) +#define mi_os_stat_decrease(stat,amount) mi_subproc_stat_decrease(_mi_subproc(),stat,amount) + +#define mi_tld_stat_counter_increase(tld,stat,amount) __mi_stat_counter_increase( &(tld)->stats.stat, amount) +#define mi_tld_stat_increase(tld,stat,amount) __mi_stat_increase( &(tld)->stats.stat, amount) +#define mi_tld_stat_decrease(tld,stat,amount) __mi_stat_decrease( &(tld)->stats.stat, amount) + +#define mi_debug_tld_stat_counter_increase(tld,stat,amount) mi_debug_stat_counter_increase( (tld)->stats.stat, amount) +#define mi_debug_tld_stat_increase(tld,stat,amount) mi_debug_stat_increase( (tld)->stats.stat, amount) +#define mi_debug_tld_stat_decrease(tld,stat,amount) mi_debug_stat_decrease( (tld)->stats.stat, amount) + +#define mi_heap_stat_counter_increase(heap,stat,amount) mi_tld_stat_counter_increase((heap)->tld, stat, amount) +#define mi_heap_stat_increase(heap,stat,amount) mi_tld_stat_increase( (heap)->tld, stat, amount) +#define mi_heap_stat_decrease(heap,stat,amount) mi_tld_stat_decrease( (heap)->tld, stat, amount) + +#define mi_debug_heap_stat_counter_increase(heap,stat,amount) mi_debug_tld_stat_counter_increase((heap)->tld, stat, amount) +#define mi_debug_heap_stat_increase(heap,stat,amount) mi_debug_tld_stat_increase( (heap)->tld, stat, amount) +#define mi_debug_heap_stat_decrease(heap,stat,amount) mi_debug_tld_stat_decrease( (heap)->tld, stat, amount) // ------------------------------------------------------ // Sub processes use separate arena's and no heaps/pages/blocks // are shared between sub processes. -// Each thread should also belong to one sub-process only +// The subprocess structure contains essentially all static variables (except per subprocess :-)) +// +// Each thread should belong to one sub-process only // ------------------------------------------------------ #define MI_MAX_ARENAS (160) // Limited for now (and takes up .bss).. but arena's scale up exponentially (see `mi_arena_reserve`) @@ -519,10 +554,13 @@ typedef struct mi_subproc_s { _Atomic(size_t) arena_count; // current count of arena's _Atomic(mi_arena_t*) arenas[MI_MAX_ARENAS]; // arena's of this sub-process mi_lock_t arena_reserve_lock; // lock to ensure arena's get reserved one at a time - _Atomic(size_t) abandoned_count[MI_BIN_COUNT]; // total count of abandoned pages for this sub-process + + _Atomic(size_t) abandoned_count[MI_BIN_COUNT]; // total count of abandoned pages for this sub-process mi_page_queue_t os_pages; // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on) mi_lock_t os_pages_lock; // lock for the os pages list (this lock protects list operations) + mi_memid_t memid; // provenance of this memory block (meta or OS) + mi_stats_t stats; // sub-process statistics (tld stats are merged in on thread termination) } mi_subproc_t; @@ -535,16 +573,16 @@ typedef int64_t mi_msecs_t; // Thread local data struct mi_tld_s { - mi_threadid_t thread_id; // thread id of this thread - size_t thread_seq; // thread sequence id (linear count of created threads) - mi_subproc_t* subproc; // sub-process this thread belongs to. - mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) - mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) - unsigned long long heartbeat; // monotonic heartbeat count - bool recurse; // true if deferred was called; used to prevent infinite recursion. - bool is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks) - mi_stats_t stats; // statistics - mi_memid_t memid; // provenance of the tld memory itself (meta or OS) + mi_threadid_t thread_id; // thread id of this thread + size_t thread_seq; // thread sequence id (linear count of created threads) + mi_subproc_t* subproc; // sub-process this thread belongs to. + mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted) + mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates) + unsigned long long heartbeat; // monotonic heartbeat count + bool recurse; // true if deferred was called; used to prevent infinite recursion. + bool is_in_threadpool; // true if this thread is part of a threadpool (and can run arbitrary tasks) + mi_stats_t stats; // statistics + mi_memid_t memid; // provenance of the tld memory itself (meta or OS) }; diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c index 14cbee45..5da9fc0c 100644 --- a/src/alloc-aligned.c +++ b/src/alloc-aligned.c @@ -193,9 +193,7 @@ static void* mi_heap_malloc_zero_aligned_at(mi_heap_t* const heap, const size_t const bool is_aligned = (((uintptr_t)page->free + offset) & align_mask)==0; if mi_likely(is_aligned) { - #if MI_STAT>1 - mi_heap_stat_increase(heap, malloc, size); - #endif + mi_debug_heap_stat_increase(heap, malloc, size); void* p = (zero ? _mi_page_malloc_zeroed(heap,page,padsize) : _mi_page_malloc(heap,page,padsize)); // call specific page malloc for better codegen mi_assert_internal(p != NULL); mi_assert_internal(((uintptr_t)p + offset) % alignment == 0); diff --git a/src/arena.c b/src/arena.c index fd914f43..dcff8920 100644 --- a/src/arena.c +++ b/src/arena.c @@ -69,10 +69,6 @@ typedef struct mi_purge_info_s { Arena id's ----------------------------------------------------------- */ -static mi_arena_id_t mi_arena_id_create(mi_arena_t* arena) { - return arena; -} - mi_arena_id_t _mi_arena_id_none(void) { return NULL; } @@ -222,14 +218,14 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count); // adjust the stats so we don't double count the commits if (already_committed_count > 0) { - _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count), true /* on alloc */); + mi_subproc_stat_adjust_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count), true /* on alloc */); } // now actually commit bool commit_zero = false; if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) { // failed to commit (todo: give warning?) if (already_committed_count > 0) { - _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(already_committed_count)); + mi_subproc_stat_increase(arena->subproc, committed, mi_size_of_slices(already_committed_count)); } memid->initially_committed = false; } @@ -251,7 +247,7 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // if the OS has overcommit, and this is the first time we access these pages, then // count the commit now (as at arena reserve we didn't count those commits as these are on-demand) if (_mi_os_has_overcommit() && touched_slices > 0) { - _mi_stat_increase(&_mi_stats_main.committed, mi_size_of_slices(touched_slices)); + mi_subproc_stat_increase( arena->subproc, committed, mi_size_of_slices(touched_slices)); } } // tool support @@ -325,18 +321,18 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_ // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice // is actually allocated for the first time it will be counted. const bool adjust = (overcommit && arena_commit); - if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true /* on alloc */); } + if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true /* on alloc */); } // and try to reserve the arena int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); if (err != 0) { - if (adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back + if (adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve, true); } // roll back // failed, try a smaller size? const size_t small_arena_reserve = (MI_SIZE_BITS == 32 ? 128*MI_MiB : 1*MI_GiB); - if (adjust) { _mi_stat_adjust_decrease(&_mi_stats_main.committed, arena_reserve, true); } + if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true); } if (arena_reserve > small_arena_reserve) { // try again err = mi_reserve_os_memory_ex(small_arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); - if (err != 0 && adjust) { _mi_stat_adjust_increase(&_mi_stats_main.committed, arena_reserve, true); } // roll back + if (err != 0 && adjust) { mi_subproc_stat_adjust_increase( subproc, committed, arena_reserve, true); } // roll back } } return (err==0); @@ -579,8 +575,8 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t mi_assert_internal(mi_page_is_abandoned(page)); mi_assert_internal(mi_arena_has_page(arena,page)); mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); - _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_alloc, 1); + mi_subproc_stat_decrease( arena->subproc, pages_abandoned, 1); + mi_subproc_stat_counter_increase(arena->subproc, pages_reclaim_on_alloc, 1); _mi_page_free_collect(page, false); // update `used` count mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); @@ -828,12 +824,13 @@ void _mi_arena_page_abandon(mi_page_t* page) { const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index); MI_UNUSED(wasclear); mi_assert_internal(wasclear); mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]); + mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1); } else { // page is full (or a singleton), page is OS/externally allocated // leave as is; it will be reclaimed when an object is free'd in the page - } - _mi_stat_increase(&_mi_stats_main.pages_abandoned, 1); + mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1); + } _mi_page_unown(page); } @@ -850,8 +847,9 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) { return false; } else { - _mi_stat_counter_increase(&_mi_stats_main.pages_reabandon_full, 1); - _mi_stat_adjust_decrease(&_mi_stats_main.pages_abandoned, 1, true /* on alloc */); // adjust as we are not abandoning fresh + mi_subproc_t* subproc = _mi_subproc(); + mi_subproc_stat_counter_increase( subproc, pages_reabandon_full, 1); + mi_subproc_stat_adjust_decrease( subproc, pages_abandoned, 1, true /* on alloc */); // adjust as we are not abandoning fresh _mi_arena_page_abandon(page); return true; } @@ -879,13 +877,14 @@ void _mi_arena_page_unabandon(mi_page_t* page) { mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index); mi_page_clear_abandoned_mapped(page); mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]); + mi_subproc_stat_decrease(arena->subproc, pages_abandoned, 1); } else { - // page is full (or a singleton), page is OS/nly allocated + // page is full (or a singleton), page is OS allocated // nothing to do // TODO: maintain count of these as well? - } - _mi_stat_decrease(&_mi_stats_main.pages_abandoned, 1); + mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1); + } } void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { @@ -1016,7 +1015,7 @@ void _mi_arena_unsafe_destroy_all(void) { Add an arena. ----------------------------------------------------------- */ -static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { +static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id) { mi_assert_internal(arena != NULL); mi_assert_internal(arena->slice_count > 0); if (arena_id != NULL) { *arena_id = NULL; } @@ -1043,7 +1042,7 @@ static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t return false; } - _mi_stat_counter_increase(&stats->arena_count,1); + mi_subproc_stat_counter_increase(arena->subproc, arena_count, 1); mi_atomic_store_ptr_release(mi_arena_t,&subproc->arenas[i], arena); if (arena_id != NULL) { *arena_id = arena; } return true; @@ -1149,7 +1148,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL); } - return mi_arena_add(subproc, arena, arena_id, &_mi_stats_main); + return mi_arena_add(subproc, arena, arena_id); } @@ -1414,7 +1413,7 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c // update committed bitmap if (needs_recommit) { - _mi_stat_adjust_decrease(&_mi_stats_main.committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */); + mi_subproc_stat_adjust_decrease( arena->subproc, committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */); mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count); } return needs_recommit; @@ -1506,7 +1505,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire_base, (mi_msecs_t)0)) { mi_atomic_storei64_release(&arena->purge_expire_extend, (mi_msecs_t)0); // and also reset the extend } - _mi_stat_counter_increase(&_mi_stats_main.arena_purges, 1); + mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1); // go through all purge info's (with max MI_BFIELD_BITS ranges at a time) // this also clears those ranges atomically (so any newly freed blocks will get purged next @@ -1647,7 +1646,7 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, arena->is_exclusive = true; arena->is_large = is_large; arena->subproc = NULL; - if (!mi_arena_add(_mi_subproc(), arena, arena_id, &_mi_stats_main)) { + if (!mi_arena_add(_mi_subproc(), arena, arena_id)) { return false; } mi_arena_pages_reregister(arena); diff --git a/src/bitmap.c b/src/bitmap.c index 6352e4ea..e4a4cc2d 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -106,7 +106,9 @@ static inline void mi_bfield_atomic_clear_once_set(_Atomic(mi_bfield_t)*b, size_ do { if mi_unlikely((old&mask) == 0) { old = mi_atomic_load_acquire(b); - if ((old&mask)==0) { _mi_stat_counter_increase(&_mi_stats_main.pages_unabandon_busy_wait, 1); } + if ((old&mask)==0) { + mi_subproc_stat_counter_increase(_mi_subproc(), pages_unabandon_busy_wait, 1); + } while ((old&mask)==0) { // busy wait mi_atomic_yield(); old = mi_atomic_load_acquire(b); diff --git a/src/free.c b/src/free.c index 770856da..88f784c7 100644 --- a/src/free.c +++ b/src/free.c @@ -242,7 +242,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { // first remove it from the abandoned pages in the arena -- this waits for any readers to finish _mi_arena_page_unabandon(page); _mi_heap_page_reclaim(tagheap, page); - _mi_stat_counter_increase(&_mi_stats_main.pages_reclaim_on_free, 1); + mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1); return; } } diff --git a/src/heap.c b/src/heap.c index e8743691..d82b383f 100644 --- a/src/heap.c +++ b/src/heap.c @@ -141,7 +141,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // collect all pages owned by this thread mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); - + // collect arenas (this is program wide so don't force purges on abandonment of threads) _mi_arenas_collect(collect == MI_FORCE /* force purge? */); } @@ -183,9 +183,9 @@ mi_heap_t* mi_heap_get_backing(void) { } // todo: make order of parameters consistent (but would that break compat with CPython?) -void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld) +void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld) { - mi_assert_internal(heap!=NULL); + mi_assert_internal(heap!=NULL); mi_memid_t memid = heap->memid; _mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t)); heap->memid = memid; @@ -204,7 +204,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint heap->full_page_retain = heap->full_page_retain / 4; } } - + if (heap->tld->heap_backing == NULL) { heap->tld->heap_backing = heap; // first heap becomes the backing heap _mi_random_init(&heap->random); @@ -240,7 +240,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id) { mi_heap_t* bheap = mi_heap_get_backing(); mi_assert_internal(bheap != NULL); - return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld); + return _mi_heap_create(heap_tag, allow_destroy, arena_id, bheap->tld); } mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { @@ -333,17 +333,17 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ if (bsize > MI_LARGE_MAX_OBJ_SIZE) { mi_heap_stat_decrease(heap, huge, bsize); } -#if (MI_STAT) + #if (MI_STAT) _mi_page_free_collect(page, false); // update used count const size_t inuse = page->used; if (bsize <= MI_LARGE_MAX_OBJ_SIZE) { mi_heap_stat_decrease(heap, normal, bsize * inuse); -#if (MI_STAT>1) + #if (MI_STAT>1) mi_heap_stat_decrease(heap, normal_bins[_mi_bin(bsize)], inuse); -#endif + #endif } mi_heap_stat_decrease(heap, malloc, bsize * inuse); // todo: off for aligned blocks... -#endif + #endif /// pretend it is all free now mi_assert_internal(mi_page_thread_free(page) == NULL); @@ -460,7 +460,7 @@ void mi_heap_delete(mi_heap_t* heap) // transfer still used pages to the backing heap mi_heap_absorb(bheap, heap); } - else + else */ { // abandon all pages diff --git a/src/init.c b/src/init.c index 177ca2bd..5159941a 100644 --- a/src/init.c +++ b/src/init.c @@ -34,7 +34,7 @@ const mi_page_t _mi_page_empty = { { 0, 0 }, // keys #endif NULL, // xheap - NULL, NULL, // next, prev + NULL, NULL, // next, prev MI_MEMID_STATIC // memid }; @@ -103,7 +103,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = { 0, // thread_seq &subproc_main, // subproc NULL, // heap_backing - NULL, // heaps list + NULL, // heaps list 0, // heartbeat false, // recurse false, // is_in_threadpool @@ -139,7 +139,7 @@ static mi_decl_cache_align mi_tld_t tld_main = { 0, // thread_seq &subproc_main, // subproc &heap_main, // heap_backing - &heap_main, // heaps list + &heap_main, // heaps list 0, // heartbeat false, // recurse false, // is_in_threadpool @@ -165,7 +165,7 @@ mi_decl_cache_align mi_heap_t heap_main = { #endif MI_SMALL_PAGES_EMPTY, MI_PAGE_QUEUES_EMPTY, - MI_MEMID_STATIC + MI_MEMID_STATIC }; @@ -237,7 +237,7 @@ static void mi_tld_main_init(void) { // Initialization of the (statically allocated) main heap, and the main tld and subproc. static void mi_heap_main_init(void) { - if (heap_main.cookie == 0) { + if (heap_main.cookie == 0) { mi_subproc_main_init(); mi_tld_main_init(); // heap @@ -249,7 +249,7 @@ static void mi_heap_main_init(void) { #endif heap_main.cookie = _mi_heap_random_next(&heap_main); heap_main.keys[0] = _mi_heap_random_next(&heap_main); - heap_main.keys[1] = _mi_heap_random_next(&heap_main); + heap_main.keys[1] = _mi_heap_random_next(&heap_main); _mi_heap_guarded_init(&heap_main); heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0); heap_main.full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); @@ -266,14 +266,21 @@ mi_heap_t* heap_main_get(void) { Thread local data ----------------------------------------------------------- */ -// Thread sequence number -static _Atomic(size_t) mi_tcount; +// Count current and total created threads +static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1); +static _Atomic(size_t) thread_total_count; + +size_t _mi_current_thread_count(void) { + return mi_atomic_load_relaxed(&thread_count); +} + // The mimalloc thread local data -mi_decl_thread mi_tld_t* mi_tld; +mi_decl_thread mi_tld_t* thread_tld = &tld_empty; // Allocate fresh tld static mi_tld_t* mi_tld_alloc(void) { + mi_atomic_increment_relaxed(&thread_count); if (_mi_is_main_thread()) { return &tld_main; } @@ -292,7 +299,7 @@ static mi_tld_t* mi_tld_alloc(void) { tld->heaps = NULL; tld->subproc = &subproc_main; tld->thread_id = _mi_prim_thread_id(); - tld->thread_seq = mi_atomic_add_acq_rel(&mi_tcount, 1); + tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1); tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool(); return tld; } @@ -301,28 +308,38 @@ static mi_tld_t* mi_tld_alloc(void) { #define MI_TLD_INVALID ((mi_tld_t*)1) mi_decl_noinline static void mi_tld_free(void) { - mi_tld_t* tld = _mi_tld(); - mi_tld = MI_TLD_INVALID; - _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid); + mi_tld_t* tld = _mi_tld(); + if (tld != NULL && tld != MI_TLD_INVALID) { + _mi_stats_done(&tld->stats); + _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid); + } + tld = MI_TLD_INVALID; + mi_atomic_decrement_relaxed(&thread_count); } mi_decl_noinline mi_tld_t* _mi_tld(void) { - if (mi_tld == MI_TLD_INVALID) { - _mi_error_message(EFAULT, "internal error: tld accessed after the thread terminated\n"); - mi_tld = NULL; + mi_tld_t* tld = thread_tld; + if (tld == MI_TLD_INVALID) { + _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n"); + thread_tld = &tld_empty; } - if (mi_tld==NULL) { - mi_tld = mi_tld_alloc(); + if (tld==&tld_empty) { + thread_tld = tld = mi_tld_alloc(); } - return mi_tld; + return tld; } mi_subproc_t* _mi_subproc(void) { - if (_mi_is_main_thread()) { // during initialization we should not recurse over reading the _mi_tld - return &subproc_main; + // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()` + // todo: this will still fail on OS systems where the first access to a thread-local causes allocation. + // on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being + // stored in a TLS slot for example) + mi_heap_t* heap = mi_prim_get_default_heap(); + if (heap == NULL || heap == &_mi_heap_empty) { + return _mi_subproc_main(); } else { - return _mi_tld()->subproc; + return thread_tld->subproc; // don't call `_mi_tld()` } } @@ -396,11 +413,11 @@ static bool _mi_thread_heap_init(void) { //mi_assert_internal(_mi_heap_default->tld->heap_backing == mi_prim_get_default_heap()); } else { - // allocates tld data - // note: we cannot access thread-locals yet as that can cause (recursive) allocation + // allocates tld data + // note: we cannot access thread-locals yet as that can cause (recursive) allocation // (on macOS <= 14 for example where the loader allocates thread-local data on demand). - mi_tld_t* tld = mi_tld_alloc(); - + mi_tld_t* tld = mi_tld_alloc(); + // allocate and initialize the heap mi_heap_t* heap = _mi_heap_create(0 /* default tag */, false /* allow destroy? */, _mi_arena_id_none(), tld); @@ -409,7 +426,7 @@ static bool _mi_thread_heap_init(void) { _mi_heap_set_default_direct(heap); // now that the heap is set for this thread, we can set the thread-local tld. - mi_tld = tld; + thread_tld = tld; } return false; } @@ -444,9 +461,6 @@ static bool _mi_thread_heap_done(mi_heap_t* heap) { _mi_heap_collect_abandon(heap); } - // merge stats - _mi_stats_done(&heap->tld->stats); - // free heap meta data _mi_meta_free(heap, sizeof(mi_heap_t), heap->memid); @@ -494,11 +508,6 @@ bool _mi_is_main_thread(void) { return (tld_main.thread_id==0 || tld_main.thread_id == _mi_thread_id()); } -static _Atomic(size_t) thread_count = MI_ATOMIC_VAR_INIT(1); - -size_t _mi_current_thread_count(void) { - return mi_atomic_load_relaxed(&thread_count); -} // This is called from the `mi_malloc_generic` void mi_thread_init(void) mi_attr_noexcept @@ -511,8 +520,7 @@ void mi_thread_init(void) mi_attr_noexcept // fiber/pthread key to a non-zero value, ensuring `_mi_thread_done` is called) if (_mi_thread_heap_init()) return; // returns true if already initialized - _mi_stat_increase(&_mi_stats_main.threads, 1); - mi_atomic_increment_relaxed(&thread_count); + mi_subproc_stat_increase(_mi_subproc_main(), threads, 1); //_mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id()); } @@ -534,15 +542,14 @@ void _mi_thread_done(mi_heap_t* heap) } // adjust stats - mi_atomic_decrement_relaxed(&thread_count); - _mi_stat_decrease(&_mi_stats_main.threads, 1); + mi_subproc_stat_decrease(_mi_subproc_main(), threads, 1); // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps... if (heap->tld->thread_id != _mi_prim_thread_id()) return; // abandon the thread local heap _mi_thread_heap_done(heap); // returns true if already ran - + // free thread local data mi_tld_free(); } @@ -654,7 +661,7 @@ void mi_process_init(void) mi_attr_noexcept { _mi_prim_thread_associate_default_heap(NULL); #endif - mi_stats_reset(); // only call stat reset *after* thread init (or the heap tld == NULL) + mi_stats_reset(); // only call stat reset *after* thread init (or the heap tld == NULL) mi_track_init(); if (mi_option_is_enabled(mi_option_reserve_huge_os_pages)) { diff --git a/src/os.c b/src/os.c index 86ecb16b..53e8f571 100644 --- a/src/os.c +++ b/src/os.c @@ -114,9 +114,9 @@ static void mi_os_prim_free(void* addr, size_t size, bool still_committed) { _mi_warning_message("unable to free OS memory (error: %d (0x%x), size: 0x%zx bytes, address: %p)\n", err, err, size, addr); } if (still_committed) { - _mi_stat_decrease(&os_stats->committed, size); + mi_os_stat_decrease(committed, size); } - _mi_stat_decrease(&os_stats->reserved, size); + mi_os_stat_decrease(reserved, size); } void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t memid) { @@ -171,11 +171,11 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm _mi_warning_message("unable to allocate OS memory (error: %d (0x%x), addr: %p, size: 0x%zx bytes, align: 0x%zx, commit: %d, allow large: %d)\n", err, err, hint_addr, size, try_alignment, commit, allow_large); } - _mi_stat_counter_increase(&os_stats->mmap_calls, 1); + mi_os_stat_counter_increase(mmap_calls, 1); if (p != NULL) { - _mi_stat_increase(&os_stats->reserved, size); + mi_os_stat_increase(reserved, size); if (commit) { - _mi_stat_increase(&os_stats->committed, size); + mi_os_stat_increase(committed, size); // seems needed for asan (or `mimalloc-test-api` fails) #ifdef MI_TRACK_ASAN if (*is_zero) { mi_track_mem_defined(p,size); } @@ -290,7 +290,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo if (size == 0) return NULL; size = _mi_os_good_alloc_size(size); alignment = _mi_align_up(alignment, _mi_os_page_size()); - + bool os_is_large = false; bool os_is_zero = false; void* os_base = NULL; @@ -379,8 +379,8 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* bool _mi_os_commit(void* addr, size_t size, bool* is_zero) { if (is_zero != NULL) { *is_zero = false; } - _mi_stat_increase(&os_stats->committed, size); // use size for precise commit vs. decommit - _mi_stat_counter_increase(&os_stats->commit_calls, 1); + mi_os_stat_increase(committed, size); // use size for precise commit vs. decommit + mi_os_stat_counter_increase(commit_calls, 1); // page align range size_t csize; @@ -408,7 +408,7 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero) { static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) { mi_assert_internal(needs_recommit!=NULL); - _mi_stat_decrease(&os_stats->committed, size); + mi_os_stat_decrease(committed, size); // page align size_t csize; @@ -440,8 +440,8 @@ bool _mi_os_reset(void* addr, size_t size) { size_t csize; void* start = mi_os_page_align_area_conservative(addr, size, &csize); if (csize == 0) return true; // || _mi_os_is_huge_reserved(addr) - _mi_stat_increase(&os_stats->reset, csize); - _mi_stat_counter_increase(&os_stats->reset_calls, 1); + mi_os_stat_increase(reset, csize); + mi_os_stat_counter_increase(reset_calls, 1); #if (MI_DEBUG>1) && !MI_SECURE && !MI_TRACK_ENABLED // && !MI_TSAN memset(start, 0, csize); // pretend it is eagerly reset @@ -460,8 +460,8 @@ bool _mi_os_reset(void* addr, size_t size) { bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset) { if (mi_option_get(mi_option_purge_delay) < 0) return false; // is purging allowed? - _mi_stat_counter_increase(&os_stats->purge_calls, 1); - _mi_stat_increase(&os_stats->purged, size); + mi_os_stat_counter_increase(purge_calls, 1); + mi_os_stat_increase(purged, size); if (mi_option_is_enabled(mi_option_purge_decommits) && // should decommit? !_mi_preloading()) // don't decommit during preloading (unsafe) @@ -595,8 +595,8 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse // success, record it page++; // increase before timeout check (see issue #711) - _mi_stat_increase(&os_stats->committed, MI_HUGE_OS_PAGE_SIZE); - _mi_stat_increase(&os_stats->reserved, MI_HUGE_OS_PAGE_SIZE); + mi_os_stat_increase(committed, MI_HUGE_OS_PAGE_SIZE); + mi_os_stat_increase(reserved, MI_HUGE_OS_PAGE_SIZE); // check for timeout if (max_msecs > 0) { diff --git a/src/page.c b/src/page.c index 0444b47e..31dbcc7d 100644 --- a/src/page.c +++ b/src/page.c @@ -387,9 +387,9 @@ void _mi_page_retire(mi_page_t* page) mi_attr_noexcept { const size_t bsize = mi_page_block_size(page); if mi_likely( /* bsize < MI_MAX_RETIRE_SIZE && */ !mi_page_queue_is_special(pq)) { // not full or huge queue? if (pq->last==page && pq->first==page) { // the only page in the queue? - mi_stat_counter_increase(_mi_stats_main.page_no_retire,1); - page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4); mi_heap_t* heap = mi_page_heap(page); + mi_debug_heap_stat_counter_increase(heap, page_no_retire, 1); + page->retire_expire = (bsize <= MI_SMALL_MAX_OBJ_SIZE ? MI_RETIRE_CYCLES : MI_RETIRE_CYCLES/4); mi_assert_internal(pq >= heap->pages); const size_t index = pq - heap->pages; mi_assert_internal(index < MI_BIN_FULL && index < MI_BIN_HUGE); @@ -554,7 +554,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { size_t page_size; //uint8_t* page_start = mi_page_area(page, &page_size); - mi_heap_stat_counter_increase(heap, pages_extended, 1); + mi_debug_heap_stat_counter_increase(heap, pages_extended, 1); // calculate the extend count const size_t bsize = mi_page_block_size(page); @@ -583,7 +583,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { } // enable the new free list page->capacity += (uint16_t)extend; - mi_heap_stat_increase(heap, page_committed, extend * bsize); + mi_debug_heap_stat_increase(heap, page_committed, extend * bsize); mi_assert_expensive(mi_page_is_valid_init(page)); } @@ -709,8 +709,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m page = next; } // for each page - mi_heap_stat_counter_increase(heap, searches, count); - + mi_debug_heap_stat_counter_increase(heap, searches, count); + // set the page to the best candidate if (page_candidate != NULL) { page = page_candidate; diff --git a/src/stats.c b/src/stats.c index bb17b936..2a395ed5 100644 --- a/src/stats.c +++ b/src/stats.c @@ -19,88 +19,93 @@ terms of the MIT license. A copy of the license can be found in the file Statistics operations ----------------------------------------------------------- */ -static bool mi_is_in_main(void* stat) { - return ((uint8_t*)stat >= (uint8_t*)&_mi_stats_main - && (uint8_t*)stat < ((uint8_t*)&_mi_stats_main + sizeof(mi_stats_t))); +static void mi_stat_update_mt(mi_stat_count_t* stat, int64_t amount) { + if (amount == 0) return; + // add atomically + int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount); + mi_atomic_maxi64_relaxed(&stat->peak, current + amount); + if (amount > 0) { + mi_atomic_addi64_relaxed(&stat->allocated, amount); + } + else { + mi_atomic_addi64_relaxed(&stat->freed, -amount); + } } static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { if (amount == 0) return; - if mi_unlikely(mi_is_in_main(stat)) - { - // add atomically (for abandoned pages) - int64_t current = mi_atomic_addi64_relaxed(&stat->current, amount); - mi_atomic_maxi64_relaxed(&stat->peak, current + amount); - if (amount > 0) { - mi_atomic_addi64_relaxed(&stat->allocated,amount); - } - else { - mi_atomic_addi64_relaxed(&stat->freed, -amount); - } + // add thread local + stat->current += amount; + if (stat->current > stat->peak) stat->peak = stat->current; + if (amount > 0) { + stat->allocated += amount; } else { - // add thread local - stat->current += amount; - if (stat->current > stat->peak) stat->peak = stat->current; - if (amount > 0) { - stat->allocated += amount; - } - else { - stat->freed += -amount; - } + stat->freed += -amount; } } + // Adjust stats to compensate; for example before committing a range, // first adjust downwards with parts that were already committed so // we avoid double counting. +static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) { + if (amount == 0) return; + // adjust atomically + mi_atomic_addi64_relaxed(&stat->current, amount); + mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount); +} + static void mi_stat_adjust(mi_stat_count_t* stat, int64_t amount, bool on_alloc) { if (amount == 0) return; - if mi_unlikely(mi_is_in_main(stat)) - { - // adjust atomically - mi_atomic_addi64_relaxed(&stat->current, amount); - mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount); + stat->current += amount; + if (on_alloc) { + stat->allocated += amount; } else { - // don't affect the peak - stat->current += amount; - if (on_alloc) { - stat->allocated += amount; - } - else { - stat->freed += amount; - } + stat->freed += amount; } } -void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) { - if (mi_is_in_main(stat)) { - mi_atomic_addi64_relaxed( &stat->count, 1 ); - mi_atomic_addi64_relaxed( &stat->total, (int64_t)amount ); - } - else { - stat->count++; - stat->total += amount; - } +void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) { + mi_atomic_addi64_relaxed(&stat->count, 1); + mi_atomic_addi64_relaxed(&stat->total, (int64_t)amount); } -void _mi_stat_increase(mi_stat_count_t* stat, size_t amount) { +void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) { + stat->count++; + stat->total += amount; +} + +void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) { + mi_stat_update_mt(stat, (int64_t)amount); +} +void __mi_stat_increase(mi_stat_count_t* stat, size_t amount) { mi_stat_update(stat, (int64_t)amount); } -void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) { +void __mi_stat_decrease_mt(mi_stat_count_t* stat, size_t amount) { + mi_stat_update_mt(stat, -((int64_t)amount)); +} +void __mi_stat_decrease(mi_stat_count_t* stat, size_t amount) { mi_stat_update(stat, -((int64_t)amount)); } -void _mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) { +void __mi_stat_adjust_increase_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) { + mi_stat_adjust_mt(stat, (int64_t)amount, on_alloc); +} +void __mi_stat_adjust_increase(mi_stat_count_t* stat, size_t amount, bool on_alloc) { mi_stat_adjust(stat, (int64_t)amount, on_alloc); } -void _mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) { +void __mi_stat_adjust_decrease_mt(mi_stat_count_t* stat, size_t amount, bool on_alloc) { + mi_stat_adjust_mt(stat, -((int64_t)amount), on_alloc); +} +void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount, bool on_alloc) { mi_stat_adjust(stat, -((int64_t)amount), on_alloc); } + // must be thread safe as it is called from stats_merge static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) { if (stat==src) return; @@ -401,27 +406,29 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) static mi_msecs_t mi_process_start; // = 0 -static mi_stats_t* mi_stats_get_default(void) { - mi_heap_t* heap = mi_heap_get_default(); - return &heap->tld->stats; +// return thread local stats +static mi_stats_t* mi_get_tld_stats(void) { + return &_mi_tld()->stats; } static void mi_stats_merge_from(mi_stats_t* stats) { - if (stats != &_mi_stats_main) { - mi_stats_add(&_mi_stats_main, stats); - memset(stats, 0, sizeof(mi_stats_t)); + mi_subproc_t* subproc = _mi_subproc(); + if (stats != &subproc->stats) { + mi_stats_add(&subproc->stats, stats); + _mi_memzero(stats, sizeof(mi_stats_t)); } } void mi_stats_reset(void) mi_attr_noexcept { - mi_stats_t* stats = mi_stats_get_default(); - if (stats != &_mi_stats_main) { memset(stats, 0, sizeof(mi_stats_t)); } - memset(&_mi_stats_main, 0, sizeof(mi_stats_t)); + mi_stats_t* stats = mi_get_tld_stats(); + mi_subproc_t* subproc = _mi_subproc(); + if (stats != &subproc->stats) { _mi_memzero(stats, sizeof(mi_stats_t)); } + _mi_memzero(&subproc->stats, sizeof(mi_stats_t)); if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); }; } void mi_stats_merge(void) mi_attr_noexcept { - mi_stats_merge_from( mi_stats_get_default() ); + mi_stats_merge_from( mi_get_tld_stats() ); } void _mi_stats_done(mi_stats_t* stats) { // called from `mi_thread_done` @@ -429,8 +436,8 @@ void _mi_stats_done(mi_stats_t* stats) { // called from `mi_thread_done` } void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept { - mi_stats_merge_from(mi_stats_get_default()); - _mi_stats_print(&_mi_stats_main, out, arg); + mi_stats_merge_from(mi_get_tld_stats()); + _mi_stats_print(&_mi_subproc()->stats, out, arg); } void mi_stats_print(void* out) mi_attr_noexcept { @@ -439,7 +446,7 @@ void mi_stats_print(void* out) mi_attr_noexcept { } void mi_thread_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept { - _mi_stats_print(mi_stats_get_default(), out, arg); + _mi_stats_print(mi_get_tld_stats(), out, arg); } @@ -473,11 +480,12 @@ mi_msecs_t _mi_clock_end(mi_msecs_t start) { mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults) mi_attr_noexcept { + mi_subproc_t* subproc = _mi_subproc(); mi_process_info_t pinfo; _mi_memzero_var(pinfo); pinfo.elapsed = _mi_clock_end(mi_process_start); - pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.current)); - pinfo.peak_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)&_mi_stats_main.committed.peak)); + pinfo.current_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.current))); + pinfo.peak_commit = (size_t)(mi_atomic_loadi64_relaxed((_Atomic(int64_t)*)(&subproc->stats.committed.peak))); pinfo.current_rss = pinfo.current_commit; pinfo.peak_rss = pinfo.peak_commit; pinfo.utime = 0; diff --git a/test/test-stress.c b/test/test-stress.c index b35743df..0920a02e 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -48,10 +48,10 @@ static int ITER = 20; static int THREADS = 32; static int SCALE = 50; static int ITER = 50; -#elif 0 -static int THREADS = 64; -static int SCALE = 400; -static int ITER = 10; +#elif 1 +static int THREADS = 32; +static int SCALE = 25; +static int ITER = 50; #define ALLOW_LARGE true #else static int THREADS = 32; // more repeatable if THREADS <= #processors From 95aeda4cdda2431c20ed9fa3facb241b142ae773 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 10:53:34 -0800 Subject: [PATCH 115/264] merge subproc stats on delete --- include/mimalloc/internal.h | 1 + src/init.c | 4 ++++ src/stats.c | 23 +++++++++++------------ 3 files changed, 16 insertions(+), 12 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 7774b378..e316de94 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -203,6 +203,7 @@ void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page); // "stats.c" void _mi_stats_done(mi_stats_t* stats); +void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from); mi_msecs_t _mi_clock_now(void); mi_msecs_t _mi_clock_end(mi_msecs_t start); mi_msecs_t _mi_clock_start(void); diff --git a/src/init.c b/src/init.c index 5159941a..3af4f4ef 100644 --- a/src/init.c +++ b/src/init.c @@ -382,6 +382,10 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) { mi_lock_release(&subproc->os_pages_lock); } if (!safe_to_delete) return; + + // merge stats back into the main subproc? + _mi_stats_merge_from(&_mi_subproc_main()->stats, &subproc->stats); + // safe to release // todo: should we refcount subprocesses? mi_lock_done(&subproc->os_pages_lock); diff --git a/src/stats.c b/src/stats.c index 2a395ed5..102373ec 100644 --- a/src/stats.c +++ b/src/stats.c @@ -411,14 +411,6 @@ static mi_stats_t* mi_get_tld_stats(void) { return &_mi_tld()->stats; } -static void mi_stats_merge_from(mi_stats_t* stats) { - mi_subproc_t* subproc = _mi_subproc(); - if (stats != &subproc->stats) { - mi_stats_add(&subproc->stats, stats); - _mi_memzero(stats, sizeof(mi_stats_t)); - } -} - void mi_stats_reset(void) mi_attr_noexcept { mi_stats_t* stats = mi_get_tld_stats(); mi_subproc_t* subproc = _mi_subproc(); @@ -427,16 +419,23 @@ void mi_stats_reset(void) mi_attr_noexcept { if (mi_process_start == 0) { mi_process_start = _mi_clock_start(); }; } -void mi_stats_merge(void) mi_attr_noexcept { - mi_stats_merge_from( mi_get_tld_stats() ); +void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from) { + if (to != from) { + mi_stats_add(to, from); + _mi_memzero(from, sizeof(mi_stats_t)); + } } void _mi_stats_done(mi_stats_t* stats) { // called from `mi_thread_done` - mi_stats_merge_from(stats); + _mi_stats_merge_from(&_mi_subproc()->stats, stats); +} + +void mi_stats_merge(void) mi_attr_noexcept { + _mi_stats_done( mi_get_tld_stats() ); } void mi_stats_print_out(mi_output_fun* out, void* arg) mi_attr_noexcept { - mi_stats_merge_from(mi_get_tld_stats()); + mi_stats_merge(); _mi_stats_print(&_mi_subproc()->stats, out, arg); } From 4ad7fedd25e0869aa6fbca2aa24fe08dd4eebc39 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 11:35:30 -0800 Subject: [PATCH 116/264] track os abandoned pages in a list --- include/mimalloc/atomic.h | 25 ++++++++--------- include/mimalloc/types.h | 4 +-- src/arena-meta.c | 7 +++-- src/arena.c | 56 ++++++++++++++++++++++++++------------- src/init.c | 11 ++++---- 5 files changed, 61 insertions(+), 42 deletions(-) diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h index 0c7fafe3..fcd9efba 100644 --- a/include/mimalloc/atomic.h +++ b/include/mimalloc/atomic.h @@ -415,6 +415,8 @@ static inline void mi_atomic_yield(void) { #pragma warning(disable:26110) // unlock with holding lock #endif +#define mi_lock(lock) for(bool _go = (mi_lock_acquire(lock),true); _go; (mi_lock_release(lock), _go=false) ) + #if defined(_WIN32) #if 0 @@ -424,9 +426,8 @@ static inline void mi_atomic_yield(void) { static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return TryEnterCriticalSection(lock); } -static inline bool mi_lock_acquire(mi_lock_t* lock) { +static inline void mi_lock_acquire(mi_lock_t* lock) { EnterCriticalSection(lock); - return true; } static inline void mi_lock_release(mi_lock_t* lock) { LeaveCriticalSection(lock); @@ -445,9 +446,8 @@ static inline void mi_lock_done(mi_lock_t* lock) { static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return TryAcquireSRWLockExclusive(lock); } -static inline bool mi_lock_acquire(mi_lock_t* lock) { +static inline void mi_lock_acquire(mi_lock_t* lock) { AcquireSRWLockExclusive(lock); - return true; } static inline void mi_lock_release(mi_lock_t* lock) { ReleaseSRWLockExclusive(lock); @@ -468,8 +468,11 @@ static inline void mi_lock_done(mi_lock_t* lock) { static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return (pthread_mutex_trylock(lock) == 0); } -static inline bool mi_lock_acquire(mi_lock_t* lock) { - return (pthread_mutex_lock(lock) == 0); +static inline void mi_lock_acquire(mi_lock_t* lock) { + const int err = pthread_mutex_lock(lock); + if (err != 0) { + mi_error_message(EFAULT, "internal error: lock cannot be acquired\n"); + } } static inline void mi_lock_release(mi_lock_t* lock) { pthread_mutex_unlock(lock); @@ -489,9 +492,8 @@ static inline void mi_lock_done(mi_lock_t* lock) { static inline bool mi_lock_try_acquire(mi_lock_t* lock) { return lock->try_lock(); } -static inline bool mi_lock_acquire(mi_lock_t* lock) { +static inline void mi_lock_acquire(mi_lock_t* lock) { lock->lock(); - return true; } static inline void mi_lock_release(mi_lock_t* lock) { lock->unlock(); @@ -514,12 +516,11 @@ static inline bool mi_lock_try_acquire(mi_lock_t* lock) { uintptr_t expected = 0; return mi_atomic_cas_strong_acq_rel(lock, &expected, (uintptr_t)1); } -static inline bool mi_lock_acquire(mi_lock_t* lock) { +static inline void mi_lock_acquire(mi_lock_t* lock) { for (int i = 0; i < 1000; i++) { // for at most 1000 tries? - if (mi_lock_try_acquire(lock)) return true; + if (mi_lock_try_acquire(lock)) return; mi_atomic_yield(); - } - return true; + } } static inline void mi_lock_release(mi_lock_t* lock) { mi_atomic_store_release(lock, (uintptr_t)0); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index ca3913ad..59393848 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -556,8 +556,8 @@ typedef struct mi_subproc_s { mi_lock_t arena_reserve_lock; // lock to ensure arena's get reserved one at a time _Atomic(size_t) abandoned_count[MI_BIN_COUNT]; // total count of abandoned pages for this sub-process - mi_page_queue_t os_pages; // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on) - mi_lock_t os_pages_lock; // lock for the os pages list (this lock protects list operations) + mi_page_t* os_abandoned_pages; // list of pages that OS allocated and not in an arena (only used if `mi_option_visit_abandoned` is on) + mi_lock_t os_abandoned_pages_lock; // lock for the os abandoned pages list (this lock protects list operations) mi_memid_t memid; // provenance of this memory block (meta or OS) mi_stats_t stats; // sub-process statistics (tld stats are merged in on thread termination) diff --git a/src/arena-meta.c b/src/arena-meta.c index f28c50e9..a5dc8e75 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -64,12 +64,11 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) { // allocate a fresh meta page and add it to the global list. static mi_meta_page_t* mi_meta_page_zalloc(void) { // allocate a fresh arena slice - // note: we always use subproc_main directly for the meta-data since at thread start the metadata for the - // tld and heap need to be (meta) allocated and at that time we cannot read the tld pointer (yet). + // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. mi_memid_t memid; - mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc_main(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0, + mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0, true /* commit*/, true /* allow large */, - NULL, 0 /* tseq */, &memid ); + NULL /* req arena */, 0 /* thread_seq */, &memid); if (mpage == NULL) return NULL; mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN)); if (!memid.initially_zero) { diff --git a/src/arena.c b/src/arena.c index dcff8920..c4b02cf6 100644 --- a/src/arena.c +++ b/src/arena.c @@ -439,24 +439,20 @@ static mi_decl_noinline void* mi_arenas_try_alloc( // otherwise, try to reserve a new arena -- but one thread at a time.. (todo: allow 2 or 4 to reduce contention?) const size_t arena_count = mi_arenas_get_count(subproc); - if (mi_lock_acquire(&subproc->arena_reserve_lock)) { - bool ok = true; + mi_lock(&subproc->arena_reserve_lock) { if (arena_count == mi_arenas_get_count(subproc)) { // we are the first to enter the lock, reserve a fresh arena mi_arena_id_t arena_id = 0; - ok = mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id); + mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id); } else { // another thread already reserved a new arena } - mi_lock_release(&subproc->arena_reserve_lock); - if (ok) { - // try once more to allocate in the new arena - mi_assert_internal(req_arena == NULL); - p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid); - if (p != NULL) return p; - } - } + } + // try once more to allocate in the new arena + mi_assert_internal(req_arena == NULL); + p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid); + if (p != NULL) return p; return NULL; } @@ -685,11 +681,13 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ else { page->block_size_shift = 0; } + // and own it + mi_page_try_claim_ownership(page); + + // register in the page map _mi_page_map_register(page); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); - - mi_page_try_claim_ownership(page); mi_assert_internal(mi_page_block_size(page) == block_size); mi_assert_internal(mi_page_is_abandoned(page)); mi_assert_internal(mi_page_is_owned(page)); @@ -771,7 +769,8 @@ void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_all_free(page)); - mi_assert_internal(page->next==NULL); + mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(page->next==NULL && page->prev==NULL); #if MI_DEBUG>1 if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) { @@ -790,6 +789,7 @@ void _mi_arena_page_free(mi_page_t* page) { } #endif + // unregister page _mi_page_map_unregister(page); if (page->memid.memkind == MI_MEM_ARENA) { mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index); @@ -807,7 +807,7 @@ void _mi_arena_page_abandon(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); mi_assert_internal(!mi_page_all_free(page)); - mi_assert_internal(page->next==NULL); + mi_assert_internal(page->next==NULL && page->prev == NULL); if (page->memid.memkind==MI_MEM_ARENA && !mi_page_is_full(page)) { // make available for allocations @@ -827,8 +827,19 @@ void _mi_arena_page_abandon(mi_page_t* page) { mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1); } else { - // page is full (or a singleton), page is OS/externally allocated + // page is full (or a singleton), or the page is OS/externally allocated // leave as is; it will be reclaimed when an object is free'd in the page + mi_subproc_t* subproc = _mi_subproc(); + // but for non-arena pages, add to the subproc list so these can be visited + if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) { + mi_lock(&subproc->os_abandoned_pages_lock) { + // push in front + page->prev = NULL; + page->next = subproc->os_abandoned_pages; + if (page->next != NULL) { page->next->prev = page; } + subproc->os_abandoned_pages = page; + } + } mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1); } _mi_page_unown(page); @@ -881,9 +892,18 @@ void _mi_arena_page_unabandon(mi_page_t* page) { } else { // page is full (or a singleton), page is OS allocated - // nothing to do - // TODO: maintain count of these as well? + mi_subproc_t* subproc = _mi_subproc(); mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1); + // if not an arena page, remove from the subproc os pages list + if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) { + mi_lock(&subproc->os_abandoned_pages_lock) { + if (page->prev != NULL) { page->prev->next = page->next; } + if (page->next != NULL) { page->next->prev = page->prev; } + if (subproc->os_abandoned_pages == page) { subproc->os_abandoned_pages = page->next; } + page->next = NULL; + page->prev = NULL; + } + } } } diff --git a/src/init.c b/src/init.c index 3af4f4ef..1968ef68 100644 --- a/src/init.c +++ b/src/init.c @@ -223,7 +223,7 @@ void _mi_heap_guarded_init(mi_heap_t* heap) { static void mi_subproc_main_init(void) { if (subproc_main.memid.memkind != MI_MEM_STATIC) { subproc_main.memid = _mi_memid_create(MI_MEM_STATIC); - mi_lock_init(&subproc_main.os_pages_lock); + mi_lock_init(&subproc_main.os_abandoned_pages_lock); mi_lock_init(&subproc_main.arena_reserve_lock); } } @@ -361,7 +361,7 @@ mi_subproc_id_t mi_subproc_new(void) { mi_subproc_t* subproc = (mi_subproc_t*)_mi_meta_zalloc(sizeof(mi_subproc_t),&memid); if (subproc == NULL) return NULL; subproc->memid = memid; - mi_lock_init(&subproc->os_pages_lock); + mi_lock_init(&subproc->os_abandoned_pages_lock); mi_lock_init(&subproc->arena_reserve_lock); return subproc; } @@ -375,11 +375,10 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) { mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id); // check if there are os pages still.. bool safe_to_delete = false; - if (mi_lock_acquire(&subproc->os_pages_lock)) { - if (subproc->os_pages.first == NULL) { + mi_lock(&subproc->os_abandoned_pages_lock) { + if (subproc->os_abandoned_pages == NULL) { safe_to_delete = true; } - mi_lock_release(&subproc->os_pages_lock); } if (!safe_to_delete) return; @@ -388,7 +387,7 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) { // safe to release // todo: should we refcount subprocesses? - mi_lock_done(&subproc->os_pages_lock); + mi_lock_done(&subproc->os_abandoned_pages_lock); mi_lock_done(&subproc->arena_reserve_lock); _mi_meta_free(subproc, sizeof(mi_subproc_t), subproc->memid); } From 89b0d5a357af02809509544f83c92e7f5be11a3f Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 11:53:29 -0800 Subject: [PATCH 117/264] allocate heaps associated with an arena in that arena --- include/mimalloc/internal.h | 11 ++++++----- include/mimalloc/types.h | 21 ++++++--------------- src/arena-meta.c | 5 +---- src/arena.c | 6 ++---- src/heap.c | 14 +++++++++++--- src/init.c | 10 +++++----- 6 files changed, 31 insertions(+), 36 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index e316de94..208989e3 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -147,6 +147,7 @@ mi_arena_t* _mi_arena_from_id(mi_arena_id_t id); void* _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); void* _mi_arena_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); +void _mi_arena_free(void* p, size_t size, mi_memid_t memid); bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena); bool _mi_arena_contains(const void* p); void _mi_arenas_collect(bool force_purge); @@ -421,11 +422,11 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) { return (heap != &_mi_heap_empty); } -static inline uintptr_t _mi_ptr_cookie(const void* p) { - extern mi_heap_t _mi_heap_main; - mi_assert_internal(_mi_heap_main.cookie != 0); - return ((uintptr_t)p ^ _mi_heap_main.cookie); -} +//static inline uintptr_t _mi_ptr_cookie(const void* p) { +// extern mi_heap_t _mi_heap_main; +// mi_assert_internal(_mi_heap_main.cookie != 0); +// return ((uintptr_t)p ^ _mi_heap_main.cookie); +//} /* ----------------------------------------------------------- diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 59393848..461b5393 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -396,7 +396,6 @@ struct mi_heap_s { mi_tld_t* tld; // thread-local data mi_arena_t* exclusive_arena; // if the heap should only allocate from a specific arena (or NULL) uintptr_t cookie; // random cookie to verify pointers (see `_mi_ptr_cookie`) - uintptr_t keys[2]; // two random keys used to encode the `thread_delayed_free` list mi_random_ctx_t random; // random number context used for secure allocation size_t page_count; // total number of pages in the `pages` queues. size_t page_retired_min; // smallest retired index (retired pages are fully free, but still in the page queues) @@ -522,21 +521,13 @@ void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount); #define mi_os_stat_increase(stat,amount) mi_subproc_stat_increase(_mi_subproc(),stat,amount) #define mi_os_stat_decrease(stat,amount) mi_subproc_stat_decrease(_mi_subproc(),stat,amount) -#define mi_tld_stat_counter_increase(tld,stat,amount) __mi_stat_counter_increase( &(tld)->stats.stat, amount) -#define mi_tld_stat_increase(tld,stat,amount) __mi_stat_increase( &(tld)->stats.stat, amount) -#define mi_tld_stat_decrease(tld,stat,amount) __mi_stat_decrease( &(tld)->stats.stat, amount) +#define mi_heap_stat_counter_increase(heap,stat,amount) __mi_stat_counter_increase( &(heap)->tld->stats.stat, amount) +#define mi_heap_stat_increase(heap,stat,amount) __mi_stat_increase( &(heap)->tld->stats.stat, amount) +#define mi_heap_stat_decrease(heap,stat,amount) __mi_stat_decrease( &(heap)->tld->stats.stat, amount) -#define mi_debug_tld_stat_counter_increase(tld,stat,amount) mi_debug_stat_counter_increase( (tld)->stats.stat, amount) -#define mi_debug_tld_stat_increase(tld,stat,amount) mi_debug_stat_increase( (tld)->stats.stat, amount) -#define mi_debug_tld_stat_decrease(tld,stat,amount) mi_debug_stat_decrease( (tld)->stats.stat, amount) - -#define mi_heap_stat_counter_increase(heap,stat,amount) mi_tld_stat_counter_increase((heap)->tld, stat, amount) -#define mi_heap_stat_increase(heap,stat,amount) mi_tld_stat_increase( (heap)->tld, stat, amount) -#define mi_heap_stat_decrease(heap,stat,amount) mi_tld_stat_decrease( (heap)->tld, stat, amount) - -#define mi_debug_heap_stat_counter_increase(heap,stat,amount) mi_debug_tld_stat_counter_increase((heap)->tld, stat, amount) -#define mi_debug_heap_stat_increase(heap,stat,amount) mi_debug_tld_stat_increase( (heap)->tld, stat, amount) -#define mi_debug_heap_stat_decrease(heap,stat,amount) mi_debug_tld_stat_decrease( (heap)->tld, stat, amount) +#define mi_debug_heap_stat_counter_increase(heap,stat,amount) mi_debug_stat_counter_increase( (heap)->tld->stats.stat, amount) +#define mi_debug_heap_stat_increase(heap,stat,amount) mi_debug_stat_increase( (heap)->tld->stats.stat, amount) +#define mi_debug_heap_stat_decrease(heap,stat,amount) mi_debug_stat_decrease( (heap)->tld->stats.stat, amount) // ------------------------------------------------------ diff --git a/src/arena-meta.c b/src/arena-meta.c index a5dc8e75..065a1331 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -148,11 +148,8 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE); mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL); } - else if (mi_memid_is_os(memid)) { - _mi_os_free(p, size, memid); - } else { - mi_assert_internal(mi_memid_needs_no_free(memid)); + _mi_arena_free(p,size,memid); } } diff --git a/src/arena.c b/src/arena.c index c4b02cf6..869cba49 100644 --- a/src/arena.c +++ b/src/arena.c @@ -762,8 +762,6 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block return page; } -static void mi_arena_free(void* p, size_t size, mi_memid_t memid); - void _mi_arena_page_free(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); @@ -794,7 +792,7 @@ void _mi_arena_page_free(mi_page_t* page) { if (page->memid.memkind == MI_MEM_ARENA) { mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index); } - mi_arena_free(page, mi_memid_size(page->memid), page->memid); + _mi_arena_free(page, mi_memid_size(page->memid), page->memid); } /* ----------------------------------------------------------- @@ -920,7 +918,7 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices); static void mi_arenas_try_purge(bool force, bool visit_all); -static void mi_arena_free(void* p, size_t size, mi_memid_t memid) { +void _mi_arena_free(void* p, size_t size, mi_memid_t memid) { if (p==NULL) return; if (size==0) return; diff --git a/src/heap.c b/src/heap.c index d82b383f..f47aaad9 100644 --- a/src/heap.c +++ b/src/heap.c @@ -213,8 +213,8 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint _mi_random_split(&heap->tld->heap_backing->random, &heap->random); } heap->cookie = _mi_heap_random_next(heap) | 1; - heap->keys[0] = _mi_heap_random_next(heap); - heap->keys[1] = _mi_heap_random_next(heap); + //heap->keys[0] = _mi_heap_random_next(heap); + //heap->keys[1] = _mi_heap_random_next(heap);*/ _mi_heap_guarded_init(heap); // push on the thread local heaps list @@ -227,7 +227,15 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena mi_assert(heap_tag >= 0 && heap_tag < 256); // allocate and initialize a heap mi_memid_t memid; - mi_heap_t* heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid); + mi_heap_t* heap; + if (arena_id == _mi_arena_id_none()) { + heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid); + } + else { + // heaps associated wita a specific arena are allocated in that arena + // note: takes up at least one slice which is quite wasteful... + heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), sizeof(mi_heap_t), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid); + } if (heap==NULL) { _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n"); return NULL; diff --git a/src/init.c b/src/init.c index 1968ef68..2f147e55 100644 --- a/src/init.c +++ b/src/init.c @@ -115,7 +115,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { &tld_empty, // tld NULL, // exclusive_arena 0, // cookie - { 0, 0 }, // keys + //{ 0, 0 }, // keys { {0}, {0}, 0, true }, // random 0, // page count MI_BIN_FULL, 0, // page retired min/max @@ -149,9 +149,9 @@ static mi_decl_cache_align mi_tld_t tld_main = { mi_decl_cache_align mi_heap_t heap_main = { &tld_main, // thread local data + NULL, // exclusive arena 0, // initial cookie - 0, // arena id - { 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!) + //{ 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!) { {0x846ca68b}, {0}, 0, true }, // random 0, // page count MI_BIN_FULL, 0, // page retired min/max @@ -248,8 +248,8 @@ static void mi_heap_main_init(void) { _mi_random_init(&heap_main.random); #endif heap_main.cookie = _mi_heap_random_next(&heap_main); - heap_main.keys[0] = _mi_heap_random_next(&heap_main); - heap_main.keys[1] = _mi_heap_random_next(&heap_main); + //heap_main.keys[0] = _mi_heap_random_next(&heap_main); + //heap_main.keys[1] = _mi_heap_random_next(&heap_main); _mi_heap_guarded_init(&heap_main); heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0); heap_main.full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); From 7d46478a5f7c16b078b7955df95d3801eb1d585d Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 13:19:06 -0800 Subject: [PATCH 118/264] add initial load/unload for heaps --- include/mimalloc.h | 8 ++++- src/arena.c | 22 +++++++----- src/heap.c | 83 ++++++++++++++++++++++++++++++++++++---------- 3 files changed, 86 insertions(+), 27 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 7a58e54c..b0a20e9e 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -326,7 +326,13 @@ mi_decl_export void mi_heap_guarded_set_size_bound(mi_heap_t* heap, size_t min, //mi_decl_export void mi_os_decommit(void* p, size_t size); mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size); -mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id); +mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id); +mi_decl_export bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena); +mi_decl_export void mi_heap_unload(mi_heap_t* heap); + +// Is a pointer contained in the given arena area? +mi_decl_export bool mi_arena_contains(mi_arena_id_t arena_id, const void* p); + // ------------------------------------------------------ // Convenience diff --git a/src/arena.c b/src/arena.c index 869cba49..aa3c9175 100644 --- a/src/arena.c +++ b/src/arena.c @@ -492,7 +492,6 @@ void* _mi_arena_alloc_aligned( mi_subproc_t* subproc, // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed? - req_arena == NULL && // not a specific arena? size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && // and not too small/large alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0) // and good alignment { @@ -980,13 +979,21 @@ void _mi_arenas_collect(bool force_purge) { mi_arenas_try_purge(force_purge, force_purge /* visit all? */); } + +// Is a pointer contained in the given arena area? +bool mi_arena_contains(mi_arena_id_t arena_id, const void* p) { + mi_arena_t* arena = _mi_arena_from_id(arena_id); + return (mi_arena_start(arena) <= (const uint8_t*)p && + mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p); +} + // Is a pointer inside any of our arenas? bool _mi_arena_contains(const void* p) { mi_subproc_t* subproc = _mi_subproc(); const size_t max_arena = mi_arenas_get_count(subproc); for (size_t i = 0; i < max_arena; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]); - if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_slices(arena->slice_count) >(const uint8_t*)p) { + if (arena != NULL && mi_arena_contains(arena,p)) { return true; } } @@ -1636,7 +1643,7 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* return true; } -mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, mi_arena_id_t* arena_id) { +mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id) { // assume the memory area is already containing the arena if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } if (start == NULL || size == 0) return false; @@ -1658,13 +1665,10 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, bool is_committed, _mi_warning_message("the reloaded arena is not exclusive\n"); return false; } - arena->memid.is_pinned = is_large; - arena->memid.initially_committed = is_committed; - arena->memid.initially_zero = is_zero; + arena->is_exclusive = true; - arena->is_large = is_large; - arena->subproc = NULL; - if (!mi_arena_add(_mi_subproc(), arena, arena_id)) { + arena->subproc = _mi_subproc(); + if (!mi_arena_add(arena->subproc, arena, arena_id)) { return false; } mi_arena_pages_reregister(arena); diff --git a/src/heap.c b/src/heap.c index f47aaad9..03030b47 100644 --- a/src/heap.c +++ b/src/heap.c @@ -234,7 +234,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena else { // heaps associated wita a specific arena are allocated in that arena // note: takes up at least one slice which is quite wasteful... - heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), sizeof(mi_heap_t), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid); + heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid); } if (heap==NULL) { _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n"); @@ -280,7 +280,7 @@ static void mi_heap_reset_pages(mi_heap_t* heap) { } // called from `mi_heap_destroy` and `mi_heap_delete` to free the internal heap resources. -static void mi_heap_free(mi_heap_t* heap) { +static void mi_heap_free(mi_heap_t* heap, bool do_free_mem) { mi_assert(heap != NULL); mi_assert_internal(mi_heap_is_initialized(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return; @@ -307,7 +307,9 @@ static void mi_heap_free(mi_heap_t* heap) { mi_assert_internal(heap->tld->heaps != NULL); // and free the used memory - _mi_meta_free(heap, sizeof(*heap), heap->memid); + if (do_free_mem) { + _mi_meta_free(heap, sizeof(*heap), heap->memid); + } } // return a heap on the same thread as `heap` specialized for the specified tag (if it exists) @@ -403,7 +405,7 @@ void mi_heap_destroy(mi_heap_t* heap) { #endif // free all pages _mi_heap_destroy_pages(heap); - mi_heap_free(heap); + mi_heap_free(heap,true); } #endif } @@ -462,20 +464,11 @@ void mi_heap_delete(mi_heap_t* heap) mi_assert_expensive(mi_heap_is_valid(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return; - /* - mi_heap_t* bheap = heap->tld->heap_backing; - if (bheap != heap && mi_heaps_are_compatible(bheap,heap)) { - // transfer still used pages to the backing heap - mi_heap_absorb(bheap, heap); - } - else - */ - { - // abandon all pages - _mi_heap_collect_abandon(heap); - } + // abandon all pages + _mi_heap_collect_abandon(heap); + mi_assert_internal(heap->page_count==0); - mi_heap_free(heap); + mi_heap_free(heap,true); } mi_heap_t* mi_heap_set_default(mi_heap_t* heap) { @@ -489,7 +482,63 @@ mi_heap_t* mi_heap_set_default(mi_heap_t* heap) { } +/* ----------------------------------------------------------- + Load/unload heaps +----------------------------------------------------------- */ +void mi_heap_unload(mi_heap_t* heap) { + mi_assert(mi_heap_is_initialized(heap)); + mi_assert_expensive(mi_heap_is_valid(heap)); + if (heap==NULL || !mi_heap_is_initialized(heap)) return; + if (heap->exclusive_arena == NULL) { + _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n"); + return; + } + + // abandon all pages so all thread'id in the pages are cleared + _mi_heap_collect_abandon(heap); + mi_assert_internal(heap->page_count==0); + // remove from heap list + mi_heap_free(heap, false /* but don't actually free the memory */); + + // disassociate from the current thread-local and static state + heap->tld = NULL; + return; +} + +bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) { + mi_assert(mi_heap_is_initialized(heap)); + if (heap==NULL || !mi_heap_is_initialized(heap)) return false; + if (heap->exclusive_arena == NULL) { + _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n"); + return false; + } + if (heap->tld != NULL) { + _mi_warning_message("cannot reload heaps that were not unloaded first\n"); + return false; + } + mi_arena_t* arena = _mi_arena_from_id(arena_id); + if (heap->exclusive_arena != arena) { + _mi_warning_message("trying to reload a heap at a different arena address: %p vs %p\n", heap->exclusive_arena, arena); + return false; + } + + mi_assert_internal(heap->page_count==0); + + // re-associate from the current thread-local and static state + heap->tld = _mi_tld(); + + // reinit direct pages (as we may be in a different process) + mi_assert_internal(heap->page_count == 0); + for (int i = 0; i < MI_PAGES_DIRECT; i++) { + heap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty; + } + + // push on the thread local heaps list + heap->next = heap->tld->heaps; + heap->tld->heaps = heap; + return true; +} /* ----------------------------------------------------------- Analysis From 108c84e858b7ee2aa2fd3f00de03afb879e89718 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 14:45:14 -0800 Subject: [PATCH 119/264] remove req_arena parameter to arena_reserve --- src/arena.c | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/src/arena.c b/src/arena.c index aa3c9175..af1f737e 100644 --- a/src/arena.c +++ b/src/arena.c @@ -274,11 +274,8 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id); // try to reserve a fresh arena space -static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) +static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_large, mi_arena_id_t* arena_id) { - // if (_mi_preloading()) return false; // use OS only while pre loading - if (req_arena_id != _mi_arena_id_none()) return false; - const size_t arena_count = mi_arenas_get_count(subproc); if (arena_count > (MI_MAX_ARENAS - 4)) return false; @@ -443,7 +440,7 @@ static mi_decl_noinline void* mi_arenas_try_alloc( if (arena_count == mi_arenas_get_count(subproc)) { // we are the first to enter the lock, reserve a fresh arena mi_arena_id_t arena_id = 0; - mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, req_arena, &arena_id); + mi_arena_reserve(subproc, mi_size_of_slices(slice_count), allow_large, &arena_id); } else { // another thread already reserved a new arena From c138fba149d358465345ce0316c42d626afe1328 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 15:49:17 -0800 Subject: [PATCH 120/264] merge from dev --- src/arena-abandon.c | 346 -------------------------------------------- 1 file changed, 346 deletions(-) delete mode 100644 src/arena-abandon.c diff --git a/src/arena-abandon.c b/src/arena-abandon.c deleted file mode 100644 index 460c80fc..00000000 --- a/src/arena-abandon.c +++ /dev/null @@ -1,346 +0,0 @@ -/* ---------------------------------------------------------------------------- -Copyright (c) 2019-2024, Microsoft Research, Daan Leijen -This is free software; you can redistribute it and/or modify it under the -terms of the MIT license. A copy of the license can be found in the file -"LICENSE" at the root of this distribution. ------------------------------------------------------------------------------*/ - -#if !defined(MI_IN_ARENA_C) -#error "this file should be included from 'arena.c' (so mi_arena_t is visible)" -// add includes help an IDE -#include "mimalloc.h" -#include "mimalloc/internal.h" -#include "bitmap.h" -#endif - -// Minimal exports for arena-abandoned. -size_t mi_arena_id_index(mi_arena_id_t id); -mi_arena_t* mi_arena_from_index(size_t idx); -size_t mi_arena_get_count(void); -void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex); -bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index); - -/* ----------------------------------------------------------- - Abandoned blocks/segments: - - _mi_arena_segment_clear_abandoned - _mi_arena_segment_mark_abandoned - - This is used to atomically abandon/reclaim segments - (and crosses the arena API but it is convenient to have here). - - Abandoned segments still have live blocks; they get reclaimed - when a thread frees a block in it, or when a thread needs a fresh - segment. - - Abandoned segments are atomically marked in the `block_abandoned` - bitmap of arenas. Any segments allocated outside arenas are put - in the sub-process `abandoned_os_list`. This list is accessed - using locks but this should be uncommon and generally uncontended. - Reclaim and visiting either scan through the `block_abandoned` - bitmaps of the arena's, or visit the `abandoned_os_list` - - A potentially nicer design is to use arena's for everything - and perhaps have virtual arena's to map OS allocated memory - but this would lack the "density" of our current arena's. TBC. ------------------------------------------------------------ */ - - -// reclaim a specific OS abandoned segment; `true` on success. -// sets the thread_id. -static bool mi_arena_segment_os_clear_abandoned(mi_segment_t* segment, bool take_lock) { - mi_assert(segment->memid.memkind != MI_MEM_ARENA); - // not in an arena, remove from list of abandoned os segments - mi_subproc_t* const subproc = segment->subproc; - if (take_lock && !mi_lock_try_acquire(&subproc->abandoned_os_lock)) { - return false; // failed to acquire the lock, we just give up - } - // remove atomically from the abandoned os list (if possible!) - bool reclaimed = false; - mi_segment_t* const next = segment->abandoned_os_next; - mi_segment_t* const prev = segment->abandoned_os_prev; - if (next != NULL || prev != NULL || subproc->abandoned_os_list == segment) { - #if MI_DEBUG>3 - // find ourselves in the abandoned list (and check the count) - bool found = false; - size_t count = 0; - for (mi_segment_t* current = subproc->abandoned_os_list; current != NULL; current = current->abandoned_os_next) { - if (current == segment) { found = true; } - count++; - } - mi_assert_internal(found); - mi_assert_internal(count == mi_atomic_load_relaxed(&subproc->abandoned_os_list_count)); - #endif - // remove (atomically) from the list and reclaim - if (prev != NULL) { prev->abandoned_os_next = next; } - else { subproc->abandoned_os_list = next; } - if (next != NULL) { next->abandoned_os_prev = prev; } - else { subproc->abandoned_os_list_tail = prev; } - segment->abandoned_os_next = NULL; - segment->abandoned_os_prev = NULL; - mi_atomic_decrement_relaxed(&subproc->abandoned_count); - mi_atomic_decrement_relaxed(&subproc->abandoned_os_list_count); - if (take_lock) { // don't reset the thread_id when iterating - mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); - } - reclaimed = true; - } - if (take_lock) { mi_lock_release(&segment->subproc->abandoned_os_lock); } - return reclaimed; -} - -// reclaim a specific abandoned segment; `true` on success. -// sets the thread_id. -bool _mi_arena_segment_clear_abandoned(mi_segment_t* segment) { - if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) { - return mi_arena_segment_os_clear_abandoned(segment, true /* take lock */); - } - // arena segment: use the blocks_abandoned bitmap. - size_t arena_idx; - size_t bitmap_idx; - mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); - mi_arena_t* arena = mi_arena_from_index(arena_idx); - mi_assert_internal(arena != NULL); - // reclaim atomically - bool was_marked = _mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx); - if (was_marked) { - mi_assert_internal(mi_atomic_load_acquire(&segment->thread_id) == 0); - mi_atomic_decrement_relaxed(&segment->subproc->abandoned_count); - mi_atomic_store_release(&segment->thread_id, _mi_thread_id()); - } - // mi_assert_internal(was_marked); - mi_assert_internal(!was_marked || _mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); - //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); - return was_marked; -} - - -// mark a specific OS segment as abandoned -static void mi_arena_segment_os_mark_abandoned(mi_segment_t* segment) { - mi_assert(segment->memid.memkind != MI_MEM_ARENA); - // not in an arena; we use a list of abandoned segments - mi_subproc_t* const subproc = segment->subproc; - mi_lock(&subproc->abandoned_os_lock) { - // push on the tail of the list (important for the visitor) - mi_segment_t* prev = subproc->abandoned_os_list_tail; - mi_assert_internal(prev == NULL || prev->abandoned_os_next == NULL); - mi_assert_internal(segment->abandoned_os_prev == NULL); - mi_assert_internal(segment->abandoned_os_next == NULL); - if (prev != NULL) { prev->abandoned_os_next = segment; } - else { subproc->abandoned_os_list = segment; } - subproc->abandoned_os_list_tail = segment; - segment->abandoned_os_prev = prev; - segment->abandoned_os_next = NULL; - mi_atomic_increment_relaxed(&subproc->abandoned_os_list_count); - mi_atomic_increment_relaxed(&subproc->abandoned_count); - // and release the lock - } - return; -} - -// mark a specific segment as abandoned -// clears the thread_id. -void _mi_arena_segment_mark_abandoned(mi_segment_t* segment) -{ - mi_assert_internal(segment->used == segment->abandoned); - mi_atomic_store_release(&segment->thread_id, (uintptr_t)0); // mark as abandoned for multi-thread free's - if mi_unlikely(segment->memid.memkind != MI_MEM_ARENA) { - mi_arena_segment_os_mark_abandoned(segment); - return; - } - // segment is in an arena, mark it in the arena `blocks_abandoned` bitmap - size_t arena_idx; - size_t bitmap_idx; - mi_arena_memid_indices(segment->memid, &arena_idx, &bitmap_idx); - mi_arena_t* arena = mi_arena_from_index(arena_idx); - mi_assert_internal(arena != NULL); - // set abandonment atomically - mi_subproc_t* const subproc = segment->subproc; // don't access the segment after setting it abandoned - const bool was_unmarked = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); - if (was_unmarked) { mi_atomic_increment_relaxed(&subproc->abandoned_count); } - mi_assert_internal(was_unmarked); - mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); -} - - -/* ----------------------------------------------------------- - Iterate through the abandoned blocks/segments using a cursor. - This is used for reclaiming and abandoned block visiting. ------------------------------------------------------------ */ - -// start a cursor at a randomized arena -void _mi_arena_field_cursor_init(mi_heap_t* heap, mi_subproc_t* subproc, bool visit_all, mi_arena_field_cursor_t* current) { - mi_assert_internal(heap == NULL || heap->tld->segments.subproc == subproc); - current->bitmap_idx = 0; - current->subproc = subproc; - current->visit_all = visit_all; - current->hold_visit_lock = false; - const size_t abandoned_count = mi_atomic_load_relaxed(&subproc->abandoned_count); - const size_t abandoned_list_count = mi_atomic_load_relaxed(&subproc->abandoned_os_list_count); - const size_t max_arena = mi_arena_get_count(); - if (heap != NULL && heap->arena_id != _mi_arena_id_none()) { - // for a heap that is bound to one arena, only visit that arena - current->start = mi_arena_id_index(heap->arena_id); - current->end = current->start + 1; - current->os_list_count = 0; - } - else { - // otherwise visit all starting at a random location - if (abandoned_count > abandoned_list_count && max_arena > 0) { - current->start = (heap == NULL || max_arena == 0 ? 0 : (mi_arena_id_t)(_mi_heap_random_next(heap) % max_arena)); - current->end = current->start + max_arena; - } - else { - current->start = 0; - current->end = 0; - } - current->os_list_count = abandoned_list_count; // max entries to visit in the os abandoned list - } - mi_assert_internal(current->start <= max_arena); -} - -void _mi_arena_field_cursor_done(mi_arena_field_cursor_t* current) { - if (current->hold_visit_lock) { - mi_lock_release(¤t->subproc->abandoned_os_visit_lock); - current->hold_visit_lock = false; - } -} - -static mi_segment_t* mi_arena_segment_clear_abandoned_at(mi_arena_t* arena, mi_subproc_t* subproc, mi_bitmap_index_t bitmap_idx) { - // try to reclaim an abandoned segment in the arena atomically - if (!_mi_bitmap_unclaim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx)) return NULL; - mi_assert_internal(_mi_bitmap_is_claimed(arena->blocks_inuse, arena->field_count, 1, bitmap_idx)); - mi_segment_t* segment = (mi_segment_t*)mi_arena_block_start(arena, bitmap_idx); - mi_assert_internal(mi_atomic_load_relaxed(&segment->thread_id) == 0); - // check that the segment belongs to our sub-process - // note: this is the reason we need the `abandoned_visit` lock in the case abandoned visiting is enabled. - // without the lock an abandoned visit may otherwise fail to visit all abandoned segments in the sub-process. - // for regular reclaim it is fine to miss one sometimes so without abandoned visiting we don't need the `abandoned_visit` lock. - if (segment->subproc != subproc) { - // it is from another sub-process, re-mark it and continue searching - const bool was_zero = _mi_bitmap_claim(arena->blocks_abandoned, arena->field_count, 1, bitmap_idx, NULL); - mi_assert_internal(was_zero); MI_UNUSED(was_zero); - return NULL; - } - else { - // success, we unabandoned a segment in our sub-process - mi_atomic_decrement_relaxed(&subproc->abandoned_count); - return segment; - } -} - -static mi_segment_t* mi_arena_segment_clear_abandoned_next_field(mi_arena_field_cursor_t* previous) { - const size_t max_arena = mi_arena_get_count(); - size_t field_idx = mi_bitmap_index_field(previous->bitmap_idx); - size_t bit_idx = mi_bitmap_index_bit_in_field(previous->bitmap_idx); - // visit arena's (from the previous cursor) - for (; previous->start < previous->end; previous->start++, field_idx = 0, bit_idx = 0) { - // index wraps around - size_t arena_idx = (previous->start >= max_arena ? previous->start % max_arena : previous->start); - mi_arena_t* arena = mi_arena_from_index(arena_idx); - if (arena != NULL) { - bool has_lock = false; - // visit the abandoned fields (starting at previous_idx) - for (; field_idx < arena->field_count; field_idx++, bit_idx = 0) { - size_t field = mi_atomic_load_relaxed(&arena->blocks_abandoned[field_idx]); - if mi_unlikely(field != 0) { // skip zero fields quickly - // we only take the arena lock if there are actually abandoned segments present - if (!has_lock && mi_option_is_enabled(mi_option_visit_abandoned)) { - has_lock = (previous->visit_all ? (mi_lock_acquire(&arena->abandoned_visit_lock),true) : mi_lock_try_acquire(&arena->abandoned_visit_lock)); - if (!has_lock) { - if (previous->visit_all) { - _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the visitor lock"); - } - // skip to next arena - break; - } - } - mi_assert_internal(has_lock || !mi_option_is_enabled(mi_option_visit_abandoned)); - // visit each set bit in the field (todo: maybe use `ctz` here?) - for (; bit_idx < MI_BITMAP_FIELD_BITS; bit_idx++) { - // pre-check if the bit is set - size_t mask = ((size_t)1 << bit_idx); - if mi_unlikely((field & mask) == mask) { - mi_bitmap_index_t bitmap_idx = mi_bitmap_index_create(field_idx, bit_idx); - mi_segment_t* const segment = mi_arena_segment_clear_abandoned_at(arena, previous->subproc, bitmap_idx); - if (segment != NULL) { - //mi_assert_internal(arena->blocks_committed == NULL || _mi_bitmap_is_claimed(arena->blocks_committed, arena->field_count, 1, bitmap_idx)); - if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } - previous->bitmap_idx = mi_bitmap_index_create_ex(field_idx, bit_idx + 1); // start at next one for the next iteration - return segment; - } - } - } - } - } - if (has_lock) { mi_lock_release(&arena->abandoned_visit_lock); } - } - } - return NULL; -} - -static mi_segment_t* mi_arena_segment_clear_abandoned_next_list(mi_arena_field_cursor_t* previous) { - // go through the abandoned_os_list - // we only allow one thread per sub-process to do to visit guarded by the `abandoned_os_visit_lock`. - // The lock is released when the cursor is released. - if (!previous->hold_visit_lock) { - previous->hold_visit_lock = (previous->visit_all ? (mi_lock_acquire(&previous->subproc->abandoned_os_visit_lock),true) - : mi_lock_try_acquire(&previous->subproc->abandoned_os_visit_lock)); - if (!previous->hold_visit_lock) { - if (previous->visit_all) { - _mi_error_message(EFAULT, "internal error: failed to visit all abandoned segments due to failure to acquire the OS visitor lock"); - } - return NULL; // we cannot get the lock, give up - } - } - // One list entry at a time - while (previous->os_list_count > 0) { - previous->os_list_count--; - mi_lock_acquire(&previous->subproc->abandoned_os_lock); // this could contend with concurrent OS block abandonment and reclaim from `free` - mi_segment_t* segment = previous->subproc->abandoned_os_list; - // pop from head of the list, a subsequent mark will push at the end (and thus we iterate through os_list_count entries) - if (segment == NULL || mi_arena_segment_os_clear_abandoned(segment, false /* we already have the lock */)) { - mi_lock_release(&previous->subproc->abandoned_os_lock); - return segment; - } - // already abandoned, try again - mi_lock_release(&previous->subproc->abandoned_os_lock); - } - // done - mi_assert_internal(previous->os_list_count == 0); - return NULL; -} - - -// reclaim abandoned segments -// this does not set the thread id (so it appears as still abandoned) -mi_segment_t* _mi_arena_segment_clear_abandoned_next(mi_arena_field_cursor_t* previous) { - if (previous->start < previous->end) { - // walk the arena - mi_segment_t* segment = mi_arena_segment_clear_abandoned_next_field(previous); - if (segment != NULL) { return segment; } - } - // no entries in the arena's anymore, walk the abandoned OS list - mi_assert_internal(previous->start == previous->end); - return mi_arena_segment_clear_abandoned_next_list(previous); -} - - -bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { - // (unfortunately) the visit_abandoned option must be enabled from the start. - // This is to avoid taking locks if abandoned list visiting is not required (as for most programs) - if (!mi_option_is_enabled(mi_option_visit_abandoned)) { - _mi_error_message(EFAULT, "internal error: can only visit abandoned blocks when MIMALLOC_VISIT_ABANDONED=ON"); - return false; - } - mi_arena_field_cursor_t current; - _mi_arena_field_cursor_init(NULL, _mi_subproc_from_id(subproc_id), true /* visit all (blocking) */, ¤t); - mi_segment_t* segment; - bool ok = true; - while (ok && (segment = _mi_arena_segment_clear_abandoned_next(¤t)) != NULL) { - ok = _mi_segment_visit_blocks(segment, heap_tag, visit_blocks, visitor, arg); - _mi_arena_segment_mark_abandoned(segment); - } - _mi_arena_field_cursor_done(¤t); - return ok; -} From da17a59bdb127e1bd5fdd1ecc3dbf8153e1ed4db Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 15:53:50 -0800 Subject: [PATCH 121/264] re-add deferred free and heap retired collect --- include/mimalloc/types.h | 1 + src/init.c | 2 ++ src/page.c | 13 ++++++++----- 3 files changed, 11 insertions(+), 5 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 0b084558..7009a017 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -400,6 +400,7 @@ struct mi_heap_s { size_t page_count; // total number of pages in the `pages` queues. size_t page_retired_min; // smallest retired index (retired pages are fully free, but still in the page queues) size_t page_retired_max; // largest retired index into the `pages` array. + size_t generic_count; // how often is mimalloc_generic invoked? mi_heap_t* next; // list of heaps per thread long full_page_retain; // how many full pages can be retained per queue (before abondoning them) bool allow_page_reclaim; // `true` if this heap should not reclaim abandoned pages diff --git a/src/init.c b/src/init.c index 6bbea58e..5f3fb797 100644 --- a/src/init.c +++ b/src/init.c @@ -119,6 +119,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { { {0}, {0}, 0, true }, // random 0, // page count MI_BIN_FULL, 0, // page retired min/max + 0, // generic count NULL, // next 0, // full page retain false, // can reclaim @@ -155,6 +156,7 @@ mi_decl_cache_align mi_heap_t heap_main = { { {0x846ca68b}, {0}, 0, true }, // random 0, // page count MI_BIN_FULL, 0, // page retired min/max + 0, // generic count NULL, // next heap 2, // full page retain true, // allow page reclaim diff --git a/src/page.c b/src/page.c index 31dbcc7d..c366439e 100644 --- a/src/page.c +++ b/src/page.c @@ -872,11 +872,14 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al } mi_assert_internal(mi_heap_is_initialized(heap)); - // call potential deferred free routines - // _mi_deferred_free(heap, false); - - // free delayed frees from other threads (but skip contended ones) - // _mi_heap_delayed_free_partial(heap); + // collect every N generic mallocs + if (heap->generic_count++ > 10000) { + heap->generic_count = 0; + // call potential deferred free routines + _mi_deferred_free(heap, false); + // collect retired pages + _mi_heap_collect_retired(heap, false); + } // find (or allocate) a page of the right size mi_page_t* page = mi_find_page(heap, size, huge_alignment); From d7d626cbfae73e22ab85d92a12feb76b9bf8f981 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 16:24:56 -0800 Subject: [PATCH 122/264] enable collecting from the full page queue --- src/heap.c | 23 ----------------------- src/page.c | 39 ++++++++++++++++++++++++++++++++++----- 2 files changed, 34 insertions(+), 28 deletions(-) diff --git a/src/heap.c b/src/heap.c index 03030b47..412c6465 100644 --- a/src/heap.c +++ b/src/heap.c @@ -102,14 +102,6 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t return true; // don't break } -//static bool mi_heap_page_never_delayed_free(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t* page, void* arg1, void* arg2) { -// MI_UNUSED(arg1); -// MI_UNUSED(arg2); -// MI_UNUSED(heap); -// MI_UNUSED(pq); -// _mi_page_use_delayed_free(page, MI_NEVER_DELAYED_FREE, false); -// return true; // don't break -//} static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) { @@ -121,21 +113,6 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // python/cpython#112532: we may be called from a thread that is not the owner of the heap // const bool is_main_thread = (_mi_is_main_thread() && heap->thread_id == _mi_thread_id()); - // note: never reclaim on collect but leave it to threads that need storage to reclaim - //if ( - //#ifdef NDEBUG - // collect == MI_FORCE - //#else - // collect >= MI_FORCE - //#endif - // && is_main_thread && mi_heap_is_backing(heap) && heap->allow_page_reclaim) - //{ - // // the main thread is abandoned (end-of-program), try to reclaim all abandoned segments. - // // if all memory is freed by now, all segments should be freed. - // // note: this only collects in the current subprocess - // _mi_arena_reclaim_all_abandoned(heap); - //} - // collect retired pages _mi_heap_collect_retired(heap, force); diff --git a/src/page.c b/src/page.c index c366439e..200cdaa9 100644 --- a/src/page.c +++ b/src/page.c @@ -433,6 +433,36 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) { } +static void mi_heap_collect_full_pages(mi_heap_t* heap) { + // note: normally full pages get immediately abandoned and the full queue is always empty + // this path is only used if abandoning is disabled due to a destroy-able heap or options + // set by the user. + mi_page_queue_t* pq = &heap->pages[MI_BIN_FULL]; + for (mi_page_t* page = pq->first; page != NULL; ) { + mi_page_t* next = page->next; // get next in case we free the page + _mi_page_free_collect(page, false); // register concurrent free's + // no longer full? + if (!mi_page_is_full(page)) { + if (mi_page_all_free(page)) { + _mi_page_free(page, pq); + } + else { + _mi_page_unfull(page); + } + } + page = next; + } +} + +static mi_decl_noinline void mi_heap_generic_collect(mi_heap_t* heap) { + // call potential deferred free routines + _mi_deferred_free(heap, false); + // collect retired pages + _mi_heap_collect_retired(heap, false); + // collect full pages that had concurrent free's + mi_heap_collect_full_pages(heap); +} + /* ----------------------------------------------------------- Initialize the initial free list in a page. In secure mode we initialize a randomized list by @@ -857,6 +887,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme } } + // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed. // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed. // The `huge_alignment` is normally 0 but is set to a multiple of MI_SLICE_SIZE for @@ -873,17 +904,15 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al mi_assert_internal(mi_heap_is_initialized(heap)); // collect every N generic mallocs - if (heap->generic_count++ > 10000) { + if mi_unlikely(heap->generic_count++ > 10000) { heap->generic_count = 0; - // call potential deferred free routines - _mi_deferred_free(heap, false); - // collect retired pages - _mi_heap_collect_retired(heap, false); + mi_heap_generic_collect(heap); } // find (or allocate) a page of the right size mi_page_t* page = mi_find_page(heap, size, huge_alignment); if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more + mi_heap_generic_collect(heap); mi_heap_collect(heap, true /* force */); page = mi_find_page(heap, size, huge_alignment); } From 1e2221f5126fa3686cff9fd656842cf35059b4e6 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 19:28:53 -0800 Subject: [PATCH 123/264] fix signed/unsigned; fix heap_destroy assert failure --- src/heap.c | 3 ++- src/page-map.c | 13 +++++++++++-- 2 files changed, 13 insertions(+), 3 deletions(-) diff --git a/src/heap.c b/src/heap.c index 412c6465..a1b06c6b 100644 --- a/src/heap.c +++ b/src/heap.c @@ -340,6 +340,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ // mi_page_free(page,false); page->next = NULL; page->prev = NULL; + mi_page_set_heap(page, NULL); _mi_arena_page_free(page); return true; // keep going @@ -507,7 +508,7 @@ bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) { // reinit direct pages (as we may be in a different process) mi_assert_internal(heap->page_count == 0); - for (int i = 0; i < MI_PAGES_DIRECT; i++) { + for (size_t i = 0; i < MI_PAGES_DIRECT; i++) { heap->pages_free_direct[i] = (mi_page_t*)&_mi_page_empty; } diff --git a/src/page-map.c b/src/page-map.c index 7b74c711..d6517f72 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -9,6 +9,14 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/internal.h" #include "bitmap.h" +// The page-map contains a byte for each 64kb slice in the address space. +// For an address `a` where `n = _mi_page_map[a >> 16]`: +// 0 = unused +// 1 = the slice at `a & ~0xFFFF` is a mimalloc page. +// 1 < n << 127 = the slice is part of a page, starting at `(((a>>16) - n - 1) << 16)`. +// +// 1 byte per slice => 1 GiB page map = 2^30 slices of 2^16 = 2^46 = 64 TiB address space. +// 4 GiB virtual for 256 TiB address space (48 bit) (and 64 KiB for 4 GiB address space (on 32-bit)). mi_decl_cache_align uint8_t* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; @@ -24,10 +32,11 @@ bool _mi_page_map_init(void) { size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS); if (vbits == 0) { vbits = _mi_os_virtual_address_bits(); + #if MI_ARCH_X64 if (vbits >= 48) { vbits = 47; } + #endif } - // 1 byte per block = 2 GiB for 128 TiB address space (48 bit = 256 TiB address space) - // 64 KiB for 4 GiB address space (on 32-bit) + mi_page_map_max_address = (void*)(MI_PU(1) << vbits); const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); From 56cbddfc7e39ec0a4ea7585641bf333495b83604 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 23:08:52 -0800 Subject: [PATCH 124/264] initial work on a two-level page-map --- include/mimalloc/bits.h | 8 ++ include/mimalloc/internal.h | 64 +++++++++++++--- src/page-map.c | 143 +++++++++++++++++++++++++++++++++++- test/test-stress.c | 4 +- 4 files changed, 206 insertions(+), 13 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 32b9d528..fb6c2e8c 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -68,6 +68,14 @@ typedef int32_t mi_ssize_t; #define MI_MiB (MI_KiB*MI_KiB) #define MI_GiB (MI_MiB*MI_KiB) +#if MI_INTPTR_SIZE > 4 +#define MI_MAX_VABITS (48) +#define MI_PAGE_MAP_FLAT 0 +#else +#define MI_MAX_VABITS (32) +#define MI_PAGE_MAP_FLAT 1 +#endif + /* -------------------------------------------------------------------------------- Architecture diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 208989e3..dbc45133 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -422,6 +422,14 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) { return (heap != &_mi_heap_empty); } +static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) { + mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE)); + const size_t idx = _mi_wsize_from_size(size); + mi_assert_internal(idx < MI_PAGES_DIRECT); + return heap->pages_free_direct[idx]; +} + + //static inline uintptr_t _mi_ptr_cookie(const void* p) { // extern mi_heap_t _mi_heap_main; // mi_assert_internal(_mi_heap_main.cookie != 0); @@ -433,14 +441,9 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) { Pages ----------------------------------------------------------- */ -static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) { - mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE)); - const size_t idx = _mi_wsize_from_size(size); - mi_assert_internal(idx < MI_PAGES_DIRECT); - return heap->pages_free_direct[idx]; -} - +#if MI_PAGE_MAP_FLAT +// flat page-map committed on demand extern uint8_t* _mi_page_map; static inline uintptr_t _mi_page_map_index(const void* p) { @@ -465,16 +468,59 @@ static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) { static inline mi_page_t* _mi_checked_ptr_page(const void* p) { bool valid; - mi_page_t* const page = _mi_ptr_page_ex(p,&valid); + mi_page_t* const page = _mi_ptr_page_ex(p, &valid); return (valid ? page : NULL); } +static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { + return _mi_ptr_page_ex(p, NULL); +} + +#else + +// 2-level page map + +// one page-map directory = 64 KiB => covers 2^16 * 2^16 = 2^32 = 4 GiB address space +// the page-map needs 48-16-16 = 16 bits => 2^16 map directories = 2^16 * 2^3 = 2^19 = 512 KiB size. +// we commit the page-map directories on-demand. (2^16 * 2^16 = 2^32 ~= 4 GiB needed to cover 256 TeB) + +#define MI_PAGE_MAP_SUB_SHIFT (16) // 64 KiB +#define MI_PAGE_MAP_SUB_SIZE (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT) +#define MI_PAGE_MAP_SHIFT (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT) +#define MI_PAGE_MAP_COUNT (MI_ZU(1) << MI_PAGE_MAP_SHIFT) + +extern uint8_t** _mi_page_map; + +static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) { + const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE; + if (sub_idx != NULL) { *sub_idx = (uint32_t)u % MI_PAGE_MAP_SUB_SIZE; } + return (size_t)(u / MI_PAGE_MAP_COUNT); +} + +static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { + const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE; + const uint8_t* const sub = _mi_page_map[u / MI_PAGE_MAP_COUNT]; + const uint8_t ofs = sub[(uint32_t)u % MI_PAGE_MAP_SUB_SIZE]; + return (mi_page_t*)((u - ofs + 1) * MI_ARENA_SLICE_SIZE); +} + +static inline mi_page_t* _mi_checked_ptr_page(const void* p) { + const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE; + const uint8_t* const sub = _mi_page_map[u / MI_PAGE_MAP_COUNT]; + //if mi_unlikely(sub == NULL) { return NULL; } + const uint8_t ofs = sub[(uint32_t)u % MI_PAGE_MAP_SUB_SIZE]; + //if mi_unlikely(ofs == 0) { return NULL; } + return (mi_page_t*)((u - ofs + 1) * MI_ARENA_SLICE_SIZE); +} + +#endif + static inline mi_page_t* _mi_ptr_page(const void* p) { mi_assert_internal(p==NULL || mi_is_in_heap_region(p)); #if MI_DEBUG || defined(__APPLE__) return _mi_checked_ptr_page(p); #else - return _mi_ptr_page_ex(p,NULL); + return _mi_unchecked_ptr_page(p); #endif } diff --git a/src/page-map.c b/src/page-map.c index d6517f72..a814610f 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -9,6 +9,8 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/internal.h" #include "bitmap.h" +#if MI_PAGE_MAP_FLAT + // The page-map contains a byte for each 64kb slice in the address space. // For an address `a` where `n = _mi_page_map[a >> 16]`: // 0 = unused @@ -17,6 +19,9 @@ terms of the MIT license. A copy of the license can be found in the file // // 1 byte per slice => 1 GiB page map = 2^30 slices of 2^16 = 2^46 = 64 TiB address space. // 4 GiB virtual for 256 TiB address space (48 bit) (and 64 KiB for 4 GiB address space (on 32-bit)). + +// 1MiB = 2^20*2^16 = 2^36 = 64GiB address space +// 2^12 pointers = 2^15 k = 32k mi_decl_cache_align uint8_t* _mi_page_map = NULL; static bool mi_page_map_all_committed = false; static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; @@ -25,7 +30,7 @@ static mi_memid_t mi_page_map_memid; // (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization) -static mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0), +sstatic mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0), { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} }; bool _mi_page_map_init(void) { @@ -101,7 +106,7 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* void _mi_page_map_register(mi_page_t* page) { mi_assert_internal(page != NULL); - mi_assert_internal(_mi_is_aligned(page,MI_PAGE_ALIGN)); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_page_map != NULL); // should be initialized before multi-thread access! if mi_unlikely(_mi_page_map == NULL) { if (!_mi_page_map_init()) return; @@ -151,3 +156,137 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att return false; } } + +#else + +mi_decl_cache_align uint8_t** _mi_page_map = NULL; + +static void* mi_page_map_max_address = NULL; +static mi_memid_t mi_page_map_memid; + +bool _mi_page_map_init(void) { + size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS); + if (vbits == 0) { + vbits = _mi_os_virtual_address_bits(); + mi_assert_internal(vbits <= MI_MAX_VABITS); + } + + mi_page_map_max_address = (void*)(MI_PU(1) << vbits); + const size_t os_page_size = _mi_os_page_size(); + const size_t page_map_size = _mi_align_up(MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT + MI_INTPTR_SHIFT), os_page_size); + const size_t reserve_size = page_map_size + (2 * MI_PAGE_MAP_SUB_SIZE); + _mi_page_map = (uint8_t**)_mi_os_alloc_aligned(reserve_size, 1, true /* commit */, true, &mi_page_map_memid); + if (_mi_page_map==NULL) { + _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", reserve_size / MI_KiB); + return false; + } + if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) { + _mi_warning_message("the page map was committed but not zero initialized!\n"); + _mi_memzero_aligned(_mi_page_map, reserve_size); + } + + uint8_t* sub0 = (uint8_t*)_mi_page_map + page_map_size; + uint8_t* sub1 = sub0 + MI_PAGE_MAP_SUB_SIZE; + // initialize the first part so NULL pointers get resolved without an access violation + _mi_page_map[0] = sub0; + sub0[0] = 1; // so _mi_ptr_page(NULL) == NULL + // and initialize the 4GiB range where we were allocated + _mi_page_map[_mi_page_map_index(_mi_page_map,NULL)] = sub1; + + mi_assert_internal(_mi_ptr_page(NULL)==NULL); + return true; +} + +static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* sub_idx, size_t* slice_count) { + size_t page_size; + *page_start = mi_page_area(page, &page_size); + if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; } // furthest interior pointer + *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks + return _mi_page_map_index(page,sub_idx); +} + + +static inline void mi_page_map_set_range(size_t idx, size_t sub_idx, size_t slice_count, uint8_t (*set)(uint8_t ofs)) { + // is the page map area that contains the page address committed? + uint8_t ofs = 1; + while (slice_count > 0) { + uint8_t* sub = _mi_page_map[idx]; + if (sub == NULL) { + mi_memid_t memid; + sub = (uint8_t*)_mi_os_alloc(MI_PAGE_MAP_SUB_SIZE, &memid); + if (sub == NULL) { + _mi_error_message(EFAULT, "internal error: unable to extend the page map\n"); + return; // abort? + } + } + // set the offsets for the page + while (sub_idx < MI_PAGE_MAP_SUB_SIZE && slice_count > 0) { + sub[sub_idx] = set(ofs); + sub_idx++; + ofs++; + slice_count--; + } + sub_idx = 0; // potentially wrap around to the next idx + } +} + +static uint8_t set_ofs(uint8_t ofs) { + return ofs; +} + +void _mi_page_map_register(mi_page_t* page) { + mi_assert_internal(page != NULL); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_page_map != NULL); // should be initialized before multi-thread access! + if mi_unlikely(_mi_page_map == NULL) { + if (!_mi_page_map_init()) return; + } + mi_assert(_mi_page_map!=NULL); + uint8_t* page_start; + size_t slice_count; + size_t sub_idx; + const size_t idx = mi_page_map_get_idx(page, &page_start, &sub_idx, &slice_count); + mi_page_map_set_range(idx, sub_idx, slice_count, &set_ofs); +} + +static uint8_t set_zero(uint8_t ofs) { + MI_UNUSED(ofs); + return 0; +} + + +void _mi_page_map_unregister(mi_page_t* page) { + mi_assert_internal(_mi_page_map != NULL); + // get index and count + uint8_t* page_start; + size_t slice_count; + size_t sub_idx; + const size_t idx = mi_page_map_get_idx(page, &page_start, &sub_idx, &slice_count); + // unset the offsets + mi_page_map_set_range(idx, sub_idx, slice_count, &set_zero); +} + +void _mi_page_map_unregister_range(void* start, size_t size) { + const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE); + size_t sub_idx; + const size_t idx = _mi_page_map_index(start, &sub_idx); + mi_page_map_set_range(idx, sub_idx, slice_count, &set_zero); +} + +mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { + + if mi_unlikely(p >= mi_page_map_max_address) return false; + size_t sub_idx; + const size_t idx = _mi_page_map_index(p, &sub_idx); + uint8_t* sub = _mi_page_map[idx]; + if (sub != NULL) { + return (sub[sub_idx] != 0); + } + else { + return false; + } +} + + +#endif + diff --git a/test/test-stress.c b/test/test-stress.c index 0920a02e..bbcded65 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -302,8 +302,8 @@ int main(int argc, char** argv) { mi_option_enable(mi_option_visit_abandoned); #endif #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) - // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); - mi_option_set(mi_option_purge_delay,10); + mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); + //mi_option_set(mi_option_purge_delay,10); #endif #ifndef USE_STD_MALLOC mi_stats_reset(); From c9b2d31665b9102114569ccf78be1328c2843fe7 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 21 Dec 2024 23:17:11 -0800 Subject: [PATCH 125/264] fix page_map initialization --- src/page-map.c | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/src/page-map.c b/src/page-map.c index a814610f..403be079 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -214,6 +214,12 @@ static inline void mi_page_map_set_range(size_t idx, size_t sub_idx, size_t slic if (sub == NULL) { mi_memid_t memid; sub = (uint8_t*)_mi_os_alloc(MI_PAGE_MAP_SUB_SIZE, &memid); + uint8_t* expect = NULL; + if (!mi_atomic_cas_strong_acq_rel(((_Atomic(uint8_t*)*)&_mi_page_map[idx]), &expect, sub)) { + _mi_os_free(sub, MI_PAGE_MAP_SUB_SIZE, memid); + sub = expect; + mi_assert_internal(sub!=NULL); + } if (sub == NULL) { _mi_error_message(EFAULT, "internal error: unable to extend the page map\n"); return; // abort? From 93fa8d895ad7366285782cf1f1259fe427c4d631 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 12:18:53 -0800 Subject: [PATCH 126/264] revert back to flat address map --- include/mimalloc/bits.h | 8 -- include/mimalloc/internal.h | 65 ++-------- src/free.c | 8 +- src/page-map.c | 248 ++++++++---------------------------- 4 files changed, 66 insertions(+), 263 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index fb6c2e8c..32b9d528 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -68,14 +68,6 @@ typedef int32_t mi_ssize_t; #define MI_MiB (MI_KiB*MI_KiB) #define MI_GiB (MI_MiB*MI_KiB) -#if MI_INTPTR_SIZE > 4 -#define MI_MAX_VABITS (48) -#define MI_PAGE_MAP_FLAT 0 -#else -#define MI_MAX_VABITS (32) -#define MI_PAGE_MAP_FLAT 1 -#endif - /* -------------------------------------------------------------------------------- Architecture diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index dbc45133..17c02941 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -169,6 +169,7 @@ bool _mi_page_map_init(void); void _mi_page_map_register(mi_page_t* page); void _mi_page_map_unregister(mi_page_t* page); void _mi_page_map_unregister_range(void* start, size_t size); +mi_page_t* _mi_safe_ptr_page(const void* p); // "page.c" void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; @@ -441,29 +442,18 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si Pages ----------------------------------------------------------- */ -#if MI_PAGE_MAP_FLAT - // flat page-map committed on demand extern uint8_t* _mi_page_map; -static inline uintptr_t _mi_page_map_index(const void* p) { - return (((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT); +static inline size_t _mi_page_map_index(const void* p) { + return (size_t)((uintptr_t)p >> MI_ARENA_SLICE_SHIFT); } static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) { - #if 1 - const uintptr_t idx = _mi_page_map_index(p); + const size_t idx = _mi_page_map_index(p); const size_t ofs = _mi_page_map[idx]; - if (valid != NULL) *valid = (ofs != 0); - return (mi_page_t*)((idx - ofs + 1) << MI_ARENA_SLICE_SHIFT); - #else - const uintptr_t idx = _mi_page_map_index(p); - const uintptr_t up = idx << MI_ARENA_SLICE_SHIFT; - __builtin_prefetch((void*)up); - const size_t ofs = _mi_page_map[idx]; - if (valid != NULL) *valid = (ofs != 0); - return (mi_page_t*)(up - ((ofs - 1) << MI_ARENA_SLICE_SHIFT)); - #endif + if (valid != NULL) { *valid = (ofs != 0); } + return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) + 1 - ofs) << MI_ARENA_SLICE_SHIFT); } static inline mi_page_t* _mi_checked_ptr_page(const void* p) { @@ -476,49 +466,10 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { return _mi_ptr_page_ex(p, NULL); } -#else - -// 2-level page map - -// one page-map directory = 64 KiB => covers 2^16 * 2^16 = 2^32 = 4 GiB address space -// the page-map needs 48-16-16 = 16 bits => 2^16 map directories = 2^16 * 2^3 = 2^19 = 512 KiB size. -// we commit the page-map directories on-demand. (2^16 * 2^16 = 2^32 ~= 4 GiB needed to cover 256 TeB) - -#define MI_PAGE_MAP_SUB_SHIFT (16) // 64 KiB -#define MI_PAGE_MAP_SUB_SIZE (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT) -#define MI_PAGE_MAP_SHIFT (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT) -#define MI_PAGE_MAP_COUNT (MI_ZU(1) << MI_PAGE_MAP_SHIFT) - -extern uint8_t** _mi_page_map; - -static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) { - const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE; - if (sub_idx != NULL) { *sub_idx = (uint32_t)u % MI_PAGE_MAP_SUB_SIZE; } - return (size_t)(u / MI_PAGE_MAP_COUNT); -} - -static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { - const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE; - const uint8_t* const sub = _mi_page_map[u / MI_PAGE_MAP_COUNT]; - const uint8_t ofs = sub[(uint32_t)u % MI_PAGE_MAP_SUB_SIZE]; - return (mi_page_t*)((u - ofs + 1) * MI_ARENA_SLICE_SIZE); -} - -static inline mi_page_t* _mi_checked_ptr_page(const void* p) { - const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE; - const uint8_t* const sub = _mi_page_map[u / MI_PAGE_MAP_COUNT]; - //if mi_unlikely(sub == NULL) { return NULL; } - const uint8_t ofs = sub[(uint32_t)u % MI_PAGE_MAP_SUB_SIZE]; - //if mi_unlikely(ofs == 0) { return NULL; } - return (mi_page_t*)((u - ofs + 1) * MI_ARENA_SLICE_SIZE); -} - -#endif - static inline mi_page_t* _mi_ptr_page(const void* p) { mi_assert_internal(p==NULL || mi_is_in_heap_region(p)); #if MI_DEBUG || defined(__APPLE__) - return _mi_checked_ptr_page(p); + return _mi_checked_ptr_page(p); #else return _mi_unchecked_ptr_page(p); #endif @@ -637,7 +588,7 @@ static inline bool mi_page_immediate_available(const mi_page_t* page) { return (page->free != NULL); } - + // is the page not yet used up to its reserved space? static inline bool mi_page_is_expandable(const mi_page_t* page) { mi_assert_internal(page != NULL); diff --git a/src/free.c b/src/free.c index 88f784c7..d08123a2 100644 --- a/src/free.c +++ b/src/free.c @@ -145,14 +145,14 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p); return NULL; } - #endif - mi_page_t* const page = _mi_ptr_page(p); - #if MI_DEBUG + mi_page_t* const page = _mi_safe_ptr_page(p); if (page == NULL && p != NULL) { _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p); } - #endif return page; + #else + return _mi_ptr_page(p); + #endif } // Free a block diff --git a/src/page-map.c b/src/page-map.c index 403be079..a4001359 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -9,60 +9,61 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/internal.h" #include "bitmap.h" -#if MI_PAGE_MAP_FLAT -// The page-map contains a byte for each 64kb slice in the address space. -// For an address `a` where `n = _mi_page_map[a >> 16]`: +// The page-map contains a byte for each 64kb slice in the address space. +// For an address `a` where `ofs = _mi_page_map[a >> 16]`: // 0 = unused // 1 = the slice at `a & ~0xFFFF` is a mimalloc page. -// 1 < n << 127 = the slice is part of a page, starting at `(((a>>16) - n - 1) << 16)`. -// -// 1 byte per slice => 1 GiB page map = 2^30 slices of 2^16 = 2^46 = 64 TiB address space. -// 4 GiB virtual for 256 TiB address space (48 bit) (and 64 KiB for 4 GiB address space (on 32-bit)). +// 1 < ofs <= 127 = the slice is part of a page, starting at `(((a>>16) - ofs - 1) << 16)`. +// +// 1 byte per slice => 1 TiB address space needs a 2^14 * 2^16 = 16 MiB page map. +// A full 256 TiB address space (48 bit) needs a 4 GiB page map. +// A full 4 GiB address space (32 bit) needs only a 64 KiB page map. -// 1MiB = 2^20*2^16 = 2^36 = 64GiB address space -// 2^12 pointers = 2^15 k = 32k mi_decl_cache_align uint8_t* _mi_page_map = NULL; -static bool mi_page_map_all_committed = false; -static size_t mi_page_map_entries_per_commit_bit = MI_ARENA_SLICE_SIZE; -static void* mi_page_map_max_address = NULL; -static mi_memid_t mi_page_map_memid; +static void* mi_page_map_max_address = NULL; +static mi_memid_t mi_page_map_memid; +#define MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT MI_ARENA_SLICE_SIZE +static mi_bitmap_t* mi_page_map_commit; // one bit per committed 64 KiB entries -// (note: we need to initialize statically or otherwise C++ may run a default constructors after process initialization) -sstatic mi_bitmap_t mi_page_map_commit = { MI_ATOMIC_VAR_INIT(MI_BITMAP_DEFAULT_CHUNK_COUNT), MI_ATOMIC_VAR_INIT(0), - { 0 }, { {MI_ATOMIC_VAR_INIT(0)} }, {{{ MI_ATOMIC_VAR_INIT(0) }}} }; +static void mi_page_map_ensure_committed(size_t idx, size_t slice_count); bool _mi_page_map_init(void) { - size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS); + size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS); if (vbits == 0) { vbits = _mi_os_virtual_address_bits(); - #if MI_ARCH_X64 + #if MI_ARCH_X64 // canonical address is limited to the first 128 TiB if (vbits >= 48) { vbits = 47; } #endif } - + + // Allocate the page map and commit bits mi_page_map_max_address = (void*)(MI_PU(1) << vbits); const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); - - mi_page_map_entries_per_commit_bit = _mi_divide_up(page_map_size, MI_BITMAP_DEFAULT_BIT_COUNT); - // mi_bitmap_init(&mi_page_map_commit, MI_BITMAP_MIN_BIT_COUNT, true); - - mi_page_map_all_committed = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap)); // _mi_os_has_overcommit(); // commit on-access on Linux systems? - _mi_page_map = (uint8_t*)_mi_os_alloc_aligned(page_map_size, 1, mi_page_map_all_committed, true, &mi_page_map_memid); - if (_mi_page_map==NULL) { + const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap)); // _mi_os_has_overcommit(); // commit on-access on Linux systems? + const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT); + const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL)); + const size_t reserve_size = bitmap_size + page_map_size; + uint8_t* const base = (uint8_t*)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid); + if (base==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); return false; } if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) { - _mi_warning_message("the page map was committed but not zero initialized!\n"); - _mi_memzero_aligned(_mi_page_map, page_map_size); + _mi_warning_message("internal: the page map was committed but not zero initialized!\n"); + _mi_memzero_aligned(base, reserve_size); } + if (bitmap_size > 0) { + mi_page_map_commit = (mi_bitmap_t*)base; + _mi_os_commit(mi_page_map_commit, bitmap_size, NULL); + mi_bitmap_init(mi_page_map_commit, commit_bits, true); + } + _mi_page_map = base + bitmap_size; + // commit the first part so NULL pointers get resolved without an access violation - if (!mi_page_map_all_committed) { - bool is_zero; - _mi_os_commit(_mi_page_map, _mi_os_page_size(), &is_zero); - if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(_mi_page_map, _mi_os_page_size()); } + if (!commit) { + mi_page_map_ensure_committed(0, 1); } _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL mi_assert_internal(_mi_ptr_page(NULL)==NULL); @@ -70,30 +71,31 @@ bool _mi_page_map_init(void) { } static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) { - // is the page map area that contains the page address committed? + // is the page map area that contains the page address committed? // we always set the commit bits so we can track what ranges are in-use. // we only actually commit if the map wasn't committed fully already. - const size_t commit_bit_idx_lo = idx / mi_page_map_entries_per_commit_bit; - const size_t commit_bit_idx_hi = (idx + slice_count - 1) / mi_page_map_entries_per_commit_bit; - for (size_t i = commit_bit_idx_lo; i <= commit_bit_idx_hi; i++) { // per bit to avoid crossing over bitmap chunks - if (mi_bitmap_is_clearN(&mi_page_map_commit, i, 1)) { - // this may race, in which case we do multiple commits (which is ok) - if (!mi_page_map_all_committed) { + if (mi_page_map_commit != NULL) { + const size_t commit_idx = idx / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT; + const size_t commit_idx_hi = (idx + slice_count - 1) / MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT; + for (size_t i = commit_idx; i <= commit_idx_hi; i++) { // per bit to avoid crossing over bitmap chunks + if (mi_bitmap_is_clear(mi_page_map_commit, i)) { + // this may race, in which case we do multiple commits (which is ok) bool is_zero; - uint8_t* const start = _mi_page_map + (i*mi_page_map_entries_per_commit_bit); - const size_t size = mi_page_map_entries_per_commit_bit; + uint8_t* const start = _mi_page_map + (i * MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT); + const size_t size = MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT; _mi_os_commit(start, size, &is_zero); - if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); } + if (!is_zero && !mi_page_map_memid.initially_zero) { _mi_memzero(start, size); } + mi_bitmap_set(mi_page_map_commit, i); } - mi_bitmap_set(&mi_page_map_commit, i); } } #if MI_DEBUG > 0 _mi_page_map[idx] = 0; _mi_page_map[idx+slice_count-1] = 0; - #endif + #endif } + static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) { size_t page_size; *page_start = mi_page_area(page, &page_size); @@ -102,8 +104,6 @@ static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* return _mi_page_map_index(page); } - - void _mi_page_map_register(mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); @@ -125,7 +125,6 @@ void _mi_page_map_register(mi_page_t* page) { } } - void _mi_page_map_unregister(mi_page_t* page) { mi_assert_internal(_mi_page_map != NULL); // get index and count @@ -143,156 +142,17 @@ void _mi_page_map_unregister_range(void* start, size_t size) { _mi_memzero(&_mi_page_map[index], slice_count); } -mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { - // if mi_unlikely(_mi_page_map==NULL) { // happens on macOS during loading - // _mi_page_map_init(); - // } - if mi_unlikely(p >= mi_page_map_max_address) return false; - uintptr_t idx = ((uintptr_t)p >> MI_ARENA_SLICE_SHIFT); - if (mi_page_map_all_committed || mi_bitmap_is_setN(&mi_page_map_commit, idx/mi_page_map_entries_per_commit_bit, 1)) { - return (_mi_page_map[idx] != 0); - } - else { - return false; - } -} -#else - -mi_decl_cache_align uint8_t** _mi_page_map = NULL; - -static void* mi_page_map_max_address = NULL; -static mi_memid_t mi_page_map_memid; - -bool _mi_page_map_init(void) { - size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS); - if (vbits == 0) { - vbits = _mi_os_virtual_address_bits(); - mi_assert_internal(vbits <= MI_MAX_VABITS); - } - - mi_page_map_max_address = (void*)(MI_PU(1) << vbits); - const size_t os_page_size = _mi_os_page_size(); - const size_t page_map_size = _mi_align_up(MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT + MI_INTPTR_SHIFT), os_page_size); - const size_t reserve_size = page_map_size + (2 * MI_PAGE_MAP_SUB_SIZE); - _mi_page_map = (uint8_t**)_mi_os_alloc_aligned(reserve_size, 1, true /* commit */, true, &mi_page_map_memid); - if (_mi_page_map==NULL) { - _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", reserve_size / MI_KiB); - return false; - } - if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) { - _mi_warning_message("the page map was committed but not zero initialized!\n"); - _mi_memzero_aligned(_mi_page_map, reserve_size); - } - - uint8_t* sub0 = (uint8_t*)_mi_page_map + page_map_size; - uint8_t* sub1 = sub0 + MI_PAGE_MAP_SUB_SIZE; - // initialize the first part so NULL pointers get resolved without an access violation - _mi_page_map[0] = sub0; - sub0[0] = 1; // so _mi_ptr_page(NULL) == NULL - // and initialize the 4GiB range where we were allocated - _mi_page_map[_mi_page_map_index(_mi_page_map,NULL)] = sub1; - - mi_assert_internal(_mi_ptr_page(NULL)==NULL); - return true; -} - -static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* sub_idx, size_t* slice_count) { - size_t page_size; - *page_start = mi_page_area(page, &page_size); - if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; } // furthest interior pointer - *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks - return _mi_page_map_index(page,sub_idx); -} - - -static inline void mi_page_map_set_range(size_t idx, size_t sub_idx, size_t slice_count, uint8_t (*set)(uint8_t ofs)) { - // is the page map area that contains the page address committed? - uint8_t ofs = 1; - while (slice_count > 0) { - uint8_t* sub = _mi_page_map[idx]; - if (sub == NULL) { - mi_memid_t memid; - sub = (uint8_t*)_mi_os_alloc(MI_PAGE_MAP_SUB_SIZE, &memid); - uint8_t* expect = NULL; - if (!mi_atomic_cas_strong_acq_rel(((_Atomic(uint8_t*)*)&_mi_page_map[idx]), &expect, sub)) { - _mi_os_free(sub, MI_PAGE_MAP_SUB_SIZE, memid); - sub = expect; - mi_assert_internal(sub!=NULL); - } - if (sub == NULL) { - _mi_error_message(EFAULT, "internal error: unable to extend the page map\n"); - return; // abort? - } - } - // set the offsets for the page - while (sub_idx < MI_PAGE_MAP_SUB_SIZE && slice_count > 0) { - sub[sub_idx] = set(ofs); - sub_idx++; - ofs++; - slice_count--; - } - sub_idx = 0; // potentially wrap around to the next idx - } -} - -static uint8_t set_ofs(uint8_t ofs) { - return ofs; -} - -void _mi_page_map_register(mi_page_t* page) { - mi_assert_internal(page != NULL); - mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); - mi_assert_internal(_mi_page_map != NULL); // should be initialized before multi-thread access! - if mi_unlikely(_mi_page_map == NULL) { - if (!_mi_page_map_init()) return; - } - mi_assert(_mi_page_map!=NULL); - uint8_t* page_start; - size_t slice_count; - size_t sub_idx; - const size_t idx = mi_page_map_get_idx(page, &page_start, &sub_idx, &slice_count); - mi_page_map_set_range(idx, sub_idx, slice_count, &set_ofs); -} - -static uint8_t set_zero(uint8_t ofs) { - MI_UNUSED(ofs); - return 0; -} - - -void _mi_page_map_unregister(mi_page_t* page) { - mi_assert_internal(_mi_page_map != NULL); - // get index and count - uint8_t* page_start; - size_t slice_count; - size_t sub_idx; - const size_t idx = mi_page_map_get_idx(page, &page_start, &sub_idx, &slice_count); - // unset the offsets - mi_page_map_set_range(idx, sub_idx, slice_count, &set_zero); -} - -void _mi_page_map_unregister_range(void* start, size_t size) { - const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE); - size_t sub_idx; - const size_t idx = _mi_page_map_index(start, &sub_idx); - mi_page_map_set_range(idx, sub_idx, slice_count, &set_zero); +mi_page_t* _mi_safe_ptr_page(const void* p) { + if mi_unlikely(p >= mi_page_map_max_address) return NULL; + const uintptr_t idx = _mi_page_map_index(p); + if mi_unlikely(mi_page_map_commit == NULL || !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL; + const uintptr_t ofs = _mi_page_map[idx]; + if mi_unlikely(ofs == 0) return NULL; + return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT); } mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { - - if mi_unlikely(p >= mi_page_map_max_address) return false; - size_t sub_idx; - const size_t idx = _mi_page_map_index(p, &sub_idx); - uint8_t* sub = _mi_page_map[idx]; - if (sub != NULL) { - return (sub[sub_idx] != 0); - } - else { - return false; - } + return (_mi_safe_ptr_page(p) != NULL); } - -#endif - From 8d16303aa6a6d25975f01569b71b7127a0a8d559 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 12:21:31 -0800 Subject: [PATCH 127/264] add -mtune=native with opt arch --- CMakeLists.txt | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index ebd02b20..07a292e0 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -91,7 +91,7 @@ endif() if (CMAKE_GENERATOR MATCHES "^Visual Studio.*$") message(STATUS "Note: when building with Visual Studio the build type is specified when building.") - message(STATUS "For example: 'cmake --build . --config=Release") + message(STATUS "For example: 'cmake --build . --config=Release") endif() if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$") @@ -401,9 +401,9 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel") endif() if(MI_OPT_ARCH) if(MI_ARCH STREQUAL "x64") - set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2") # fast bit scan (since 2013) + set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2;-mtune=native") # fast bit scan (since 2013) elseif(MI_ARCH STREQUAL "arm64") - set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a") # fast atomics (since 2016) + set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a;-mtune=native") # fast atomics (since 2016) endif() endif() endif() @@ -557,7 +557,7 @@ if(MI_BUILD_SHARED) elseif(MI_ARCH STREQUAL "x64") set(MIMALLOC_REDIRECT_SUFFIX "") if(CMAKE_SYSTEM_PROCESSOR STREQUAL "ARM64") - message(STATUS "Note: x64 code emulated on Windows for arm64 should use an arm64ec build of 'mimalloc-override.dll'") + message(STATUS "Note: x64 code emulated on Windows for arm64 should use an arm64ec build of 'mimalloc-override.dll'") message(STATUS " with 'mimalloc-redirect-arm64ec.dll'. See the 'bin\\readme.md' for more information.") endif() elseif(MI_ARCH STREQUAL "x86") @@ -681,7 +681,7 @@ endif() # ----------------------------------------------------------------------------- if (MI_OVERRIDE) if (MI_BUILD_SHARED) - target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE) + target_compile_definitions(mimalloc PRIVATE MI_MALLOC_OVERRIDE) endif() if(NOT WIN32) # It is only possible to override malloc on Windows when building as a DLL. From 3c7d7e1f11eeca0dec9d48119ed22f40e63ae518 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 14:07:57 -0800 Subject: [PATCH 128/264] experiment with 2 level pagemap --- include/mimalloc/bits.h | 18 ++++ include/mimalloc/internal.h | 43 +++++++++- src/page-map.c | 162 ++++++++++++++++++++++++++++++++++++ 3 files changed, 222 insertions(+), 1 deletion(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 32b9d528..ca0b5905 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -107,6 +107,24 @@ typedef int32_t mi_ssize_t; // Define big endian if needed // #define MI_BIG_ENDIAN 1 +#if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 +#define MI_MAX_VABITS MI_DEFAULT_VIRTUAL_ADDRESS_BITS +#elif MI_ARCH_X64 +#define MI_MAX_VABITS (47) +#elif MI_INTPTR_SIZE > 4 +#define MI_MAX_VABITS (48) +#else +#define MI_MAX_VABITS (32) +#endif + +#ifndef MI_PAGE_MAP_FLAT +#if MI_MAX_VABITS <= 40 +#define MI_PAGE_MAP_FLAT 1 +#else +#define MI_PAGE_MAP_FLAT 0 +#endif +#endif + /* -------------------------------------------------------------------------------- Builtin's diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 17c02941..8955db5e 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -442,6 +442,8 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si Pages ----------------------------------------------------------- */ +#if MI_PAGE_MAP_FLAT + // flat page-map committed on demand extern uint8_t* _mi_page_map; @@ -466,10 +468,49 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { return _mi_ptr_page_ex(p, NULL); } +#else + +// 2-level page map + +// one sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space +// the page-map needs 48-16-13 = 19 bits => 2^19 sub map pointers = 4 MiB size. +// we commit the page-map and the sub maps on-demand. + +#define MI_PAGE_MAP_SUB_SHIFT (13) +#define MI_PAGE_MAP_SUB_COUNT (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT) + +#define MI_PAGE_MAP_SHIFT (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT) +#define MI_PAGE_MAP_COUNT (MI_ZU(1) << MI_PAGE_MAP_SHIFT) + +extern mi_page_t*** _mi_page_map; + +static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) { + const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE; + if (sub_idx != NULL) { *sub_idx = (uint32_t)u % MI_PAGE_MAP_SUB_COUNT; } + return (size_t)(u / MI_PAGE_MAP_SUB_COUNT); +} + +static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { + size_t sub_idx; + const size_t idx = _mi_page_map_index(p, &sub_idx); + return _mi_page_map[idx][sub_idx]; +} + +static inline mi_page_t* _mi_checked_ptr_page(const void* p) { + size_t sub_idx; + const size_t idx = _mi_page_map_index(p, &sub_idx); + mi_page_t** const sub = _mi_page_map[idx]; + if mi_unlikely(sub == NULL) return NULL; + return sub[sub_idx]; +} + +#endif + + static inline mi_page_t* _mi_ptr_page(const void* p) { mi_assert_internal(p==NULL || mi_is_in_heap_region(p)); #if MI_DEBUG || defined(__APPLE__) - return _mi_checked_ptr_page(p); + return _mi_checked_ptr_page(p); #else return _mi_unchecked_ptr_page(p); #endif diff --git a/src/page-map.c b/src/page-map.c index a4001359..99a9b60a 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -9,6 +9,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "mimalloc/internal.h" #include "bitmap.h" +#if MI_PAGE_MAP_FLAT // The page-map contains a byte for each 64kb slice in the address space. // For an address `a` where `ofs = _mi_page_map[a >> 16]`: @@ -156,3 +157,164 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att return (_mi_safe_ptr_page(p) != NULL); } +#else + +mi_decl_cache_align mi_page_t*** _mi_page_map; +static void* mi_page_map_max_address; +static mi_memid_t mi_page_map_memid; + +static _Atomic(mi_bfield_t) mi_page_map_commit; // one bit per committed 64 KiB entries + +static mi_page_t** mi_page_map_ensure_at(size_t idx); +static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count); + +bool _mi_page_map_init(void) { + size_t vbits = (size_t)mi_option_get_clamp(mi_option_max_vabits, 0, MI_SIZE_BITS); + if (vbits == 0) { + vbits = _mi_os_virtual_address_bits(); + #if MI_ARCH_X64 // canonical address is limited to the first 128 TiB + if (vbits >= 48) { vbits = 47; } + #endif + } + + // Allocate the page map and commit bits + mi_page_map_max_address = (void*)(MI_PU(1) << vbits); + const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)); + const size_t os_page_size = _mi_os_page_size(); + const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size); + const size_t reserve_size = page_map_size + os_page_size; + const bool commit = page_map_size <= 64*MI_KiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap); // _mi_os_has_overcommit(); // commit on-access on Linux systems? + _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid); + if (_mi_page_map==NULL) { + _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); + return false; + } + if (mi_page_map_memid.initially_committed && !mi_page_map_memid.initially_zero) { + _mi_warning_message("internal: the page map was committed but not zero initialized!\n"); + _mi_memzero_aligned(_mi_page_map, page_map_size); + } + mi_atomic_store_release(&mi_page_map_commit, (commit ? ~0 : (mi_bfield_t)0)); + + // commit the first part so NULL pointers get resolved without an access violation + mi_page_map_ensure_at(0); + + // note: for the NULL range we only commit one OS page + // mi_page_map_set_range(NULL, 0, 0, 1); + _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size); + if (!mi_page_map_memid.initially_committed) { + _mi_os_commit(_mi_page_map[0], os_page_size, NULL); + } + _mi_page_map[0][0] = NULL; + + mi_assert_internal(_mi_ptr_page(NULL)==NULL); + return true; +} + +static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) { + mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit); + const size_t bit_idx = (idx*MI_INTPTR_SIZE)/MI_ARENA_SLICE_SIZE; // we commit a slice of entries at a time + mi_assert_internal(bit_idx < MI_BFIELD_BITS); + if (pbit_idx != NULL) { *pbit_idx = bit_idx; } + return ((commit & (MI_ZU(1) << bit_idx)) != 0); +} + +static mi_page_t** mi_page_map_ensure_committed(size_t idx) { + size_t bit_idx; + if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) { + uint8_t* start = (uint8_t*)_mi_page_map + (bit_idx * MI_ARENA_SLICE_SIZE); + _mi_os_commit(start, MI_ARENA_SLICE_SIZE, NULL); + mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx); + } + return _mi_page_map[idx]; +} + +static mi_page_t** mi_page_map_ensure_at(size_t idx) { + mi_page_t** sub = mi_page_map_ensure_committed(idx); + if mi_unlikely(sub == NULL) { + // sub map not yet allocated, alloc now + mi_memid_t memid; + sub = (mi_page_t**)_mi_os_alloc(MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), &memid); + mi_page_t** expect = NULL; + if (!mi_atomic_cas_strong_acq_rel(((_Atomic(mi_page_t**)*)&_mi_page_map[idx]), &expect, sub)) { + // another thread already allocated it.. free and continue + _mi_os_free(sub, MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*), memid); + sub = expect; + mi_assert_internal(sub!=NULL); + } + if (sub == NULL) { + _mi_error_message(EFAULT, "internal error: unable to extend the page map\n"); + } + } + return sub; +} + +static void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count) { + // is the page map area that contains the page address committed? + while (slice_count > 0) { + mi_page_t** sub = mi_page_map_ensure_at(idx); + // set the offsets for the page + while (sub_idx < MI_PAGE_MAP_SUB_COUNT) { + sub[sub_idx] = page; + slice_count--; if (slice_count == 0) return; + sub_idx++; + } + idx++; // potentially wrap around to the next idx + sub_idx = 0; + } +} + +static size_t mi_page_map_get_idx(mi_page_t* page, size_t* sub_idx, size_t* slice_count) { + size_t page_size; + uint8_t* page_start = mi_page_area(page, &page_size); + if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; } // furthest interior pointer + *slice_count = mi_slice_count_of_size(page_size) + ((page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks + return _mi_page_map_index(page, sub_idx); +} + +void _mi_page_map_register(mi_page_t* page) { + mi_assert_internal(page != NULL); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_page_map != NULL); // should be initialized before multi-thread access! + if mi_unlikely(_mi_page_map == NULL) { + if (!_mi_page_map_init()) return; + } + mi_assert(_mi_page_map!=NULL); + size_t slice_count; + size_t sub_idx; + const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count); + mi_page_map_set_range(page, idx, sub_idx, slice_count); +} + +void _mi_page_map_unregister(mi_page_t* page) { + mi_assert_internal(_mi_page_map != NULL); + // get index and count + size_t slice_count; + size_t sub_idx; + const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count); + // unset the offsets + mi_page_map_set_range(page, idx, sub_idx, slice_count); +} + +void _mi_page_map_unregister_range(void* start, size_t size) { + const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE); + size_t sub_idx; + const uintptr_t idx = _mi_page_map_index(start, &sub_idx); + mi_page_map_set_range(NULL, idx, sub_idx, slice_count); // todo: avoid committing if not already committed? +} + + +mi_page_t* _mi_safe_ptr_page(const void* p) { + if mi_unlikely(p >= mi_page_map_max_address) return NULL; + size_t sub_idx; + const size_t idx = _mi_page_map_index(p,&sub_idx); + if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL; + mi_page_t** const sub = _mi_page_map[idx]; + if mi_unlikely(sub==NULL) return NULL; + return sub[sub_idx]; +} + +mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { + return (_mi_safe_ptr_page(p) != NULL); +} + +#endif From a42a2a926b5fd68a40bd7b75d1362d5c1f4e7d1b Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 14:18:33 -0800 Subject: [PATCH 129/264] improving level 2 page-map --- include/mimalloc/internal.h | 11 ++++++----- src/page-map.c | 17 ++++++++++++----- 2 files changed, 18 insertions(+), 10 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 8955db5e..5dc2074d 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -470,11 +470,12 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { #else -// 2-level page map - -// one sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space -// the page-map needs 48-16-13 = 19 bits => 2^19 sub map pointers = 4 MiB size. -// we commit the page-map and the sub maps on-demand. +// 2-level page map: +// The page-map is usually 4 MiB and points to sub maps of 64 KiB. +// The page-map is committed on-demand (in 64 KiB) parts (and sub-maps are committed on-demand as well) +// One sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space +// The page-map needs 48-16-13 = 19 bits => 2^19 sub map pointers = 4 MiB size. +// (Choosing a MI_PAGE_MAP_SUB_SHIFT of 16 gives slightly better code but will commit the initial sub-map at 512 KiB) #define MI_PAGE_MAP_SUB_SHIFT (13) #define MI_PAGE_MAP_SUB_COUNT (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT) diff --git a/src/page-map.c b/src/page-map.c index 99a9b60a..5a25b839 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -159,11 +159,13 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att #else +// A 2-level page map + mi_decl_cache_align mi_page_t*** _mi_page_map; static void* mi_page_map_max_address; static mi_memid_t mi_page_map_memid; -static _Atomic(mi_bfield_t) mi_page_map_commit; // one bit per committed 64 KiB entries +static _Atomic(mi_bfield_t) mi_page_map_commit; static mi_page_t** mi_page_map_ensure_at(size_t idx); static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count); @@ -178,8 +180,10 @@ bool _mi_page_map_init(void) { } // Allocate the page map and commit bits + mi_assert(MI_MAX_VABITS >= vbits); mi_page_map_max_address = (void*)(MI_PU(1) << vbits); const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)); + mi_assert(page_map_count <= MI_PAGE_MAP_COUNT); const size_t os_page_size = _mi_os_page_size(); const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size); const size_t reserve_size = page_map_size + os_page_size; @@ -193,7 +197,7 @@ bool _mi_page_map_init(void) { _mi_warning_message("internal: the page map was committed but not zero initialized!\n"); _mi_memzero_aligned(_mi_page_map, page_map_size); } - mi_atomic_store_release(&mi_page_map_commit, (commit ? ~0 : (mi_bfield_t)0)); + mi_atomic_store_release(&mi_page_map_commit, (commit ? ~MI_ZU(0) : MI_ZU(0))); // commit the first part so NULL pointers get resolved without an access violation mi_page_map_ensure_at(0); @@ -210,9 +214,12 @@ bool _mi_page_map_init(void) { return true; } + +#define MI_PAGE_MAP_ENTRIES_PER_CBIT (MI_PAGE_MAP_COUNT / MI_BFIELD_BITS) + static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) { mi_bfield_t commit = mi_atomic_load_relaxed(&mi_page_map_commit); - const size_t bit_idx = (idx*MI_INTPTR_SIZE)/MI_ARENA_SLICE_SIZE; // we commit a slice of entries at a time + const size_t bit_idx = idx/MI_PAGE_MAP_ENTRIES_PER_CBIT; mi_assert_internal(bit_idx < MI_BFIELD_BITS); if (pbit_idx != NULL) { *pbit_idx = bit_idx; } return ((commit & (MI_ZU(1) << bit_idx)) != 0); @@ -221,8 +228,8 @@ static inline bool mi_page_map_is_committed(size_t idx, size_t* pbit_idx) { static mi_page_t** mi_page_map_ensure_committed(size_t idx) { size_t bit_idx; if mi_unlikely(!mi_page_map_is_committed(idx, &bit_idx)) { - uint8_t* start = (uint8_t*)_mi_page_map + (bit_idx * MI_ARENA_SLICE_SIZE); - _mi_os_commit(start, MI_ARENA_SLICE_SIZE, NULL); + uint8_t* start = (uint8_t*)&_mi_page_map[bit_idx * MI_PAGE_MAP_ENTRIES_PER_CBIT]; + _mi_os_commit(start, MI_PAGE_MAP_ENTRIES_PER_CBIT * sizeof(mi_page_t**), NULL); mi_atomic_or_acq_rel(&mi_page_map_commit, MI_ZU(1) << bit_idx); } return _mi_page_map[idx]; From c5cfc92f0cc8809d7fdd5e86c67321d90dd33a04 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 14:39:57 -0800 Subject: [PATCH 130/264] small fixes --- include/mimalloc/bits.h | 2 ++ src/arena-meta.c | 2 +- src/page-map.c | 2 +- 3 files changed, 4 insertions(+), 2 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index ca0b5905..ed4a7b44 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -107,6 +107,7 @@ typedef int32_t mi_ssize_t; // Define big endian if needed // #define MI_BIG_ENDIAN 1 +// maximum virtual address bits in a user-space pointer #if MI_DEFAULT_VIRTUAL_ADDRESS_BITS > 0 #define MI_MAX_VABITS MI_DEFAULT_VIRTUAL_ADDRESS_BITS #elif MI_ARCH_X64 @@ -117,6 +118,7 @@ typedef int32_t mi_ssize_t; #define MI_MAX_VABITS (32) #endif +// use a flat page-map (or a 2-level one) #ifndef MI_PAGE_MAP_FLAT #if MI_MAX_VABITS <= 40 #define MI_PAGE_MAP_FLAT 1 diff --git a/src/arena-meta.c b/src/arena-meta.c index 065a1331..fcfb680c 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -25,7 +25,7 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_META_PAGE_SIZE MI_ARENA_SLICE_SIZE #define MI_META_PAGE_ALIGN MI_ARENA_SLICE_ALIGN -#define MI_META_BLOCK_SIZE (64) +#define MI_META_BLOCK_SIZE (128) // large enough such that META_MAX_SIZE > 4k (even on 32-bit) #define MI_META_BLOCK_ALIGN MI_META_BLOCK_SIZE #define MI_META_BLOCKS_PER_PAGE (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE) // 1024 #define MI_META_MAX_SIZE (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE) diff --git a/src/page-map.c b/src/page-map.c index 5a25b839..190be6c0 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -147,7 +147,7 @@ void _mi_page_map_unregister_range(void* start, size_t size) { mi_page_t* _mi_safe_ptr_page(const void* p) { if mi_unlikely(p >= mi_page_map_max_address) return NULL; const uintptr_t idx = _mi_page_map_index(p); - if mi_unlikely(mi_page_map_commit == NULL || !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL; + if mi_unlikely(mi_page_map_commit != NULL && !mi_bitmap_is_set(mi_page_map_commit, idx/MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT)) return NULL; const uintptr_t ofs = _mi_page_map[idx]; if mi_unlikely(ofs == 0) return NULL; return (mi_page_t*)((((uintptr_t)p >> MI_ARENA_SLICE_SHIFT) - ofs + 1) << MI_ARENA_SLICE_SHIFT); From 516e644359685d38d035e76b1ac7d40df0c22edc Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 16:06:49 -0800 Subject: [PATCH 131/264] rename option pagemap_commit; always commit the page map on macos (for now) --- ide/vs2022/mimalloc.vcxproj | 2 +- include/mimalloc.h | 4 ++-- include/mimalloc/bits.h | 2 +- include/mimalloc/internal.h | 3 +++ src/alloc.c | 2 +- src/options.c | 11 ++++++++++- src/page-map.c | 4 ++-- test/main-override-static.c | 2 +- 8 files changed, 21 insertions(+), 9 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 87e866bb..2c4477d9 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -190,7 +190,7 @@ true Default ../../include - MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); + MI_DEBUG=3;MI_SECURE=4;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 diff --git a/include/mimalloc.h b/include/mimalloc.h index b0a20e9e..8bff8923 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -398,8 +398,8 @@ typedef enum mi_option_e { mi_option_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) mi_option_full_page_retain, // retain N full pages per size class (=2) mi_option_max_page_candidates, // max candidate pages to consider for allocation (=4) - mi_option_max_vabits, // max virtual address bits to consider in user space (=48) - mi_option_debug_commit_full_pagemap, // commit the full pagemap to catch invalid pointer uses (=0) + mi_option_max_vabits, // max user space virtual address bits to consider (=48) + mi_option_pagemap_commit, // commit the full pagemap (to always catch invalid pointer uses) (=0) _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index ed4a7b44..875f6230 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -120,7 +120,7 @@ typedef int32_t mi_ssize_t; // use a flat page-map (or a 2-level one) #ifndef MI_PAGE_MAP_FLAT -#if MI_MAX_VABITS <= 40 +#if MI_MAX_VABITS <= 40 && !defined(__APPLE__) #define MI_PAGE_MAP_FLAT 1 #else #define MI_PAGE_MAP_FLAT 0 diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 5dc2074d..9146896c 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -445,6 +445,7 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si #if MI_PAGE_MAP_FLAT // flat page-map committed on demand +// single indirection and low commit, but large initial virtual reserve (4 GiB with 48 bit virtual addresses) extern uint8_t* _mi_page_map; static inline size_t _mi_page_map_index(const void* p) { @@ -471,6 +472,8 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { #else // 2-level page map: +// double indirection but low commit and low virtual reserve. +// // The page-map is usually 4 MiB and points to sub maps of 64 KiB. // The page-map is committed on-demand (in 64 KiB) parts (and sub-maps are committed on-demand as well) // One sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space diff --git a/src/alloc.c b/src/alloc.c index e5f2b8ae..6b037987 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -674,7 +674,7 @@ mi_decl_restrict void* _mi_heap_malloc_guarded(mi_heap_t* heap, size_t size, boo #if MI_STAT>1 mi_heap_stat_increase(heap, malloc, mi_usable_size(p)); #endif - _mi_stat_counter_increase(&heap->tld->stats.guarded_alloc_count, 1); + mi_heap_stat_counter_increase(heap, guarded_alloc_count, 1); } #if MI_DEBUG>3 if (p != NULL && zero) { diff --git a/src/options.c b/src/options.c index 4f1a00b8..fc3a2838 100644 --- a/src/options.c +++ b/src/options.c @@ -102,6 +102,14 @@ typedef struct mi_option_desc_s { #endif #endif +#ifndef MI_DEFAULT_PAGEMAP_COMMIT +#if defined(__APPLE__) +#define MI_DEFAULT_PAGEMAP_COMMIT 1 +#else +#define MI_DEFAULT_PAGEMAP_COMMIT 0 +#endif +#endif + static mi_option_desc_t options[_mi_option_last] = { @@ -165,7 +173,8 @@ static mi_option_desc_t options[_mi_option_last] = { 2, UNINIT, MI_OPTION(full_page_retain) }, { 4, UNINIT, MI_OPTION(max_page_candidates) }, { 0, UNINIT, MI_OPTION(max_vabits) }, - { 0, UNINIT, MI_OPTION(debug_commit_full_pagemap) }, + { MI_DEFAULT_PAGEMAP_COMMIT, + UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/page-map.c b/src/page-map.c index 190be6c0..37ce3082 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -42,7 +42,7 @@ bool _mi_page_map_init(void) { // Allocate the page map and commit bits mi_page_map_max_address = (void*)(MI_PU(1) << vbits); const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); - const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap)); // _mi_os_has_overcommit(); // commit on-access on Linux systems? + const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems? const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT); const size_t bitmap_size = (commit ? 0 : mi_bitmap_size(commit_bits, NULL)); const size_t reserve_size = bitmap_size + page_map_size; @@ -187,7 +187,7 @@ bool _mi_page_map_init(void) { const size_t os_page_size = _mi_os_page_size(); const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size); const size_t reserve_size = page_map_size + os_page_size; - const bool commit = page_map_size <= 64*MI_KiB || mi_option_is_enabled(mi_option_debug_commit_full_pagemap); // _mi_os_has_overcommit(); // commit on-access on Linux systems? + const bool commit = page_map_size <= 64*MI_KiB || mi_option_is_enabled(mi_option_pagemap_commit); // _mi_os_has_overcommit(); // commit on-access on Linux systems? _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); diff --git a/test/main-override-static.c b/test/main-override-static.c index 410764bd..b16864db 100644 --- a/test/main-override-static.c +++ b/test/main-override-static.c @@ -35,7 +35,7 @@ int main() { // corrupt_free(); // block_overflow1(); // block_overflow2(); - // test_canary_leak(); + test_canary_leak(); // test_aslr(); // invalid_free(); // test_reserved(); From 773fe7ae5b914821a1d201fd47b2e12870516f5a Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 17:25:58 -0800 Subject: [PATCH 132/264] support full secure build --- ide/vs2022/mimalloc.vcxproj | 2 +- include/mimalloc/types.h | 12 ++++++----- src/arena-meta.c | 38 ++++++++++++++++++++++---------- src/arena.c | 43 ++++++++++++++++++++++++++++++------- src/os.c | 4 ++-- src/page.c | 18 ++++++++-------- src/prim/unix/prim.c | 2 +- 7 files changed, 82 insertions(+), 37 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 2c4477d9..dc112272 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -190,7 +190,7 @@ true Default ../../include - MI_DEBUG=3;MI_SECURE=4;%(PreprocessorDefinitions); + MI_DEBUG=3;MI_GUARDED=1;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 7009a017..84179458 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -46,11 +46,13 @@ terms of the MIT license. A copy of the license can be found in the file // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance). // #define MI_STAT 1 -// Define MI_SECURE to enable security mitigations -// #define MI_SECURE 1 // guard page around metadata -// #define MI_SECURE 2 // guard page around each mimalloc page -// #define MI_SECURE 3 // encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free) -// #define MI_SECURE 4 // checks for double free. (may be more expensive) +// Define MI_SECURE to enable security mitigations. The lowest two have minimal performance impact: +// #define MI_SECURE 1 // guard page around metadata +// #define MI_SECURE 2 // guard page around each mimalloc page (can fragment VMA's with large heaps..) +// +// The next two levels can have more performance cost: +// #define MI_SECURE 3 // randomize allocations, encode free lists (detect corrupted free list (buffer overflow), and invalid pointer free) +// #define MI_SECURE 4 // checks for double free. (may be more expensive) #if !defined(MI_SECURE) #define MI_SECURE 0 diff --git a/src/arena-meta.c b/src/arena-meta.c index fcfb680c..a916706b 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -25,6 +25,12 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_META_PAGE_SIZE MI_ARENA_SLICE_SIZE #define MI_META_PAGE_ALIGN MI_ARENA_SLICE_ALIGN +#if MI_SECURE +#define MI_META_PAGE_GUARD_SIZE (4*MI_KiB) +#else +#define MI_META_PAGE_GUARD_SIZE (0) +#endif + #define MI_META_BLOCK_SIZE (128) // large enough such that META_MAX_SIZE > 4k (even on 32-bit) #define MI_META_BLOCK_ALIGN MI_META_BLOCK_SIZE #define MI_META_BLOCKS_PER_PAGE (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE) // 1024 @@ -41,7 +47,7 @@ static mi_decl_cache_align _Atomic(mi_meta_page_t*) mi_meta_pages = MI_ATOMIC_V #if MI_DEBUG > 1 static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) { - mi_meta_page_t* mpage = (mi_meta_page_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN); + mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + MI_META_PAGE_GUARD_SIZE); if (block_idx != NULL) { *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE; } @@ -54,9 +60,9 @@ static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) { } static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) { - mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN)); + mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - MI_META_PAGE_GUARD_SIZE, MI_META_PAGE_ALIGN)); mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE); - void* p = ((uint8_t*)mpage + (block_idx * MI_META_BLOCK_SIZE)); + void* p = ((uint8_t*)mpage - MI_META_PAGE_GUARD_SIZE + (block_idx * MI_META_BLOCK_SIZE)); mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL)); return p; } @@ -66,22 +72,32 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) { // allocate a fresh arena slice // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. mi_memid_t memid; - mi_meta_page_t* mpage = (mi_meta_page_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_ARENA_SLICE_SIZE, MI_ARENA_SLICE_ALIGN, 0, - true /* commit*/, true /* allow large */, + uint8_t* base = (uint8_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0, + true /* commit*/, (MI_SECURE==0) /* allow large? */, NULL /* req arena */, 0 /* thread_seq */, &memid); - if (mpage == NULL) return NULL; - mi_assert_internal(_mi_is_aligned(mpage,MI_META_PAGE_ALIGN)); + if (base == NULL) return NULL; + mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN)); if (!memid.initially_zero) { - _mi_memzero_aligned(mpage, MI_ARENA_SLICE_SIZE); + _mi_memzero_aligned(base, MI_ARENA_SLICE_SIZE); } - // initialize the page + // guard pages + #if MI_SECURE + if (!memid.is_pinned) { + _mi_os_decommit(base, MI_META_PAGE_GUARD_SIZE); + _mi_os_decommit(base + MI_META_PAGE_SIZE - MI_META_PAGE_GUARD_SIZE, MI_META_PAGE_GUARD_SIZE); + } + #endif + + // initialize the page and free block bitmap + mi_meta_page_t* mpage = (mi_meta_page_t*)(base + MI_META_PAGE_GUARD_SIZE); mpage->memid = memid; mi_bitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */); const size_t mpage_size = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL); const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE); - mi_assert_internal(info_blocks < MI_META_BLOCKS_PER_PAGE); - mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks); + const size_t guard_blocks = _mi_divide_up(MI_META_PAGE_GUARD_SIZE, MI_META_BLOCK_SIZE); + mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE); + mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks); // push atomically in front of the meta page list // (note: there is no ABA issue since we never free meta-pages) diff --git a/src/arena.c b/src/arena.c index 0cea5776..aa8ba416 100644 --- a/src/arena.c +++ b/src/arena.c @@ -577,10 +577,16 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t return NULL; } +#if MI_SECURE < 2 +#define MI_ARENA_GUARD_PAGE_SIZE (0) +#else +#define MI_ARENA_GUARD_PAGE_SIZE (4*MI_KiB) +#endif + static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment, mi_arena_t* req_arena, size_t tseq) { - const bool allow_large = true; + const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page const bool commit = true; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t page_alignment = MI_ARENA_SLICE_ALIGN; @@ -615,6 +621,14 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment)); + // guard page at the end + const size_t page_body_size = mi_size_of_slices(slice_count) - MI_ARENA_GUARD_PAGE_SIZE; + #if MI_SECURE >= 2 + if (memid.initially_committed && !memid.is_pinned) { + _mi_os_decommit((uint8_t*)page + page_body_size, MI_ARENA_GUARD_PAGE_SIZE); + } + #endif + // claimed free slices: initialize the page partly if (!memid.initially_zero) { mi_track_mem_undefined(page, slice_count * MI_ARENA_SLICE_SIZE); @@ -625,7 +639,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ } #if MI_DEBUG > 1 if (memid.initially_zero) { - if (!mi_mem_is_zero(page, mi_size_of_slices(slice_count))) { + if (!mi_mem_is_zero(page, page_body_size)) { _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n"); memid.initially_zero = false; _mi_memzero_aligned(page, sizeof(*page)); @@ -655,7 +669,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ // otherwise start after the info block_start = mi_page_info_size(); } - const size_t reserved = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size); + const size_t reserved = (os_align ? 1 : (page_body_size - block_start) / block_size); mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); page->reserved = (uint16_t)reserved; page->page_start = (uint8_t*)page + block_start; @@ -708,7 +722,7 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si mi_tld_t* const tld = heap->tld; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size()); - const size_t slice_count = mi_slice_count_of_size(info_size + block_size); + const size_t slice_count = mi_slice_count_of_size(info_size + block_size + MI_ARENA_GUARD_PAGE_SIZE); mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq); if (page == NULL) return NULL; @@ -717,6 +731,7 @@ static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, si mi_assert(page->reserved == 1); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); + _mi_page_init(heap, page); return page; } @@ -774,6 +789,13 @@ void _mi_arena_page_free(mi_page_t* page) { } #endif + // recommit guard page at the end? + #if MI_SECURE >= 2 + if (!page->memid.is_pinned) { + _mi_os_commit((uint8_t*)page + mi_memid_size(page->memid) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE, NULL); + } + #endif + // unregister page _mi_page_map_unregister(page); if (page->memid.memkind == MI_MEM_ARENA) { @@ -1114,12 +1136,17 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s mi_arena_t* arena = (mi_arena_t*)start; // commit & zero if needed - bool is_zero = memid.initially_zero; + const size_t os_page_size = _mi_os_page_size(); if (!memid.initially_committed) { - _mi_os_commit(arena, mi_size_of_slices(info_slices), NULL); + // security: always leave a guard OS page decommitted at the end (already part of info_slices) + _mi_os_commit(arena, mi_size_of_slices(info_slices) - os_page_size, NULL); } - if (!is_zero) { - _mi_memzero(arena, mi_size_of_slices(info_slices)); + else if (!memid.is_pinned) { + // security: decommit a guard OS page at the end of the arena info + _mi_os_decommit((uint8_t*)arena + mi_size_of_slices(info_slices) - os_page_size, os_page_size); + } + if (!memid.initially_zero) { + _mi_memzero(arena, mi_size_of_slices(info_slices) - os_page_size); } // init diff --git a/src/os.c b/src/os.c index 53e8f571..80d44d12 100644 --- a/src/os.c +++ b/src/os.c @@ -536,8 +536,8 @@ static uint8_t* mi_os_claim_huge_pages(size_t pages, size_t* total_size) { start = huge_start; if (start == 0) { // Initialize the start address after the 32TiB area - start = ((uintptr_t)32 << 40); // 32TiB virtual start address - #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of huge pages unless in debug mode + start = ((uintptr_t)8 << 40); // 8TiB virtual start address + #if (MI_SECURE>0 || MI_DEBUG==0) // security: randomize start of huge pages unless in debug mode uintptr_t r = _mi_heap_random_next(mi_prim_get_default_heap()); start = start + ((uintptr_t)MI_HUGE_OS_PAGE_SIZE * ((r>>17) & 0x0FFF)); // (randomly 12bits)*1GiB == between 0 to 4TiB #endif diff --git a/src/page.c b/src/page.c index 200cdaa9..6030161a 100644 --- a/src/page.c +++ b/src/page.c @@ -82,7 +82,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) { mi_assert_internal(mi_page_block_size(page) > 0); mi_assert_internal(page->used <= page->capacity); mi_assert_internal(page->capacity <= page->reserved); - + // const size_t bsize = mi_page_block_size(page); // uint8_t* start = mi_page_start(page); //mi_assert_internal(start + page->capacity*page->block_size == page->top); @@ -475,7 +475,7 @@ static mi_decl_noinline void mi_heap_generic_collect(mi_heap_t* heap) { static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) { MI_UNUSED(stats); - #if (MI_SECURE<=2) + #if (MI_SECURE<3) mi_assert_internal(page->free == NULL); mi_assert_internal(page->local_free == NULL); #endif @@ -533,7 +533,7 @@ static void mi_page_free_list_extend_secure(mi_heap_t* const heap, mi_page_t* co static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, const size_t bsize, const size_t extend, mi_stats_t* const stats) { MI_UNUSED(stats); - #if (MI_SECURE <= 2) + #if (MI_SECURE<3) mi_assert_internal(page->free == NULL); mi_assert_internal(page->local_free == NULL); #endif @@ -561,7 +561,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co ----------------------------------------------------------- */ #define MI_MAX_EXTEND_SIZE (4*1024) // heuristic, one OS page seems to work well. -#if (MI_SECURE>0) +#if (MI_SECURE>=3) #define MI_MIN_EXTEND (8*MI_SECURE) // extend at least by this many #else #define MI_MIN_EXTEND (1) @@ -574,7 +574,7 @@ static mi_decl_noinline void mi_page_free_list_extend( mi_page_t* const page, co // extra test in malloc? or cache effects?) static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { mi_assert_expensive(mi_page_is_valid_init(page)); - #if (MI_SECURE<=2) + #if (MI_SECURE<3) mi_assert(page->free == NULL); mi_assert(page->local_free == NULL); if (page->free != NULL) return; @@ -605,7 +605,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(extend < (1UL<<16)); // and append the extend the free list - if (extend < MI_MIN_SLICES || MI_SECURE==0) { //!mi_option_is_enabled(mi_option_secure)) { + if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) { mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats ); } else { @@ -621,7 +621,7 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { mi_assert(page != NULL); mi_page_set_heap(page, heap); - + size_t page_size; uint8_t* page_start = mi_page_area(page, &page_size); MI_UNUSED(page_start); mi_track_mem_noaccess(page_start,page_size); @@ -653,7 +653,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { #endif mi_assert_internal(page->block_size_shift == 0 || (mi_page_block_size(page) == ((size_t)1 << page->block_size_shift))); mi_assert_expensive(mi_page_is_valid_init(page)); - + // initialize an initial free list mi_page_extend_free(heap,page); mi_assert(mi_page_immediate_available(page)); @@ -740,7 +740,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m } // for each page mi_debug_heap_stat_counter_increase(heap, searches, count); - + // set the page to the best candidate if (page_candidate != NULL) { page = page_candidate; diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index eb351f69..b47fff90 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -412,7 +412,7 @@ int _mi_prim_decommit(void* start, size_t size, bool* needs_recommit) { int err = 0; // decommit: use MADV_DONTNEED as it decreases rss immediately (unlike MADV_FREE) err = unix_madvise(start, size, MADV_DONTNEED); - #if !MI_DEBUG && !MI_SECURE + #if !MI_DEBUG && MI_SECURE<=2 *needs_recommit = false; #else *needs_recommit = true; From 9ecadaecd5c04f6ddd7597d665f42329b9c502ab Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 17:55:56 -0800 Subject: [PATCH 133/264] clean up --- src/arena.c | 35 +++++++++++++++++++---------------- 1 file changed, 19 insertions(+), 16 deletions(-) diff --git a/src/arena.c b/src/arena.c index aa8ba416..b9fbef05 100644 --- a/src/arena.c +++ b/src/arena.c @@ -583,6 +583,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t #define MI_ARENA_GUARD_PAGE_SIZE (4*MI_KiB) #endif +// Allocate a fresh page static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment, mi_arena_t* req_arena, size_t tseq) { @@ -622,10 +623,10 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment)); // guard page at the end - const size_t page_body_size = mi_size_of_slices(slice_count) - MI_ARENA_GUARD_PAGE_SIZE; + const size_t page_noguard_size = mi_size_of_slices(slice_count) - MI_ARENA_GUARD_PAGE_SIZE; #if MI_SECURE >= 2 if (memid.initially_committed && !memid.is_pinned) { - _mi_os_decommit((uint8_t*)page + page_body_size, MI_ARENA_GUARD_PAGE_SIZE); + _mi_os_decommit((uint8_t*)page + page_noguard_size, MI_ARENA_GUARD_PAGE_SIZE); } #endif @@ -639,7 +640,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ } #if MI_DEBUG > 1 if (memid.initially_zero) { - if (!mi_mem_is_zero(page, page_body_size)) { + if (!mi_mem_is_zero(page, page_noguard_size)) { _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n"); memid.initially_zero = false; _mi_memzero_aligned(page, sizeof(*page)); @@ -669,7 +670,7 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ // otherwise start after the info block_start = mi_page_info_size(); } - const size_t reserved = (os_align ? 1 : (page_body_size - block_start) / block_size); + const size_t reserved = (os_align ? 1 : (page_noguard_size - block_start) / block_size); mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); page->reserved = (uint16_t)reserved; page->page_start = (uint8_t*)page + block_start; @@ -695,7 +696,8 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ return page; } -static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size_t block_size) { +// Allocate a regular small/medium/large page. +static mi_page_t* mi_arena_page_regular_alloc(mi_heap_t* heap, size_t slice_count, size_t block_size) { mi_arena_t* req_arena = heap->exclusive_arena; mi_tld_t* const tld = heap->tld; @@ -716,21 +718,22 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size return NULL; } - -static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { +// Allocate a page containing one block (very large, or with large alignment) +static mi_page_t* mi_arena_page_singleton_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { mi_arena_t* req_arena = heap->exclusive_arena; mi_tld_t* const tld = heap->tld; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size()); - const size_t slice_count = mi_slice_count_of_size(info_size + block_size + MI_ARENA_GUARD_PAGE_SIZE); + #if MI_ARENA_GUARD_PAGE_SIZE == 0 + const size_t slice_count = mi_slice_count_of_size(info_size + block_size); + #else + const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, MI_ARENA_GUARD_PAGE_SIZE) + MI_ARENA_GUARD_PAGE_SIZE); + #endif mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq); if (page == NULL) return NULL; - mi_assert(page != NULL); mi_assert(page->reserved == 1); - mi_assert_internal(_mi_ptr_page(page)==page); - mi_assert_internal(_mi_ptr_page(mi_page_start(page))==page); _mi_page_init(heap, page); return page; @@ -741,19 +744,19 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block mi_page_t* page; if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) { mi_assert_internal(_mi_is_power_of_two(block_alignment)); - page = mi_singleton_page_alloc(heap, block_size, block_alignment); + page = mi_arena_page_singleton_alloc(heap, block_size, block_alignment); } else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) { - page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size); + page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size); } else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) { - page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); + page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); } else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { - page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); + page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); } else { - page = mi_singleton_page_alloc(heap, block_size, block_alignment); + page = mi_arena_page_singleton_alloc(heap, block_size, block_alignment); } // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); From db82baf1a8f2952d83c6df91bee9cca4a463e0eb Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 18:09:16 -0800 Subject: [PATCH 134/264] cleanup, some renaming --- include/mimalloc/internal.h | 241 ++++++++++++++++++------------------ src/arena-meta.c | 8 +- src/arena.c | 58 ++++----- src/free.c | 8 +- src/heap.c | 4 +- src/init.c | 2 +- src/page.c | 6 +- 7 files changed, 159 insertions(+), 168 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 9146896c..041e7653 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -57,171 +57,168 @@ terms of the MIT license. A copy of the license can be found in the file #endif // "libc.c" -#include -void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args); -void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...); -char _mi_toupper(char c); -int _mi_strnicmp(const char* s, const char* t, size_t n); -void _mi_strlcpy(char* dest, const char* src, size_t dest_size); -void _mi_strlcat(char* dest, const char* src, size_t dest_size); -size_t _mi_strlen(const char* s); -size_t _mi_strnlen(const char* s, size_t max_len); -bool _mi_getenv(const char* name, char* result, size_t result_size); +#include +void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args); +void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...); +char _mi_toupper(char c); +int _mi_strnicmp(const char* s, const char* t, size_t n); +void _mi_strlcpy(char* dest, const char* src, size_t dest_size); +void _mi_strlcat(char* dest, const char* src, size_t dest_size); +size_t _mi_strlen(const char* s); +size_t _mi_strnlen(const char* s, size_t max_len); +bool _mi_getenv(const char* name, char* result, size_t result_size); // "options.c" -void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message); -void _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...); -void _mi_warning_message(const char* fmt, ...); -void _mi_verbose_message(const char* fmt, ...); -void _mi_trace_message(const char* fmt, ...); -void _mi_output_message(const char* fmt, ...); -void _mi_options_init(void); -long _mi_option_get_fast(mi_option_t option); -void _mi_error_message(int err, const char* fmt, ...); +void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message); +void _mi_fprintf(mi_output_fun* out, void* arg, const char* fmt, ...); +void _mi_warning_message(const char* fmt, ...); +void _mi_verbose_message(const char* fmt, ...); +void _mi_trace_message(const char* fmt, ...); +void _mi_output_message(const char* fmt, ...); +void _mi_options_init(void); +long _mi_option_get_fast(mi_option_t option); +void _mi_error_message(int err, const char* fmt, ...); // random.c -void _mi_random_init(mi_random_ctx_t* ctx); -void _mi_random_init_weak(mi_random_ctx_t* ctx); -void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx); -void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx); -uintptr_t _mi_random_next(mi_random_ctx_t* ctx); -uintptr_t _mi_heap_random_next(mi_heap_t* heap); -uintptr_t _mi_os_random_weak(uintptr_t extra_seed); +void _mi_random_init(mi_random_ctx_t* ctx); +void _mi_random_init_weak(mi_random_ctx_t* ctx); +void _mi_random_reinit_if_weak(mi_random_ctx_t * ctx); +void _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx); +uintptr_t _mi_random_next(mi_random_ctx_t* ctx); +uintptr_t _mi_heap_random_next(mi_heap_t* heap); +uintptr_t _mi_os_random_weak(uintptr_t extra_seed); static inline uintptr_t _mi_random_shuffle(uintptr_t x); // init.c extern mi_decl_cache_align const mi_page_t _mi_page_empty; -void _mi_process_load(void); +void _mi_process_load(void); void mi_cdecl _mi_process_done(void); -bool _mi_is_redirected(void); -bool _mi_allocator_init(const char** message); -void _mi_allocator_done(void); -bool _mi_is_main_thread(void); -size_t _mi_current_thread_count(void); -bool _mi_preloading(void); // true while the C runtime is not initialized yet -void _mi_thread_done(mi_heap_t* heap); +bool _mi_is_redirected(void); +bool _mi_allocator_init(const char** message); +void _mi_allocator_done(void); +bool _mi_is_main_thread(void); +size_t _mi_current_thread_count(void); +bool _mi_preloading(void); // true while the C runtime is not initialized yet +void _mi_thread_done(mi_heap_t* heap); -mi_tld_t* _mi_tld(void); // current tld: `_mi_tld() == _mi_heap_get_default()->tld` +mi_tld_t* _mi_tld(void); // current tld: `_mi_tld() == _mi_heap_get_default()->tld` mi_subproc_t* _mi_subproc(void); mi_subproc_t* _mi_subproc_main(void); +mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; size_t _mi_thread_seq_id(void) mi_attr_noexcept; - -mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap -mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); void _mi_heap_guarded_init(mi_heap_t* heap); // os.c -void _mi_os_init(void); // called from process init -void* _mi_os_alloc(size_t size, mi_memid_t* memid); -void* _mi_os_zalloc(size_t size, mi_memid_t* memid); -void _mi_os_free(void* p, size_t size, mi_memid_t memid); -void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid); +void _mi_os_init(void); // called from process init +void* _mi_os_alloc(size_t size, mi_memid_t* memid); +void* _mi_os_zalloc(size_t size, mi_memid_t* memid); +void _mi_os_free(void* p, size_t size, mi_memid_t memid); +void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid); -size_t _mi_os_page_size(void); -size_t _mi_os_good_alloc_size(size_t size); -bool _mi_os_has_overcommit(void); -bool _mi_os_has_virtual_reserve(void); -size_t _mi_os_virtual_address_bits(void); +size_t _mi_os_page_size(void); +size_t _mi_os_good_alloc_size(size_t size); +bool _mi_os_has_overcommit(void); +bool _mi_os_has_virtual_reserve(void); +size_t _mi_os_virtual_address_bits(void); -bool _mi_os_reset(void* addr, size_t size); -bool _mi_os_commit(void* p, size_t size, bool* is_zero); -bool _mi_os_decommit(void* addr, size_t size); -bool _mi_os_protect(void* addr, size_t size); -bool _mi_os_unprotect(void* addr, size_t size); -bool _mi_os_purge(void* p, size_t size); -bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset); +bool _mi_os_reset(void* addr, size_t size); +bool _mi_os_commit(void* p, size_t size, bool* is_zero); +bool _mi_os_decommit(void* addr, size_t size); +bool _mi_os_protect(void* addr, size_t size); +bool _mi_os_unprotect(void* addr, size_t size); +bool _mi_os_purge(void* p, size_t size); +bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset); -void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid); -void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid); +void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid); +void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid); -void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size); -bool _mi_os_use_large_page(size_t size, size_t alignment); -size_t _mi_os_large_page_size(void); +void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size); +bool _mi_os_use_large_page(size_t size, size_t alignment); +size_t _mi_os_large_page_size(void); -void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid); +void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_secs, size_t* pages_reserved, size_t* psize, mi_memid_t* memid); // arena.c mi_arena_id_t _mi_arena_id_none(void); -mi_arena_t* _mi_arena_from_id(mi_arena_id_t id); +mi_arena_t* _mi_arena_from_id(mi_arena_id_t id); +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena); -void* _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); -void* _mi_arena_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); -void _mi_arena_free(void* p, size_t size, mi_memid_t memid); -bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena); -bool _mi_arena_contains(const void* p); -void _mi_arenas_collect(bool force_purge); -void _mi_arena_unsafe_destroy_all(void); +void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); +void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); +void _mi_arenas_free(void* p, size_t size, mi_memid_t memid); +bool _mi_arenas_contain(const void* p); +void _mi_arenas_collect(bool force_purge); +void _mi_arenas_unsafe_destroy_all(void); -mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); -void _mi_arena_page_free(mi_page_t* page); -void _mi_arena_page_abandon(mi_page_t* page); -void _mi_arena_page_unabandon(mi_page_t* page); -bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page); +mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); +void _mi_arenas_page_free(mi_page_t* page); +void _mi_arenas_page_abandon(mi_page_t* page); +void _mi_arenas_page_unabandon(mi_page_t* page); +bool _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page); // arena-meta.c -void* _mi_meta_zalloc( size_t size, mi_memid_t* memid ); -void _mi_meta_free(void* p, size_t size, mi_memid_t memid); -bool _mi_meta_is_meta_page(void* p); +void* _mi_meta_zalloc( size_t size, mi_memid_t* memid ); +void _mi_meta_free(void* p, size_t size, mi_memid_t memid); +bool _mi_meta_is_meta_page(void* p); // "page-map.c" -bool _mi_page_map_init(void); -void _mi_page_map_register(mi_page_t* page); -void _mi_page_map_unregister(mi_page_t* page); -void _mi_page_map_unregister_range(void* start, size_t size); -mi_page_t* _mi_safe_ptr_page(const void* p); +bool _mi_page_map_init(void); +void _mi_page_map_register(mi_page_t* page); +void _mi_page_map_unregister(mi_page_t* page); +void _mi_page_map_unregister_range(void* start, size_t size); +mi_page_t* _mi_safe_ptr_page(const void* p); // "page.c" -void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; +void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept mi_attr_malloc; -void _mi_page_retire(mi_page_t* page) mi_attr_noexcept; // free the page if there are no other pages with many free blocks -void _mi_page_unfull(mi_page_t* page); -void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq); // free the page -void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq); // abandon the page, to be picked up by another thread... -void _mi_page_force_abandon(mi_page_t* page); -void _mi_heap_collect_retired(mi_heap_t* heap, bool force); +void _mi_page_retire(mi_page_t* page) mi_attr_noexcept; // free the page if there are no other pages with many free blocks +void _mi_page_unfull(mi_page_t* page); +void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq); // free the page +void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq); // abandon the page, to be picked up by another thread... +void _mi_heap_collect_retired(mi_heap_t* heap, bool force); -size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); -void _mi_deferred_free(mi_heap_t* heap, bool force); +size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); +void _mi_deferred_free(mi_heap_t* heap, bool force); -void _mi_page_free_collect(mi_page_t* page,bool force); -void _mi_page_init(mi_heap_t* heap, mi_page_t* page); +void _mi_page_free_collect(mi_page_t* page,bool force); +void _mi_page_init(mi_heap_t* heap, mi_page_t* page); -size_t _mi_bin_size(uint8_t bin); // for stats -uint8_t _mi_bin(size_t size); // for stats +size_t _mi_bin_size(uint8_t bin); // for stats +uint8_t _mi_bin(size_t size); // for stats // "heap.c" -mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld); -void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld); -void _mi_heap_destroy_pages(mi_heap_t* heap); -void _mi_heap_collect_abandon(mi_heap_t* heap); -void _mi_heap_set_default_direct(mi_heap_t* heap); -bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid); -void _mi_heap_unsafe_destroy_all(void); -mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag); -void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page); -bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg); -void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page); +mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena_id, mi_tld_t* tld); +void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t tag, mi_tld_t* tld); +void _mi_heap_destroy_pages(mi_heap_t* heap); +void _mi_heap_collect_abandon(mi_heap_t* heap); +void _mi_heap_set_default_direct(mi_heap_t* heap); +bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid); +void _mi_heap_unsafe_destroy_all(void); +mi_heap_t* _mi_heap_by_tag(mi_heap_t* heap, uint8_t tag); +void _mi_heap_area_init(mi_heap_area_t* area, mi_page_t* page); +bool _mi_heap_area_visit_blocks(const mi_heap_area_t* area, mi_page_t* page, mi_block_visit_fun* visitor, void* arg); +void _mi_heap_page_reclaim(mi_heap_t* heap, mi_page_t* page); // "stats.c" -void _mi_stats_done(mi_stats_t* stats); -void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from); -mi_msecs_t _mi_clock_now(void); -mi_msecs_t _mi_clock_end(mi_msecs_t start); -mi_msecs_t _mi_clock_start(void); +void _mi_stats_done(mi_stats_t* stats); +void _mi_stats_merge_from(mi_stats_t* to, mi_stats_t* from); +mi_msecs_t _mi_clock_now(void); +mi_msecs_t _mi_clock_end(mi_msecs_t start); +mi_msecs_t _mi_clock_start(void); // "alloc.c" -void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept; // called from `_mi_malloc_generic` -void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` -void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` -void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept; -void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` -void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept; -mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p); -void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); +void* _mi_page_malloc_zero(mi_heap_t* heap, mi_page_t* page, size_t size, bool zero) mi_attr_noexcept; // called from `_mi_malloc_generic` +void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` +void* _mi_page_malloc_zeroed(mi_heap_t* heap, mi_page_t* page, size_t size) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` +void* _mi_heap_malloc_zero(mi_heap_t* heap, size_t size, bool zero) mi_attr_noexcept; +void* _mi_heap_malloc_zero_ex(mi_heap_t* heap, size_t size, bool zero, size_t huge_alignment) mi_attr_noexcept; // called from `_mi_heap_malloc_aligned` +void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) mi_attr_noexcept; +mi_block_t* _mi_page_ptr_unalign(const mi_page_t* page, const void* p); +void _mi_padding_shrink(const mi_page_t* page, const mi_block_t* block, const size_t min_size); #if MI_DEBUG>1 -bool _mi_page_is_valid(mi_page_t* page); +bool _mi_page_is_valid(mi_page_t* page); #endif @@ -718,8 +715,8 @@ static inline bool _mi_page_unown(mi_page_t* page) { while mi_unlikely(mi_tf_block(tf_old) != NULL) { _mi_page_free_collect(page, false); // update used if (mi_page_all_free(page)) { // it may become free just before unowning it - _mi_arena_page_unabandon(page); - _mi_arena_page_free(page); + _mi_arenas_page_unabandon(page); + _mi_arenas_page_free(page); return true; } tf_old = mi_atomic_load_relaxed(&page->xthread_free); diff --git a/src/arena-meta.c b/src/arena-meta.c index a916706b..34be6e0e 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -72,9 +72,9 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) { // allocate a fresh arena slice // note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. mi_memid_t memid; - uint8_t* base = (uint8_t*)_mi_arena_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0, - true /* commit*/, (MI_SECURE==0) /* allow large? */, - NULL /* req arena */, 0 /* thread_seq */, &memid); + uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0, + true /* commit*/, (MI_SECURE==0) /* allow large? */, + NULL /* req arena */, 0 /* thread_seq */, &memid); if (base == NULL) return NULL; mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN)); if (!memid.initially_zero) { @@ -165,7 +165,7 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { mi_bitmap_setN(&mpage->blocks_free, block_idx, block_count,NULL); } else { - _mi_arena_free(p,size,memid); + _mi_arenas_free(p,size,memid); } } diff --git a/src/arena.c b/src/arena.c index b9fbef05..7a016165 100644 --- a/src/arena.c +++ b/src/arena.c @@ -467,7 +467,7 @@ static void* mi_arena_os_alloc_aligned( // Allocate large sized memory -void* _mi_arena_alloc_aligned( mi_subproc_t* subproc, +void* _mi_arenas_alloc_aligned( mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) @@ -493,9 +493,9 @@ void* _mi_arena_alloc_aligned( mi_subproc_t* subproc, return p; } -void* _mi_arena_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) +void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid) { - return _mi_arena_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid); + return _mi_arenas_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid); } @@ -521,7 +521,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, // note: this normally never happens unless heaptags are actually used. // (an unown might free the page, and depending on that we can keep it in the abandoned map or not) // note: a minor wrinkle: the page will still be mapped but the abandoned map entry is (temporarily) clear at this point. - // so we cannot check in `mi_arena_free` for this invariant to hold. + // so we cannot check in `mi_arenas_free` for this invariant to hold. const bool freed = _mi_page_unown(page); *keep_abandoned = !freed; return false; @@ -531,7 +531,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, mi_arena_t* arena, return true; } -static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t slice_count, size_t block_size, mi_arena_t* req_arena, mi_heaptag_t heaptag, size_t tseq) +static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_t slice_count, size_t block_size, mi_arena_t* req_arena, mi_heaptag_t heaptag, size_t tseq) { MI_UNUSED(slice_count); const size_t bin = _mi_bin(block_size); @@ -584,7 +584,7 @@ static mi_page_t* mi_arena_page_try_find_abandoned(mi_subproc_t* subproc, size_t #endif // Allocate a fresh page -static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment, +static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment, mi_arena_t* req_arena, size_t tseq) { const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page @@ -697,18 +697,18 @@ static mi_page_t* mi_arena_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_ } // Allocate a regular small/medium/large page. -static mi_page_t* mi_arena_page_regular_alloc(mi_heap_t* heap, size_t slice_count, size_t block_size) { +static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_count, size_t block_size) { mi_arena_t* req_arena = heap->exclusive_arena; mi_tld_t* const tld = heap->tld; // 1. look for an abandoned page - mi_page_t* page = mi_arena_page_try_find_abandoned(tld->subproc, slice_count, block_size, req_arena, heap->tag, tld->thread_seq); + mi_page_t* page = mi_arenas_page_try_find_abandoned(tld->subproc, slice_count, block_size, req_arena, heap->tag, tld->thread_seq); if (page != NULL) { return page; // return as abandoned } // 2. find a free block, potentially allocating a new arena - page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq); + page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq); if (page != NULL) { mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); _mi_page_init(heap, page); @@ -719,7 +719,7 @@ static mi_page_t* mi_arena_page_regular_alloc(mi_heap_t* heap, size_t slice_coun } // Allocate a page containing one block (very large, or with large alignment) -static mi_page_t* mi_arena_page_singleton_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { +static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { mi_arena_t* req_arena = heap->exclusive_arena; mi_tld_t* const tld = heap->tld; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); @@ -730,7 +730,7 @@ static mi_page_t* mi_arena_page_singleton_alloc(mi_heap_t* heap, size_t block_si const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, MI_ARENA_GUARD_PAGE_SIZE) + MI_ARENA_GUARD_PAGE_SIZE); #endif - mi_page_t* page = mi_arena_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq); + mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq); if (page == NULL) return NULL; mi_assert(page->reserved == 1); @@ -740,23 +740,23 @@ static mi_page_t* mi_arena_page_singleton_alloc(mi_heap_t* heap, size_t block_si } -mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { +mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) { mi_page_t* page; if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) { mi_assert_internal(_mi_is_power_of_two(block_alignment)); - page = mi_arena_page_singleton_alloc(heap, block_size, block_alignment); + page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment); } else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) { - page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size); + page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size); } else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) { - page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); + page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); } else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { - page = mi_arena_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); + page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); } else { - page = mi_arena_page_singleton_alloc(heap, block_size, block_alignment); + page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment); } // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); @@ -767,7 +767,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block return page; } -void _mi_arena_page_free(mi_page_t* page) { +void _mi_arenas_page_free(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); @@ -804,14 +804,14 @@ void _mi_arena_page_free(mi_page_t* page) { if (page->memid.memkind == MI_MEM_ARENA) { mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index); } - _mi_arena_free(page, mi_memid_size(page->memid), page->memid); + _mi_arenas_free(page, mi_memid_size(page->memid), page->memid); } /* ----------------------------------------------------------- Arena abandon ----------------------------------------------------------- */ -void _mi_arena_page_abandon(mi_page_t* page) { +void _mi_arenas_page_abandon(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); @@ -855,7 +855,7 @@ void _mi_arena_page_abandon(mi_page_t* page) { _mi_page_unown(page); } -bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) { +bool _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); @@ -871,13 +871,13 @@ bool _mi_arena_page_try_reabandon_to_mapped(mi_page_t* page) { mi_subproc_t* subproc = _mi_subproc(); mi_subproc_stat_counter_increase( subproc, pages_reabandon_full, 1); mi_subproc_stat_adjust_decrease( subproc, pages_abandoned, 1, true /* on alloc */); // adjust as we are not abandoning fresh - _mi_arena_page_abandon(page); + _mi_arenas_page_abandon(page); return true; } } // called from `mi_free` if trying to unabandon an abandoned page -void _mi_arena_page_unabandon(mi_page_t* page) { +void _mi_arenas_page_unabandon(mi_page_t* page) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); @@ -917,12 +917,6 @@ void _mi_arena_page_unabandon(mi_page_t* page) { } } -void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { - MI_UNUSED(heap); - // TODO: implement this - return; -} - /* ----------------------------------------------------------- Arena free @@ -930,7 +924,7 @@ void _mi_arena_reclaim_all_abandoned(mi_heap_t* heap) { static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices); static void mi_arenas_try_purge(bool force, bool visit_all); -void _mi_arena_free(void* p, size_t size, mi_memid_t memid) { +void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) { if (p==NULL) return; if (size==0) return; @@ -1001,7 +995,7 @@ bool mi_arena_contains(mi_arena_id_t arena_id, const void* p) { } // Is a pointer inside any of our arenas? -bool _mi_arena_contains(const void* p) { +bool _mi_arenas_contain(const void* p) { mi_subproc_t* subproc = _mi_subproc(); const size_t max_arena = mi_arenas_get_count(subproc); for (size_t i = 0; i < max_arena; i++) { @@ -1043,7 +1037,7 @@ static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) { // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. -void _mi_arena_unsafe_destroy_all(void) { +void _mi_arenas_unsafe_destroy_all(void) { mi_arenas_unsafe_destroy(_mi_subproc()); _mi_arenas_collect(true /* force purge */); // purge non-owned arenas } diff --git a/src/free.c b/src/free.c index d08123a2..4d72cc7a 100644 --- a/src/free.c +++ b/src/free.c @@ -210,9 +210,9 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { if (mi_page_all_free(page)) { // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish) - _mi_arena_page_unabandon(page); + _mi_arenas_page_unabandon(page); // we can free the page directly - _mi_arena_page_free(page); + _mi_arenas_page_free(page); return; } @@ -240,7 +240,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { { if (mi_page_queue(tagheap, page->block_size)->first != NULL) { // don't reclaim for an block_size we don't use // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arena_page_unabandon(page); + _mi_arenas_page_unabandon(page); _mi_heap_page_reclaim(tagheap, page); mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1); return; @@ -252,7 +252,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations if (!mi_page_is_used_at_frac(page,8) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && - _mi_arena_page_try_reabandon_to_mapped(page)) + _mi_arenas_page_try_reabandon_to_mapped(page)) { return; } diff --git a/src/heap.c b/src/heap.c index a1b06c6b..25ddf9b7 100644 --- a/src/heap.c +++ b/src/heap.c @@ -211,7 +211,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena else { // heaps associated wita a specific arena are allocated in that arena // note: takes up at least one slice which is quite wasteful... - heap = (mi_heap_t*)_mi_arena_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid); + heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid); } if (heap==NULL) { _mi_error_message(ENOMEM, "unable to allocate heap meta-data\n"); @@ -341,7 +341,7 @@ static bool _mi_heap_page_destroy(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ page->next = NULL; page->prev = NULL; mi_page_set_heap(page, NULL); - _mi_arena_page_free(page); + _mi_arenas_page_free(page); return true; // keep going } diff --git a/src/init.c b/src/init.c index 5f3fb797..8233f8a3 100644 --- a/src/init.c +++ b/src/init.c @@ -713,7 +713,7 @@ void mi_cdecl _mi_process_done(void) { if (mi_option_is_enabled(mi_option_destroy_on_exit)) { mi_collect(true /* force */); _mi_heap_unsafe_destroy_all(); // forcefully release all memory held by all heaps (of this thread only!) - _mi_arena_unsafe_destroy_all(); + _mi_arenas_unsafe_destroy_all(); } if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) { diff --git a/src/page.c b/src/page.c index 6030161a..7c8429a9 100644 --- a/src/page.c +++ b/src/page.c @@ -252,7 +252,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { else { mi_page_queue_remove(pq, page); mi_page_set_heap(page, NULL); - _mi_arena_page_abandon(page); + _mi_arenas_page_abandon(page); } } @@ -264,7 +264,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size mi_assert_internal(mi_heap_contains_queue(heap, pq)); mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size); #endif - mi_page_t* page = _mi_arena_page_alloc(heap, block_size, page_alignment); + mi_page_t* page = _mi_arenas_page_alloc(heap, block_size, page_alignment); if (page == NULL) { // out-of-memory return NULL; @@ -357,7 +357,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) { // and free it mi_page_set_heap(page,NULL); - _mi_arena_page_free(page); + _mi_arenas_page_free(page); } #define MI_MAX_RETIRE_SIZE MI_LARGE_OBJ_SIZE_MAX // should be less than size for MI_BIN_HUGE From e61ab67185f1c89c71dec7f7e4508bc8ed9c7f82 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 18:31:33 -0800 Subject: [PATCH 135/264] cleanup --- ide/vs2022/mimalloc.vcxproj | 2 +- src/arena.c | 46 ++++++++++++++++++++++--------------- 2 files changed, 29 insertions(+), 19 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index dc112272..87e866bb 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -190,7 +190,7 @@ true Default ../../include - MI_DEBUG=3;MI_GUARDED=1;%(PreprocessorDefinitions); + MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 diff --git a/src/arena.c b/src/arena.c index 7a016165..b5c17d95 100644 --- a/src/arena.c +++ b/src/arena.c @@ -42,6 +42,7 @@ typedef struct mi_arena_s { int numa_node; // associated NUMA node bool is_exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) + long purge_delay; // from the options, but allows setting per arena _Atomic(mi_msecs_t) purge_expire; // expiration time when slices can be purged from `slices_purge`. mi_bitmap_t* slices_free; // is the slice free? @@ -793,6 +794,7 @@ void _mi_arenas_page_free(mi_page_t* page) { #endif // recommit guard page at the end? + // we must do this since we may later allocate large spans over this page and cannot have a guard page in between #if MI_SECURE >= 2 if (!page->memid.is_pinned) { _mi_os_commit((uint8_t*)page + mi_memid_size(page->memid) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE, NULL); @@ -1047,7 +1049,7 @@ void _mi_arenas_unsafe_destroy_all(void) { Add an arena. ----------------------------------------------------------- */ -static bool mi_arena_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id) { +static bool mi_arenas_add(mi_subproc_t* subproc, mi_arena_t* arena, mi_arena_id_t* arena_id) { mi_assert_internal(arena != NULL); mi_assert_internal(arena->slice_count > 0); if (arena_id != NULL) { *arena_id = NULL; } @@ -1089,7 +1091,7 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas const size_t size = base_size + bitmaps_size; const size_t os_page_size = _mi_os_page_size(); - const size_t info_size = _mi_align_up(size, os_page_size) + os_page_size; // + guard page + const size_t info_size = _mi_align_up(size, os_page_size) + MI_ARENA_GUARD_PAGE_SIZE; const size_t info_slices = mi_slice_count_of_size(info_size); if (bitmap_base != NULL) *bitmap_base = base_size; @@ -1132,18 +1134,19 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s mi_arena_t* arena = (mi_arena_t*)start; - // commit & zero if needed - const size_t os_page_size = _mi_os_page_size(); + // commit & zero if needed if (!memid.initially_committed) { - // security: always leave a guard OS page decommitted at the end (already part of info_slices) - _mi_os_commit(arena, mi_size_of_slices(info_slices) - os_page_size, NULL); + // if MI_SECURE, leave a guard OS page decommitted at the end + _mi_os_commit(arena, mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE, NULL); } else if (!memid.is_pinned) { - // security: decommit a guard OS page at the end of the arena info - _mi_os_decommit((uint8_t*)arena + mi_size_of_slices(info_slices) - os_page_size, os_page_size); + #if MI_SECURE > 0 + // if MI_SECURE, decommit a guard OS page at the end of the arena info + _mi_os_decommit((uint8_t*)arena + mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE); + #endif } if (!memid.initially_zero) { - _mi_memzero(arena, mi_size_of_slices(info_slices) - os_page_size); + _mi_memzero(arena, mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE); } // init @@ -1155,6 +1158,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->is_large = is_large; arena->purge_expire = 0; + arena->purge_delay = mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult); // mi_lock_init(&arena->abandoned_visit_lock); // init bitmaps @@ -1184,7 +1188,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s mi_bitmap_setN(arena->slices_dirty, 0, info_slices, NULL); } - return mi_arena_add(subproc, arena, arena_id); + return mi_arenas_add(subproc, arena, arena_id); } @@ -1427,9 +1431,14 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv Arena purge ----------------------------------------------------------- */ -static long mi_arena_purge_delay(void) { +static long mi_arena_purge_delay(mi_arena_t* arena) { // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay - return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); + if (arena==NULL) { + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); + } + else { + return arena->purge_delay; + } } // reset or decommit in an arena and update the commit bitmap @@ -1459,8 +1468,8 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. // Note: assumes we (still) own the area as we may purge immediately static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { - const long delay = mi_arena_purge_delay(); - if (delay < 0 || _mi_preloading()) return; // is purging allowed at all? + const long delay = mi_arena_purge_delay(arena); + if (arena->memid.is_pinned || delay < 0 || _mi_preloading()) return; // is purging allowed at all? mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); if (delay == 0) { @@ -1542,7 +1551,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) // go through all purge info's (with max MI_BFIELD_BITS ranges at a time) // this also clears those ranges atomically (so any newly freed blocks will get purged next // time around) - mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/}; + mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(arena), true /*all?*/, false /*any?*/}; _mi_bitmap_forall_setc_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo); return vinfo.any_purged; @@ -1551,7 +1560,8 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) static void mi_arenas_try_purge(bool force, bool visit_all) { - if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled + const long delay = mi_arena_purge_delay(NULL); + if (_mi_preloading() || delay <= 0) return; // nothing will be scheduled // check if any arena needs purging? mi_tld_t* tld = _mi_tld(); @@ -1568,7 +1578,7 @@ static void mi_arenas_try_purge(bool force, bool visit_all) mi_atomic_guard(&purge_guard) { // increase global expire: at most one purge per delay cycle - mi_atomic_store_release(&subproc->purge_expire, now + mi_arena_purge_delay()); + mi_atomic_store_release(&subproc->purge_expire, now + delay); const size_t arena_start = tld->thread_seq % max_arena; size_t max_purge_count = (visit_all ? max_arena : 2); bool all_visited = true; @@ -1688,7 +1698,7 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* are arena->is_exclusive = true; arena->subproc = _mi_subproc(); - if (!mi_arena_add(arena->subproc, arena, arena_id)) { + if (!mi_arenas_add(arena->subproc, arena, arena_id)) { return false; } mi_arena_pages_reregister(arena); From f605cb73e524cbfcab36c86cc351a6310640a3fb Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 18:33:44 -0800 Subject: [PATCH 136/264] old purge delay --- src/arena.c | 17 +++++------------ 1 file changed, 5 insertions(+), 12 deletions(-) diff --git a/src/arena.c b/src/arena.c index b5c17d95..4926e667 100644 --- a/src/arena.c +++ b/src/arena.c @@ -42,7 +42,6 @@ typedef struct mi_arena_s { int numa_node; // associated NUMA node bool is_exclusive; // only allow allocations if specifically for this arena bool is_large; // memory area consists of large- or huge OS pages (always committed) - long purge_delay; // from the options, but allows setting per arena _Atomic(mi_msecs_t) purge_expire; // expiration time when slices can be purged from `slices_purge`. mi_bitmap_t* slices_free; // is the slice free? @@ -1158,7 +1157,6 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->is_large = is_large; arena->purge_expire = 0; - arena->purge_delay = mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult); // mi_lock_init(&arena->abandoned_visit_lock); // init bitmaps @@ -1431,14 +1429,9 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv Arena purge ----------------------------------------------------------- */ -static long mi_arena_purge_delay(mi_arena_t* arena) { +static long mi_arena_purge_delay(void) { // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay - if (arena==NULL) { - return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); - } - else { - return arena->purge_delay; - } + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); } // reset or decommit in an arena and update the commit bitmap @@ -1468,7 +1461,7 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c // Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. // Note: assumes we (still) own the area as we may purge immediately static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { - const long delay = mi_arena_purge_delay(arena); + const long delay = mi_arena_purge_delay(); if (arena->memid.is_pinned || delay < 0 || _mi_preloading()) return; // is purging allowed at all? mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); @@ -1551,7 +1544,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) // go through all purge info's (with max MI_BFIELD_BITS ranges at a time) // this also clears those ranges atomically (so any newly freed blocks will get purged next // time around) - mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(arena), true /*all?*/, false /*any?*/}; + mi_purge_visit_info_t vinfo = { now, mi_arena_purge_delay(), true /*all?*/, false /*any?*/}; _mi_bitmap_forall_setc_ranges(arena->slices_purge, &mi_arena_try_purge_visitor, arena, &vinfo); return vinfo.any_purged; @@ -1560,7 +1553,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) static void mi_arenas_try_purge(bool force, bool visit_all) { - const long delay = mi_arena_purge_delay(NULL); + const long delay = mi_arena_purge_delay(); if (_mi_preloading() || delay <= 0) return; // nothing will be scheduled // check if any arena needs purging? From 8d2b7b0383a6ed10b02881531b3e7e25f6c68a38 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 18:34:39 -0800 Subject: [PATCH 137/264] merge from dev3 --- src/arena.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/src/arena.c b/src/arena.c index ca22c47c..0f6388a9 100644 --- a/src/arena.c +++ b/src/arena.c @@ -176,10 +176,6 @@ static size_t mi_memid_size(mi_memid_t memid) { /* ----------------------------------------------------------- Arena Allocation ----------------------------------------------------------- */ -static long mi_arena_purge_delay(void) { - // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay - return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); -} static mi_decl_noinline void* mi_arena_try_alloc_at( mi_arena_t* arena, size_t slice_count, bool commit, size_t tseq, mi_memid_t* memid) From dd1b37c9f8dc0d712b9b32bc88ef40bdb71e46a9 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 21:03:03 -0800 Subject: [PATCH 138/264] fix recursive tls access on macOS <= 14 --- include/mimalloc/internal.h | 5 ++--- src/arena.c | 15 +++++++-------- src/heap.c | 14 +++++++------- src/init.c | 24 +++++++++++++++--------- src/page.c | 4 +++- src/stats.c | 14 +++++++------- 6 files changed, 41 insertions(+), 35 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 041e7653..e98a37f5 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -101,7 +101,6 @@ size_t _mi_current_thread_count(void); bool _mi_preloading(void); // true while the C runtime is not initialized yet void _mi_thread_done(mi_heap_t* heap); -mi_tld_t* _mi_tld(void); // current tld: `_mi_tld() == _mi_heap_get_default()->tld` mi_subproc_t* _mi_subproc(void); mi_subproc_t* _mi_subproc_main(void); mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); @@ -148,8 +147,8 @@ void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); void _mi_arenas_free(void* p, size_t size, mi_memid_t memid); bool _mi_arenas_contain(const void* p); -void _mi_arenas_collect(bool force_purge); -void _mi_arenas_unsafe_destroy_all(void); +void _mi_arenas_collect(bool force_purge, mi_tld_t* tld); +void _mi_arenas_unsafe_destroy_all(mi_tld_t* tld); mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); void _mi_arenas_page_free(mi_page_t* page); diff --git a/src/arena.c b/src/arena.c index 4926e667..88524ea2 100644 --- a/src/arena.c +++ b/src/arena.c @@ -923,7 +923,7 @@ void _mi_arenas_page_unabandon(mi_page_t* page) { Arena free ----------------------------------------------------------- */ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_t slices); -static void mi_arenas_try_purge(bool force, bool visit_all); +static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld); void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) { if (p==NULL) return; @@ -979,12 +979,12 @@ void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) { } // try to purge expired decommits - mi_arenas_try_purge(false, false); + // mi_arenas_try_purge(false, false, NULL); } // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired -void _mi_arenas_collect(bool force_purge) { - mi_arenas_try_purge(force_purge, force_purge /* visit all? */); +void _mi_arenas_collect(bool force_purge, mi_tld_t* tld) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */, tld); } @@ -1038,9 +1038,9 @@ static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) { // destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` // for dynamic libraries that are unloaded and need to release all their allocated memory. -void _mi_arenas_unsafe_destroy_all(void) { +void _mi_arenas_unsafe_destroy_all(mi_tld_t* tld) { mi_arenas_unsafe_destroy(_mi_subproc()); - _mi_arenas_collect(true /* force purge */); // purge non-owned arenas + _mi_arenas_collect(true /* force purge */, tld); // purge non-owned arenas } @@ -1551,13 +1551,12 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) } -static void mi_arenas_try_purge(bool force, bool visit_all) +static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld) { const long delay = mi_arena_purge_delay(); if (_mi_preloading() || delay <= 0) return; // nothing will be scheduled // check if any arena needs purging? - mi_tld_t* tld = _mi_tld(); mi_subproc_t* subproc = tld->subproc; const mi_msecs_t now = _mi_clock_now(); mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire); diff --git a/src/heap.c b/src/heap.c index 25ddf9b7..6632861b 100644 --- a/src/heap.c +++ b/src/heap.c @@ -120,7 +120,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); // collect arenas (this is program wide so don't force purges on abandonment of threads) - _mi_arenas_collect(collect == MI_FORCE /* force purge? */); + _mi_arenas_collect(collect == MI_FORCE /* force purge? */, heap->tld); } void _mi_heap_collect_abandon(mi_heap_t* heap) { @@ -204,7 +204,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena mi_assert(heap_tag >= 0 && heap_tag < 256); // allocate and initialize a heap mi_memid_t memid; - mi_heap_t* heap; + mi_heap_t* heap; if (arena_id == _mi_arena_id_none()) { heap = (mi_heap_t*)_mi_meta_zalloc(sizeof(mi_heap_t), &memid); } @@ -444,7 +444,7 @@ void mi_heap_delete(mi_heap_t* heap) // abandon all pages _mi_heap_collect_abandon(heap); - + mi_assert_internal(heap->page_count==0); mi_heap_free(heap,true); } @@ -471,7 +471,7 @@ void mi_heap_unload(mi_heap_t* heap) { _mi_warning_message("cannot unload heaps that are not associated with an exclusive arena\n"); return; } - + // abandon all pages so all thread'id in the pages are cleared _mi_heap_collect_abandon(heap); mi_assert_internal(heap->page_count==0); @@ -485,7 +485,7 @@ void mi_heap_unload(mi_heap_t* heap) { } bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) { - mi_assert(mi_heap_is_initialized(heap)); + mi_assert(mi_heap_is_initialized(heap)); if (heap==NULL || !mi_heap_is_initialized(heap)) return false; if (heap->exclusive_arena == NULL) { _mi_warning_message("cannot reload heaps that were not associated with an exclusive arena\n"); @@ -503,8 +503,8 @@ bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena_id) { mi_assert_internal(heap->page_count==0); - // re-associate from the current thread-local and static state - heap->tld = _mi_tld(); + // re-associate with the current thread-local and static state + heap->tld = mi_heap_get_default()->tld; // reinit direct pages (as we may be in a different process) mi_assert_internal(heap->page_count == 0); diff --git a/src/init.c b/src/init.c index 8233f8a3..5240611c 100644 --- a/src/init.c +++ b/src/init.c @@ -309,17 +309,21 @@ static mi_tld_t* mi_tld_alloc(void) { #define MI_TLD_INVALID ((mi_tld_t*)1) -mi_decl_noinline static void mi_tld_free(void) { - mi_tld_t* tld = _mi_tld(); +mi_decl_noinline static void mi_tld_free(mi_tld_t* tld) { if (tld != NULL && tld != MI_TLD_INVALID) { _mi_stats_done(&tld->stats); _mi_meta_free(tld, sizeof(mi_tld_t), tld->memid); } - tld = MI_TLD_INVALID; + #if 0 + // do not read/write to `thread_tld` on older macOS <= 14 as that will re-initialize the thread local storage + // (since we are calling this during pthread shutdown) + // (and this could happen on other systems as well, so let's never do it) + thread_tld = MI_TLD_INVALID; + #endif mi_atomic_decrement_relaxed(&thread_count); } -mi_decl_noinline mi_tld_t* _mi_tld(void) { +static mi_tld_t* mi_tld(void) { mi_tld_t* tld = thread_tld; if (tld == MI_TLD_INVALID) { _mi_error_message(EFAULT, "internal error: tld is accessed after the thread terminated\n"); @@ -337,11 +341,11 @@ mi_subproc_t* _mi_subproc(void) { // on such systems we can check for this with the _mi_prim_get_default_heap as those are protected (by being // stored in a TLS slot for example) mi_heap_t* heap = mi_prim_get_default_heap(); - if (heap == NULL || heap == &_mi_heap_empty) { + if (heap == NULL) { return _mi_subproc_main(); } else { - return thread_tld->subproc; // don't call `_mi_tld()` + return heap->tld->subproc; // avoid using thread local storage (`thread_tld`) } } @@ -395,7 +399,7 @@ void mi_subproc_delete(mi_subproc_id_t subproc_id) { } void mi_subproc_add_current_thread(mi_subproc_id_t subproc_id) { - mi_tld_t* tld = _mi_tld(); + mi_tld_t* tld = mi_tld(); if (tld == NULL) return; mi_assert(tld->subproc == &subproc_main); if (tld->subproc != &subproc_main) return; @@ -553,10 +557,12 @@ void _mi_thread_done(mi_heap_t* heap) if (heap->tld->thread_id != _mi_prim_thread_id()) return; // abandon the thread local heap + // note: we store the tld as we should avoid reading `thread_tld` at this point (to avoid reinitializing the thread local storage) + mi_tld_t* tld = heap->tld; _mi_thread_heap_done(heap); // returns true if already ran // free thread local data - mi_tld_free(); + mi_tld_free(tld); } void _mi_heap_set_default_direct(mi_heap_t* heap) { @@ -713,7 +719,7 @@ void mi_cdecl _mi_process_done(void) { if (mi_option_is_enabled(mi_option_destroy_on_exit)) { mi_collect(true /* force */); _mi_heap_unsafe_destroy_all(); // forcefully release all memory held by all heaps (of this thread only!) - _mi_arenas_unsafe_destroy_all(); + _mi_arenas_unsafe_destroy_all(&tld_main); } if (mi_option_is_enabled(mi_option_show_stats) || mi_option_is_enabled(mi_option_verbose)) { diff --git a/src/page.c b/src/page.c index 7c8429a9..239d5d6e 100644 --- a/src/page.c +++ b/src/page.c @@ -252,7 +252,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { else { mi_page_queue_remove(pq, page); mi_page_set_heap(page, NULL); - _mi_arenas_page_abandon(page); + _mi_arenas_page_abandon(page); } } @@ -356,8 +356,10 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) { mi_page_queue_remove(pq, page); // and free it + mi_heap_t* heap = page->heap; mi_page_set_heap(page,NULL); _mi_arenas_page_free(page); + _mi_arenas_collect(false, heap->tld); // allow purging } #define MI_MAX_RETIRE_SIZE MI_LARGE_OBJ_SIZE_MAX // should be less than size for MI_BIN_HUGE diff --git a/src/stats.c b/src/stats.c index 102373ec..057dc093 100644 --- a/src/stats.c +++ b/src/stats.c @@ -47,11 +47,11 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) { // Adjust stats to compensate; for example before committing a range, -// first adjust downwards with parts that were already committed so +// first adjust downwards with parts that were already committed so // we avoid double counting. static void mi_stat_adjust_mt(mi_stat_count_t* stat, int64_t amount, bool on_alloc) { if (amount == 0) return; - // adjust atomically + // adjust atomically mi_atomic_addi64_relaxed(&stat->current, amount); mi_atomic_addi64_relaxed((on_alloc ? &stat->allocated : &stat->freed), amount); } @@ -74,7 +74,7 @@ void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount) { void __mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) { stat->count++; - stat->total += amount; + stat->total += amount; } void __mi_stat_increase_mt(mi_stat_count_t* stat, size_t amount) { @@ -150,7 +150,7 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { mi_stat_counter_add(&stats->page_no_retire, &src->page_no_retire, 1); mi_stat_counter_add(&stats->searches, &src->searches, 1); mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1); - mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1); + mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1); mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1); #if MI_STAT>1 for (size_t i = 0; i <= MI_BIN_HUGE; i++) { @@ -347,7 +347,7 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0) #endif #if MI_STAT mi_stat_print(&stats->normal, "normal", (stats->normal_count.count == 0 ? 1 : -(stats->normal.allocated / stats->normal_count.count)), out, arg); - mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg); + mi_stat_print(&stats->huge, "huge", (stats->huge_count.count == 0 ? 1 : -(stats->huge.allocated / stats->huge_count.count)), out, arg); mi_stat_count_t total = { 0,0,0,0 }; mi_stat_add(&total, &stats->normal, 1); mi_stat_add(&total, &stats->huge, 1); @@ -408,7 +408,7 @@ static mi_msecs_t mi_process_start; // = 0 // return thread local stats static mi_stats_t* mi_get_tld_stats(void) { - return &_mi_tld()->stats; + return &mi_heap_get_default()->tld->stats; } void mi_stats_reset(void) mi_attr_noexcept { @@ -492,7 +492,7 @@ mi_decl_export void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, s pinfo.page_faults = 0; _mi_prim_process_info(&pinfo); - + if (elapsed_msecs!=NULL) *elapsed_msecs = (pinfo.elapsed < 0 ? 0 : (pinfo.elapsed < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.elapsed : PTRDIFF_MAX)); if (user_msecs!=NULL) *user_msecs = (pinfo.utime < 0 ? 0 : (pinfo.utime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.utime : PTRDIFF_MAX)); if (system_msecs!=NULL) *system_msecs = (pinfo.stime < 0 ? 0 : (pinfo.stime < (mi_msecs_t)PTRDIFF_MAX ? (size_t)pinfo.stime : PTRDIFF_MAX)); From 04970f43e5d45fe18e868a020db58aabe2180f3c Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 21:55:40 -0800 Subject: [PATCH 139/264] document way to use a TLS slot on windows --- include/mimalloc/prim.h | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 99791585..2d681062 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -207,6 +207,20 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce #endif } +#elif 0 && _MSC_VER && _WIN32 +// On Windows, using a fixed TLS slot has better codegen than a thread-local +// but it might clash with an application trying to use the same slot. (so we disable this by default) +#include + +#define MI_HAS_TLS_SLOT +#define MI_TLS_SLOT 63 // last available slot + +static inline void* mi_prim_tls_slot(size_t slot) mi_attr_noexcept { + return NtCurrentTeb()->TlsSlots[slot]; +} +static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexcept { + NtCurrentTeb()->TlsSlots[slot] = value; +} #endif // Do we have __builtin_thread_pointer? This would be the preferred way to get a unique thread id From bc5ae316493d58047d22df2dcd4689d7c4a82246 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 22 Dec 2024 22:31:16 -0800 Subject: [PATCH 140/264] add abandoned_visit_blocks --- src/arena.c | 68 ++++++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 65 insertions(+), 3 deletions(-) diff --git a/src/arena.c b/src/arena.c index 88524ea2..00ff3720 100644 --- a/src/arena.c +++ b/src/arena.c @@ -352,6 +352,7 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena mi_arena_t* name_arena; \ if (req_arena != NULL) { \ name_arena = req_arena; /* if there is a specific req_arena, only search that one */\ + if (_i > 0) break; /* only once */ \ } \ else { \ size_t _idx; \ @@ -369,7 +370,6 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena #define mi_forall_arenas_end() \ } \ - if (req_arena != NULL) break; \ } \ } @@ -1594,10 +1594,71 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld) } } +/* ----------------------------------------------------------- + Visit abandoned pages +----------------------------------------------------------- */ + +typedef struct mi_abandoned_page_visit_info_s { + int heap_tag; + mi_block_visit_fun* visitor; + void* arg; + bool visit_blocks; +} mi_abandoned_page_visit_info_t; + +static bool abandoned_page_visit(mi_page_t* page, mi_abandoned_page_visit_info_t* vinfo) { + if (page->heap_tag != vinfo->heap_tag) { return true; } // continue + mi_heap_area_t area; + _mi_heap_area_init(&area, page); + if (!vinfo->visitor(NULL, &area, NULL, area.block_size, vinfo->arg)) { + return false; + } + if (vinfo->visit_blocks) { + return _mi_heap_area_visit_blocks(&area, page, vinfo->visitor, vinfo->arg); + } + else { + return true; + } +} + +static bool abandoned_page_visit_at(size_t slice_index, size_t slice_count, mi_arena_t* arena, void* arg) { + MI_UNUSED(slice_count); + mi_abandoned_page_visit_info_t* vinfo = (mi_abandoned_page_visit_info_t*)arg; + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); + mi_assert_internal(mi_page_is_abandoned_mapped(page)); + return abandoned_page_visit(page, vinfo); +} + +// Visit all abandoned pages in this subproc. bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg) { + mi_abandoned_page_visit_info_t visit_info = { heap_tag, visitor, arg, visit_blocks }; MI_UNUSED(subproc_id); MI_UNUSED(heap_tag); MI_UNUSED(visit_blocks); MI_UNUSED(visitor); MI_UNUSED(arg); - _mi_error_message(EINVAL, "implement mi_abandoned_visit_blocks\n"); - return false; + + // visit abandoned pages in the arenas + // we don't have to claim because we assume we are the only thread running (in this subproc). + // (but we could atomically claim as well by first doing abandoned_reclaim and afterwards reabandoning). + bool ok = true; + mi_subproc_t* subproc = _mi_subproc_from_id(subproc_id); + mi_forall_arenas(subproc, NULL, 0, arena) { + mi_assert_internal(arena->subproc == subproc); + for (size_t bin = 0; ok && bin < MI_BIN_COUNT; bin++) { + // todo: if we had a single abandoned page map as well, this can be faster. + if (mi_atomic_load_relaxed(&subproc->abandoned_count[bin]) > 0) { + ok = _mi_bitmap_forall_set(arena->pages_abandoned[bin], &abandoned_page_visit_at, arena, &visit_info); + } + } + } + mi_forall_arenas_end(); + if (!ok) return false; + + // visit abandoned pages in OS allocated memory + // (technically we don't need the lock as we assume we are the only thread running in this subproc) + mi_lock(&subproc->os_abandoned_pages_lock) { + for (mi_page_t* page = subproc->os_abandoned_pages; ok && page != NULL; page = page->next) { + ok = abandoned_page_visit(page, &visit_info); + } + } + + return ok; } @@ -1697,3 +1758,4 @@ mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* are return true; } + From 657135de36edad2082323426aea3e2fa1a9cf19a Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 23 Dec 2024 09:53:52 -0800 Subject: [PATCH 141/264] commit 2level page-map on over-commit systems --- CMakeLists.txt | 18 +++++++++++------- include/mimalloc/internal.h | 26 ++++++++++++-------------- src/options.c | 2 +- src/page-map.c | 3 ++- 4 files changed, 26 insertions(+), 23 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 07a292e0..c184a0b3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,10 +10,9 @@ option(MI_PADDING "Enable padding to detect heap block overflow (alway option(MI_OVERRIDE "Override the standard malloc interface (i.e. define entry points for 'malloc', 'free', etc)" ON) option(MI_XMALLOC "Enable abort() call on memory allocation failure by default" OFF) option(MI_SHOW_ERRORS "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF) -option(MI_TRACK_VALGRIND "Compile with Valgrind support (adds a small overhead)" OFF) -option(MI_TRACK_ASAN "Compile with address sanitizer support (adds a small overhead)" OFF) -option(MI_TRACK_ETW "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF) +option(MI_GUARDED "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF) option(MI_USE_CXX "Use the C++ compiler to compile the library (instead of the C compiler)" OFF) + option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON) option(MI_OPT_SIMD "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF) option(MI_SEE_ASM "Generate assembly files" OFF) @@ -21,14 +20,19 @@ option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" option(MI_OSX_ZONE "Use malloc zone to override standard malloc on macOS" ON) option(MI_WIN_REDIRECT "Use redirection module ('mimalloc-redirect') on Windows if compiling mimalloc as a DLL" ON) option(MI_LOCAL_DYNAMIC_TLS "Use local-dynamic-tls, a slightly slower but dlopen-compatible thread local storage mechanism (Unix)" OFF) -option(MI_LIBC_MUSL "Set this when linking with musl libc" OFF) +option(MI_LIBC_MUSL "Enable this when linking with musl libc" OFF) + +option(MI_DEBUG_TSAN "Build with thread sanitizer (needs clang)" OFF) +option(MI_DEBUG_UBSAN "Build with undefined-behavior sanitizer (needs clang++)" OFF) +option(MI_TRACK_VALGRIND "Compile with Valgrind support (adds a small overhead)" OFF) +option(MI_TRACK_ASAN "Compile with address sanitizer support (adds a small overhead)" OFF) +option(MI_TRACK_ETW "Compile with Windows event tracing (ETW) support (adds a small overhead)" OFF) + option(MI_BUILD_SHARED "Build shared library" ON) option(MI_BUILD_STATIC "Build static library" ON) option(MI_BUILD_OBJECT "Build object library" ON) option(MI_BUILD_TESTS "Build test executables" ON) -option(MI_DEBUG_TSAN "Build with thread sanitizer (needs clang)" OFF) -option(MI_DEBUG_UBSAN "Build with undefined-behavior sanitizer (needs clang++)" OFF) -option(MI_GUARDED "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF) + option(MI_SKIP_COLLECT_ON_EXIT "Skip collecting memory on program exit" OFF) option(MI_NO_PADDING "Force no use of padding even in DEBUG mode etc." OFF) option(MI_INSTALL_TOPLEVEL "Install directly into $CMAKE_INSTALL_PREFIX instead of PREFIX/lib/mimalloc-version" OFF) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index e98a37f5..4cb54d6f 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -435,13 +435,14 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si /* ----------------------------------------------------------- - Pages + The page map maps addresses to `mi_page_t` pointers ----------------------------------------------------------- */ #if MI_PAGE_MAP_FLAT -// flat page-map committed on demand +// flat page-map committed on demand, using one byte per slice (64 KiB). // single indirection and low commit, but large initial virtual reserve (4 GiB with 48 bit virtual addresses) +// used by default on <= 40 bit virtual address spaces. extern uint8_t* _mi_page_map; static inline size_t _mi_page_map_index(const void* p) { @@ -468,26 +469,23 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { #else // 2-level page map: -// double indirection but low commit and low virtual reserve. -// -// The page-map is usually 4 MiB and points to sub maps of 64 KiB. -// The page-map is committed on-demand (in 64 KiB) parts (and sub-maps are committed on-demand as well) -// One sub page-map = 64 KiB => covers 2^13 * 2^16 = 2^32 = 512 MiB address space -// The page-map needs 48-16-13 = 19 bits => 2^19 sub map pointers = 4 MiB size. -// (Choosing a MI_PAGE_MAP_SUB_SHIFT of 16 gives slightly better code but will commit the initial sub-map at 512 KiB) - +// double indirection, but low commit and low virtual reserve. +// +// the page-map is usually 4 MiB and points to sub maps of 64 KiB. +// the page-map is committed on-demand (in 64 KiB parts) (and sub-maps are committed on-demand as well) +// one sub page-map = 64 KiB => covers 2^(16-3) * 2^16 = 2^29 = 512 MiB address space +// the page-map needs 48-(16+13) = 19 bits => 2^19 sub map pointers = 4 MiB size. #define MI_PAGE_MAP_SUB_SHIFT (13) #define MI_PAGE_MAP_SUB_COUNT (MI_ZU(1) << MI_PAGE_MAP_SUB_SHIFT) - #define MI_PAGE_MAP_SHIFT (MI_MAX_VABITS - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT) #define MI_PAGE_MAP_COUNT (MI_ZU(1) << MI_PAGE_MAP_SHIFT) extern mi_page_t*** _mi_page_map; static inline size_t _mi_page_map_index(const void* p, size_t* sub_idx) { - const uintptr_t u = (uintptr_t)p / MI_ARENA_SLICE_SIZE; - if (sub_idx != NULL) { *sub_idx = (uint32_t)u % MI_PAGE_MAP_SUB_COUNT; } - return (size_t)(u / MI_PAGE_MAP_SUB_COUNT); + const size_t u = (size_t)((uintptr_t)p / MI_ARENA_SLICE_SIZE); + if (sub_idx != NULL) { *sub_idx = u % MI_PAGE_MAP_SUB_COUNT; } + return (u / MI_PAGE_MAP_SUB_COUNT); } static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { diff --git a/src/options.c b/src/options.c index fc3a2838..7562cd46 100644 --- a/src/options.c +++ b/src/options.c @@ -103,7 +103,7 @@ typedef struct mi_option_desc_s { #endif #ifndef MI_DEFAULT_PAGEMAP_COMMIT -#if defined(__APPLE__) +#if defined(__APPLE__) // when overloading malloc, we still get mixed pointers sometimes on macOS; this avoids a bad access #define MI_DEFAULT_PAGEMAP_COMMIT 1 #else #define MI_DEFAULT_PAGEMAP_COMMIT 0 diff --git a/src/page-map.c b/src/page-map.c index 37ce3082..db14265b 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -187,7 +187,8 @@ bool _mi_page_map_init(void) { const size_t os_page_size = _mi_os_page_size(); const size_t page_map_size = _mi_align_up( page_map_count * sizeof(mi_page_t**), os_page_size); const size_t reserve_size = page_map_size + os_page_size; - const bool commit = page_map_size <= 64*MI_KiB || mi_option_is_enabled(mi_option_pagemap_commit); // _mi_os_has_overcommit(); // commit on-access on Linux systems? + const bool commit = page_map_size <= 64*MI_KiB || + mi_option_is_enabled(mi_option_pagemap_commit) || _mi_os_has_overcommit(); _mi_page_map = (mi_page_t***)_mi_os_alloc_aligned(reserve_size, 1, commit, true /* allow large */, &mi_page_map_memid); if (_mi_page_map==NULL) { _mi_error_message(ENOMEM, "unable to reserve virtual memory for the page map (%zu KiB)\n", page_map_size / MI_KiB); From 88d8ee964f818b09ccd56c078b90851c78cd9af2 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 23 Dec 2024 15:04:06 -0800 Subject: [PATCH 142/264] remove is_large member (and use is_pinned for this) --- doc/mimalloc-doc.h | 7 +++---- include/mimalloc.h | 4 ++-- include/mimalloc/internal.h | 4 ++-- src/arena.c | 23 ++++++++++------------- 4 files changed, 17 insertions(+), 21 deletions(-) diff --git a/doc/mimalloc-doc.h b/doc/mimalloc-doc.h index e1c14b44..e9da9b90 100644 --- a/doc/mimalloc-doc.h +++ b/doc/mimalloc-doc.h @@ -431,12 +431,11 @@ int mi_reserve_os_memory(size_t size, bool commit, bool allow_large); /// @param start Start of the memory area /// @param size The size of the memory area. /// @param is_committed Is the area already committed? -/// @param is_large Does it consist of large OS pages? Set this to \a true as well for memory -/// that should not be decommitted or protected (like rdma etc.) +/// @param is_pinned Can the memory not be decommitted or reset? (usually the case for large OS pages) /// @param is_zero Does the area consists of zero's? /// @param numa_node Possible associated numa node or `-1`. /// @return \a true if successful, and \a false on error. -bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node); +bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node); /// Reserve \a pages of huge OS pages (1GiB) evenly divided over \a numa_nodes nodes, /// but stops after at most `timeout_msecs` seconds. @@ -589,7 +588,7 @@ void mi_subproc_add_current_thread(mi_subproc_id_t subproc); /// Allocate \a size bytes aligned by \a alignment. /// @param size number of bytes to allocate. -/// @param alignment the minimal alignment of the allocated memory. +/// @param alignment the minimal alignment of the allocated memory. /// @returns pointer to the allocated memory or \a NULL if out of memory, /// or if the alignment is not a power of 2 (including 0). The \a size is unrestricted /// (and does not have to be an integral multiple of the \a alignment). diff --git a/include/mimalloc.h b/include/mimalloc.h index 8bff8923..508e6aec 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -274,7 +274,7 @@ mi_decl_export int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept; mi_decl_export int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept; -mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept; +mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept; mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept; @@ -283,7 +283,7 @@ typedef void* mi_arena_id_t; mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size); mi_decl_export int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; mi_decl_export int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; -mi_decl_export bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; +mi_decl_export bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept; #if MI_MALLOC_VERSION >= 182 // Create a heap that only allocates in the specified arena diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 4cb54d6f..281f531a 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -143,8 +143,8 @@ mi_arena_id_t _mi_arena_id_none(void); mi_arena_t* _mi_arena_from_id(mi_arena_id_t id); bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena); -void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); -void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); +void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); +void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); void _mi_arenas_free(void* p, size_t size, mi_memid_t memid); bool _mi_arenas_contain(const void* p); void _mi_arenas_collect(bool force_purge, mi_tld_t* tld); diff --git a/src/arena.c b/src/arena.c index 00ff3720..7b97fbbc 100644 --- a/src/arena.c +++ b/src/arena.c @@ -41,7 +41,6 @@ typedef struct mi_arena_s { size_t info_slices; // initial slices reserved for the arena bitmaps int numa_node; // associated NUMA node bool is_exclusive; // only allow allocations if specifically for this arena - bool is_large; // memory area consists of large- or huge OS pages (always committed) _Atomic(mi_msecs_t) purge_expire; // expiration time when slices can be purged from `slices_purge`. mi_bitmap_t* slices_free; // is the slice free? @@ -333,8 +332,8 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_ Arena iteration ----------------------------------------------------------- */ -static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_large) { - if (!allow_large && arena->is_large) return false; +static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_pinned) { + if (!allow_pinned && arena->memid.is_pinned) return false; if (!mi_arena_id_is_suitable(arena, req_arena)) return false; if (req_arena == NULL) { // if not specific, check numa affinity const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); @@ -1104,7 +1103,7 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) { } -static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept +static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { mi_assert(!is_large || (memid.initially_committed && memid.is_pinned)); mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE)); @@ -1154,8 +1153,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s arena->is_exclusive = exclusive; arena->slice_count = slice_count; arena->info_slices = info_slices; - arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) - arena->is_large = is_large; + arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->purge_expire = 0; // mi_lock_init(&arena->abandoned_visit_lock); @@ -1190,14 +1188,14 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s } -bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { +bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_pinned, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); memid.mem.os.base = start; memid.mem.os.size = size; memid.initially_committed = is_committed; memid.initially_zero = is_zero; - memid.is_pinned = is_large; - return mi_manage_os_memory_ex2(_mi_subproc(), start, size, is_large, numa_node, exclusive, memid, arena_id); + memid.is_pinned = is_pinned; + return mi_manage_os_memory_ex2(_mi_subproc(), start, size, numa_node, exclusive, memid, arena_id); } // Reserve a range of regular OS memory @@ -1207,13 +1205,12 @@ static int mi_reserve_os_memory_ex2(mi_subproc_t* subproc, size_t size, bool com mi_memid_t memid; void* start = _mi_os_alloc_aligned(size, MI_ARENA_SLICE_ALIGN, commit, allow_large, &memid); if (start == NULL) return ENOMEM; - const bool is_large = memid.is_pinned; // todo: use separate is_large field? - if (!mi_manage_os_memory_ex2(subproc, start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { + if (!mi_manage_os_memory_ex2(subproc, start, size, -1 /* numa node */, exclusive, memid, arena_id)) { _mi_os_free_ex(start, size, commit, memid); _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); return ENOMEM; } - _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); + _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), memid.is_pinned ? " (in large os pages)" : ""); // mi_debug_show_arenas(true, true, false); return 0; @@ -1373,7 +1370,7 @@ int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_m } _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); - if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, true, numa_node, exclusive, memid, arena_id)) { + if (!mi_manage_os_memory_ex2(_mi_subproc(), p, hsize, numa_node, exclusive, memid, arena_id)) { _mi_os_free(p, hsize, memid); return ENOMEM; } From b515a0ad4c58f1e264213f22998c628470746bc1 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 23 Dec 2024 16:28:34 -0800 Subject: [PATCH 143/264] add _mi_os_guard_page_size --- include/mimalloc/internal.h | 8 ++++++ include/mimalloc/types.h | 7 +++-- src/arena-meta.c | 26 +++++++----------- src/arena.c | 38 +++++++++++--------------- src/os.c | 54 ++++++++++++++++++++++++++++++++++++- 5 files changed, 91 insertions(+), 42 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 281f531a..7c49d590 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -116,6 +116,7 @@ void _mi_os_free(void* p, size_t size, mi_memid_t memid); void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid); size_t _mi_os_page_size(void); +size_t _mi_os_guard_page_size(void); size_t _mi_os_good_alloc_size(size_t size); bool _mi_os_has_overcommit(void); bool _mi_os_has_virtual_reserve(void); @@ -129,6 +130,13 @@ bool _mi_os_unprotect(void* addr, size_t size); bool _mi_os_purge(void* p, size_t size); bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset); +size_t _mi_os_secure_guard_page_size(void); +bool _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned); +bool _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned); +bool _mi_os_secure_guard_page_reset_at(void* addr); +bool _mi_os_secure_guard_page_reset_before(void* addr); + + void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid); void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 84179458..c2ce4a26 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -46,8 +46,12 @@ terms of the MIT license. A copy of the license can be found in the file // Define MI_STAT as 1 to maintain statistics; set it to 2 to have detailed statistics (but costs some performance). // #define MI_STAT 1 -// Define MI_SECURE to enable security mitigations. The lowest two have minimal performance impact: +// Define MI_SECURE to enable security mitigations. Level 1 has minimal performance impact, +// but protects most metadata with guard pages: // #define MI_SECURE 1 // guard page around metadata +// +// Level 2 has more performance impact but protect well against various buffer overflows +// by surrounding all mimalloc pages with guard pages: // #define MI_SECURE 2 // guard page around each mimalloc page (can fragment VMA's with large heaps..) // // The next two levels can have more performance cost: @@ -126,7 +130,6 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_MEDIUM_PAGE_SIZE (8*MI_SMALL_PAGE_SIZE) // 512 KiB (=byte in the bitmap) #define MI_LARGE_PAGE_SIZE (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE) // 4 MiB (=word in the bitmap) - // Maximum number of size classes. (spaced exponentially in 12.5% increments) #define MI_BIN_HUGE (73U) #define MI_BIN_FULL (MI_BIN_HUGE+1) diff --git a/src/arena-meta.c b/src/arena-meta.c index 34be6e0e..c8c0cac6 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -25,12 +25,6 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_META_PAGE_SIZE MI_ARENA_SLICE_SIZE #define MI_META_PAGE_ALIGN MI_ARENA_SLICE_ALIGN -#if MI_SECURE -#define MI_META_PAGE_GUARD_SIZE (4*MI_KiB) -#else -#define MI_META_PAGE_GUARD_SIZE (0) -#endif - #define MI_META_BLOCK_SIZE (128) // large enough such that META_MAX_SIZE > 4k (even on 32-bit) #define MI_META_BLOCK_ALIGN MI_META_BLOCK_SIZE #define MI_META_BLOCKS_PER_PAGE (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE) // 1024 @@ -47,7 +41,7 @@ static mi_decl_cache_align _Atomic(mi_meta_page_t*) mi_meta_pages = MI_ATOMIC_V #if MI_DEBUG > 1 static mi_meta_page_t* mi_meta_page_of_ptr(void* p, size_t* block_idx) { - mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + MI_META_PAGE_GUARD_SIZE); + mi_meta_page_t* mpage = (mi_meta_page_t*)((uint8_t*)mi_align_down_ptr(p,MI_META_PAGE_ALIGN) + _mi_os_secure_guard_page_size()); if (block_idx != NULL) { *block_idx = ((uint8_t*)p - (uint8_t*)mpage) / MI_META_BLOCK_SIZE; } @@ -60,9 +54,9 @@ static mi_meta_page_t* mi_meta_page_next( mi_meta_page_t* mpage ) { } static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) { - mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - MI_META_PAGE_GUARD_SIZE, MI_META_PAGE_ALIGN)); + mi_assert_internal(_mi_is_aligned((uint8_t*)mpage - _mi_os_secure_guard_page_size(), MI_META_PAGE_ALIGN)); mi_assert_internal(block_idx < MI_META_BLOCKS_PER_PAGE); - void* p = ((uint8_t*)mpage - MI_META_PAGE_GUARD_SIZE + (block_idx * MI_META_BLOCK_SIZE)); + void* p = ((uint8_t*)mpage - _mi_os_secure_guard_page_size() + (block_idx * MI_META_BLOCK_SIZE)); mi_assert_internal(mpage == mi_meta_page_of_ptr(p,NULL)); return p; } @@ -82,20 +76,18 @@ static mi_meta_page_t* mi_meta_page_zalloc(void) { } // guard pages - #if MI_SECURE - if (!memid.is_pinned) { - _mi_os_decommit(base, MI_META_PAGE_GUARD_SIZE); - _mi_os_decommit(base + MI_META_PAGE_SIZE - MI_META_PAGE_GUARD_SIZE, MI_META_PAGE_GUARD_SIZE); - } + #if MI_SECURE >= 1 + _mi_os_secure_guard_page_set_at(base, memid.is_pinned); + _mi_os_secure_guard_page_set_before(base + MI_META_PAGE_SIZE, memid.is_pinned); #endif - + // initialize the page and free block bitmap - mi_meta_page_t* mpage = (mi_meta_page_t*)(base + MI_META_PAGE_GUARD_SIZE); + mi_meta_page_t* mpage = (mi_meta_page_t*)(base + _mi_os_secure_guard_page_size()); mpage->memid = memid; mi_bitmap_init(&mpage->blocks_free, MI_META_BLOCKS_PER_PAGE, true /* already_zero */); const size_t mpage_size = offsetof(mi_meta_page_t,blocks_free) + mi_bitmap_size(MI_META_BLOCKS_PER_PAGE, NULL); const size_t info_blocks = _mi_divide_up(mpage_size,MI_META_BLOCK_SIZE); - const size_t guard_blocks = _mi_divide_up(MI_META_PAGE_GUARD_SIZE, MI_META_BLOCK_SIZE); + const size_t guard_blocks = _mi_divide_up(_mi_os_secure_guard_page_size(), MI_META_BLOCK_SIZE); mi_assert_internal(info_blocks + 2*guard_blocks < MI_META_BLOCKS_PER_PAGE); mi_bitmap_unsafe_setN(&mpage->blocks_free, info_blocks + guard_blocks, MI_META_BLOCKS_PER_PAGE - info_blocks - 2*guard_blocks); diff --git a/src/arena.c b/src/arena.c index 7b97fbbc..3349abb1 100644 --- a/src/arena.c +++ b/src/arena.c @@ -576,12 +576,6 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_ return NULL; } -#if MI_SECURE < 2 -#define MI_ARENA_GUARD_PAGE_SIZE (0) -#else -#define MI_ARENA_GUARD_PAGE_SIZE (4*MI_KiB) -#endif - // Allocate a fresh page static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment, mi_arena_t* req_arena, size_t tseq) @@ -621,11 +615,14 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment)); - // guard page at the end - const size_t page_noguard_size = mi_size_of_slices(slice_count) - MI_ARENA_GUARD_PAGE_SIZE; - #if MI_SECURE >= 2 - if (memid.initially_committed && !memid.is_pinned) { - _mi_os_decommit((uint8_t*)page + page_noguard_size, MI_ARENA_GUARD_PAGE_SIZE); + // guard page at the end of mimalloc page? + #if MI_SECURE < 2 + const size_t page_noguard_size = mi_size_of_slices(slice_count); + #else + mi_assert(mi_size_of_slices(slice_count) > _mi_os_secure_guard_page_size()); + const size_t page_noguard_size = mi_size_of_slices(slice_count) - _mi_os_secure_guard_page_size(); + if (memid.initially_committed) { + _mi_os_secure_guard_page_set_at((uint8_t*)page + page_noguard_size, memid.is_pinned); } #endif @@ -795,7 +792,7 @@ void _mi_arenas_page_free(mi_page_t* page) { // we must do this since we may later allocate large spans over this page and cannot have a guard page in between #if MI_SECURE >= 2 if (!page->memid.is_pinned) { - _mi_os_commit((uint8_t*)page + mi_memid_size(page->memid) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE, NULL); + _mi_os_secure_guard_page_reset_before((uint8_t*)page + mi_memid_size(page->memid)); } #endif @@ -1089,7 +1086,7 @@ static size_t mi_arena_info_slices_needed(size_t slice_count, size_t* bitmap_bas const size_t size = base_size + bitmaps_size; const size_t os_page_size = _mi_os_page_size(); - const size_t info_size = _mi_align_up(size, os_page_size) + MI_ARENA_GUARD_PAGE_SIZE; + const size_t info_size = _mi_align_up(size, os_page_size) + _mi_os_secure_guard_page_size(); const size_t info_slices = mi_slice_count_of_size(info_size); if (bitmap_base != NULL) *bitmap_base = base_size; @@ -1105,7 +1102,6 @@ static mi_bitmap_t* mi_arena_bitmap_init(size_t slice_count, uint8_t** base) { static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t size, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept { - mi_assert(!is_large || (memid.initially_committed && memid.is_pinned)); mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE)); mi_assert(start!=NULL); if (start==NULL) return false; @@ -1134,17 +1130,15 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s // commit & zero if needed if (!memid.initially_committed) { - // if MI_SECURE, leave a guard OS page decommitted at the end - _mi_os_commit(arena, mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE, NULL); + // leave a guard OS page decommitted at the end + _mi_os_commit(arena, mi_size_of_slices(info_slices) - _mi_os_secure_guard_page_size(), NULL); } - else if (!memid.is_pinned) { - #if MI_SECURE > 0 - // if MI_SECURE, decommit a guard OS page at the end of the arena info - _mi_os_decommit((uint8_t*)arena + mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE, MI_ARENA_GUARD_PAGE_SIZE); - #endif + else { + // if MI_SECURE, set a guard page at the end + _mi_os_secure_guard_page_set_before((uint8_t*)arena + mi_size_of_slices(info_slices), memid.is_pinned); } if (!memid.initially_zero) { - _mi_memzero(arena, mi_size_of_slices(info_slices) - MI_ARENA_GUARD_PAGE_SIZE); + _mi_memzero(arena, mi_size_of_slices(info_slices) - _mi_os_secure_guard_page_size()); } // init diff --git a/src/os.c b/src/os.c index 80d44d12..399aac6c 100644 --- a/src/os.c +++ b/src/os.c @@ -61,8 +61,16 @@ size_t _mi_os_large_page_size(void) { return (mi_os_mem_config.large_page_size != 0 ? mi_os_mem_config.large_page_size : _mi_os_page_size()); } +size_t _mi_os_guard_page_size(void) { + const size_t gsize = _mi_os_page_size(); + mi_assert(gsize <= (MI_ARENA_SLICE_SIZE/8)); + return gsize; +} + size_t _mi_os_virtual_address_bits(void) { - return mi_os_mem_config.virtual_address_bits; + const size_t vbits = mi_os_mem_config.virtual_address_bits; + mi_assert(vbits <= MI_MAX_VABITS); + return vbits; } bool _mi_os_use_large_page(size_t size, size_t alignment) { @@ -99,6 +107,50 @@ void* _mi_os_get_aligned_hint(size_t try_alignment, size_t size) { return NULL; } +// In secure mode, return the size of a guard page, otherwise 0 +size_t _mi_os_secure_guard_page_size(void) { + #if MI_SECURE > 0 + return _mi_os_guard_page_size(); + #else + return 0; + #endif +} + +// In secure mode, try to decommit an area and output a warning if this fails. +bool _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned) { + if (addr == NULL) return true; + #if MI_SECURE > 0 + const bool ok = (is_pinned ? false : _mi_os_decommit(addr, _mi_os_secure_guard_page_size())); + if (!ok) { + _mi_error_message(EINVAL, "secure level %d, but failed to commit guard page (at %p of size %zu)\n", MI_SECURE, addr, _mi_os_secure_guard_page_size()); + } + return ok; + #else + MI_UNUSED(is_pinned); + return true; + #endif +} + +// In secure mode, try to decommit an area and output a warning if this fails. +bool _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned) { + return _mi_os_secure_guard_page_set_at((uint8_t*)addr - _mi_os_secure_guard_page_size(), is_pinned); +} + +// In secure mode, try to recommit an area +bool _mi_os_secure_guard_page_reset_at(void* addr) { + if (addr == NULL) return true; + #if MI_SECURE > 0 + return _mi_os_commit(addr, _mi_os_secure_guard_page_size(), NULL); + #else + return true; + #endif +} + +// In secure mode, try to recommit an area +bool _mi_os_secure_guard_page_reset_before(void* addr) { + return _mi_os_secure_guard_page_reset_at((uint8_t*)addr - _mi_os_secure_guard_page_size()); +} + /* ----------------------------------------------------------- Free memory From c65c6d83bd0a1c3d00bcbe8ce4fc1bc10ddc947e Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 23 Dec 2024 16:31:42 -0800 Subject: [PATCH 144/264] fix guard page size --- ide/vs2022/mimalloc.vcxproj | 2 +- src/arena.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 87e866bb..63bc7d1d 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -190,7 +190,7 @@ true Default ../../include - MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); + MI_DEBUG=3;MI_GUARDED=0;MI_SECURE=4;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 diff --git a/src/arena.c b/src/arena.c index 3349abb1..9ae44d85 100644 --- a/src/arena.c +++ b/src/arena.c @@ -720,10 +720,10 @@ static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_s mi_tld_t* const tld = heap->tld; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t info_size = (os_align ? MI_PAGE_ALIGN : mi_page_info_size()); - #if MI_ARENA_GUARD_PAGE_SIZE == 0 + #if MI_SECURE < 2 const size_t slice_count = mi_slice_count_of_size(info_size + block_size); #else - const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, MI_ARENA_GUARD_PAGE_SIZE) + MI_ARENA_GUARD_PAGE_SIZE); + const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size()); #endif mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq); From 9bad269c518a4104ac13584bc9474e0e357efd1c Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 23 Dec 2024 16:47:01 -0800 Subject: [PATCH 145/264] fix purge delay check for arenas --- src/arena.c | 2 +- src/options.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arena.c b/src/arena.c index 9ae44d85..af0d1d0a 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1551,7 +1551,7 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld) mi_subproc_t* subproc = tld->subproc; const mi_msecs_t now = _mi_clock_now(); mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire); - if (!force && (arenas_expire == 0 || arenas_expire < now)) return; + if (!force && (arenas_expire == 0 || arenas_expire > now)) return; const size_t max_arena = mi_arenas_get_count(subproc); if (max_arena == 0) return; diff --git a/src/options.c b/src/options.c index 7562cd46..63d8a68f 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 500, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose From b77b34df968d610d7d26b0671f4375a072b39943 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 23 Dec 2024 17:10:34 -0800 Subject: [PATCH 146/264] double arena per 4; large page objects 1/8 of large page size --- include/mimalloc/types.h | 2 +- src/arena.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 72c8d0a7..53c543d0 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -324,7 +324,7 @@ typedef struct mi_page_s { // (Except for large pages since huge objects are allocated in 4MiB chunks) #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 11 KiB #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 128 KiB -#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 1 MiB +#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 1 MiB #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/arena.c b/src/arena.c index b2113ec0..bc88acf3 100644 --- a/src/arena.c +++ b/src/arena.c @@ -280,7 +280,7 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_ if (arena_count >= 1 && arena_count <= 128) { // scale up the arena sizes exponentially every 4 entries - const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/2, 0, 16); + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/4, 0, 16); size_t reserve = 0; if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { arena_reserve = reserve; From 9a7c0d443a0e04f2610044ffb4bdfa752ada8864 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 23 Dec 2024 17:15:13 -0800 Subject: [PATCH 147/264] max obj size 1/8 of a page --- include/mimalloc/types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index c2ce4a26..b21d0970 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -322,9 +322,9 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) -#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 8 KiB -#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB -#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/2) // < 2 MiB +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB +#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 512 KiB #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) From ba68810333e74dbb0fd32becc92ef8cabc0f5c3b Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 23 Dec 2024 18:33:37 -0800 Subject: [PATCH 148/264] commit page on demand --- ide/vs2022/mimalloc.vcxproj | 2 +- include/mimalloc.h | 1 + include/mimalloc/types.h | 7 ++-- src/arena.c | 64 ++++++++++++++++++++++++++----------- src/init.c | 1 + src/options.c | 3 +- src/page.c | 16 ++++++++-- 7 files changed, 69 insertions(+), 25 deletions(-) diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index 63bc7d1d..87e866bb 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -190,7 +190,7 @@ true Default ../../include - MI_DEBUG=3;MI_GUARDED=0;MI_SECURE=4;%(PreprocessorDefinitions); + MI_DEBUG=3;MI_GUARDED=0;%(PreprocessorDefinitions); CompileAsCpp false stdcpp20 diff --git a/include/mimalloc.h b/include/mimalloc.h index 508e6aec..5f856411 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -400,6 +400,7 @@ typedef enum mi_option_e { mi_option_max_page_candidates, // max candidate pages to consider for allocation (=4) mi_option_max_vabits, // max user space virtual address bits to consider (=48) mi_option_pagemap_commit, // commit the full pagemap (to always catch invalid pointer uses) (=0) + mi_option_page_commit_on_demand, // commit page memory on-demand _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index b21d0970..a4e158d6 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -139,6 +139,8 @@ terms of the MIT license. A copy of the license can be found in the file // We never allocate more than PTRDIFF_MAX (see also ) #define MI_MAX_ALLOC_SIZE PTRDIFF_MAX +#define MI_PAGE_MIN_COMMIT_SIZE MI_ARENA_SLICE_SIZE + // ------------------------------------------------------ // Arena's are large reserved areas of memory allocated from // the OS that are managed by mimalloc to efficiently @@ -290,7 +292,7 @@ typedef struct mi_page_s { _Atomic(mi_page_flags_t) xflags; // `in_full_queue` and `has_aligned` flags size_t block_size; // size available in each block (always `>0`) - uint8_t* page_start; // start of the blocks + uint8_t* page_start; // start of the blocks mi_heaptag_t heap_tag; // tag of the owning heap, used to separate heaps by object type bool free_is_zero; // `true` if the blocks in the free list are zero initialized // padding @@ -301,6 +303,7 @@ typedef struct mi_page_s { mi_heap_t* heap; // the heap owning this page (or NULL for abandoned pages) struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` + size_t page_committed; // committed size relative to `page_start`. mi_memid_t memid; // provenance of the page memory } mi_page_t; @@ -324,7 +327,7 @@ typedef struct mi_page_s { // (Except for large pages since huge objects are allocated in 4MiB chunks) #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 512 KiB +#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 512 KiB #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/arena.c b/src/arena.c index af0d1d0a..c31f1fe3 100644 --- a/src/arena.c +++ b/src/arena.c @@ -562,7 +562,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_ _mi_page_free_collect(page, false); // update `used` count mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); @@ -578,16 +578,16 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_ // Allocate a fresh page static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment, - mi_arena_t* req_arena, size_t tseq) + mi_arena_t* req_arena, size_t tseq, bool commit) { const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page - const bool commit = true; const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN); const size_t page_alignment = MI_ARENA_SLICE_ALIGN; // try to allocate from free space in arena's mi_memid_t memid = _mi_memid_none(); mi_page_t* page = NULL; + const size_t alloc_size = mi_size_of_slices(slice_count); if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's? !os_align && // not large alignment slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large @@ -604,10 +604,10 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice if (os_align) { // note: slice_count already includes the page mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment)); - page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, &memid); + page = (mi_page_t*)mi_arena_os_alloc_aligned(alloc_size, block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena, &memid); } else { - page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena, &memid); + page = (mi_page_t*)mi_arena_os_alloc_aligned(alloc_size, page_alignment, 0 /* align offset */, commit, allow_large, req_arena, &memid); } } @@ -617,25 +617,25 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice // guard page at the end of mimalloc page? #if MI_SECURE < 2 - const size_t page_noguard_size = mi_size_of_slices(slice_count); + const size_t page_noguard_size = alloc_size; #else - mi_assert(mi_size_of_slices(slice_count) > _mi_os_secure_guard_page_size()); - const size_t page_noguard_size = mi_size_of_slices(slice_count) - _mi_os_secure_guard_page_size(); + mi_assert(alloc_size > _mi_os_secure_guard_page_size()); + const size_t page_noguard_size = alloc_size - _mi_os_secure_guard_page_size(); if (memid.initially_committed) { _mi_os_secure_guard_page_set_at((uint8_t*)page + page_noguard_size, memid.is_pinned); } #endif // claimed free slices: initialize the page partly - if (!memid.initially_zero) { + if (!memid.initially_zero && memid.initially_committed) { mi_track_mem_undefined(page, slice_count * MI_ARENA_SLICE_SIZE); _mi_memzero_aligned(page, sizeof(*page)); } - else { + else if (memid.initially_committed) { mi_track_mem_defined(page, slice_count * MI_ARENA_SLICE_SIZE); } #if MI_DEBUG > 1 - if (memid.initially_zero) { + if (memid.initially_zero && memid.initially_committed) { if (!mi_mem_is_zero(page, page_noguard_size)) { _mi_error_message(EFAULT, "internal error: page memory was not zero initialized.\n"); memid.initially_zero = false; @@ -644,6 +644,7 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice } #endif mi_assert(MI_PAGE_INFO_SIZE >= mi_page_info_size()); + size_t block_start; #if MI_GUARDED // in a guarded build, we align pages with blocks a multiple of an OS page size, to the OS page size @@ -668,9 +669,24 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice } const size_t reserved = (os_align ? 1 : (page_noguard_size - block_start) / block_size); mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX); + + // commit first block? + size_t commit_size = 0; + if (!memid.initially_committed) { + commit_size = _mi_align_up(block_start + block_size, MI_PAGE_MIN_COMMIT_SIZE); + if (commit_size > page_noguard_size) { commit_size = page_noguard_size; } + bool is_zero; + _mi_os_commit(page, commit_size, &is_zero); + if (!memid.initially_zero && !is_zero) { + _mi_memzero_aligned(page, commit_size); + } + } + + // initialize page->reserved = (uint16_t)reserved; page->page_start = (uint8_t*)page + block_start; page->block_size = block_size; + page->page_committed = (commit_size == 0 ? 0 : commit_size - block_start); mi_assert(commit_size == 0 || commit_size >= block_start + block_size); page->memid = memid; page->free_is_zero = memid.initially_zero; if (block_size > 0 && _mi_is_power_of_two(block_size)) { @@ -704,7 +720,8 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou } // 2. find a free block, potentially allocating a new arena - page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq); + page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, + !mi_option_is_enabled(mi_option_page_commit_on_demand)); if (page != NULL) { mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); _mi_page_init(heap, page); @@ -726,7 +743,7 @@ static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_s const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size()); #endif - mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq); + mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq, true /* commit singletons always */); if (page == NULL) return NULL; mi_assert(page->reserved == 1); @@ -779,7 +796,7 @@ void _mi_arenas_page_free(mi_page_t* page) { mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may @@ -799,7 +816,16 @@ void _mi_arenas_page_free(mi_page_t* page) { // unregister page _mi_page_map_unregister(page); if (page->memid.memkind == MI_MEM_ARENA) { - mi_bitmap_clear(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index); + mi_arena_t* arena = page->memid.mem.arena.arena; + mi_bitmap_clear(arena->pages, page->memid.mem.arena.slice_index); + if (page->page_committed > 0) { + // if committed on-demand, set the commit bits to account commit properly + const size_t total_committed = (page->page_start - (uint8_t*)page) + page->page_committed; + mi_assert_internal(mi_memid_size(page->memid) >= total_committed); + const size_t total_slices = _mi_divide_up(total_committed, MI_ARENA_SLICE_SIZE); + mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices); + mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL); + } } _mi_arenas_free(page, mi_memid_size(page->memid), page->memid); } @@ -824,7 +850,7 @@ void _mi_arenas_page_abandon(mi_page_t* page) { mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(!mi_page_is_singleton(page)); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_page_set_abandoned_mapped(page); @@ -889,7 +915,7 @@ void _mi_arenas_page_unabandon(mi_page_t* page) { mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index); @@ -1430,7 +1456,7 @@ static long mi_arena_purge_delay(void) { // returns if the memory is no longer committed (versus reset which keeps the commit) static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_count) { mi_assert_internal(!arena->memid.is_pinned); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); // we own it? const size_t size = mi_size_of_slices(slice_count); void* const p = mi_arena_slice_start(arena, slice_index); @@ -1455,7 +1481,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ const long delay = mi_arena_purge_delay(); if (arena->memid.is_pinned || delay < 0 || _mi_preloading()) return; // is purging allowed at all? - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); // we still own it? if (delay == 0) { // purge directly mi_arena_purge(arena, slice_index, slice_count); diff --git a/src/init.c b/src/init.c index 5240611c..16c1dea4 100644 --- a/src/init.c +++ b/src/init.c @@ -35,6 +35,7 @@ const mi_page_t _mi_page_empty = { #endif NULL, // xheap NULL, NULL, // next, prev + MI_ARENA_SLICE_SIZE, // page_committed MI_MEMID_STATIC // memid }; diff --git a/src/options.c b/src/options.c index 63d8a68f..faeb9da4 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 500, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 250, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose @@ -175,6 +175,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? + { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/page.c b/src/page.c index 239d5d6e..ed94cae1 100644 --- a/src/page.c +++ b/src/page.c @@ -606,6 +606,18 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(extend > 0 && extend + page->capacity <= page->reserved); mi_assert_internal(extend < (1UL<<16)); + // commit on demand? + if (page->page_committed > 0) { + const size_t needed_size = (page->capacity + extend)*bsize; + if (needed_size > page->page_committed) { + size_t commit_size = _mi_align_up(needed_size, MI_PAGE_MIN_COMMIT_SIZE); + const size_t max_size = page->reserved * bsize; + if (commit_size > max_size) { commit_size = max_size; } + mi_assert(commit_size > page->page_committed); + _mi_os_commit(mi_page_start(page) + page->page_committed, commit_size - page->page_committed, NULL); + } + } + // and append the extend the free list if (extend < MI_MIN_SLICES || MI_SECURE<3) { //!mi_option_is_enabled(mi_option_secure)) { mi_page_free_list_extend(page, bsize, extend, &heap->tld->stats ); @@ -635,8 +647,8 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { #endif #if MI_DEBUG>2 if (page->memid.initially_zero) { - mi_track_mem_defined(page->page_start, page_size); - mi_assert_expensive(mi_mem_is_zero(page_start, page_size)); + mi_track_mem_defined(page->page_start, (page->page_committed == 0 ? page_size : page->page_committed)); + mi_assert_expensive(mi_mem_is_zero(page_start, (page->page_committed == 0 ? page_size : page->page_committed))); } #endif From d21114b5f2904aaefd8d97871e938e5ef839d942 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 24 Dec 2024 11:37:52 -0800 Subject: [PATCH 149/264] improve page commit on demand --- include/mimalloc/internal.h | 25 +++++++++++-- include/mimalloc/types.h | 3 +- src/arena.c | 75 +++++++++++++++++++++++-------------- src/heap.c | 5 ++- src/options.c | 4 +- src/os.c | 20 ++++++---- src/page.c | 25 +++++++------ 7 files changed, 101 insertions(+), 56 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 7c49d590..5b877635 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -128,7 +128,8 @@ bool _mi_os_decommit(void* addr, size_t size); bool _mi_os_protect(void* addr, size_t size); bool _mi_os_unprotect(void* addr, size_t size); bool _mi_os_purge(void* p, size_t size); -bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset); +bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stats_size); +bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size); size_t _mi_os_secure_guard_page_size(void); bool _mi_os_secure_guard_page_set_at(void* addr, bool is_pinned); @@ -155,7 +156,7 @@ void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid); void _mi_arenas_free(void* p, size_t size, mi_memid_t memid); bool _mi_arenas_contain(const void* p); -void _mi_arenas_collect(bool force_purge, mi_tld_t* tld); +void _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld); void _mi_arenas_unsafe_destroy_all(mi_tld_t* tld); mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); @@ -534,9 +535,12 @@ static inline uint8_t* mi_page_start(const mi_page_t* page) { return page->page_start; } +static inline size_t mi_page_size(const mi_page_t* page) { + return mi_page_block_size(page) * page->reserved; +} static inline uint8_t* mi_page_area(const mi_page_t* page, size_t* size) { - if (size) { *size = mi_page_block_size(page) * page->reserved; } + if (size) { *size = mi_page_size(page); } return mi_page_start(page); } @@ -564,6 +568,21 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) { return mi_page_block_size(page) - MI_PADDING_SIZE; } +// This may change if we locate page info outside the page data slices +static inline uint8_t* mi_page_slice_start(const mi_page_t* page) { + return (uint8_t*)page; +} + +// This gives the offset relative to the start slice of a page. This may change if we ever +// locate page info outside the page-data itself. +static inline size_t mi_page_slice_offset_of(const mi_page_t* page, size_t offset_relative_to_page_start) { + return (page->page_start - mi_page_slice_start(page)) + offset_relative_to_page_start; +} + +static inline size_t mi_page_committed(const mi_page_t* page) { + return (page->slice_committed == 0 ? mi_page_size(page) : page->slice_committed - (page->page_start - mi_page_slice_start(page))); +} + static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { return page->heap; } diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index a4e158d6..627aa6f9 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -139,6 +139,7 @@ terms of the MIT license. A copy of the license can be found in the file // We never allocate more than PTRDIFF_MAX (see also ) #define MI_MAX_ALLOC_SIZE PTRDIFF_MAX +// Minimal commit for a page on-demand commit (should be >= OS page size, and >= MI_ARENA_SLICE_SIZE for correct stats) #define MI_PAGE_MIN_COMMIT_SIZE MI_ARENA_SLICE_SIZE // ------------------------------------------------------ @@ -303,7 +304,7 @@ typedef struct mi_page_s { mi_heap_t* heap; // the heap owning this page (or NULL for abandoned pages) struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` - size_t page_committed; // committed size relative to `page_start`. + size_t slice_committed; // committed size relative to the first arena slice of the page data mi_memid_t memid; // provenance of the page memory } mi_page_t; diff --git a/src/arena.c b/src/arena.c index c31f1fe3..a5b83bf5 100644 --- a/src/arena.c +++ b/src/arena.c @@ -207,12 +207,12 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( size_t already_committed_count = 0; mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed_count); // adjust the stats so we don't double count the commits - if (already_committed_count > 0) { - mi_subproc_stat_adjust_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count), true /* on alloc */); - } + //if (already_committed_count > 0) { + // mi_subproc_stat_adjust_decrease(arena->subproc, committed, mi_size_of_slices(already_committed_count), true /* on alloc */); + //} // now actually commit bool commit_zero = false; - if (!_mi_os_commit(p, mi_size_of_slices(slice_count), &commit_zero)) { + if (!_mi_os_commit_ex(p, mi_size_of_slices(slice_count), &commit_zero, mi_size_of_slices(slice_count - already_committed_count))) { // failed to commit (todo: give warning?) if (already_committed_count > 0) { mi_subproc_stat_increase(arena->subproc, committed, mi_size_of_slices(already_committed_count)); @@ -686,7 +686,7 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice page->reserved = (uint16_t)reserved; page->page_start = (uint8_t*)page + block_start; page->block_size = block_size; - page->page_committed = (commit_size == 0 ? 0 : commit_size - block_start); mi_assert(commit_size == 0 || commit_size >= block_start + block_size); + page->slice_committed = commit_size; page->memid = memid; page->free_is_zero = memid.initially_zero; if (block_size > 0 && _mi_is_power_of_two(block_size)) { @@ -720,8 +720,10 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou } // 2. find a free block, potentially allocating a new arena - page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, - !mi_option_is_enabled(mi_option_page_commit_on_demand)); + const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) || // always commit small pages + _mi_os_has_overcommit() || // no need to commit on demand on an OS that already does this for us + !mi_option_is_enabled(mi_option_page_commit_on_demand)); + page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit); if (page != NULL) { mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); _mi_page_init(heap, page); @@ -818,13 +820,18 @@ void _mi_arenas_page_free(mi_page_t* page) { if (page->memid.memkind == MI_MEM_ARENA) { mi_arena_t* arena = page->memid.mem.arena.arena; mi_bitmap_clear(arena->pages, page->memid.mem.arena.slice_index); - if (page->page_committed > 0) { + if (page->slice_committed > 0) { // if committed on-demand, set the commit bits to account commit properly - const size_t total_committed = (page->page_start - (uint8_t*)page) + page->page_committed; - mi_assert_internal(mi_memid_size(page->memid) >= total_committed); - const size_t total_slices = _mi_divide_up(total_committed, MI_ARENA_SLICE_SIZE); + mi_assert_internal(mi_memid_size(page->memid) >= page->slice_committed); + const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE; // conservative + mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices)); mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices); - mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL); + if (total_slices > 0) { + mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL); + } + } + else { + mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, page->memid.mem.arena.slice_index, page->memid.mem.arena.slice_count)); } } _mi_arenas_free(page, mi_memid_size(page->memid), page->memid); @@ -1005,8 +1012,8 @@ void _mi_arenas_free(void* p, size_t size, mi_memid_t memid) { } // Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired -void _mi_arenas_collect(bool force_purge, mi_tld_t* tld) { - mi_arenas_try_purge(force_purge, force_purge /* visit all? */, tld); +void _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld) { + mi_arenas_try_purge(force_purge, visit_all, tld); } @@ -1062,7 +1069,7 @@ static void mi_arenas_unsafe_destroy(mi_subproc_t* subproc) { // for dynamic libraries that are unloaded and need to release all their allocated memory. void _mi_arenas_unsafe_destroy_all(mi_tld_t* tld) { mi_arenas_unsafe_destroy(_mi_subproc()); - _mi_arenas_collect(true /* force purge */, tld); // purge non-owned arenas + _mi_arenas_collect(true /* force purge */, true /* visit all*/, tld); // purge non-owned arenas } @@ -1462,15 +1469,23 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c void* const p = mi_arena_slice_start(arena, slice_index); //const bool all_committed = mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count); size_t already_committed; - mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed); + mi_bitmap_setN(arena->slices_committed, slice_index, slice_count, &already_committed); // pretend all committed.. (as we lack a clearN call that counts the already set bits..) const bool all_committed = (already_committed == slice_count); - const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */); + const bool needs_recommit = _mi_os_purge_ex(p, size, all_committed /* allow reset? */, mi_size_of_slices(already_committed)); - // update committed bitmap if (needs_recommit) { - mi_subproc_stat_adjust_decrease( arena->subproc, committed, mi_size_of_slices(slice_count - already_committed), false /* on freed */); + // no longer committed mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count); + // we just counted in the purge to decommit all, but the some part was not committed so adjust that here + // mi_os_stat_decrease(committed, mi_size_of_slices(slice_count - already_committed)); } + else if (!all_committed) { + // we cannot assume any of these are committed any longer (even with reset since we did setN and may have marked uncommitted slices as committed) + mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count); + // we adjust the commit count as parts will be re-committed + // mi_os_stat_decrease(committed, mi_size_of_slices(already_committed)); + } + return needs_recommit; } @@ -1493,6 +1508,7 @@ static void mi_arena_schedule_purge(mi_arena_t* arena, size_t slice_index, size_ if (mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire0, expire)) { // expiration was not yet set // maybe set the global arenas expire as well (if it wasn't set already) + mi_assert_internal(expire0==0); mi_atomic_casi64_strong_acq_rel(&arena->subproc->purge_expire, &expire0, expire); } else { @@ -1554,8 +1570,8 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); if (!force && (expire == 0 || expire > now)) return false; - // reset expire (if not already set concurrently) - mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); + // reset expire + mi_atomic_store_release(&arena->purge_expire, (mi_msecs_t)0); mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1); // go through all purge info's (with max MI_BFIELD_BITS ranges at a time) @@ -1570,33 +1586,36 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld) { + // try purge can be called often so try to only run when needed const long delay = mi_arena_purge_delay(); if (_mi_preloading() || delay <= 0) return; // nothing will be scheduled // check if any arena needs purging? mi_subproc_t* subproc = tld->subproc; const mi_msecs_t now = _mi_clock_now(); - mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire); - if (!force && (arenas_expire == 0 || arenas_expire > now)) return; + const mi_msecs_t arenas_expire = mi_atomic_load_acquire(&subproc->purge_expire); + if (!visit_all && !force && (arenas_expire == 0 || arenas_expire > now)) return; const size_t max_arena = mi_arenas_get_count(subproc); if (max_arena == 0) return; - // allow only one thread to purge at a time + // allow only one thread to purge at a time (todo: allow concurrent purging?) static mi_atomic_guard_t purge_guard; mi_atomic_guard(&purge_guard) { // increase global expire: at most one purge per delay cycle - mi_atomic_store_release(&subproc->purge_expire, now + delay); + if (arenas_expire > now) { mi_atomic_store_release(&subproc->purge_expire, now + (delay/10)); } const size_t arena_start = tld->thread_seq % max_arena; - size_t max_purge_count = (visit_all ? max_arena : 2); + size_t max_purge_count = (visit_all ? max_arena : (max_arena/4)+1); bool all_visited = true; + bool any_purged = false; for (size_t _i = 0; _i < max_arena; _i++) { size_t i = _i + arena_start; if (i >= max_arena) { i -= max_arena; } mi_arena_t* arena = mi_arena_from_index(subproc,i); if (arena != NULL) { if (mi_arena_try_purge(arena, now, force)) { + any_purged = true; if (max_purge_count <= 1) { all_visited = false; break; @@ -1605,8 +1624,8 @@ static void mi_arenas_try_purge(bool force, bool visit_all, mi_tld_t* tld) } } } - if (all_visited) { - mi_atomic_store_release(&subproc->purge_expire, (mi_msecs_t)0); + if (all_visited && !any_purged) { + mi_atomic_store_release(&subproc->purge_expire, 0); } } } diff --git a/src/heap.c b/src/heap.c index 6632861b..f0d495a3 100644 --- a/src/heap.c +++ b/src/heap.c @@ -119,8 +119,9 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // collect all pages owned by this thread mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); - // collect arenas (this is program wide so don't force purges on abandonment of threads) - _mi_arenas_collect(collect == MI_FORCE /* force purge? */, heap->tld); + // collect arenas (this is program wide so don't force purges on abandonment of threads) + //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); + _mi_arenas_collect(collect == MI_FORCE /* force purge? */, true /* visit all? */, heap->tld); } void _mi_heap_collect_abandon(mi_heap_t* heap) { diff --git a/src/options.c b/src/options.c index faeb9da4..b613f983 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 250, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 0, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose @@ -175,7 +175,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, + { 1, UNINIT, MI_OPTION(page_commit_on_demand) }, }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/os.c b/src/os.c index 399aac6c..79c2bc17 100644 --- a/src/os.c +++ b/src/os.c @@ -429,9 +429,9 @@ static void* mi_os_page_align_area_conservative(void* addr, size_t size, size_t* return mi_os_page_align_areax(true, addr, size, newsize); } -bool _mi_os_commit(void* addr, size_t size, bool* is_zero) { +bool _mi_os_commit_ex(void* addr, size_t size, bool* is_zero, size_t stat_size) { if (is_zero != NULL) { *is_zero = false; } - mi_os_stat_increase(committed, size); // use size for precise commit vs. decommit + mi_os_stat_increase(committed, stat_size); // use size for precise commit vs. decommit mi_os_stat_counter_increase(commit_calls, 1); // page align range @@ -458,9 +458,13 @@ bool _mi_os_commit(void* addr, size_t size, bool* is_zero) { return true; } -static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) { +bool _mi_os_commit(void* addr, size_t size, bool* is_zero) { + return _mi_os_commit_ex(addr, size, is_zero, size); +} + +static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit, size_t stats_size) { mi_assert_internal(needs_recommit!=NULL); - mi_os_stat_decrease(committed, size); + mi_os_stat_decrease(committed, stats_size); // page align size_t csize; @@ -479,7 +483,7 @@ static bool mi_os_decommit_ex(void* addr, size_t size, bool* needs_recommit) { bool _mi_os_decommit(void* addr, size_t size) { bool needs_recommit; - return mi_os_decommit_ex(addr, size, &needs_recommit); + return mi_os_decommit_ex(addr, size, &needs_recommit, size); } @@ -509,7 +513,7 @@ bool _mi_os_reset(void* addr, size_t size) { // either resets or decommits memory, returns true if the memory needs // to be recommitted if it is to be re-used later on. -bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset) +bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset, size_t stats_size) { if (mi_option_get(mi_option_purge_delay) < 0) return false; // is purging allowed? mi_os_stat_counter_increase(purge_calls, 1); @@ -519,7 +523,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset) !_mi_preloading()) // don't decommit during preloading (unsafe) { bool needs_recommit = true; - mi_os_decommit_ex(p, size, &needs_recommit); + mi_os_decommit_ex(p, size, &needs_recommit, stats_size); return needs_recommit; } else { @@ -533,7 +537,7 @@ bool _mi_os_purge_ex(void* p, size_t size, bool allow_reset) // either resets or decommits memory, returns true if the memory needs // to be recommitted if it is to be re-used later on. bool _mi_os_purge(void* p, size_t size) { - return _mi_os_purge_ex(p, size, true); + return _mi_os_purge_ex(p, size, true, size); } diff --git a/src/page.c b/src/page.c index ed94cae1..aba548e9 100644 --- a/src/page.c +++ b/src/page.c @@ -251,8 +251,10 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { } else { mi_page_queue_remove(pq, page); + mi_tld_t* tld = page->heap->tld; mi_page_set_heap(page, NULL); - _mi_arenas_page_abandon(page); + _mi_arenas_page_abandon(page); + _mi_arenas_collect(false, false, tld); // allow purging } } @@ -263,7 +265,7 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size mi_assert_internal(pq != NULL); mi_assert_internal(mi_heap_contains_queue(heap, pq)); mi_assert_internal(page_alignment > 0 || block_size > MI_LARGE_MAX_OBJ_SIZE || block_size == pq->block_size); - #endif + #endif mi_page_t* page = _mi_arenas_page_alloc(heap, block_size, page_alignment); if (page == NULL) { // out-of-memory @@ -359,7 +361,7 @@ void _mi_page_free(mi_page_t* page, mi_page_queue_t* pq) { mi_heap_t* heap = page->heap; mi_page_set_heap(page,NULL); _mi_arenas_page_free(page); - _mi_arenas_collect(false, heap->tld); // allow purging + _mi_arenas_collect(false, false, heap->tld); // allow purging } #define MI_MAX_RETIRE_SIZE MI_LARGE_OBJ_SIZE_MAX // should be less than size for MI_BIN_HUGE @@ -607,14 +609,13 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page) { mi_assert_internal(extend < (1UL<<16)); // commit on demand? - if (page->page_committed > 0) { + if (page->slice_committed > 0) { const size_t needed_size = (page->capacity + extend)*bsize; - if (needed_size > page->page_committed) { - size_t commit_size = _mi_align_up(needed_size, MI_PAGE_MIN_COMMIT_SIZE); - const size_t max_size = page->reserved * bsize; - if (commit_size > max_size) { commit_size = max_size; } - mi_assert(commit_size > page->page_committed); - _mi_os_commit(mi_page_start(page) + page->page_committed, commit_size - page->page_committed, NULL); + const size_t needed_commit = _mi_align_up( mi_page_slice_offset_of(page, needed_size), MI_PAGE_MIN_COMMIT_SIZE ); + if (needed_commit > page->slice_committed) { + mi_assert_internal(((needed_commit - page->slice_committed) % _mi_os_page_size()) == 0); + _mi_os_commit(mi_page_slice_start(page) + page->slice_committed, needed_commit - page->slice_committed, NULL); + page->slice_committed = needed_commit; } } @@ -647,8 +648,8 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) { #endif #if MI_DEBUG>2 if (page->memid.initially_zero) { - mi_track_mem_defined(page->page_start, (page->page_committed == 0 ? page_size : page->page_committed)); - mi_assert_expensive(mi_mem_is_zero(page_start, (page->page_committed == 0 ? page_size : page->page_committed))); + mi_track_mem_defined(page->page_start, mi_page_committed(page)); + mi_assert_expensive(mi_mem_is_zero(page_start, mi_page_committed(page))); } #endif From 71a1645d4d06fc5c7c1b91b7df6d94ff956c647e Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 24 Dec 2024 12:04:21 -0800 Subject: [PATCH 150/264] fix build --- src/arena.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/arena.c b/src/arena.c index 9915cdcf..48fa0315 100644 --- a/src/arena.c +++ b/src/arena.c @@ -563,7 +563,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_ _mi_page_free_collect(page, false); // update `used` count mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); + mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); From 016b36d9173cc7adf51c4f3836bc1e22682e1837 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 24 Dec 2024 12:10:34 -0800 Subject: [PATCH 151/264] fix max va bits on unix --- src/os.c | 10 +--------- 1 file changed, 1 insertion(+), 9 deletions(-) diff --git a/src/os.c b/src/os.c index 79c2bc17..ef440fcd 100644 --- a/src/os.c +++ b/src/os.c @@ -15,14 +15,6 @@ terms of the MIT license. A copy of the license can be found in the file /* ----------------------------------------------------------- Initialization. ----------------------------------------------------------- */ -#ifndef MI_DEFAULT_VIRTUAL_ADDRESS_BITS -#if MI_INTPTR_SIZE < 8 -#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS 32 -#else -#define MI_DEFAULT_VIRTUAL_ADDRESS_BITS 48 -#endif -#endif - #ifndef MI_DEFAULT_PHYSICAL_MEMORY #if MI_INTPTR_SIZE < 8 #define MI_DEFAULT_PHYSICAL_MEMORY 4*MI_GiB @@ -36,7 +28,7 @@ static mi_os_mem_config_t mi_os_mem_config = { 0, // large page size (usually 2MiB) 4096, // allocation granularity MI_DEFAULT_PHYSICAL_MEMORY, - MI_DEFAULT_VIRTUAL_ADDRESS_BITS, + MI_MAX_VABITS, // in `bits.h` true, // has overcommit? (if true we use MAP_NORESERVE on mmap systems) false, // can we partially free allocated blocks? (on mmap systems we can free anywhere in a mapped range, but on Windows we must free the entire span) true // has virtual reserve? (if true we can reserve virtual address space without using commit or physical memory) From ad6f48f3e4b85d0f8a0f3de1a4ba2aeb9db8adb5 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 24 Dec 2024 15:00:05 -0800 Subject: [PATCH 152/264] fix assertion for huge pages --- src/page-queue.c | 4 ++-- src/page.c | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/src/page-queue.c b/src/page-queue.c index 9e3aaacc..128ae8e3 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -156,7 +156,7 @@ static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) { mi_heap_t* heap = mi_page_heap(page); mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); - mi_assert_expensive(mi_page_queue_contains(pq, page)); + mi_assert_expensive(mi_page_is_huge(page) || mi_page_queue_contains(pq, page)); return pq; } @@ -210,7 +210,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) { static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(page != NULL); - mi_assert_expensive(mi_page_queue_contains(queue, page)); + mi_assert_expensive(mi_page_is_huge(page) || mi_page_queue_contains(queue, page)); mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); diff --git a/src/page.c b/src/page.c index aba548e9..1e15644e 100644 --- a/src/page.c +++ b/src/page.c @@ -123,7 +123,7 @@ bool _mi_page_is_valid(mi_page_t* page) { //mi_assert_internal(!_mi_process_is_initialized); { mi_page_queue_t* pq = mi_page_queue_of(page); - mi_assert_internal(mi_page_queue_contains(pq, page)); + mi_assert_internal(mi_page_is_huge(page) || mi_page_queue_contains(pq, page)); mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page)); // mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq)); } @@ -298,7 +298,7 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) { mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0); if (page==NULL) return NULL; mi_assert_internal(pq->block_size==mi_page_block_size(page)); - mi_assert_internal(pq==mi_heap_page_queue_of(heap, page)); + mi_assert_internal(mi_page_is_huge(page) || pq==mi_heap_page_queue_of(heap, page)); return page; } From d862e57955e7f00d16024b9780e43bb2e964eeae Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 24 Dec 2024 16:39:54 -0800 Subject: [PATCH 153/264] fix huge page allocation size --- src/page-queue.c | 4 ++-- src/page.c | 29 ++++++++++++++++------------- 2 files changed, 18 insertions(+), 15 deletions(-) diff --git a/src/page-queue.c b/src/page-queue.c index 128ae8e3..9e3aaacc 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -156,7 +156,7 @@ static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* static mi_page_queue_t* mi_page_queue_of(const mi_page_t* page) { mi_heap_t* heap = mi_page_heap(page); mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page); - mi_assert_expensive(mi_page_is_huge(page) || mi_page_queue_contains(pq, page)); + mi_assert_expensive(mi_page_queue_contains(pq, page)); return pq; } @@ -210,7 +210,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) { static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(page != NULL); - mi_assert_expensive(mi_page_is_huge(page) || mi_page_queue_contains(queue, page)); + mi_assert_expensive(mi_page_queue_contains(queue, page)); mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); diff --git a/src/page.c b/src/page.c index 1e15644e..9a96da85 100644 --- a/src/page.c +++ b/src/page.c @@ -123,7 +123,7 @@ bool _mi_page_is_valid(mi_page_t* page) { //mi_assert_internal(!_mi_process_is_initialized); { mi_page_queue_t* pq = mi_page_queue_of(page); - mi_assert_internal(mi_page_is_huge(page) || mi_page_queue_contains(pq, page)); + mi_assert_internal(mi_page_queue_contains(pq, page)); mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page)); // mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq)); } @@ -298,7 +298,7 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) { mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0); if (page==NULL) return NULL; mi_assert_internal(pq->block_size==mi_page_block_size(page)); - mi_assert_internal(mi_page_is_huge(page) || pq==mi_heap_page_queue_of(heap, page)); + mi_assert_internal(pq==mi_heap_page_queue_of(heap, page)); return page; } @@ -794,8 +794,9 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m // Find a page with free blocks of `size`. -static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, size_t size) { - mi_page_queue_t* pq = mi_page_queue(heap, size); +static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) { + // mi_page_queue_t* pq = mi_page_queue(heap, size); + mi_assert_internal(!mi_page_queue_is_huge(pq)); // check the first page: we even do this with candidate search or otherwise we re-search every time mi_page_t* page = pq->first; @@ -853,13 +854,13 @@ void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noex // Huge pages contain just one block, and the segment contains just that page. // Huge pages are also use if the requested alignment is very large (> MI_BLOCK_ALIGNMENT_MAX) // so their size is not always `> MI_LARGE_OBJ_SIZE_MAX`. -static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment) { - size_t block_size = _mi_os_good_alloc_size(size); - mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0); +static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_alignment, mi_page_queue_t* pq) { + const size_t block_size = _mi_os_good_alloc_size(size); + // mi_assert_internal(mi_bin(block_size) == MI_BIN_HUGE || page_alignment > 0); #if MI_HUGE_PAGE_ABANDON - mi_page_queue_t* pq = NULL; + #error todo. #else - mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1); // always in the huge queue regardless of the block size + // mi_page_queue_t* pq = mi_page_queue(heap, MI_LARGE_MAX_OBJ_SIZE+1); // always in the huge queue regardless of the block size mi_assert_internal(mi_page_queue_is_huge(pq)); #endif mi_page_t* page = mi_page_fresh_alloc(heap, pq, block_size, page_alignment); @@ -882,15 +883,17 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a // Allocate a page // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed. static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept { + mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment ? MI_LARGE_MAX_OBJ_SIZE+1 : size)); // huge allocation? - const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` - if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) { + if mi_unlikely(mi_page_queue_is_huge(pq)) { + const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` + //if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) { if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) { _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size); return NULL; } else { - return mi_huge_page_alloc(heap,size,huge_alignment); + return mi_huge_page_alloc(heap,size,huge_alignment,pq); } } else { @@ -898,7 +901,7 @@ static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignme #if MI_PADDING mi_assert_internal(size >= MI_PADDING_SIZE); #endif - return mi_find_free_page(heap, size); + return mi_find_free_page(heap, pq); } } From 1e1a12bf3c4194ee121776aa3d383b218442c2a2 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 24 Dec 2024 17:07:11 -0800 Subject: [PATCH 154/264] fix rounding issue with huge size allocations --- include/mimalloc/internal.h | 5 +++-- include/mimalloc/types.h | 2 +- src/page.c | 19 ++++++++----------- 3 files changed, 12 insertions(+), 14 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 5b877635..0e161951 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -706,9 +706,10 @@ static inline bool mi_page_is_huge(const mi_page_t* page) { (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page)); } - static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) { - return &((mi_heap_t*)heap)->pages[_mi_bin(size)]; + mi_page_queue_t* const pq = &((mi_heap_t*)heap)->pages[_mi_bin(size)]; + if (size <= MI_LARGE_MAX_OBJ_SIZE) { mi_assert_internal(pq->block_size <= MI_LARGE_MAX_OBJ_SIZE); } + return pq; } diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 627aa6f9..4bede252 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -328,7 +328,7 @@ typedef struct mi_page_s { // (Except for large pages since huge objects are allocated in 4MiB chunks) #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE ((MI_LARGE_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 512 KiB +#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/4) // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with _mi_bin #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/page.c b/src/page.c index 9a96da85..542496a0 100644 --- a/src/page.c +++ b/src/page.c @@ -883,18 +883,15 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size, size_t page_a // Allocate a page // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed. static mi_page_t* mi_find_page(mi_heap_t* heap, size_t size, size_t huge_alignment) mi_attr_noexcept { - mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment ? MI_LARGE_MAX_OBJ_SIZE+1 : size)); + const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` + if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) { + _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size); + return NULL; + } + mi_page_queue_t* pq = mi_page_queue(heap, (huge_alignment > 0 ? MI_LARGE_MAX_OBJ_SIZE+1 : size)); // huge allocation? - if mi_unlikely(mi_page_queue_is_huge(pq)) { - const size_t req_size = size - MI_PADDING_SIZE; // correct for padding_size in case of an overflow on `size` - //if mi_unlikely(req_size > (MI_LARGE_MAX_OBJ_SIZE - MI_PADDING_SIZE) || huge_alignment > 0) { - if mi_unlikely(req_size > MI_MAX_ALLOC_SIZE) { - _mi_error_message(EOVERFLOW, "allocation request is too large (%zu bytes)\n", req_size); - return NULL; - } - else { - return mi_huge_page_alloc(heap,size,huge_alignment,pq); - } + if mi_unlikely(mi_page_queue_is_huge(pq) || req_size > MI_MAX_ALLOC_SIZE) { + return mi_huge_page_alloc(heap,size,huge_alignment,pq); } else { // otherwise find a page with free blocks in our size segregated queues From 4d1d3471cff8e7285705fe590d46dcfe51e22d0c Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 24 Dec 2024 17:14:53 -0800 Subject: [PATCH 155/264] rename page options --- include/mimalloc.h | 4 ++-- src/heap.c | 4 ++-- src/init.c | 4 ++-- src/options.c | 4 ++-- src/page.c | 2 +- 5 files changed, 9 insertions(+), 9 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 5f856411..6432e41a 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -396,8 +396,8 @@ typedef enum mi_option_e { mi_option_guarded_sample_seed, // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0) mi_option_target_segments_per_thread, // experimental (=0) mi_option_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) - mi_option_full_page_retain, // retain N full pages per size class (=2) - mi_option_max_page_candidates, // max candidate pages to consider for allocation (=4) + mi_option_page_full_retain, // retain N full pages per size class (=2) + mi_option_page_max_candidates, // max candidate pages to consider for allocation (=4) mi_option_max_vabits, // max user space virtual address bits to consider (=48) mi_option_pagemap_commit, // commit the full pagemap (to always catch invalid pointer uses) (=0) mi_option_page_commit_on_demand, // commit page memory on-demand diff --git a/src/heap.c b/src/heap.c index f0d495a3..09cc2574 100644 --- a/src/heap.c +++ b/src/heap.c @@ -170,8 +170,8 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint heap->tld = tld; // avoid reading the thread-local tld during initialization heap->exclusive_arena = _mi_arena_from_id(arena_id); heap->allow_page_reclaim = !noreclaim; - heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_full_page_retain) >= 0); - heap->full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); + heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_page_full_retain) >= 0); + heap->full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); heap->tag = heap_tag; if (heap->tld->is_in_threadpool) { // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. diff --git a/src/init.c b/src/init.c index 16c1dea4..4631d9d9 100644 --- a/src/init.c +++ b/src/init.c @@ -254,8 +254,8 @@ static void mi_heap_main_init(void) { //heap_main.keys[0] = _mi_heap_random_next(&heap_main); //heap_main.keys[1] = _mi_heap_random_next(&heap_main); _mi_heap_guarded_init(&heap_main); - heap_main.allow_page_abandon = (mi_option_get(mi_option_full_page_retain) >= 0); - heap_main.full_page_retain = mi_option_get_clamp(mi_option_full_page_retain, -1, 32); + heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0); + heap_main.full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); } } diff --git a/src/options.c b/src/options.c index b613f983..0d9bea28 100644 --- a/src/options.c +++ b/src/options.c @@ -170,8 +170,8 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. { 1, UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free - { 2, UNINIT, MI_OPTION(full_page_retain) }, - { 4, UNINIT, MI_OPTION(max_page_candidates) }, + { 2, UNINIT, MI_OPTION(page_full_retain) }, + { 4, UNINIT, MI_OPTION(page_max_candidates) }, { 0, UNINIT, MI_OPTION(max_vabits) }, { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? diff --git a/src/page.c b/src/page.c index 542496a0..474d8d2d 100644 --- a/src/page.c +++ b/src/page.c @@ -721,7 +721,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m // we prefer non-expandable pages with high usage as candidates (to reduce commit, and increase chances of free-ing up pages) if (page_candidate == NULL) { page_candidate = page; - candidate_limit = _mi_option_get_fast(mi_option_max_page_candidates); + candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates); } else if (mi_page_all_free(page_candidate)) { _mi_page_free(page_candidate, pq); From 8259c0eb7ca96787a50bcbad24d28b5bb2407acd Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 24 Dec 2024 20:10:44 -0800 Subject: [PATCH 156/264] nice colors for heap maps --- include/mimalloc.h | 2 +- src/arena.c | 119 ++++++++++++++++++++++++++++++++------------- src/libc.c | 18 ++++++- src/options.c | 4 +- test/test-stress.c | 14 +++--- 5 files changed, 111 insertions(+), 46 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 6432e41a..dacc647e 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size mi_decl_export int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept; mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept; -mi_decl_export void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept; +mi_decl_export void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept; // Experimental: heaps associated with specific memory arena's typedef void* mi_arena_id_t; diff --git a/src/arena.c b/src/arena.c index a5b83bf5..083fc35b 100644 --- a/src/arena.c +++ b/src/arena.c @@ -720,9 +720,9 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou } // 2. find a free block, potentially allocating a new arena + const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand); const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) || // always commit small pages - _mi_os_has_overcommit() || // no need to commit on demand on an OS that already does this for us - !mi_option_is_enabled(mi_option_page_commit_on_demand)); + (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 1)); page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit); if (page != NULL) { mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); @@ -824,7 +824,7 @@ void _mi_arenas_page_free(mi_page_t* page) { // if committed on-demand, set the commit bits to account commit properly mi_assert_internal(mi_memid_size(page->memid) >= page->slice_committed); const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE; // conservative - mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices)); + //mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices)); mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices); if (total_slices > 0) { mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL); @@ -1262,56 +1262,106 @@ int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noe /* ----------------------------------------------------------- Debugging ----------------------------------------------------------- */ -static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { +static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf, size_t* k) { size_t bit_set_count = 0; for (int bit = 0; bit < MI_BFIELD_BITS; bit++) { bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); if (is_set) bit_set_count++; - buf[bit] = (is_set ? 'x' : '.'); + buf[*k++] = (is_set ? 'x' : '.'); } return bit_set_count; } -static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, mi_arena_t* arena, size_t slice_index) { +typedef enum mi_ansi_color_e { + MI_BLACK = 30, + MI_MAROON, + MI_DARKGREEN, + MI_ORANGE, + MI_NAVY, + MI_PURPLE, + MI_TEAL, + MI_GRAY, + MI_DARKGRAY = 90, + MI_RED, + MI_GREEN, + MI_YELLOW, + MI_BLUE, + MI_MAGENTA, + MI_CYAN, + MI_WHITE +} mi_ansi_color_t; + +static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) { + buf[*k] = '\x1b'; + buf[*k+1] = '['; + buf[*k+2] = (char)(((int)color / 10) + '0'); + buf[*k+3] = (char)(((int)color % 10) + '0'); + buf[*k+4] = 'm'; + *k += 5; +} + +static int mi_page_commit_usage(mi_page_t* page) { + if (mi_page_size(page) <= MI_PAGE_MIN_COMMIT_SIZE) return 100; + const size_t committed_size = mi_page_committed(page); + const size_t used_size = page->used * mi_page_block_size(page); + return (int)(used_size * 100 / committed_size); +} + +static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, mi_arena_t* arena, size_t slice_index) { size_t bit_set_count = 0; long bit_of_page = 0; + mi_ansi_color_t color = MI_GRAY; + mi_ansi_color_t prev_color = MI_GRAY; for (int bit = 0; bit < MI_BFIELD_BITS; bit++, bit_of_page--) { bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); void* start = mi_arena_slice_start(arena, slice_index + bit); + char c = ' '; if (is_set) { mi_assert_internal(bit_of_page <= 0); bit_set_count++; mi_page_t* page = (mi_page_t*)start; - char c = 'p'; + c = 'p'; + color = MI_GRAY; if (mi_page_is_abandoned_mapped(page)) { c = 'a'; } else if (mi_page_is_abandoned(page)) { c = (mi_page_is_singleton(page) ? 's' : 'f'); } + int commit_usage = mi_page_commit_usage(page); + if (commit_usage < 25) { color = MI_MAROON; } + else if (commit_usage < 50) { color = MI_ORANGE; } + else if (commit_usage < 75) { color = MI_TEAL; } + else color = MI_DARKGREEN; bit_of_page = (long)page->memid.mem.arena.slice_count; - buf[bit] = c; } else { - char c = '?'; + c = '?'; if (bit_of_page > 0) { c = '-'; } - else if (_mi_meta_is_meta_page(start)) { c = 'm'; } - else if (slice_index + bit < arena->info_slices) { c = 'i'; } + else if (_mi_meta_is_meta_page(start)) { c = 'm'; color = MI_GRAY; } + else if (slice_index + bit < arena->info_slices) { c = 'i'; color = MI_GRAY; } // else if (mi_bitmap_is_setN(arena->pages_purge, slice_index + bit, NULL)) { c = '*'; } else if (mi_bitmap_is_set(arena->slices_free, slice_index+bit)) { - if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; } - else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; } - else { c = '.'; } + if (mi_bitmap_is_set(arena->slices_purge, slice_index + bit)) { c = '~'; color = MI_ORANGE; } + else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; color = MI_GRAY; } + else { c = '.'; color = MI_GRAY; } } - if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; } - buf[bit] = c; + if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; } } + if (color != prev_color) { + mi_debug_color(buf, k, color); + prev_color = color; + } + buf[*k] = c; *k += 1; } + mi_debug_color(buf, k, MI_GRAY); return bit_set_count; } +#define MI_FIELDS_PER_LINE (4) + static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { - _mi_output_message("%s:\n", header); + _mi_output_message("\x1B[37m%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header); size_t bit_count = 0; size_t bit_set_count = 0; for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { - char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); + char buf[10*MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf)); size_t k = 0; mi_bchunk_t* chunk = &bitmap->chunks[i]; @@ -1320,17 +1370,18 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); } for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { - if (j > 0 && (j % 4) == 0) { - buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5; + if (j > 0 && (j % MI_FIELDS_PER_LINE) == 0) { + _mi_output_message(" %s\n\x1B[37m", buf); + _mi_memzero(buf, sizeof(buf)); + k = 0; buf[k++] = ' '; buf[k++] = ' '; buf[k++] = ' '; } if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; if (invert) bfield = ~bfield; - size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf + k, arena, bit_count) - : mi_debug_show_bfield(bfield, buf + k)); + size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf, &k, arena, bit_count) + : mi_debug_show_bfield(bfield, buf, &k)); if (invert) xcount = MI_BFIELD_BITS - xcount; bit_set_count += xcount; - k += MI_BFIELD_BITS; buf[k++] = ' '; } else { @@ -1339,16 +1390,16 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi } bit_count += MI_BFIELD_BITS; } - _mi_output_message(" %s\n", buf); + _mi_output_message(" %s\n\x1B[37m", buf); } - _mi_output_message(" total ('x'): %zu\n", bit_set_count); + _mi_output_message("\x1B[0m total ('x'): %zu\n", bit_set_count); return bit_set_count; } -void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_attr_noexcept { +void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept { mi_subproc_t* subproc = _mi_subproc(); size_t max_arenas = mi_arenas_get_count(subproc); - size_t free_total = 0; + //size_t free_total = 0; size_t slice_total = 0; //size_t abandoned_total = 0; size_t page_total = 0; @@ -1358,12 +1409,12 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) mi_assert(arena->subproc == subproc); slice_total += arena->slice_count; _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""), arena->subproc); - if (show_inuse) { - free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); - } - if (show_committed) { - mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL); - } + //if (show_inuse) { + // free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); + //} + //if (show_committed) { + // mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false, NULL); + //} // todo: abandoned slices //if (show_purge) { // purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); @@ -1372,7 +1423,7 @@ void mi_debug_show_arenas(bool show_pages, bool show_inuse, bool show_committed) page_total += mi_debug_show_bitmap("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, false, arena); } } - if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); + // if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); // if (show_abandoned) _mi_verbose_message("total abandoned slices: %zu\n", abandoned_total); if (show_pages) _mi_output_message("total pages in arenas: %zu\n", page_total); } diff --git a/src/libc.c b/src/libc.c index 0ec2164d..a0eeca17 100644 --- a/src/libc.c +++ b/src/libc.c @@ -171,7 +171,18 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) { char c; MI_NEXTC(); if (c != '%') { - if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t') { // output visible ascii or standard control only + if (c == '\\') { + MI_NEXTC(); + switch (c) { + case 'e': mi_outc('\x1B', &out, end); break; + case 't': mi_outc('\t', &out, end); break; + case 'n': mi_outc('\n', &out, end); break; + case 'r': mi_outc('\r', &out, end); break; + case '\\': mi_outc('\\', &out, end); break; + default: /* ignore */ break; + } + } + else if ((c >= ' ' && c <= '~') || c=='\n' || c=='\r' || c=='\t' || c=='\x1b') { // output visible ascii or standard control only mi_outc(c, &out, end); } } @@ -199,7 +210,10 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) { } char* start = out; - if (c == 's') { + if (c == '%') { + mi_outc('%', &out, end); + } + else if (c == 's') { // string const char* s = va_arg(args, const char*); mi_outs(s, &out, end); diff --git a/src/options.c b/src/options.c index 0d9bea28..0d51cc00 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 0, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose @@ -175,7 +175,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 1, UNINIT, MI_OPTION(page_commit_on_demand) }, + { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/test/test-stress.c b/test/test-stress.c index bbcded65..527d6dce 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -260,9 +260,9 @@ static void test_stress(void) { #if !defined(NDEBUG) || defined(MI_TSAN) if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); - mi_debug_show_arenas(true, false, false); + mi_debug_show_arenas(true); //mi_collect(true); - //mi_debug_show_arenas(true, false, false); + //mi_debug_show_arenas(true); } #endif } @@ -346,13 +346,13 @@ int main(int argc, char** argv) { #ifndef USE_STD_MALLOC #ifndef NDEBUG - //mi_debug_show_arenas(true, true, false); - mi_debug_show_arenas(true, false, false); - mi_collect(true); - mi_debug_show_arenas(true,false,false); + //mi_debug_show_arenas(true); + mi_debug_show_arenas(true); + //mi_collect(true); + //mi_debug_show_arenas(true); #else //mi_collect(true); - mi_debug_show_arenas(true,false,false); + mi_debug_show_arenas(true); mi_stats_print(NULL); #endif #else From 24b8384f80b62d4382b504c80c9b62a5fa9b91cf Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 24 Dec 2024 20:23:37 -0800 Subject: [PATCH 157/264] remove is_expandable requirement on page candidates --- src/options.c | 2 +- src/page.c | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/src/options.c b/src/options.c index 0d51cc00..0a9a5f92 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose diff --git a/src/page.c b/src/page.c index 474d8d2d..2f0ec406 100644 --- a/src/page.c +++ b/src/page.c @@ -728,7 +728,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m page_candidate = page; } // prefer to reuse fuller pages (in the hope the less used page gets freed) - else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page) && !mi_page_is_expandable(page)) { + else if (page->used >= page_candidate->used && !mi_page_is_mostly_used(page)) { // && !mi_page_is_expandable(page)) { page_candidate = page; } // if we find a non-expandable candidate, or searched for N pages, return with the best candidate From 5a663da9aaca48e90ced03d832f387ad42e976bc Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 24 Dec 2024 20:38:36 -0800 Subject: [PATCH 158/264] fix build warning --- src/arena.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/arena.c b/src/arena.c index 083fc35b..bbc0907e 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1400,14 +1400,14 @@ void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept { mi_subproc_t* subproc = _mi_subproc(); size_t max_arenas = mi_arenas_get_count(subproc); //size_t free_total = 0; - size_t slice_total = 0; + //size_t slice_total = 0; //size_t abandoned_total = 0; size_t page_total = 0; for (size_t i = 0; i < max_arenas; i++) { mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &subproc->arenas[i]); if (arena == NULL) break; mi_assert(arena->subproc == subproc); - slice_total += arena->slice_count; + // slice_total += arena->slice_count; _mi_output_message("arena %zu at %p: %zu slices (%zu MiB)%s, subproc: %p\n", i, arena, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""), arena->subproc); //if (show_inuse) { // free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true, NULL); From ce7eb4db7a746aba77a35fa332d9b01f23430b9b Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 25 Dec 2024 10:49:49 -0800 Subject: [PATCH 159/264] fix page commit-on-demand setting --- src/arena.c | 42 ++++++++++++++++++++---------------------- 1 file changed, 20 insertions(+), 22 deletions(-) diff --git a/src/arena.c b/src/arena.c index bbc0907e..bd1c3e70 100644 --- a/src/arena.c +++ b/src/arena.c @@ -213,10 +213,6 @@ static mi_decl_noinline void* mi_arena_try_alloc_at( // now actually commit bool commit_zero = false; if (!_mi_os_commit_ex(p, mi_size_of_slices(slice_count), &commit_zero, mi_size_of_slices(slice_count - already_committed_count))) { - // failed to commit (todo: give warning?) - if (already_committed_count > 0) { - mi_subproc_stat_increase(arena->subproc, committed, mi_size_of_slices(already_committed_count)); - } memid->initially_committed = false; } else { @@ -308,7 +304,9 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_ // on an OS with overcommit (Linux) we don't count the commit yet as it is on-demand. Once a slice // is actually allocated for the first time it will be counted. const bool adjust = (overcommit && arena_commit); - if (adjust) { mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true /* on alloc */); } + if (adjust) { + mi_subproc_stat_adjust_decrease( subproc, committed, arena_reserve, true /* on alloc */); + } // and try to reserve the arena int err = mi_reserve_os_memory_ex2(subproc, arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id); if (err != 0) { @@ -562,7 +560,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_ _mi_page_free_collect(page, false); // update `used` count mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); @@ -722,7 +720,7 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou // 2. find a free block, potentially allocating a new arena const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand); const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) || // always commit small pages - (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 1)); + (commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 0)); page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit); if (page != NULL) { mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count); @@ -798,7 +796,7 @@ void _mi_arenas_page_free(mi_page_t* page) { mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_clearN(arena->pages_abandoned[bin], slice_index, 1)); mi_assert_internal(mi_bitmap_is_setN(page->memid.mem.arena.arena->pages, page->memid.mem.arena.slice_index, 1)); // note: we cannot check for `!mi_page_is_abandoned_and_mapped` since that may @@ -857,7 +855,7 @@ void _mi_arenas_page_abandon(mi_page_t* page) { mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(!mi_page_is_singleton(page)); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); mi_assert_internal(mi_bitmap_is_setN(arena->slices_dirty, slice_index, slice_count)); mi_page_set_abandoned_mapped(page); @@ -922,7 +920,7 @@ void _mi_arenas_page_unabandon(mi_page_t* page) { mi_arena_t* arena = mi_page_arena(page, &slice_index, &slice_count); mi_assert_internal(mi_bitmap_is_clearN(arena->slices_free, slice_index, slice_count)); - mi_assert_internal(mi_option_is_enabled(mi_option_page_commit_on_demand) || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); + mi_assert_internal(page->slice_committed > 0 || mi_bitmap_is_setN(arena->slices_committed, slice_index, slice_count)); // this busy waits until a concurrent reader (from alloc_abandoned) is done mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index); @@ -1161,9 +1159,9 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s mi_arena_t* arena = (mi_arena_t*)start; - // commit & zero if needed + // commit & zero if needed if (!memid.initially_committed) { - // leave a guard OS page decommitted at the end + // leave a guard OS page decommitted at the end _mi_os_commit(arena, mi_size_of_slices(info_slices) - _mi_os_secure_guard_page_size(), NULL); } else { @@ -1180,7 +1178,7 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s arena->is_exclusive = exclusive; arena->slice_count = slice_count; arena->info_slices = info_slices; - arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) + arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) arena->purge_expire = 0; // mi_lock_init(&arena->abandoned_visit_lock); @@ -1292,7 +1290,7 @@ typedef enum mi_ansi_color_e { } mi_ansi_color_t; static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) { - buf[*k] = '\x1b'; + buf[*k] = '\x1b'; buf[*k+1] = '['; buf[*k+2] = (char)(((int)color / 10) + '0'); buf[*k+3] = (char)(((int)color % 10) + '0'); @@ -1342,7 +1340,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, else if (mi_bitmap_is_setN(arena->slices_committed, slice_index + bit, 1)) { c = '_'; color = MI_GRAY; } else { c = '.'; color = MI_GRAY; } } - if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; } + if (bit==MI_BFIELD_BITS-1 && bit_of_page > 1) { c = '>'; } } if (color != prev_color) { mi_debug_color(buf, k, color); @@ -1357,7 +1355,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, #define MI_FIELDS_PER_LINE (4) static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { - _mi_output_message("\x1B[37m%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header); + _mi_output_message("\x1B[37m%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header); size_t bit_count = 0; size_t bit_set_count = 0; for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) { @@ -1506,7 +1504,7 @@ int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserv static long mi_arena_purge_delay(void) { // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay - return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); } // reset or decommit in an arena and update the commit bitmap @@ -1533,7 +1531,7 @@ static bool mi_arena_purge(mi_arena_t* arena, size_t slice_index, size_t slice_c else if (!all_committed) { // we cannot assume any of these are committed any longer (even with reset since we did setN and may have marked uncommitted slices as committed) mi_bitmap_clearN(arena->slices_committed, slice_index, slice_count); - // we adjust the commit count as parts will be re-committed + // we adjust the commit count as parts will be re-committed // mi_os_stat_decrease(committed, mi_size_of_slices(already_committed)); } @@ -1621,7 +1619,7 @@ static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force) mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); if (!force && (expire == 0 || expire > now)) return false; - // reset expire + // reset expire mi_atomic_store_release(&arena->purge_expire, (mi_msecs_t)0); mi_subproc_stat_counter_increase(arena->subproc, arena_purges, 1); @@ -1696,8 +1694,8 @@ static bool abandoned_page_visit(mi_page_t* page, mi_abandoned_page_visit_info_t if (page->heap_tag != vinfo->heap_tag) { return true; } // continue mi_heap_area_t area; _mi_heap_area_init(&area, page); - if (!vinfo->visitor(NULL, &area, NULL, area.block_size, vinfo->arg)) { - return false; + if (!vinfo->visitor(NULL, &area, NULL, area.block_size, vinfo->arg)) { + return false; } if (vinfo->visit_blocks) { return _mi_heap_area_visit_blocks(&area, page, vinfo->visitor, vinfo->arg); @@ -1712,7 +1710,7 @@ static bool abandoned_page_visit_at(size_t slice_index, size_t slice_count, mi_a mi_abandoned_page_visit_info_t* vinfo = (mi_abandoned_page_visit_info_t*)arg; mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, slice_index); mi_assert_internal(mi_page_is_abandoned_mapped(page)); - return abandoned_page_visit(page, vinfo); + return abandoned_page_visit(page, vinfo); } // Visit all abandoned pages in this subproc. From 15061be4b2fec43ed8bfaa807fd98624366d04e6 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 25 Dec 2024 10:50:49 -0800 Subject: [PATCH 160/264] commit page-map within one allocation --- src/page-map.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/page-map.c b/src/page-map.c index db14265b..a917175a 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -160,6 +160,7 @@ mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_att #else // A 2-level page map +#define MI_PAGE_MAP_SUB_SIZE (MI_PAGE_MAP_SUB_COUNT * sizeof(mi_page_t*)) mi_decl_cache_align mi_page_t*** _mi_page_map; static void* mi_page_map_max_address; @@ -167,6 +168,7 @@ static mi_memid_t mi_page_map_memid; static _Atomic(mi_bfield_t) mi_page_map_commit; +static mi_page_t** mi_page_map_ensure_committed(size_t idx); static mi_page_t** mi_page_map_ensure_at(size_t idx); static inline void mi_page_map_set_range(mi_page_t* page, size_t idx, size_t sub_idx, size_t slice_count); @@ -200,16 +202,17 @@ bool _mi_page_map_init(void) { } mi_atomic_store_release(&mi_page_map_commit, (commit ? ~MI_ZU(0) : MI_ZU(0))); - // commit the first part so NULL pointers get resolved without an access violation - mi_page_map_ensure_at(0); - - // note: for the NULL range we only commit one OS page - // mi_page_map_set_range(NULL, 0, 0, 1); - _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size); + // note: for the NULL range we only commit one OS page (in the map and sub) if (!mi_page_map_memid.initially_committed) { - _mi_os_commit(_mi_page_map[0], os_page_size, NULL); + _mi_os_commit(&_mi_page_map[0], os_page_size, NULL); // commit first part of the map + } + _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size); // we reserved 2 subs at the end already + if (!mi_page_map_memid.initially_committed) { + _mi_os_commit(_mi_page_map[0], os_page_size, NULL); // only first OS page + } + if (!mi_page_map_memid.initially_zero) { + _mi_page_map[0][0] = NULL; } - _mi_page_map[0][0] = NULL; mi_assert_internal(_mi_ptr_page(NULL)==NULL); return true; From 7ae726bb390fe40aed6da0791d3934f59712beb9 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 25 Dec 2024 13:30:42 -0800 Subject: [PATCH 161/264] small fixes --- include/mimalloc/types.h | 4 ++-- src/arena.c | 9 +++++++-- src/options.c | 2 +- 3 files changed, 10 insertions(+), 5 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 06db5639..8b72140a 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -139,8 +139,8 @@ terms of the MIT license. A copy of the license can be found in the file // We never allocate more than PTRDIFF_MAX (see also ) #define MI_MAX_ALLOC_SIZE PTRDIFF_MAX -// Minimal commit for a page on-demand commit (should be >= OS page size, and >= MI_ARENA_SLICE_SIZE for correct stats) -#define MI_PAGE_MIN_COMMIT_SIZE MI_ARENA_SLICE_SIZE +// Minimal commit for a page on-demand commit (should be >= OS page size) +#define MI_PAGE_MIN_COMMIT_SIZE MI_ARENA_SLICE_SIZE // (4*MI_KiB) // ------------------------------------------------------ // Arena's are large reserved areas of memory allocated from diff --git a/src/arena.c b/src/arena.c index 5cdf0d22..c8d4c9cd 100644 --- a/src/arena.c +++ b/src/arena.c @@ -832,10 +832,15 @@ void _mi_arenas_page_free(mi_page_t* page) { const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE; // conservative //mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices)); mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices); - mi_assert_internal(total_slices > 0); if (total_slices > 0) { mi_bitmap_setN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices, NULL); } + // any left over? + const size_t extra = page->slice_committed % MI_ARENA_SLICE_SIZE; + if (extra > 0) { + // pretend it was decommitted already + mi_os_stat_decrease(committed, extra); + } } else { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, page->memid.mem.arena.slice_index, page->memid.mem.arena.slice_count)); @@ -1308,7 +1313,7 @@ static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) { } static int mi_page_commit_usage(mi_page_t* page) { - if (mi_page_size(page) <= MI_PAGE_MIN_COMMIT_SIZE) return 100; + // if (mi_page_size(page) <= MI_PAGE_MIN_COMMIT_SIZE) return 100; const size_t committed_size = mi_page_committed(page); const size_t used_size = page->used * mi_page_block_size(page); return (int)(used_size * 100 / committed_size); diff --git a/src/options.c b/src/options.c index 0a9a5f92..13174798 100644 --- a/src/options.c +++ b/src/options.c @@ -430,7 +430,7 @@ void _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* me // Define our own limited `fprintf` that avoids memory allocation. // We do this using `_mi_vsnprintf` with a limited buffer. static void mi_vfprintf( mi_output_fun* out, void* arg, const char* prefix, const char* fmt, va_list args ) { - char buf[768]; + char buf[992]; if (fmt==NULL) return; if (!mi_recurse_enter()) return; _mi_vsnprintf(buf, sizeof(buf)-1, fmt, args); From 5f13941c1859c3a08d5a5b321c7f99481ca66dae Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 25 Dec 2024 14:12:45 -0800 Subject: [PATCH 162/264] fix constructor re-initialization on subproc_main --- src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/init.c b/src/init.c index 4631d9d9..4feee790 100644 --- a/src/init.c +++ b/src/init.c @@ -97,7 +97,7 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- -static mi_decl_cache_align mi_subproc_t subproc_main; +static mi_decl_cache_align mi_subproc_t subproc_main = { }; // note: empty initializer to prevent running the constructor (in C++ compilation) static mi_decl_cache_align mi_tld_t tld_empty = { 0, // thread_id From efe10513ec056d9e81f713e6c441376dcd2bbf43 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Wed, 25 Dec 2024 14:40:32 -0800 Subject: [PATCH 163/264] fix initializer warning on clang-18 --- src/init.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/init.c b/src/init.c index 4feee790..81aca206 100644 --- a/src/init.c +++ b/src/init.c @@ -97,7 +97,7 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- -static mi_decl_cache_align mi_subproc_t subproc_main = { }; // note: empty initializer to prevent running the constructor (in C++ compilation) +static mi_decl_cache_align mi_subproc_t subproc_main = { 0 }; // note: empty initializer to prevent running the constructor (in C++ compilation) static mi_decl_cache_align mi_tld_t tld_empty = { 0, // thread_id From 27e0c467aefb5b7591cb291e4456823966f58344 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 25 Dec 2024 14:56:11 -0800 Subject: [PATCH 164/264] fix c++ initializer warning --- src/init.c | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/init.c b/src/init.c index 81aca206..cc96e993 100644 --- a/src/init.c +++ b/src/init.c @@ -97,7 +97,12 @@ const mi_page_t _mi_page_empty = { // may lead to allocation itself on some platforms) // -------------------------------------------------------- -static mi_decl_cache_align mi_subproc_t subproc_main = { 0 }; // note: empty initializer to prevent running the constructor (in C++ compilation) +static mi_decl_cache_align mi_subproc_t subproc_main +#if __cplusplus += { }; // empty initializer to prevent running the constructor (with msvc) +#else += { 0 }; // C zero initialize +#endif static mi_decl_cache_align mi_tld_t tld_empty = { 0, // thread_id From f72ac7a5aa85eb95f5e29a410c43a52543cfd444 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 26 Dec 2024 10:28:36 -0800 Subject: [PATCH 165/264] add attr_noexept for better codegen on msvc --- ide/vs2022/mimalloc-test-stress.vcxproj | 4 ++-- src/free.c | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj index cb761f94..9568b2d3 100644 --- a/ide/vs2022/mimalloc-test-stress.vcxproj +++ b/ide/vs2022/mimalloc-test-stress.vcxproj @@ -279,8 +279,8 @@ - - {abb5eae7-b3e6-432e-b636-333449892ea6} + + {abb5eae7-b3e6-432e-b636-333449892ea7} diff --git a/src/free.c b/src/free.c index 4d72cc7a..7467adc1 100644 --- a/src/free.c +++ b/src/free.c @@ -48,10 +48,10 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool } // Forward declaration for multi-threaded collect -static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page); +static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept; // Free a block multi-threaded -static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) +static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_noexcept { // adjust stats (after padding check and potentially recursive `mi_free` above) mi_stat_free(page, block); // stat_free may access the padding @@ -195,7 +195,7 @@ void mi_free(void* p) mi_attr_noexcept // ------------------------------------------------------ -static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) { +static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept { mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); From 0a7fd7eb6fa030c77ac98d6327c323b4409608f2 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 26 Dec 2024 10:42:24 -0800 Subject: [PATCH 166/264] use fixed tls on windows with static linking --- ide/vs2022/mimalloc-lib.vcxproj | 2 +- ide/vs2022/mimalloc-test-stress.vcxproj | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj index c82dbec7..a0c8101b 100644 --- a/ide/vs2022/mimalloc-lib.vcxproj +++ b/ide/vs2022/mimalloc-lib.vcxproj @@ -299,7 +299,7 @@ true Default ../../include - %(PreprocessorDefinitions);NDEBUG + %(PreprocessorDefinitions);NDEBUG;MI_WIN_USE_FIXED_TLS=1 AssemblyAndSourceCode $(IntDir) false diff --git a/ide/vs2022/mimalloc-test-stress.vcxproj b/ide/vs2022/mimalloc-test-stress.vcxproj index 9568b2d3..cb761f94 100644 --- a/ide/vs2022/mimalloc-test-stress.vcxproj +++ b/ide/vs2022/mimalloc-test-stress.vcxproj @@ -279,8 +279,8 @@ - - {abb5eae7-b3e6-432e-b636-333449892ea7} + + {abb5eae7-b3e6-432e-b636-333449892ea6} From e359e9b12ba39c885e122acd6177bcf5b2cb77ed Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 26 Dec 2024 10:43:10 -0800 Subject: [PATCH 167/264] merge from dev3 --- ide/vs2022/mimalloc-lib.vcxproj | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj index a0c8101b..c82dbec7 100644 --- a/ide/vs2022/mimalloc-lib.vcxproj +++ b/ide/vs2022/mimalloc-lib.vcxproj @@ -299,7 +299,7 @@ true Default ../../include - %(PreprocessorDefinitions);NDEBUG;MI_WIN_USE_FIXED_TLS=1 + %(PreprocessorDefinitions);NDEBUG AssemblyAndSourceCode $(IntDir) false From 8a4c26377f128dd3010f94076c4ab819f1076c8b Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 26 Dec 2024 23:12:03 -0800 Subject: [PATCH 168/264] add neon code for bit clear --- include/mimalloc/bits.h | 2 ++ src/bitmap.c | 23 ++++++++++++++++++++++- 2 files changed, 24 insertions(+), 1 deletion(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 875f6230..5b847f4b 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -92,6 +92,8 @@ typedef int32_t mi_ssize_t; #if MI_ARCH_X64 && defined(__AVX2__) #include +#elif MI_ARCH_ARM64 && MI_OPT_SIMD +#include #endif #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) #include diff --git a/src/bitmap.c b/src/bitmap.c index e4a4cc2d..15ae66a0 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -573,6 +573,27 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } + #elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64 + while(true) { + // a cache line is 64b so we can just as well load all at the same time (?) + const uint64x2_t vzero1_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields)); // 2x64 bit is_zero + const uint64x2_t vzero1_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 2)); // 2x64 bit is_zero + const uint64x2_t vzero2_lo = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 4)); // 2x64 bit is_zero + const uint64x2_t vzero2_hi = vceqzq_u64(vld1q_u64((uint64_t*)chunk->bfields + 6)); // 2x64 bit is_zero + const uint32x4_t vzero1 = vuzp1q_u32(vreinterpretq_u32_u64(vzero1_lo),vreinterpretq_u32_u64(vzero1_hi)); // unzip even elements: narrow to 4x32 bit is_zero () + const uint32x4_t vzero2 = vuzp1q_u32(vreinterpretq_u32_u64(vzero2_lo),vreinterpretq_u32_u64(vzero2_hi)); // unzip even elements: narrow to 4x32 bit is_zero () + const uint32x4_t vzero1x = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero1), 24)); // shift-right 2x32bit elem by 24: lo 16 bits contain the 2 lo bytes + const uint32x4_t vzero2x = vreinterpretq_u32_u64(vshrq_n_u64(vreinterpretq_u64_u32(vzero2), 24)); + const uint16x8_t vzero12 = vreinterpretq_u16_u32(vuzp1q_u32(vzero1x,vzero2x)); // unzip even 32-bit elements into one vector + const uint8x8_t vzero = vmovn_u32(vzero12); // narrow the bottom 16-bits + const uint64_t mask = ~vget_lane_u64(vreinterpret_u64_u8(vzero), 0); // 1 byte for each bfield (0xFF => bfield has a bit set) + if (mask==0) return false; + mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. + const size_t chunk_idx = mi_ctz(mask) / 8; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; + // try again + // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded + } #else // try first to find a field that is not all set (to reduce fragmentation) for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { @@ -590,7 +611,7 @@ static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, return mi_bchunk_try_find_and_clear(chunk, pidx); } -#if !MI_OPT_SIMD +#if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)) static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) { const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); if (!allow_all_set && (~b == 0)) return false; From dddcd5de16f0eb61e9ecd6f0a13e0695ddcad257 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 26 Dec 2024 23:49:38 -0800 Subject: [PATCH 169/264] add neon version for chunk_is_clear --- src/bitmap.c | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index 15ae66a0..03e21c89 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -868,6 +868,13 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { const __m256i vec1 = _mm256_load_si256((const __m256i*)chunk->bfields); const __m256i vec2 = _mm256_load_si256(((const __m256i*)chunk->bfields)+1); return (mi_mm256_is_zero(_mm256_or_si256(vec1,vec2))); + #elif MI_OPT_SIMD && (MI_BCHUNK_BITS==512) && MI_ARCH_ARM64 + const uint64x2_t v0 = vld1q_u64((uint64_t*)chunk->bfields); + const uint64x2_t v1 = vld1q_u64((uint64_t*)chunk->bfields + 2); + const uint64x2_t v2 = vld1q_u64((uint64_t*)chunk->bfields + 4); + const uint64x2_t v3 = vld1q_u64((uint64_t*)chunk->bfields + 6); + const uint64x2_t v = vorrq_u64(vorrq_u64(v0,v1),vorrq_u64(v2,v3)); + return (vmaxvq_u32(vreinterpretq_u32_u64(v)) == 0); #else for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (mi_atomic_load_relaxed(&chunk->bfields[i]) != 0) return false; @@ -876,7 +883,6 @@ static inline bool mi_bchunk_all_are_clear_relaxed(mi_bchunk_t* chunk) { #endif } - static bool mi_bchunk_bsr(mi_bchunk_t* chunk, size_t* pidx) { for (size_t i = MI_BCHUNK_FIELDS; i > 0; ) { i--; From 0d302cd1749ac8025893923b1c1d77f9246199e0 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 31 Dec 2024 15:11:09 -0800 Subject: [PATCH 170/264] add comments --- include/mimalloc/types.h | 27 ++++++++++++++------------- 1 file changed, 14 insertions(+), 13 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 8b72140a..c5029a14 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -100,9 +100,10 @@ terms of the MIT license. A copy of the license can be found in the file #endif -// ------------------------------------------------------ +// -------------------------------------------------------------- // Sizes of internal data-structures -// ------------------------------------------------------ +// (comments specify sizes on 64-bit, usually 32-bit is halved) +// -------------------------------------------------------------- // Sizes are for 64-bit #ifndef MI_ARENA_SLICE_SHIFT @@ -116,19 +117,19 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_BCHUNK_BITS_SHIFT (6 + MI_SIZE_SHIFT) // optimized for 512 bits per chunk (avx512) #endif -#define MI_BCHUNK_BITS (1 << MI_BCHUNK_BITS_SHIFT) -#define MI_ARENA_SLICE_SIZE (MI_ZU(1) << MI_ARENA_SLICE_SHIFT) +#define MI_BCHUNK_BITS (1 << MI_BCHUNK_BITS_SHIFT) // sub-bitmaps are "bchunks" of 512 bits +#define MI_ARENA_SLICE_SIZE (MI_ZU(1) << MI_ARENA_SLICE_SHIFT) // arena's allocate in slices of 64 KiB #define MI_ARENA_SLICE_ALIGN (MI_ARENA_SLICE_SIZE) -#define MI_ARENA_MIN_OBJ_SLICES (1) -#define MI_ARENA_MAX_OBJ_SLICES (MI_BCHUNK_BITS) // 32 MiB (for now, cannot cross chunk boundaries) +#define MI_ARENA_MIN_OBJ_SLICES (1) +#define MI_ARENA_MAX_OBJ_SLICES (MI_BCHUNK_BITS) // 32 MiB (for now, cannot cross chunk boundaries) #define MI_ARENA_MIN_OBJ_SIZE (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE) #define MI_ARENA_MAX_OBJ_SIZE (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE) -#define MI_SMALL_PAGE_SIZE MI_ARENA_MIN_OBJ_SIZE -#define MI_MEDIUM_PAGE_SIZE (8*MI_SMALL_PAGE_SIZE) // 512 KiB (=byte in the bitmap) -#define MI_LARGE_PAGE_SIZE (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE) // 4 MiB (=word in the bitmap) +#define MI_SMALL_PAGE_SIZE MI_ARENA_MIN_OBJ_SIZE // 64 KiB +#define MI_MEDIUM_PAGE_SIZE (8*MI_SMALL_PAGE_SIZE) // 512 KiB (=byte in the bchunk bitmap) +#define MI_LARGE_PAGE_SIZE (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE) // 4 MiB (=word in the bchunk bitmap) // Maximum number of size classes. (spaced exponentially in 12.5% increments) #define MI_BIN_HUGE (73U) @@ -272,7 +273,7 @@ typedef uint8_t mi_heaptag_t; // // Notes: // - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`). -// - If a page is not part of a heap it is called "abandoned" -- in +// - If a page is not part of a heap it is called "abandoned" (`heap==NULL`) -- in // that case the `xthreadid` is 0 or 1 (1 is for abandoned pages that // are in the abandoned page lists of an arena, these are called "mapped" abandoned pages). // - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc` @@ -304,7 +305,7 @@ typedef struct mi_page_s { mi_heap_t* heap; // the heap owning this page (or NULL for abandoned pages) struct mi_page_s* next; // next page owned by the heap with the same `block_size` struct mi_page_s* prev; // previous page owned by the heap with the same `block_size` - size_t slice_committed; // committed size relative to the first arena slice of the page data + size_t slice_committed; // committed size relative to the first arena slice of the page data (or 0 if the page is fully committed already) mi_memid_t memid; // provenance of the page memory } mi_page_t; @@ -315,7 +316,7 @@ typedef struct mi_page_s { #define MI_PAGE_ALIGN MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map. #define MI_PAGE_MIN_START_BLOCK_ALIGN MI_MAX_ALIGN_SIZE // minimal block alignment for the first block in a page (16b) -#define MI_PAGE_MAX_START_BLOCK_ALIGN2 MI_KiB // maximal block alignment for "power of 2"-sized blocks +#define MI_PAGE_MAX_START_BLOCK_ALIGN2 MI_KiB // maximal block alignment for "power of 2"-sized blocks (such that we guarantee natural alignment) #define MI_PAGE_MAX_OVERALLOC_ALIGN MI_ARENA_SLICE_SIZE // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation #if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8 @@ -328,7 +329,7 @@ typedef struct mi_page_s { // (Except for large pages since huge objects are allocated in 4MiB chunks) #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/4) // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with _mi_bin +#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/4) // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) From c507ee3d96a2146717d6ac5fe120d1dc2da545dd Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 2 Jan 2025 11:42:28 -0800 Subject: [PATCH 171/264] make bitmap scan cross bfields for NX; disable the use of large object pages --- src/arena.c | 6 +- src/bitmap.c | 287 +++++++++++++++++++++++++++++---------------------- src/bitmap.h | 4 +- 3 files changed, 170 insertions(+), 127 deletions(-) diff --git a/src/arena.c b/src/arena.c index c8d4c9cd..11a4f82f 100644 --- a/src/arena.c +++ b/src/arena.c @@ -773,9 +773,9 @@ mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t bloc else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) { page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); } - else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { - page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); - } + //else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { + // page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); + // } else { page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment); } diff --git a/src/bitmap.c b/src/bitmap.c index 03e21c89..5cecc606 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -130,6 +130,7 @@ static inline bool mi_bfield_atomic_set_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t } // Clear a mask set of bits atomically, and return true of the mask bits transitioned from all 1's to 0's +// `all_clear` is set to `true` if the new bfield became zero. static inline bool mi_bfield_atomic_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); @@ -155,6 +156,7 @@ static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_cle // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0 // and false otherwise (leaving the bit field as is). +// `all_clear` is set to `true` if the new bfield became zero. static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) { mi_assert_internal(mask != 0); mi_bfield_t old = mi_atomic_load_relaxed(b); @@ -170,9 +172,9 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bf } -// Tries to set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0) +// Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0 // and `false` otherwise leaving the bfield `b` as-is. -// `all_clear` is set to true if the new bfield is zero (and false otherwise) +// `all_clear` is set to true if the new bfield became zero (and false otherwise) static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_one()<bfields[i], idx); + const bool was_clear = mi_bfield_atomic_set(&chunk->bfields[i], idx); + if (already_set != NULL) { *already_set = (was_clear ? 0 : 1); } + return was_clear; } +// Set `0 < n <= MI_BFIELD_BITS`, and return true of the mask bits transitioned from all 0's to 1's. +// `already_set` contains the count of bits that were already set (used when committing ranges to account +// statistics correctly). +// Can cross over two bfields. static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { mi_assert_internal(cidx < MI_BCHUNK_BITS); + mi_assert_internal(n > 0 && n <= MI_BFIELD_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; - const mi_bfield_t mask = mi_bfield_mask(n, idx); - return mi_bfield_atomic_set_mask(&chunk->bfields[i], mask, already_set); -} - -static inline bool mi_bchunk_setX(mi_bchunk_t* chunk, size_t cidx, size_t* already_set) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); - mi_assert_internal((cidx%MI_BFIELD_BITS)==0); - const size_t i = cidx / MI_BFIELD_BITS; - return mi_bfield_atomic_setX(&chunk->bfields[i], already_set); + if mi_likely(idx + n <= MI_BFIELD_BITS) { + // within one field + return mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(n,idx), already_set); + } + else { + // spanning two fields + const size_t m = MI_BFIELD_BITS - idx; // bits to clear in the first field + mi_assert_internal(m < n); + mi_assert_internal(i < MI_BCHUNK_FIELDS - 1); + size_t already_set1; + const bool all_set1 = mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &already_set1); + mi_assert_internal(n - m > 0); + mi_assert_internal(n - m < MI_BFIELD_BITS); + size_t already_set2; + const bool all_set2 = mi_bfield_atomic_set_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &already_set2); + if (already_set != NULL) { *already_set = already_set1 + already_set2; } + return (all_set1 && all_set2); + } } // Set a sequence of `n` bits within a chunk. @@ -298,6 +319,7 @@ mi_decl_noinline static bool mi_bchunk_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, // next field field++; idx = 0; + mi_assert_internal(m <= n); n -= m; } if (palready_set!=NULL) { *palready_set = total_already_set; } @@ -307,13 +329,10 @@ mi_decl_noinline static bool mi_bchunk_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, static inline bool mi_bchunk_setN(mi_bchunk_t* chunk, size_t cidx, size_t n, size_t* already_set) { mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS); - if (n==1) { - bool was_clear = mi_bchunk_set(chunk, cidx); - if (already_set != NULL) { *already_set = !was_clear; } - return was_clear; - } - if (n==MI_BFIELD_BITS) return mi_bchunk_setX(chunk, cidx, already_set); - if (n bfields[i], idx, all_clear); } -static inline bool mi_bchunk_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* all_clear) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); - const size_t i = cidx / MI_BFIELD_BITS; - const size_t idx = cidx % MI_BFIELD_BITS; - const mi_bfield_t mask = mi_bfield_mask(n, idx); - return mi_bfield_atomic_clear_mask(&chunk->bfields[i], mask, all_clear); -} - -static inline bool mi_bchunk_clearX(mi_bchunk_t* chunk, size_t cidx, bool* all_clear) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); - mi_assert_internal((cidx%MI_BFIELD_BITS)==0); - const size_t i = cidx / MI_BFIELD_BITS; - return mi_bfield_atomic_clearX(&chunk->bfields[i], all_clear); -} - static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) { mi_assert_internal(n>0 && n <= MI_BCHUNK_BITS); if (n==1) return mi_bchunk_clear(chunk, cidx, maybe_all_clear); - if (n==MI_BFIELD_BITS) return mi_bchunk_clearX(chunk, cidx, maybe_all_clear); - if (n bfields[i], idx); } - if mi_likely(n<=MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); } + if (n==1) { return mi_bfield_atomic_is_xset(set, &chunk->bfields[i], idx); } + if (idx + n <= MI_BFIELD_BITS) { return mi_bfield_atomic_is_xset_mask(set, &chunk->bfields[i], mi_bfield_mask(n, idx)); } return mi_bchunk_is_xsetN_(set, chunk, i, idx, n); } // ------- mi_bchunk_try_clear --------------------------------------- +// Clear `0 < n <= MI_BITFIELD_BITS`. Can cross over a bfield boundary. static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { mi_assert_internal(cidx < MI_BCHUNK_BITS); mi_assert_internal(n <= MI_BFIELD_BITS); const size_t i = cidx / MI_BFIELD_BITS; const size_t idx = cidx % MI_BFIELD_BITS; - mi_assert_internal(idx + n <= MI_BFIELD_BITS); - const size_t mask = mi_bfield_mask(n, idx); - return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mask, pmaybe_all_clear); + if mi_likely(idx + n <= MI_BFIELD_BITS) { + // within one field + return mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(n, idx), pmaybe_all_clear); + } + else { + // spanning two fields (todo: use double-word atomic ops?) + const size_t m = MI_BFIELD_BITS - idx; // bits to clear in the first field + mi_assert_internal(m < n); + mi_assert_internal(i < MI_BCHUNK_FIELDS - 1); + bool field1_is_clear; + if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &field1_is_clear)) return false; + // try the second field as well + mi_assert_internal(n - m > 0); + mi_assert_internal(n - m < MI_BFIELD_BITS); + bool field2_is_clear; + if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[i+1], mi_bfield_mask(n - m, 0), &field2_is_clear)) { + // we failed to clear the second field, restore the first one + mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), NULL); + return false; + } + if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = field1_is_clear && field2_is_clear; } + return true; + } } +// Clear a full aligned bfield. static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) { mi_assert_internal(cidx < MI_BCHUNK_BITS); mi_assert_internal((cidx%MI_BFIELD_BITS) == 0); @@ -405,60 +432,51 @@ static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* p return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear); } -// Try to atomically set/clear a sequence of `n` bits within a chunk. -// Returns true if all bits transitioned from 0 to 1 (or 1 to 0), +// Try to atomically clear a sequence of `n` bits within a chunk. +// Returns true if all bits transitioned from 1 to 0, // and false otherwise leaving all bit fields as is. -// Note: this is a hard one as we need to unwind partial atomic operations -// if we fail halfway.. +// Note: this is the complex one as we need to unwind partial atomic operations if we fail halfway.. +// `maybe_all_clear` is set to `true` if all the bfields involved become zero. mi_decl_noinline static bool mi_bchunk_try_clearN_(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* pmaybe_all_clear) { mi_assert_internal(cidx + n <= MI_BCHUNK_BITS); mi_assert_internal(n>0); + if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = true; } if (n==0) return true; - size_t start_idx = cidx % MI_BFIELD_BITS; - size_t start_field = cidx / MI_BFIELD_BITS; - size_t end_field = MI_BCHUNK_FIELDS; - mi_bfield_t mask_mid = 0; - mi_bfield_t mask_end = 0; - bool field_is_clear; - bool maybe_all_clear = true; - if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = false; } - + // first field + const size_t start_idx = cidx % MI_BFIELD_BITS; + const size_t start_field = cidx / MI_BFIELD_BITS; size_t field = start_field; - size_t m = MI_BFIELD_BITS - start_idx; // m is the bits to xset in this field + size_t m = MI_BFIELD_BITS - start_idx; // m are the bits to clear in this field if (m > n) { m = n; } mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); mi_assert_internal(start_field < MI_BCHUNK_FIELDS); const mi_bfield_t mask_start = mi_bfield_mask(m, start_idx); - if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &field_is_clear)) return false; - maybe_all_clear = maybe_all_clear && field_is_clear; + bool maybe_all_clear; + if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_start, &maybe_all_clear)) return false; // done? + mi_assert_internal(m <= n); n -= m; - if (n==0) { - if (pmaybe_all_clear != NULL) { *pmaybe_all_clear = maybe_all_clear; } - return true; - } - - // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields - - // mid fields + + // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields + // mid fields? while (n >= MI_BFIELD_BITS) { field++; mi_assert_internal(field < MI_BCHUNK_FIELDS); - mask_mid = mi_bfield_all_set(); - if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_mid, &field_is_clear)) goto restore; + bool field_is_clear; + if (!mi_bfield_atomic_try_clearX(&chunk->bfields[field], &field_is_clear)) goto restore; maybe_all_clear = maybe_all_clear && field_is_clear; n -= MI_BFIELD_BITS; } - // last field + // last field? if (n > 0) { mi_assert_internal(n < MI_BFIELD_BITS); field++; mi_assert_internal(field < MI_BCHUNK_FIELDS); - end_field = field; - mask_end = mi_bfield_mask(n, 0); + const mi_bfield_t mask_end = mi_bfield_mask(n, 0); + bool field_is_clear; if (!mi_bfield_atomic_try_clear_mask(&chunk->bfields[field], mask_end, &field_is_clear)) goto restore; maybe_all_clear = maybe_all_clear && field_is_clear; } @@ -467,12 +485,16 @@ mi_decl_noinline static bool mi_bchunk_try_clearN_(mi_bchunk_t* chunk, size_t ci return true; restore: - // field is on the field that failed to set atomically; we need to restore all previous fields + // `field` is the index of the field that failed to set atomically; we need to restore all previous fields mi_assert_internal(field > start_field); while( field > start_field) { field--; - const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); - mi_bfield_atomic_set_mask(&chunk->bfields[field], mask, NULL); + if (field == start_field) { + mi_bfield_atomic_set_mask(&chunk->bfields[field], mask_start, NULL); + } + else { + mi_bfield_atomic_setX(&chunk->bfields[field], NULL); // mid-field: set all bits again + } } return false; } @@ -480,8 +502,8 @@ restore: static inline bool mi_bchunk_try_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, bool* maybe_all_clear) { mi_assert_internal(n>0); - if (n==MI_BFIELD_BITS) return mi_bchunk_try_clearX(chunk, cidx, maybe_all_clear); - if (n MI_BFIELD_BITS) return false; const mi_bfield_t mask = mi_bfield_mask(n, 0); + // for all fields in the chunk for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t idx; + // is there a range inside the field? while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit - if (idx + n > MI_BFIELD_BITS) break; + if (idx + n > MI_BFIELD_BITS) break; // too short, maybe cross over, or continue with the next field const size_t bmask = mask<>idx == mask); - if ((b&bmask) == bmask) { // found a match + if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) { *pidx = (i*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); @@ -753,7 +776,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, return true; } else { - // if failed to atomically commit, reload b and try again from this position + // if we failed to atomically commit, reload b and try again from the start b = mi_atomic_load_acquire(&chunk->bfields[i]); } } @@ -764,6 +787,25 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, b = b & ~mi_bfield_mask(ones, idx); // clear the ones } } + + // check if we can cross into the next bfield + if (i < MI_BCHUNK_FIELDS-1) { + const size_t post = mi_bfield_clz(~b); + if (post > 0) { + const size_t pre = mi_bfield_ctz(mi_atomic_load_relaxed(&chunk->bfields[i+1])); + if (post + pre <= n) { + // it fits -- try to claim it atomically + const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post); + if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) { + // we cleared all atomically + *pidx = cidx; + mi_assert_internal(*pidx < MI_BCHUNK_BITS); + mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); + return true; + } + } + } + } } return false; } @@ -775,46 +817,47 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, size_t n, size_t* pidx) { if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - const size_t skip_count = n/MI_BFIELD_BITS; + // we first scan ahead to see if there is a range of `n` set bits, and only then try to clear atomically + mi_assert_internal(n>0); + const size_t skip_count = (n-1)/MI_BFIELD_BITS; size_t cidx; - for (size_t i = 0; i <= MI_BCHUNK_FIELDS - skip_count; i++) + for (size_t i = 0; i < MI_BCHUNK_FIELDS - skip_count; i++) { size_t m = n; // bits to go // first field mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t ones = mi_bfield_clz(~b); - cidx = i*MI_BFIELD_BITS + (MI_BFIELD_BITS - ones); // start index + cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - ones); // start index if (ones >= m) { // we found enough bits! m = 0; } else { m -= ones; - mi_assert_internal(m>0); - } - - // keep scanning further fields? - size_t j = 1; // field count from i - while (i+j < MI_BCHUNK_FIELDS) { - mi_assert_internal(m > 0); - b = mi_atomic_load_relaxed(&chunk->bfields[i+j]); - ones = mi_bfield_ctz(~b); - if (ones >= m) { - // we found enough bits - m = 0; - break; - } - else if (ones == MI_BFIELD_BITS) { - // not enough yet, proceed to the next field - j++; - m -= MI_BFIELD_BITS; - } - else { - // the range was not enough, start from scratch - i = i + j - 1; // no need to re-scan previous fields, except the last one (with clz this time) - mi_assert_internal(m>0); - break; + + // keep scanning further fields? + size_t j = 1; // field count from i + while (i+j < MI_BCHUNK_FIELDS) { + mi_assert_internal(m > 0); + b = mi_atomic_load_relaxed(&chunk->bfields[i+j]); + ones = mi_bfield_ctz(~b); + if (ones >= m) { + // we found enough bits + m = 0; + break; + } + else if (ones == MI_BFIELD_BITS) { + // not enough yet, proceed to the next field + j++; + m -= MI_BFIELD_BITS; + } + else { + // the range was not enough, start from scratch + i = i + j - 1; // no need to re-scan previous fields, except the last one (with clz this time) + mi_assert_internal(m>0); + break; + } } } @@ -838,9 +881,9 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, //static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { // if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages // if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages -// if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages -// if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk -// if (n < MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); +// // if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages +// if (n==0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk +// if (n<=MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); // return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); //} @@ -909,7 +952,7 @@ static void mi_bitmap_chunkmap_set_max(mi_bitmap_t* bitmap, size_t chunk_idx) { static void mi_bitmap_chunkmap_set(mi_bitmap_t* bitmap, size_t chunk_idx) { mi_assert(chunk_idx < mi_bitmap_chunk_count(bitmap)); - mi_bchunk_set(&bitmap->chunkmap, chunk_idx); + mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL); mi_bitmap_chunkmap_set_max(bitmap, chunk_idx); } @@ -922,7 +965,7 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx) // .. but a concurrent set may have happened in between our all-clear test and the clearing of the // bit in the mask. We check again to catch this situation. if (!mi_bchunk_all_are_clear_relaxed(&bitmap->chunks[chunk_idx])) { - mi_bchunk_set(&bitmap->chunkmap, chunk_idx); + mi_bchunk_set(&bitmap->chunkmap, chunk_idx, NULL); return false; } mi_bitmap_chunkmap_set_max(bitmap, chunk_idx); @@ -1018,7 +1061,7 @@ bool mi_bitmap_set(mi_bitmap_t* bitmap, size_t idx) { const size_t chunk_idx = idx / MI_BCHUNK_BITS; const size_t cidx = idx % MI_BCHUNK_BITS; mi_assert_internal(chunk_idx < mi_bitmap_chunk_count(bitmap)); - const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); + const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx, NULL); mi_bitmap_chunkmap_set(bitmap, chunk_idx); // set afterwards return wasclear; } @@ -1235,9 +1278,9 @@ bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pid return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, 8, pidx, &mi_bchunk_try_find_and_clear_8); } -bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { - return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X); -} +//bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx) { +// return mi_bitmap_try_find_and_clear_generic(bitmap, tseq, MI_BFIELD_BITS, pidx, &mi_bchunk_try_find_and_clear_X); +//} bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t tseq, size_t n, size_t* pidx) { mi_assert_internal(n<=MI_BFIELD_BITS); @@ -1279,7 +1322,7 @@ static bool mi_bitmap_try_find_and_claim_visit(mi_bitmap_t* bitmap, size_t chunk else { // failed to claim it, set abandoned mapping again (unless the page was freed) if (keep_set) { - const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx); + const bool wasclear = mi_bchunk_set(&bitmap->chunks[chunk_idx], cidx, NULL); mi_assert_internal(wasclear); MI_UNUSED(wasclear); } } diff --git a/src/bitmap.h b/src/bitmap.h index 16ecea07..09967fb9 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -191,7 +191,7 @@ static inline bool mi_bitmap_try_clear(mi_bitmap_t* bitmap, size_t idx) { // Specialized versions for common bit sequence sizes mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 1-bit mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // 8-bits -mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS +// mi_decl_nodiscard bool mi_bitmap_try_find_and_clearX(mi_bitmap_t* bitmap, size_t tseq, size_t* pidx); // MI_BFIELD_BITS mi_decl_nodiscard bool mi_bitmap_try_find_and_clearNX(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // < MI_BFIELD_BITS mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx); // > MI_BFIELD_BITS <= MI_BCHUNK_BITS @@ -200,7 +200,7 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN_(mi_bitmap_t* bitmap, size_ mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx) { if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); // small pages if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); // medium pages - if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages + // if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx); return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx); From 5e26ba6fe62e5624dd65564501ef8d2fd915e56d Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 2 Jan 2025 12:14:12 -0800 Subject: [PATCH 172/264] fix debug output --- src/arena.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/arena.c b/src/arena.c index 11a4f82f..4c363a57 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1319,10 +1319,10 @@ static int mi_page_commit_usage(mi_page_t* page) { return (int)(used_size * 100 / committed_size); } -static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, mi_arena_t* arena, size_t slice_index) { +static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, mi_arena_t* arena, size_t slice_index, long* pbit_of_page, mi_ansi_color_t* pcolor_of_page ) { size_t bit_set_count = 0; - long bit_of_page = 0; - mi_ansi_color_t color = MI_GRAY; + long bit_of_page = *pbit_of_page; + mi_ansi_color_t color = *pcolor_of_page; mi_ansi_color_t prev_color = MI_GRAY; for (int bit = 0; bit < MI_BFIELD_BITS; bit++, bit_of_page--) { bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); @@ -1331,9 +1331,9 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, if (is_set) { mi_assert_internal(bit_of_page <= 0); bit_set_count++; - mi_page_t* page = (mi_page_t*)start; c = 'p'; color = MI_GRAY; + mi_page_t* page = (mi_page_t*)start; if (mi_page_is_abandoned_mapped(page)) { c = 'a'; } else if (mi_page_is_abandoned(page)) { c = (mi_page_is_singleton(page) ? 's' : 'f'); } int commit_usage = mi_page_commit_usage(page); @@ -1362,7 +1362,9 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, } buf[*k] = c; *k += 1; } - mi_debug_color(buf, k, MI_GRAY); + mi_debug_color(buf, k, MI_GRAY); + *pbit_of_page = bit_of_page; + *pcolor_of_page = color; return bit_set_count; } @@ -1381,6 +1383,8 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi else if (i<100) { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; } else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); } + long bit_of_page = 0; + mi_ansi_color_t color_of_page = MI_GRAY; for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { if (j > 0 && (j % MI_FIELDS_PER_LINE) == 0) { _mi_output_message(" %s\n\x1B[37m", buf); @@ -1390,7 +1394,7 @@ static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bi if (bit_count < slice_count) { mi_bfield_t bfield = chunk->bfields[j]; if (invert) bfield = ~bfield; - size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf, &k, arena, bit_count) + size_t xcount = (arena!=NULL ? mi_debug_show_page_bfield(bfield, buf, &k, arena, bit_count, &bit_of_page, &color_of_page) : mi_debug_show_bfield(bfield, buf, &k)); if (invert) xcount = MI_BFIELD_BITS - xcount; bit_set_count += xcount; From 10b40f90fc12b1e6895555410561c07b0cba0344 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 2 Jan 2025 14:59:42 -0800 Subject: [PATCH 173/264] fix scan of NX --- src/bitmap.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index 5cecc606..067faff0 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -281,6 +281,7 @@ static inline bool mi_bchunk_setNX(mi_bchunk_t* chunk, size_t cidx, size_t n, si const size_t m = MI_BFIELD_BITS - idx; // bits to clear in the first field mi_assert_internal(m < n); mi_assert_internal(i < MI_BCHUNK_FIELDS - 1); + mi_assert_internal(idx + m <= MI_BFIELD_BITS); size_t already_set1; const bool all_set1 = mi_bfield_atomic_set_mask(&chunk->bfields[i], mi_bfield_mask(m, idx), &already_set1); mi_assert_internal(n - m > 0); @@ -792,7 +793,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, if (i < MI_BCHUNK_FIELDS-1) { const size_t post = mi_bfield_clz(~b); if (post > 0) { - const size_t pre = mi_bfield_ctz(mi_atomic_load_relaxed(&chunk->bfields[i+1])); + const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1])); if (post + pre <= n) { // it fits -- try to claim it atomically const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post); From 34e402e128402c4d534f0513b76f54ecfaa573dd Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 2 Jan 2025 15:00:17 -0800 Subject: [PATCH 174/264] fix NX test in try_find_and_clearN --- src/bitmap.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/bitmap.h b/src/bitmap.h index 09967fb9..8ab06216 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -201,8 +201,8 @@ mi_decl_nodiscard static inline bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* if (n==1) return mi_bitmap_try_find_and_clear(bitmap, tseq, pidx); // small pages if (n==8) return mi_bitmap_try_find_and_clear8(bitmap, tseq, pidx); // medium pages // if (n==MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearX(bitmap, tseq, pidx); // large pages - if (n == 0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk - if (n < MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx); + if (n==0 || n>MI_BCHUNK_BITS) return false; // cannot be more than a chunk + if (n <= MI_BFIELD_BITS) return mi_bitmap_try_find_and_clearNX(bitmap, tseq, n, pidx); return mi_bitmap_try_find_and_clearN_(bitmap, tseq, n, pidx); } From ab78d57a843476edd6e89139585a98011e107911 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 2 Jan 2025 15:19:08 -0800 Subject: [PATCH 175/264] search size bins from small to large --- src/bitmap.c | 16 ++++++++++------ 1 file changed, 10 insertions(+), 6 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index a03aef69..b9daf7c6 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1552,14 +1552,16 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); const size_t cmap_cycle = cmap_acc+1; const mi_bbin_t bbin = mi_bbin_of(n); - // visit bins from largest size bin up to the NONE bin - for(int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL - // const mi_bbin_t bin = bbin; + // visit bins from smallest to largest (to reduce fragmentation on the larger blocks) + for(int bin = MI_BBIN_SMALL; bin <= bbin; bin++) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL + // (int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // visit bins from largest size bin up to the NONE bin { mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) { // don't search into non-accessed memory until we tried other size bins as well - if (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) { + if (bin < bbin && cmap_idx > cmap_acc) + // (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) // large to small + { break; } @@ -1573,8 +1575,10 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); // only in the current size class! const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]); - if // (bin >= chunk_bin) { - ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { + if ((mi_bbin_t)bin == chunk_bin || (bin == bbin && chunk_bin == MI_BBIN_NONE)) // only allow NONE at the final run + // ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { largest to smallest + + { mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx]; size_t cidx; if ((*on_find)(chunk, n, &cidx)) { From 2a75500ac2a43fc52c394181579347c7cb336965 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 3 Jan 2025 08:38:36 -0800 Subject: [PATCH 176/264] disable large pages by default --- include/mimalloc/types.h | 15 +++++++++++++-- src/arena.c | 6 +++--- 2 files changed, 16 insertions(+), 5 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index c5029a14..9fefdf60 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -99,6 +99,10 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_ENCODE_FREELIST 1 #endif +// Enable large pages for objects between 128KiB and 512KiB. Disabled by default. +#ifndef MI_ENABLE_LARGE_PAGES +#define MI_ENABLE_LARGE_PAGES 0 +#endif // -------------------------------------------------------------- // Sizes of internal data-structures @@ -131,6 +135,7 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_MEDIUM_PAGE_SIZE (8*MI_SMALL_PAGE_SIZE) // 512 KiB (=byte in the bchunk bitmap) #define MI_LARGE_PAGE_SIZE (MI_SIZE_SIZE*MI_MEDIUM_PAGE_SIZE) // 4 MiB (=word in the bchunk bitmap) + // Maximum number of size classes. (spaced exponentially in 12.5% increments) #define MI_BIN_HUGE (73U) #define MI_BIN_FULL (MI_BIN_HUGE+1) @@ -328,8 +333,14 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB -#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/4) // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` + +#if MI_ENABLE_LARGE_PAGES +#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB +#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/4) // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` +#else +#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/8) // <= 64 KiB +#define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE +#endif #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/arena.c b/src/arena.c index 60046cdc..cf1836f7 100644 --- a/src/arena.c +++ b/src/arena.c @@ -773,9 +773,9 @@ mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t bloc else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) { page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); } - //else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { - // page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); - // } + else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { + page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); + } else { page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment); } From bbd7a492f0f5cab84c08a0bab38151e28908a63e Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 3 Jan 2025 08:46:30 -0800 Subject: [PATCH 177/264] fix signedness warning --- src/bitmap.c | 2 +- src/bitmap.h | 15 ++++++++++++--- 2 files changed, 13 insertions(+), 4 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index b9daf7c6..ce92fe3f 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1553,7 +1553,7 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap, const size_t cmap_cycle = cmap_acc+1; const mi_bbin_t bbin = mi_bbin_of(n); // visit bins from smallest to largest (to reduce fragmentation on the larger blocks) - for(int bin = MI_BBIN_SMALL; bin <= bbin; bin++) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL + for(mi_bbin_t bin = MI_BBIN_SMALL; bin <= bbin; bin = mi_bbin_inc(bin)) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL // (int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // visit bins from largest size bin up to the NONE bin { mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) diff --git a/src/bitmap.h b/src/bitmap.h index 9969aec0..9afdffce 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -219,12 +219,21 @@ typedef enum mi_bbin_e { MI_BBIN_SMALL, // slice_count == 1 MI_BBIN_OTHER, // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS MI_BBIN_MEDIUM, // slice_count == 8 - MI_BBIN_LARGE, // slice_count == MI_BFIELD_BITS -- not used for now! + MI_BBIN_LARGE, // slice_count == MI_BFIELD_BITS -- only used if MI_ENABLE_LARGE_PAGES is 1 MI_BBIN_COUNT } mi_bbin_t; -static inline mi_bbin_t mi_bbin_of(size_t n) { - return (n==1 ? MI_BBIN_SMALL : (n==8 ? MI_BBIN_MEDIUM : MI_BBIN_OTHER)); // (n==64 ? MI_BBIN_LARGE : MI_BBIN_OTHER))); +static inline mi_bbin_t mi_bbin_inc(mi_bbin_t bbin) { + return (mi_bbin_t)((int)bbin + 1); +} + +static inline mi_bbin_t mi_bbin_of(size_t slice_count) { + if (slice_count==1) return MI_BBIN_SMALL; + if (slice_count==8) return MI_BBIN_MEDIUM; + #if MI_ENABLE_LARGE_PAGES + if (slice_count==MI_BFIELD_BITS) return MI_BBIN_LARGE; + #endif + return MI_BBIN_OTHER; } // An atomic "binned" bitmap for the free slices where we keep chunks reserved for particalar size classes From 281a513642df30ef0ee54b047fe12e64499e7a44 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 3 Jan 2025 08:48:06 -0800 Subject: [PATCH 178/264] fix initialization warning on gcc --- src/bitmap.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index 067faff0..6b371aed 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1181,7 +1181,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n while(_bcount##SUF > 0) { \ _bcount##SUF--;\ if (_b##SUF==0) { _b##SUF = bfield & ~_cycle_mask##SUF; } /* process [0,start> + [cycle, MI_BFIELD_BITS> next */ \ - size_t name_idx; \ + /* size_t name_idx; */ \ bool _found##SUF = mi_bfield_find_least_bit(_b##SUF,&name_idx); \ mi_assert_internal(_found##SUF); MI_UNUSED(_found##SUF); \ { \ @@ -1221,11 +1221,13 @@ static inline bool mi_bitmap_find(mi_bitmap_t* bitmap, size_t tseq, size_t n, si mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS); const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0); const size_t cmap_cycle = cmap_acc+1; + size_t cmap_idx = 0; mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X) { // and for each chunkmap entry we iterate over its bits to find the chunks mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bitmap->chunkmap.bfields[cmap_idx]); size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits); + size_t eidx = 0; mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`) { mi_assert_internal(eidx <= MI_BFIELD_BITS); From b6adbbca0cb02f7796112903c2b154e678ba2cce Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 3 Jan 2025 13:15:46 -0800 Subject: [PATCH 179/264] combine flags and xthread_id --- include/mimalloc/internal.h | 25 ++++++++++++++++--------- include/mimalloc/prim.h | 23 +++++++++++++++-------- include/mimalloc/types.h | 11 ++++++----- src/alloc.c | 2 +- src/free.c | 31 ++++++++++++++++--------------- src/init.c | 3 +-- src/page-map.c | 8 +++----- 7 files changed, 58 insertions(+), 45 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index e83186e8..e175f331 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -622,11 +622,16 @@ static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) { } -// Thread id of thread that owns this page -static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { +// Thread id of thread that owns this page (with flags in the bottom 2 bits) +static inline mi_threadid_t mi_page_xthread_id(const mi_page_t* page) { return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id); } +// Plain thread id of the thread that owns this page +static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { + return (mi_page_xthread_id(page) & ~MI_PAGE_FLAG_MASK); +} + // Thread free access static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); @@ -695,19 +700,21 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) { static inline bool mi_page_is_abandoned(const mi_page_t* page) { // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) - return (mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id) <= 1); + return (mi_page_xthread_id(page) <= MI_PAGE_IS_ABANDONED_MAPPED); } static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) { - return (mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id) == 1); + return (mi_page_xthread_id(page) == MI_PAGE_IS_ABANDONED_MAPPED); } static inline void mi_page_set_abandoned_mapped(mi_page_t* page) { - mi_atomic_or_relaxed(&page->xthread_id, (uintptr_t)1); + mi_assert_internal(mi_page_is_abandoned(page)); + mi_atomic_or_relaxed(&page->xthread_id, MI_PAGE_IS_ABANDONED_MAPPED); } static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) { - mi_atomic_and_relaxed(&page->xthread_id, ~(uintptr_t)1); + mi_assert_internal(mi_page_is_abandoned_mapped(page)); + mi_atomic_and_relaxed(&page->xthread_id, ~MI_PAGE_IS_ABANDONED_MAPPED); } @@ -766,15 +773,15 @@ static inline bool _mi_page_unown(mi_page_t* page) { // Page flags //----------------------------------------------------------- static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) { - return mi_atomic_load_relaxed(&((mi_page_t*)page)->xflags); + return (mi_page_xthread_id(page) & MI_PAGE_FLAG_MASK); } static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) { if (set) { - mi_atomic_or_relaxed(&page->xflags, newflag); + mi_atomic_or_relaxed(&page->xthread_id, newflag); } else { - mi_atomic_and_relaxed(&page->xflags, ~newflag); + mi_atomic_and_relaxed(&page->xthread_id, ~newflag); } } diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 687729c5..8043fd7f 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -270,35 +270,42 @@ static inline void mi_prim_tls_slot_set(size_t slot, void* value) mi_attr_noexce // defined in `init.c`; do not use these directly -extern mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from -extern bool _mi_process_is_initialized; // has mi_process_init been called? +extern mi_decl_hidden mi_decl_thread mi_heap_t* _mi_heap_default; // default heap to allocate from +extern mi_decl_hidden bool _mi_process_is_initialized; // has mi_process_init been called? -static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept; +static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept; + +static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { + const mi_threadid_t tid = __mi_prim_thread_id(); + mi_assert_internal(tid > 1); + mi_assert_internal((tid & MI_PAGE_FLAG_MASK) == 0); // bottom 2 bits are clear? + return tid; +} // Get a unique id for the current thread. #if defined(MI_PRIM_THREAD_ID) -static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { +static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept { return MI_PRIM_THREAD_ID(); // used for example by CPython for a free threaded build (see python/cpython#115488) } #elif defined(_WIN32) -static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { +static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept { // Windows: works on Intel and ARM in both 32- and 64-bit return (uintptr_t)NtCurrentTeb(); } #elif MI_USE_BUILTIN_THREAD_POINTER -static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { +static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept { // Works on most Unix based platforms with recent compilers return (uintptr_t)__builtin_thread_pointer(); } #elif MI_HAS_TLS_SLOT -static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { +static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept { #if defined(__BIONIC__) // issue #384, #495: on the Bionic libc (Android), slot 1 is the thread id // see: https://github.com/aosp-mirror/platform_bionic/blob/c44b1d0676ded732df4b3b21c5f798eacae93228/libc/platform/bionic/tls_defines.h#L86 @@ -314,7 +321,7 @@ static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { #else // otherwise use portable C, taking the address of a thread local variable (this is still very fast on most platforms). -static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept { +static inline mi_threadid_t __mi_prim_thread_id(void) mi_attr_noexcept { return (uintptr_t)&_mi_heap_default; } diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 9fefdf60..1cab7742 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -241,14 +241,16 @@ typedef struct mi_block_s { } mi_block_t; -// The `in_full` and `has_aligned` page flags are put in the same field -// to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine. +// The `in_full` and `has_aligned` page flags are put in the bottom bits of the thread_id (for fast test in `mi_free`) // `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing) // `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing) -#define MI_PAGE_IN_FULL_QUEUE MI_ZU(0x01) -#define MI_PAGE_HAS_ALIGNED MI_ZU(0x02) +#define MI_PAGE_IN_FULL_QUEUE MI_ZU(0x01) +#define MI_PAGE_HAS_ALIGNED MI_ZU(0x02) +#define MI_PAGE_IS_ABANDONED_MAPPED MI_ZU(0x04) +#define MI_PAGE_FLAG_MASK MI_ZU(0x07) typedef size_t mi_page_flags_t; + // Thread free list. // Points to a list of blocks that are freed by other threads. // The low-bit is set if the page is owned by the current thread. (`mi_page_is_owned`). @@ -296,7 +298,6 @@ typedef struct mi_page_s { mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads - _Atomic(mi_page_flags_t) xflags; // `in_full_queue` and `has_aligned` flags size_t block_size; // size available in each block (always `>0`) uint8_t* page_start; // start of the blocks diff --git a/src/alloc.c b/src/alloc.c index 6b037987..9cd44338 100644 --- a/src/alloc.c +++ b/src/alloc.c @@ -272,7 +272,7 @@ void* _mi_heap_realloc_zero(mi_heap_t* heap, void* p, size_t newsize, bool zero) // if p == NULL then behave as malloc. // else if size == 0 then reallocate to a zero-sized block (and don't return NULL, just as mi_malloc(0)). // (this means that returning NULL always indicates an error, and `p` will not have been freed in that case.) - const size_t size = _mi_usable_size(p,"mi_realloc"); // also works if p == NULL (with size 0) + const size_t size = (p==NULL ? 0 : _mi_usable_size(p,"mi_realloc")); if mi_unlikely(newsize <= size && newsize >= (size / 2) && newsize > 0) { // note: newsize must be > 0 or otherwise we return NULL for realloc(NULL,0) mi_assert_internal(p!=NULL); // todo: do not track as the usable size is still the same in the free; adjust potential padding? diff --git a/src/free.c b/src/free.c index 7467adc1..f63a55cb 100644 --- a/src/free.c +++ b/src/free.c @@ -122,6 +122,7 @@ static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_ // free a pointer owned by another thread (page parameter comes first for better codegen) static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept { + if (p==NULL) return; // a NULL pointer is seen as abandoned (tid==0) with a full flag set mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865) mi_block_check_unguard(page, block, p); mi_free_block_mt(page, block); @@ -160,24 +161,24 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) void mi_free(void* p) mi_attr_noexcept { mi_page_t* const page = mi_checked_ptr_page(p,"mi_free"); - if mi_unlikely(page==NULL) return; - const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page)); - const mi_page_flags_t flags = mi_page_flags(page); - if mi_likely(is_local) { // thread-local free? - if mi_likely(flags == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned) - // thread-local, aligned, and not a full page - mi_block_t* const block = (mi_block_t*)p; - mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */); - } - else { - // page is full or contains (inner) aligned blocks; use generic path - mi_free_generic_local(page, p); - } + #if MI_PAGE_MAP_FLAT // if not flat, NULL will point to `_mi_page_empty` and get to `mi_free_generic_mt` + if mi_unlikely(page==NULL) return; + #endif + + const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page)); + if mi_likely(xtid == 0) { // thread-local free? `tid==mi_page_thread_id(page) && mi_page_flags(page)==0` + // thread-local, aligned, and not a full page + mi_block_t* const block = (mi_block_t*)p; + mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */); + } + else if (xtid <= MI_PAGE_FLAG_MASK) { // `tid= = mi_page_thread_id(page) && mi_page_flags(page)!=0` + // page is local, but is full or contains (inner) aligned blocks; use generic path + mi_free_generic_local(page, p); } else { - // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap) - if mi_likely(flags == 0) { + // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap) + if ((xtid & MI_PAGE_FLAG_MASK) == 0) { // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0` // blocks are aligned (and not a full page) mi_block_t* const block = (mi_block_t*)p; mi_free_block_mt(page,block); diff --git a/src/init.c b/src/init.c index 439a914c..c697a1e9 100644 --- a/src/init.c +++ b/src/init.c @@ -16,7 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file // Empty page used to initialize the small free pages array const mi_page_t _mi_page_empty = { - MI_ATOMIC_VAR_INIT(0), // xthread_id + MI_ATOMIC_VAR_INIT(MI_PAGE_IN_FULL_QUEUE), // xthread_id (must set flag to catch NULL on a free) NULL, // free 0, // used 0, // capacity @@ -25,7 +25,6 @@ const mi_page_t _mi_page_empty = { 0, // retire_expire NULL, // local_free MI_ATOMIC_VAR_INIT(0), // xthread_free - MI_ATOMIC_VAR_INIT(0), // xflags 0, // block_size NULL, // page_start 0, // heap tag diff --git a/src/page-map.c b/src/page-map.c index a917175a..1cf0b07b 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -210,11 +210,9 @@ bool _mi_page_map_init(void) { if (!mi_page_map_memid.initially_committed) { _mi_os_commit(_mi_page_map[0], os_page_size, NULL); // only first OS page } - if (!mi_page_map_memid.initially_zero) { - _mi_page_map[0][0] = NULL; - } - - mi_assert_internal(_mi_ptr_page(NULL)==NULL); + _mi_page_map[0][0] = (mi_page_t*)&_mi_page_empty; // caught in `mi_free` + + mi_assert_internal(_mi_ptr_page(NULL)==&_mi_page_empty); return true; } From f6c2550eac92710b23c7b5af3bb2e20bccd2cc96 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 3 Jan 2025 13:50:31 -0800 Subject: [PATCH 180/264] fix enable large pages --- include/mimalloc/types.h | 8 +------- src/arena.c | 2 ++ 2 files changed, 3 insertions(+), 7 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 1cab7742..089ed199 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -334,14 +334,8 @@ typedef struct mi_page_s { // The max object size are checked to not waste more than 12.5% internally over the page sizes. // (Except for large pages since huge objects are allocated in 4MiB chunks) #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB - -#if MI_ENABLE_LARGE_PAGES -#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB #define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/4) // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` -#else -#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/8) // <= 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE -#endif #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/arena.c b/src/arena.c index cf1836f7..0c571c96 100644 --- a/src/arena.c +++ b/src/arena.c @@ -773,9 +773,11 @@ mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t bloc else if (block_size <= MI_MEDIUM_MAX_OBJ_SIZE) { page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_MEDIUM_PAGE_SIZE), block_size); } + #if MI_ENABLE_LARGE_PAGES else if (block_size <= MI_LARGE_MAX_OBJ_SIZE) { page = mi_arenas_page_regular_alloc(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size); } + #endif else { page = mi_arenas_page_singleton_alloc(heap, block_size, block_alignment); } From 6099f76c8c9f67c815ba147506451008616d9282 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 3 Jan 2025 14:26:32 -0800 Subject: [PATCH 181/264] nicer logic in free --- src/free.c | 20 +++++++++----------- 1 file changed, 9 insertions(+), 11 deletions(-) diff --git a/src/free.c b/src/free.c index f63a55cb..5efe0280 100644 --- a/src/free.c +++ b/src/free.c @@ -176,18 +176,16 @@ void mi_free(void* p) mi_attr_noexcept // page is local, but is full or contains (inner) aligned blocks; use generic path mi_free_generic_local(page, p); } - else { - // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap) - if ((xtid & MI_PAGE_FLAG_MASK) == 0) { // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0` - // blocks are aligned (and not a full page) - mi_block_t* const block = (mi_block_t*)p; - mi_free_block_mt(page,block); - } - else { - // page is full or contains (inner) aligned blocks; use generic multi-thread path - mi_free_generic_mt(page, p); - } + // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap) + else if ((xtid & MI_PAGE_FLAG_MASK) == 0) { // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0` + // blocks are aligned (and not a full page) + mi_block_t* const block = (mi_block_t*)p; + mi_free_block_mt(page,block); } + else { + // page is full or contains (inner) aligned blocks; use generic multi-thread path + mi_free_generic_mt(page, p); + } } From c95d9865a876e598d54b53fe293f0a348926517e Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 3 Jan 2025 14:27:18 -0800 Subject: [PATCH 182/264] merge from dev3-bin --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index ce045173..3363c68a 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 0, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose From e14c8fc795cd1a8ef21605225b3d556e74b434f7 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 3 Jan 2025 18:08:34 -0800 Subject: [PATCH 183/264] bump version to 3.0.0 --- azure-pipelines.yml | 23 ----------------------- cmake/mimalloc-config-version.cmake | 6 +++--- 2 files changed, 3 insertions(+), 26 deletions(-) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index e1a199d3..5393035e 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -331,26 +331,3 @@ jobs: workingDirectory: $(BuildType) displayName: CTest -- job: - displayName: macOS 12 (Monterey) - pool: - vmImage: - macOS-12 - strategy: - matrix: - Debug: - BuildType: debug - cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Debug -DMI_DEBUG_FULL=ON - Release: - BuildType: release - cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release - steps: - - task: CMake@1 - inputs: - workingDirectory: $(BuildType) - cmakeArgs: .. $(cmakeExtraArgs) - - script: make -j$(sysctl -n hw.ncpu) -C $(BuildType) - displayName: Make - - script: ctest --verbose --timeout 180 - workingDirectory: $(BuildType) - displayName: CTest diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake index f92d52e6..04f27e6d 100644 --- a/cmake/mimalloc-config-version.cmake +++ b/cmake/mimalloc-config-version.cmake @@ -1,6 +1,6 @@ -set(mi_version_major 1) -set(mi_version_minor 8) -set(mi_version_patch 8) +set(mi_version_major 3) +set(mi_version_minor 0) +set(mi_version_patch 0) set(mi_version ${mi_version_major}.${mi_version_minor}) set(PACKAGE_VERSION ${mi_version}) From 46ae913f22c46fc60006d0fcb0829d078a2dea76 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 3 Jan 2025 18:43:38 -0800 Subject: [PATCH 184/264] bump version to 3.0.1 for further development --- cmake/mimalloc-config-version.cmake | 2 +- include/mimalloc.h | 2 +- include/mimalloc/types.h | 13 +++++++------ src/free.c | 10 +++++----- 4 files changed, 14 insertions(+), 13 deletions(-) diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake index 04f27e6d..60cc2d3d 100644 --- a/cmake/mimalloc-config-version.cmake +++ b/cmake/mimalloc-config-version.cmake @@ -1,6 +1,6 @@ set(mi_version_major 3) set(mi_version_minor 0) -set(mi_version_patch 0) +set(mi_version_patch 1) set(mi_version ${mi_version_major}.${mi_version_minor}) set(PACKAGE_VERSION ${mi_version}) diff --git a/include/mimalloc.h b/include/mimalloc.h index 7383ce8a..fb7efcde 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MIMALLOC_H #define MIMALLOC_H -#define MI_MALLOC_VERSION 300 // major + 2 digits minor +#define MI_MALLOC_VERSION 301 // major + 2 digits minor // ------------------------------------------------------ // Compiler specific attributes diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index f13149b1..ec4144d1 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -241,9 +241,10 @@ typedef struct mi_block_s { } mi_block_t; -// The `in_full` and `has_aligned` page flags are put in the bottom bits of the thread_id (for fast test in `mi_free`) +// The page flags are put in the bottom 3 bits of the thread_id (for a fast test in `mi_free`) // `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing) // `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing) +// `is_abandoned_mapped` is true if the page is abandoned (thread_id==0) and it is in an arena so can be quickly found for reuse ("mapped") #define MI_PAGE_IN_FULL_QUEUE MI_ZU(0x01) #define MI_PAGE_HAS_ALIGNED MI_ZU(0x02) #define MI_PAGE_IS_ABANDONED_MAPPED MI_ZU(0x04) @@ -253,10 +254,9 @@ typedef size_t mi_page_flags_t; // Thread free list. // Points to a list of blocks that are freed by other threads. -// The low-bit is set if the page is owned by the current thread. (`mi_page_is_owned`). +// The least-bit is set if the page is owned by the current thread. (`mi_page_is_owned`). // Ownership is required before we can read any non-atomic fields in the page. -// This way we can push a block on the thread free list and try to claim ownership -// atomically in `free.c:mi_free_block_mt`. +// This way we can push a block on the thread free list and try to claim ownership atomically in `free.c:mi_free_block_mt`. typedef uintptr_t mi_thread_free_t; // A heap can serve only specific objects signified by its heap tag (e.g. various object types in CPython) @@ -281,13 +281,14 @@ typedef uint8_t mi_heaptag_t; // Notes: // - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`). // - If a page is not part of a heap it is called "abandoned" (`heap==NULL`) -- in -// that case the `xthreadid` is 0 or 1 (1 is for abandoned pages that +// that case the `xthreadid` is 0 or 4 (4 is for abandoned pages that // are in the abandoned page lists of an arena, these are called "mapped" abandoned pages). +// - page flags are in the bottom 3 bits of `xthread_id` for the fast path in `mi_free`. // - The layout is optimized for `free.c:mi_free` and `alloc.c:mi_page_alloc` // - Using `uint16_t` does not seem to slow things down typedef struct mi_page_s { - _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= heap->thread_id, or 0 or 1 if abandoned) + _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= heap->thread_id (or 0 if abandoned) | page_flags) mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) uint16_t used; // number of blocks in use (including blocks in `thread_free`) diff --git a/src/free.c b/src/free.c index 5efe0280..ed1b830e 100644 --- a/src/free.c +++ b/src/free.c @@ -167,18 +167,18 @@ void mi_free(void* p) mi_attr_noexcept #endif const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page)); - if mi_likely(xtid == 0) { // thread-local free? `tid==mi_page_thread_id(page) && mi_page_flags(page)==0` + if mi_likely(xtid == 0) { // `tid == mi_page_thread_id(page) && mi_page_flags(page) == 0` // thread-local, aligned, and not a full page mi_block_t* const block = (mi_block_t*)p; mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */); } - else if (xtid <= MI_PAGE_FLAG_MASK) { // `tid= = mi_page_thread_id(page) && mi_page_flags(page)!=0` + else if (xtid <= MI_PAGE_FLAG_MASK) { // `tid == mi_page_thread_id(page) && mi_page_flags(page) != 0` // page is local, but is full or contains (inner) aligned blocks; use generic path mi_free_generic_local(page, p); } - // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap) - else if ((xtid & MI_PAGE_FLAG_MASK) == 0) { // `tid!=mi_page_thread_id(page) && mi_page_flags(page)==0` - // blocks are aligned (and not a full page) + // free-ing in a page owned by a heap in another thread, or an abandoned page (not belonging to a heap) + else if ((xtid & MI_PAGE_FLAG_MASK) == 0) { // `tid != mi_page_thread_id(page) && mi_page_flags(page) == 0` + // blocks are aligned (and not a full page); push on the thread_free list mi_block_t* const block = (mi_block_t*)p; mi_free_block_mt(page,block); } From fab6bee76406340d8582e15530380a6cdde954a3 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 4 Jan 2025 22:39:06 -0800 Subject: [PATCH 185/264] nicer arena debug output --- src/arena.c | 16 ++++++---------- 1 file changed, 6 insertions(+), 10 deletions(-) diff --git a/src/arena.c b/src/arena.c index 2dae0fb5..64b1327f 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1317,12 +1317,7 @@ typedef enum mi_ansi_color_e { } mi_ansi_color_t; static void mi_debug_color(char* buf, size_t* k, mi_ansi_color_t color) { - buf[*k] = '\x1b'; - buf[*k+1] = '['; - buf[*k+2] = (char)(((int)color / 10) + '0'); - buf[*k+3] = (char)(((int)color % 10) + '0'); - buf[*k+4] = 'm'; - *k += 5; + *k += _mi_snprintf(buf + *k, 32, "\x1B[%dm", (int)color); } static int mi_page_commit_usage(mi_page_t* page) { @@ -1347,13 +1342,14 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, c = 'p'; color = MI_GRAY; mi_page_t* page = (mi_page_t*)start; - if (mi_page_is_abandoned_mapped(page)) { c = 'a'; } - else if (mi_page_is_abandoned(page)) { c = (mi_page_is_singleton(page) ? 's' : 'f'); } + if (mi_page_is_singleton(page)) { c = 's'; } + else if (mi_page_is_full(page)) { c = 'f'; } + if (!mi_page_is_abandoned(page)) { c = _mi_toupper(c); } int commit_usage = mi_page_commit_usage(page); if (commit_usage < 25) { color = MI_MAROON; } else if (commit_usage < 50) { color = MI_ORANGE; } else if (commit_usage < 75) { color = MI_TEAL; } - else color = MI_DARKGREEN; + else color = MI_DARKGREEN; bit_of_page = (long)page->memid.mem.arena.slice_count; } else { @@ -1476,7 +1472,7 @@ void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept { // purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); //} if (show_pages) { - page_total += mi_debug_show_bitmap_binned("pages (p:page, a:abandoned, f:full-abandoned, s:singleton-abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved)", arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena); + page_total += mi_debug_show_bitmap_binned("pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved) (chunk bin: S:small, M:medium, L:large, X:other)", arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena); } } // if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); From c518312fb6d67505b64e93759839c177cb9e6c0d Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 4 Jan 2025 22:49:25 -0800 Subject: [PATCH 186/264] allow narrow arena debug output --- include/mimalloc.h | 2 +- src/arena.c | 29 +++++++++++------------------ 2 files changed, 12 insertions(+), 19 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index fb7efcde..281f5ead 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size mi_decl_export int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept; mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept; -mi_decl_export void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept; +mi_decl_export void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept; // Experimental: heaps associated with specific memory arena's typedef void* mi_arena_id_t; diff --git a/src/arena.c b/src/arena.c index 64b1327f..55b6fb9b 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1377,10 +1377,9 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k, return bit_set_count; } -#define MI_FIELDS_PER_LINE (4) - -static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena) { - _mi_output_message("\x1B[37m%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header); +static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) { + _mi_output_message("\x1B[37m%s%s%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header1, header2, header3); + const size_t fields_per_line = (narrow ? 2 : 4); size_t bit_count = 0; size_t bit_set_count = 0; for (size_t i = 0; i < chunk_count && bit_count < slice_count; i++) { @@ -1408,7 +1407,7 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_ long bit_of_page = 0; mi_ansi_color_t color_of_page = MI_GRAY; for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) { - if (j > 0 && (j % MI_FIELDS_PER_LINE) == 0) { + if (j > 0 && (j % fields_per_line) == 0) { // buf[k++] = '\n'; _mi_memset(buf+k,' ',7); k += 7; _mi_output_message(" %s\n\x1B[37m", buf); _mi_memzero(buf, sizeof(buf)); @@ -1435,20 +1434,11 @@ static size_t mi_debug_show_chunks(const char* header, size_t slice_count, size_ return bit_set_count; } -//static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert, mi_arena_t* arena) { -// return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], NULL, invert, arena); -//} - -static size_t mi_debug_show_bitmap_binned(const char* header, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena) { - return mi_debug_show_chunks(header, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena); +static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) { + return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow); } -//static size_t mi_debug_show_bbitmap(const char* header, size_t slice_count, mi_bbitmap_t* bbitmap, bool invert, mi_arena_t* arena) { -// return mi_debug_show_chunks(header, slice_count, mi_bbitmap_chunk_count(bbitmap), &bbitmap->chunks[0], &bbitmap->chunk_bins[0], invert, arena); -//} - - -void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept { +void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept { mi_subproc_t* subproc = _mi_subproc(); size_t max_arenas = mi_arenas_get_count(subproc); //size_t free_total = 0; @@ -1472,7 +1462,10 @@ void mi_debug_show_arenas(bool show_pages) mi_attr_noexcept { // purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false, NULL); //} if (show_pages) { - page_total += mi_debug_show_bitmap_binned("pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:heap-meta-data, ~:free-purgable, _:free-committed, .:free-reserved) (chunk bin: S:small, M:medium, L:large, X:other)", arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena); + const char* header1 = "pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:meta-data, ~:free-purgable, _:free-committed, .:free-reserved)"; + const char* header2 = (narrow ? "\n " : " "); + const char* header3 = "(chunk bin: S:small, M : medium, L : large, X : other)"; + page_total += mi_debug_show_bitmap_binned(header1, header2, header3, arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena, narrow); } } // if (show_inuse) _mi_output_message("total inuse slices : %zu\n", slice_total - free_total); From 18244cebc5e40aef6c7a8377c0885f316a993f20 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 5 Jan 2025 11:03:41 -0800 Subject: [PATCH 187/264] refine MI_ENABLE_LARGE_PAGES --- include/mimalloc/types.h | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index ec4144d1..1d3c7b07 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -270,16 +270,19 @@ typedef uint8_t mi_heaptag_t; // The `local_free` and `thread_free` lists are migrated to the `free` list // when it is exhausted. The separate `local_free` list is necessary to // implement a monotonic heartbeat. The `thread_free` list is needed for -// avoiding atomic operations in the common case. +// avoiding atomic operations when allocating from the owning thread. // // `used - |thread_free|` == actual blocks that are in use (alive) // `used - |thread_free| + |free| + |local_free| == capacity` // -// We don't count `freed` (as |free|) but use `used` to reduce +// We don't count "freed" (as |free|) but use only the `used` field to reduce // the number of memory accesses in the `mi_page_all_free` function(s). +// Use `_mi_page_free_collect` to collect the thread_free list and update the `used` count. // // Notes: -// - Non-atomic fields can only be accessed if having ownership (low bit of `xthread_free`). +// - Non-atomic fields can only be accessed if having _ownership_ (low bit of `xthread_free` is 1). +// Combining the `thread_free` list with an ownership bit allows a concurrent `free` to atomically +// free an object and (re)claim ownership if the page was abandoned. // - If a page is not part of a heap it is called "abandoned" (`heap==NULL`) -- in // that case the `xthreadid` is 0 or 4 (4 is for abandoned pages that // are in the abandoned page lists of an arena, these are called "mapped" abandoned pages). @@ -288,17 +291,17 @@ typedef uint8_t mi_heaptag_t; // - Using `uint16_t` does not seem to slow things down typedef struct mi_page_s { - _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= heap->thread_id (or 0 if abandoned) | page_flags) + _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= `heap->thread_id (or 0 if abandoned) | page_flags`) mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) uint16_t used; // number of blocks in use (including blocks in `thread_free`) - uint16_t capacity; // number of blocks committed (must be the first field for proper zero-initialisation) + uint16_t capacity; // number of blocks committed uint16_t reserved; // number of blocks reserved in memory uint8_t block_size_shift; // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`) uint8_t retire_expire; // expiration count for retired blocks mi_block_t* local_free; // list of deferred free blocks by this thread (migrates to `free`) - _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads + _Atomic(mi_thread_free_t) xthread_free; // list of deferred free blocks freed by other threads (= `mi_block_t* | (1 if owned)`) size_t block_size; // size available in each block (always `>0`) uint8_t* page_start; // start of the blocks @@ -333,13 +336,16 @@ typedef struct mi_page_s { #endif // The max object size are checked to not waste more than 12.5% internally over the page sizes. -// (Except for large pages since huge objects are allocated in 4MiB chunks) #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB +#if MI_ENABLE_LARGE_PAGES #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/4) // <= 512 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` +#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 256 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` +#else +#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/8) // <= 64 KiB +#define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // <= 64 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` +#endif #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) - #if (MI_LARGE_MAX_OBJ_WSIZE >= 655360) #error "mimalloc internal: define more bins" #endif @@ -352,7 +358,7 @@ typedef struct mi_page_s { typedef enum mi_page_kind_e { MI_PAGE_SMALL, // small blocks go into 64KiB pages MI_PAGE_MEDIUM, // medium blocks go into 512KiB pages - MI_PAGE_LARGE, // larger blocks go into 4MiB pages + MI_PAGE_LARGE, // larger blocks go into 4MiB pages (if `MI_ENABLE_LARGE_PAGES==1`) MI_PAGE_SINGLETON // page containing a single block. // used for blocks `> MI_LARGE_MAX_OBJ_SIZE` or an aligment `> MI_PAGE_MAX_OVERALLOC_ALIGN`. } mi_page_kind_t; From a9324a2f2fe2e58f42b05c69a3e3cb291d7bc1b2 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 5 Jan 2025 11:06:37 -0800 Subject: [PATCH 188/264] merge from dev3 --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index c1144616..9fcc6ef3 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 0, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose From bbe81101db67b02c97037b1a1b5c17c688aee6f8 Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 5 Jan 2025 11:12:27 -0800 Subject: [PATCH 189/264] add comment --- include/mimalloc/types.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 1d3c7b07..e45da9a7 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -99,7 +99,8 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_ENCODE_FREELIST 1 #endif -// Enable large pages for objects between 128KiB and 512KiB. Disabled by default. +// Enable large pages for objects between 64KiB and 256KiB. +// Disabled by default as for many workloads the block sizes above 64 KiB are quite random which can lead to too many partially used large pages. #ifndef MI_ENABLE_LARGE_PAGES #define MI_ENABLE_LARGE_PAGES 0 #endif From bd3392466b151767ad449f82007091d693e685ae Mon Sep 17 00:00:00 2001 From: daanx Date: Sun, 5 Jan 2025 11:39:42 -0800 Subject: [PATCH 190/264] remove mi_debug_show_arenas parameter --- include/mimalloc.h | 2 +- src/arena.c | 6 +++++- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 281f5ead..10695def 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -276,7 +276,7 @@ mi_decl_export int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size mi_decl_export int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept; mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_pinned /* cannot decommit/reset? */, bool is_zero, int numa_node) mi_attr_noexcept; -mi_decl_export void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept; +mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept; // Experimental: heaps associated with specific memory arena's typedef void* mi_arena_id_t; diff --git a/src/arena.c b/src/arena.c index 55b6fb9b..f7e7b44a 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1438,7 +1438,7 @@ static size_t mi_debug_show_bitmap_binned(const char* header1, const char* heade return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow); } -void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept { +static void mi_debug_show_arenas_ex(bool show_pages, bool narrow) mi_attr_noexcept { mi_subproc_t* subproc = _mi_subproc(); size_t max_arenas = mi_arenas_get_count(subproc); //size_t free_total = 0; @@ -1473,6 +1473,10 @@ void mi_debug_show_arenas(bool show_pages, bool narrow) mi_attr_noexcept { if (show_pages) _mi_output_message("total pages in arenas: %zu\n", page_total); } +void mi_debug_show_arenas(void) mi_attr_noexcept { + mi_debug_show_arenas_ex(true /* show pages */, false /* narrow? */); +} + /* ----------------------------------------------------------- Reserve a huge page arena. From 8210c9aa0a3508075e99148227e507c4aaafad2c Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Sun, 5 Jan 2025 15:47:52 -0800 Subject: [PATCH 191/264] bump version for further development --- cmake/mimalloc-config-version.cmake | 2 +- include/mimalloc.h | 2 +- test/test-stress.c | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake index 60cc2d3d..527b1874 100644 --- a/cmake/mimalloc-config-version.cmake +++ b/cmake/mimalloc-config-version.cmake @@ -1,6 +1,6 @@ set(mi_version_major 3) set(mi_version_minor 0) -set(mi_version_patch 1) +set(mi_version_patch 2) set(mi_version ${mi_version_major}.${mi_version_minor}) set(PACKAGE_VERSION ${mi_version}) diff --git a/include/mimalloc.h b/include/mimalloc.h index 10695def..8b453247 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file #ifndef MIMALLOC_H #define MIMALLOC_H -#define MI_MALLOC_VERSION 301 // major + 2 digits minor +#define MI_MALLOC_VERSION 302 // major + 2 digits minor // ------------------------------------------------------ // Compiler specific attributes diff --git a/test/test-stress.c b/test/test-stress.c index 1f66460f..fb27a786 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -261,9 +261,9 @@ static void test_stress(void) { #if !defined(NDEBUG) || defined(MI_TSAN) if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); - mi_debug_show_arenas(true); + mi_debug_show_arenas(); //mi_collect(true); - //mi_debug_show_arenas(true); + //mi_debug_show_arenas(); } #endif } From 1b5399c965d00901f0303d28d822d0589c190acb Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Sun, 5 Jan 2025 15:50:07 -0800 Subject: [PATCH 192/264] set default purge delay to 0 --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index c1144616..9fcc6ef3 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 0, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose From 86550d09bcf845034a81fb46acbab42dcaf26d23 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 7 Jan 2025 13:19:44 -0800 Subject: [PATCH 193/264] set more conservative options with increased medium and small object sizes --- include/mimalloc/types.h | 8 ++++---- src/options.c | 4 ++-- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index e45da9a7..613bc69c 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -337,13 +337,13 @@ typedef struct mi_page_s { #endif // The max object size are checked to not waste more than 12.5% internally over the page sizes. -#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 16 KiB #if MI_ENABLE_LARGE_PAGES -#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB #define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 256 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #else -#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/8) // <= 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // <= 64 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` +#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/4) // <= 128 KiB +#define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #endif #define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE) diff --git a/src/options.c b/src/options.c index 9fcc6ef3..3d34d9b6 100644 --- a/src/options.c +++ b/src/options.c @@ -169,8 +169,8 @@ static mi_option_desc_t options[_mi_option_last] = UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. - { 1, UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free - { 2, UNINIT, MI_OPTION(page_full_retain) }, + { 0, UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free + { 0, UNINIT, MI_OPTION(page_full_retain) }, { 4, UNINIT, MI_OPTION(page_max_candidates) }, { 0, UNINIT, MI_OPTION(max_vabits) }, { MI_DEFAULT_PAGEMAP_COMMIT, From b2cdf81e8e2bde09e8c2eb1325f5bfcc3f9e32f9 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 7 Jan 2025 13:34:45 -0800 Subject: [PATCH 194/264] comment --- src/bitmap.c | 20 +++++++++++--------- 1 file changed, 11 insertions(+), 9 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index 80bc8ff7..ff1a139f 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -1420,9 +1420,16 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) { /* -------------------------------------------------------------------------------- - binned bitmap chunkmap + binned bitmap used to track free slices -------------------------------------------------------------------------------- */ +// Assign a specific size bin to a chunk +static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) { + mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); + mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin); +} + +// Track the index of the highest chunk that is accessed. static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) { size_t oldmax = mi_atomic_load_relaxed(&bbitmap->chunk_max_accessed); if mi_unlikely(chunk_idx > oldmax) { @@ -1430,12 +1437,13 @@ static void mi_bbitmap_chunkmap_set_max(mi_bbitmap_t* bbitmap, size_t chunk_idx) } } +// Set a bit in the chunkmap static void mi_bbitmap_chunkmap_set(mi_bbitmap_t* bbitmap, size_t chunk_idx, bool check_all_set) { mi_assert(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); if (check_all_set) { if (mi_bchunk_all_are_set_relaxed(&bbitmap->chunks[chunk_idx])) { // all slices are free in this chunk: return back to the NONE bin - mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], MI_BBIN_NONE); + mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, MI_BBIN_NONE); } } mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL); @@ -1449,7 +1457,7 @@ static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_id // clear the chunkmap bit mi_bchunk_clear(&bbitmap->chunkmap, chunk_idx, NULL); // .. but a concurrent set may have happened in between our all-clear test and the clearing of the - // bit in the mask. We check again to catch this situation. + // bit in the mask. We check again to catch this situation. (note: mi_bchunk_clear must be acq-rel) if (!mi_bchunk_all_are_clear_relaxed(&bbitmap->chunks[chunk_idx])) { mi_bchunk_set(&bbitmap->chunkmap, chunk_idx, NULL); return false; @@ -1458,12 +1466,6 @@ static bool mi_bbitmap_chunkmap_try_clear(mi_bbitmap_t* bbitmap, size_t chunk_id return true; } -// Assign from the NONE bin to a specific size bin -static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) { - mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap)); - mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin); -} - /* -------------------------------------------------------------------------------- mi_bbitmap_setN, try_clearN, and is_xsetN From dd4b4a36b1b7ad9bdb4c394cd5a51439a5d62772 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 7 Jan 2025 17:42:42 -0800 Subject: [PATCH 195/264] use standard heap_collect every 10k generic allocations, disable reclaim_on_free by default --- src/heap.c | 2 +- src/options.c | 4 ++-- src/page.c | 16 ++++------------ 3 files changed, 7 insertions(+), 15 deletions(-) diff --git a/src/heap.c b/src/heap.c index abb36da4..b744c153 100644 --- a/src/heap.c +++ b/src/heap.c @@ -123,7 +123,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // collect arenas (this is program wide so don't force purges on abandonment of threads) //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); - _mi_arenas_collect(collect == MI_FORCE /* force purge? */, true /* visit all? */, heap->tld); + _mi_arenas_collect(collect == MI_FORCE /* force purge? */, collect >= MI_FORCE /* visit all? */, heap->tld); } void _mi_heap_collect_abandon(mi_heap_t* heap) { diff --git a/src/options.c b/src/options.c index 9fcc6ef3..8d66b320 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 0, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose @@ -169,7 +169,7 @@ static mi_option_desc_t options[_mi_option_last] = UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. - { 1, UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free + { 0, UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { 2, UNINIT, MI_OPTION(page_full_retain) }, { 4, UNINIT, MI_OPTION(page_max_candidates) }, { 0, UNINIT, MI_OPTION(max_vabits) }, diff --git a/src/page.c b/src/page.c index 2af89c66..7e52d68f 100644 --- a/src/page.c +++ b/src/page.c @@ -436,7 +436,7 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force) { heap->page_retired_max = max; } - +/* static void mi_heap_collect_full_pages(mi_heap_t* heap) { // note: normally full pages get immediately abandoned and the full queue is always empty // this path is only used if abandoning is disabled due to a destroy-able heap or options @@ -457,15 +457,8 @@ static void mi_heap_collect_full_pages(mi_heap_t* heap) { page = next; } } +*/ -static mi_decl_noinline void mi_heap_generic_collect(mi_heap_t* heap) { - // call potential deferred free routines - _mi_deferred_free(heap, false); - // collect retired pages - _mi_heap_collect_retired(heap, false); - // collect full pages that had concurrent free's - mi_heap_collect_full_pages(heap); -} /* ----------------------------------------------------------- Initialize the initial free list in a page. @@ -921,14 +914,13 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size, bool zero, size_t huge_al // collect every N generic mallocs if mi_unlikely(heap->generic_count++ > 10000) { heap->generic_count = 0; - mi_heap_generic_collect(heap); + mi_heap_collect(heap, false /* force? */); } // find (or allocate) a page of the right size mi_page_t* page = mi_find_page(heap, size, huge_alignment); if mi_unlikely(page == NULL) { // first time out of memory, try to collect and retry the allocation once more - mi_heap_generic_collect(heap); - mi_heap_collect(heap, true /* force */); + mi_heap_collect(heap, true /* force? */); page = mi_find_page(heap, size, huge_alignment); } From 061ef80de7c6240c756c6786f73dfbfeba2e006c Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 7 Jan 2025 21:39:11 -0800 Subject: [PATCH 196/264] clarify allow_destroy --- src/free.c | 4 ++-- src/heap.c | 14 +++++++------- src/init.c | 1 + 3 files changed, 10 insertions(+), 9 deletions(-) diff --git a/src/free.c b/src/free.c index ed1b830e..5d9628f0 100644 --- a/src/free.c +++ b/src/free.c @@ -250,8 +250,8 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations if (!mi_page_is_used_at_frac(page,8) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page - !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && - _mi_arenas_page_try_reabandon_to_mapped(page)) + !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && + _mi_arenas_page_try_reabandon_to_mapped(page)) { return; } diff --git a/src/heap.c b/src/heap.c index b744c153..6d5e328e 100644 --- a/src/heap.c +++ b/src/heap.c @@ -167,7 +167,7 @@ mi_heap_t* mi_heap_get_backing(void) { } // todo: make order of parameters consistent (but would that break compat with CPython?) -void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint8_t heap_tag, mi_tld_t* tld) +void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy, uint8_t heap_tag, mi_tld_t* tld) { mi_assert_internal(heap!=NULL); mi_memid_t memid = heap->memid; @@ -175,15 +175,15 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool noreclaim, uint heap->memid = memid; heap->tld = tld; // avoid reading the thread-local tld during initialization heap->exclusive_arena = _mi_arena_from_id(arena_id); - heap->allow_page_reclaim = !noreclaim; - heap->allow_page_abandon = (!noreclaim && mi_option_get(mi_option_page_full_retain) >= 0); + heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_reclaim_on_free)); + heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0); heap->full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); heap->tag = heap_tag; if (heap->tld->is_in_threadpool) { // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. - // (but abandoning is good in this case) heap->allow_page_reclaim = false; - // and halve the full page retain (possibly to 0) + // .. but abandoning is good in this case: quarter the full page retain (possibly to 0) + // (so blocked threads do not hold on to too much memory) if (heap->full_page_retain >= 0) { heap->full_page_retain = heap->full_page_retain / 4; } @@ -236,12 +236,12 @@ mi_decl_nodiscard mi_heap_t* mi_heap_new_ex(int heap_tag, bool allow_destroy, mi } mi_decl_nodiscard mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t arena_id) { - return mi_heap_new_ex(0 /* default heap tag */, false /* don't allow `mi_heap_destroy` */, arena_id); + return mi_heap_new_ex(0 /* default heap tag */, false /* allow destroy? */, arena_id); } mi_decl_nodiscard mi_heap_t* mi_heap_new(void) { // don't reclaim abandoned memory or otherwise destroy is unsafe - return mi_heap_new_ex(0 /* default heap tag */, true /* no reclaim */, _mi_arena_id_none()); + return mi_heap_new_ex(0 /* default heap tag */, true /* allow destroy? */, _mi_arena_id_none()); } bool _mi_heap_memid_is_suitable(mi_heap_t* heap, mi_memid_t memid) { diff --git a/src/init.c b/src/init.c index 1d352248..40d6143f 100644 --- a/src/init.c +++ b/src/init.c @@ -259,6 +259,7 @@ static void mi_heap_main_init(void) { //heap_main.keys[0] = _mi_heap_random_next(&heap_main); //heap_main.keys[1] = _mi_heap_random_next(&heap_main); _mi_heap_guarded_init(&heap_main); + heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_reclaim_on_free); heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0); heap_main.full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); } From 57eee51f46b4a5710468259c3251157900f9abcd Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 7 Jan 2025 21:42:30 -0800 Subject: [PATCH 197/264] rename full_page_retain to page_full_retain for consistency with the option --- include/mimalloc/types.h | 2 +- src/heap.c | 6 +++--- src/init.c | 2 +- src/page.c | 6 +++--- 4 files changed, 8 insertions(+), 8 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index e45da9a7..c61b0498 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -426,7 +426,7 @@ struct mi_heap_s { size_t page_retired_max; // largest retired index into the `pages` array. size_t generic_count; // how often is mimalloc_generic invoked? mi_heap_t* next; // list of heaps per thread - long full_page_retain; // how many full pages can be retained per queue (before abondoning them) + long page_full_retain; // how many full pages can be retained per queue (before abondoning them) bool allow_page_reclaim; // `true` if this heap should not reclaim abandoned pages bool allow_page_abandon; // `true` if this heap can abandon pages to reduce memory footprint uint8_t tag; // custom tag, can be used for separating heaps based on the object types diff --git a/src/heap.c b/src/heap.c index 6d5e328e..82ca05cb 100644 --- a/src/heap.c +++ b/src/heap.c @@ -177,15 +177,15 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy, heap->exclusive_arena = _mi_arena_from_id(arena_id); heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_reclaim_on_free)); heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0); - heap->full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); + heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); heap->tag = heap_tag; if (heap->tld->is_in_threadpool) { // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. heap->allow_page_reclaim = false; // .. but abandoning is good in this case: quarter the full page retain (possibly to 0) // (so blocked threads do not hold on to too much memory) - if (heap->full_page_retain >= 0) { - heap->full_page_retain = heap->full_page_retain / 4; + if (heap->page_full_retain >= 0) { + heap->page_full_retain = heap->page_full_retain / 4; } } diff --git a/src/init.c b/src/init.c index 40d6143f..ac49d292 100644 --- a/src/init.c +++ b/src/init.c @@ -261,7 +261,7 @@ static void mi_heap_main_init(void) { _mi_heap_guarded_init(&heap_main); heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_reclaim_on_free); heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0); - heap_main.full_page_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); + heap_main.page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); } } diff --git a/src/page.c b/src/page.c index 7e52d68f..d2d6a854 100644 --- a/src/page.c +++ b/src/page.c @@ -680,7 +680,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m size_t count = 0; #endif long candidate_limit = 0; // we reset this on the first candidate to limit the search - long full_page_retain = heap->full_page_retain; + long page_full_retain = heap->page_full_retain; mi_page_t* page_candidate = NULL; // a page with free space mi_page_t* page = pq->first; @@ -703,8 +703,8 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m // if the page is completely full, move it to the `mi_pages_full` // queue so we don't visit long-lived pages too often. if (!immediate_available && !mi_page_is_expandable(page)) { - full_page_retain--; - if (full_page_retain < 0) { + page_full_retain--; + if (page_full_retain < 0) { mi_assert_internal(!mi_page_is_in_full(page) && !mi_page_immediate_available(page)); mi_page_to_full(page, pq); } From 0caf80ec3c59de23dc5865de34d321df22e40fa4 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 7 Jan 2025 21:50:55 -0800 Subject: [PATCH 198/264] default purge delay to 100ms --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index 3d34d9b6..a920fdcb 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 0, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 100, UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose From d9065115cd4c82e81df545a7417935ee3b86a93c Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 13 Jan 2025 14:49:06 -0800 Subject: [PATCH 199/264] fix netBSD compilation (issue #988) --- src/prim/unix/prim.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 4c4a013e..8ef0bd72 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -201,7 +201,8 @@ static void* unix_mmap_prim(void* addr, size_t size, size_t try_alignment, int p void* p = NULL; #if defined(MAP_ALIGNED) // BSD if (addr == NULL && try_alignment > 1 && (try_alignment % _mi_os_page_size()) == 0) { - size_t n = mi_bsr(try_alignment); + size_t idx; + size_t n = mi_bsr(try_alignment, &idx); if (((size_t)1 << n) == try_alignment && n >= 12 && n <= 30) { // alignment is a power of 2 and 4096 <= alignment <= 1GiB p = mmap(addr, size, protect_flags, flags | MAP_ALIGNED(n), fd, 0); if (p==MAP_FAILED || !_mi_is_aligned(p,try_alignment)) { From c9d623a2ef325258eaa692234d1139481eea0cf2 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 13 Jan 2025 16:02:35 -0800 Subject: [PATCH 200/264] add INTERFACE_INCLUDE_DIRECTORIES to vckpcg wrapper --- contrib/vcpkg/readme.md | 2 +- contrib/vcpkg/vcpkg-cmake-wrapper.cmake | 1 + 2 files changed, 2 insertions(+), 1 deletion(-) diff --git a/contrib/vcpkg/readme.md b/contrib/vcpkg/readme.md index b1f6047c..014f2867 100644 --- a/contrib/vcpkg/readme.md +++ b/contrib/vcpkg/readme.md @@ -9,7 +9,7 @@ to check out a specific commit, version, or branch of mimalloc, or set further o You can install such custom port as: ```sh -$ vcpkg install mimalloc[override] --recurse --overlay-ports=./contrib/vcpkg +$ vcpkg install "mimalloc[override]" --recurse --overlay-ports=./contrib/vcpkg ``` This will also show the correct sha512 hash if you use a custom version. diff --git a/contrib/vcpkg/vcpkg-cmake-wrapper.cmake b/contrib/vcpkg/vcpkg-cmake-wrapper.cmake index 1b355722..6b917347 100644 --- a/contrib/vcpkg/vcpkg-cmake-wrapper.cmake +++ b/contrib/vcpkg/vcpkg-cmake-wrapper.cmake @@ -17,4 +17,5 @@ endif() if(TARGET mimalloc-static AND NOT TARGET mimalloc) add_library(mimalloc INTERFACE IMPORTED) set_target_properties(mimalloc PROPERTIES INTERFACE_LINK_LIBRARIES mimalloc-static) + set_target_properties(mimalloc PROPERTIES INTERFACE_INCLUDE_DIRECTORIES mimalloc-static) endif() From e4befd1ce820c6988210156e291d5021e263b5d3 Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 13 Jan 2025 17:02:02 -0800 Subject: [PATCH 201/264] vcpkg: bump sha --- contrib/vcpkg/portfile.cmake | 4 ++-- contrib/vcpkg/vcpkg.json | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/contrib/vcpkg/portfile.cmake b/contrib/vcpkg/portfile.cmake index f5f39009..55f0172f 100644 --- a/contrib/vcpkg/portfile.cmake +++ b/contrib/vcpkg/portfile.cmake @@ -5,11 +5,11 @@ vcpkg_from_github( # The "REF" can be a commit hash, branch name (dev2), or a version (v2.2.1). # REF "v${VERSION}" - REF 866ce5b89db1dbc3e66bbf89041291fd16329518 + REF 6a89f8554eaab8d8d00e17b5b09f79e1d8dbf61b # The sha512 is the hash of the tar.gz bundle. # (To get the sha512, run `vcpkg install mimalloc[override] --overlay-ports=` and copy the sha from the error message.) - SHA512 0b0e5ff823c49b9534b8c32800679806c5d7c29020af058da043c3e6e36ae3c32a1cdd5a21ece97dd60bc7dd4703967f683beac435dbb8514638a6cc55e5dea8 + SHA512 32b87a3195efcc558b83a546348a8fb544fed335cdd6c9f8e7e9d0e8e64540fdcf1f4aa57fd0e783b78731518f4810292b832227d7e7665bf8426f1e6ce96f9d ) vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS diff --git a/contrib/vcpkg/vcpkg.json b/contrib/vcpkg/vcpkg.json index bdbe9ba1..95d3b15d 100644 --- a/contrib/vcpkg/vcpkg.json +++ b/contrib/vcpkg/vcpkg.json @@ -1,6 +1,6 @@ { "name": "mimalloc", - "version": "1.9.2", + "version": "3.0.2", "port-version": 2, "description": "Compact general purpose allocator with excellent performance", "homepage": "https://github.com/microsoft/mimalloc", From bc10fe27c657d72fb26592f78d31e9d763165438 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 15 Jan 2025 11:37:20 -0800 Subject: [PATCH 202/264] fix unregister from the page-map --- src/page-map.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/src/page-map.c b/src/page-map.c index 1cf0b07b..25f8a7ec 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -296,12 +296,16 @@ void _mi_page_map_register(mi_page_t* page) { void _mi_page_map_unregister(mi_page_t* page) { mi_assert_internal(_mi_page_map != NULL); + mi_assert_internal(page != NULL); + mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); + mi_assert_internal(_mi_page_map != NULL); + if mi_unlikely(_mi_page_map == NULL) return; // get index and count size_t slice_count; size_t sub_idx; const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count); // unset the offsets - mi_page_map_set_range(page, idx, sub_idx, slice_count); + mi_page_map_set_range(NULL, idx, sub_idx, slice_count); } void _mi_page_map_unregister_range(void* start, size_t size) { From be2cb44de44b13c5905886a16c7b46c498125321 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 15 Jan 2025 12:02:34 -0800 Subject: [PATCH 203/264] fix NULL pointer in _mi_safe_ptr_page to return a reference to the empty page --- src/page-map.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/page-map.c b/src/page-map.c index 25f8a7ec..641ab405 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -305,7 +305,7 @@ void _mi_page_map_unregister(mi_page_t* page) { size_t sub_idx; const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count); // unset the offsets - mi_page_map_set_range(NULL, idx, sub_idx, slice_count); + // mi_page_map_set_range(NULL, idx, sub_idx, slice_count); } void _mi_page_map_unregister_range(void* start, size_t size) { @@ -318,6 +318,7 @@ void _mi_page_map_unregister_range(void* start, size_t size) { mi_page_t* _mi_safe_ptr_page(const void* p) { if mi_unlikely(p >= mi_page_map_max_address) return NULL; + if (p == NULL) return (mi_page_t*)&_mi_page_empty; // to match mi_free expectation size_t sub_idx; const size_t idx = _mi_page_map_index(p,&sub_idx); if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL; From 5af1eb1144bf4777495f76bfff435443e8302e7f Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 15 Jan 2025 12:07:06 -0800 Subject: [PATCH 204/264] fix NULL pointer in _mi_safe_ptr_page to return a reference to the empty page --- src/page-map.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/page-map.c b/src/page-map.c index 641ab405..be99814c 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -315,10 +315,10 @@ void _mi_page_map_unregister_range(void* start, size_t size) { mi_page_map_set_range(NULL, idx, sub_idx, slice_count); // todo: avoid committing if not already committed? } - +// Return the empty page for the NULL pointer to match the behaviour of `_mi_ptr_page` mi_page_t* _mi_safe_ptr_page(const void* p) { if mi_unlikely(p >= mi_page_map_max_address) return NULL; - if (p == NULL) return (mi_page_t*)&_mi_page_empty; // to match mi_free expectation + if (p == NULL) return (mi_page_t*)&_mi_page_empty; // to match `_mi_ptr_page` (see `mi_free` as well) size_t sub_idx; const size_t idx = _mi_page_map_index(p,&sub_idx); if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL; @@ -328,7 +328,7 @@ mi_page_t* _mi_safe_ptr_page(const void* p) { } mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { - return (_mi_safe_ptr_page(p) != NULL); + return (p != NULL && _mi_safe_ptr_page(p) != NULL); } #endif From 7b8a7107747935d059c04ee8d555dc8170057d35 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 16 Jan 2025 14:00:42 -0800 Subject: [PATCH 205/264] windows on arm threadpool detect --- src/prim/windows/prim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 5ba7aa4f..da664318 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -834,7 +834,7 @@ static void NTAPI mi_win_main(PVOID module, DWORD reason, LPVOID reserved) { #endif bool _mi_prim_thread_is_in_threadpool(void) { - #if (MI_ARCH_X64 || MI_ARCH_X86) + #if (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64) if (win_major_version >= 6) { // check if this thread belongs to a windows threadpool // see: From 899fd7694b15d31e3fb86c3d099cc6c2e4f144df Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 21 Jan 2025 19:28:43 -0800 Subject: [PATCH 206/264] fix unused function warnings; unregister pages --- src/bitmap.c | 24 +++++++++++++----------- src/page-map.c | 4 ++-- 2 files changed, 15 insertions(+), 13 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index ff1a139f..8a7a9442 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -153,11 +153,11 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already return (old==0); } -static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) { - const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero()); - if (all_clear!=NULL) { *all_clear = true; } - return (~old==0); -} +// static inline bool mi_bfield_atomic_clearX(_Atomic(mi_bfield_t)*b, bool* all_clear) { +// const mi_bfield_t old = mi_atomic_exchange_release(b, mi_bfield_zero()); +// if (all_clear!=NULL) { *all_clear = true; } +// return (~old==0); +// } // ------- mi_bfield_atomic_try_clear --------------------------------------- @@ -434,12 +434,12 @@ static inline bool mi_bchunk_try_clearNX(mi_bchunk_t* chunk, size_t cidx, size_t } // Clear a full aligned bfield. -static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) { - mi_assert_internal(cidx < MI_BCHUNK_BITS); - mi_assert_internal((cidx%MI_BFIELD_BITS) == 0); - const size_t i = cidx / MI_BFIELD_BITS; - return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear); -} +// static inline bool mi_bchunk_try_clearX(mi_bchunk_t* chunk, size_t cidx, bool* pmaybe_all_clear) { +// mi_assert_internal(cidx < MI_BCHUNK_BITS); +// mi_assert_internal((cidx%MI_BFIELD_BITS) == 0); +// const size_t i = cidx / MI_BFIELD_BITS; +// return mi_bfield_atomic_try_clearX(&chunk->bfields[i], pmaybe_all_clear); +// } // Try to atomically clear a sequence of `n` bits within a chunk. // Returns true if all bits transitioned from 1 to 0, @@ -717,6 +717,7 @@ static inline bool mi_bchunk_try_find_and_clear_8(mi_bchunk_t* chunk, size_t n, // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. // Used to find large size pages in the free blocks. // todo: try neon version +/* static mi_decl_noinline bool mi_bchunk_try_find_and_clearX(mi_bchunk_t* chunk, size_t* pidx) { #if MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512) while (true) { @@ -759,6 +760,7 @@ static inline bool mi_bchunk_try_find_and_clear_X(mi_bchunk_t* chunk, size_t n, mi_assert_internal(n==MI_BFIELD_BITS); MI_UNUSED(n); return mi_bchunk_try_find_and_clearX(chunk, pidx); } +*/ // find a sequence of `n` bits in a chunk with `0 < n <= MI_BFIELD_BITS` with all bits set, // and try to clear them atomically. diff --git a/src/page-map.c b/src/page-map.c index be99814c..2b610935 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -298,17 +298,17 @@ void _mi_page_map_unregister(mi_page_t* page) { mi_assert_internal(_mi_page_map != NULL); mi_assert_internal(page != NULL); mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); - mi_assert_internal(_mi_page_map != NULL); if mi_unlikely(_mi_page_map == NULL) return; // get index and count size_t slice_count; size_t sub_idx; const size_t idx = mi_page_map_get_idx(page, &sub_idx, &slice_count); // unset the offsets - // mi_page_map_set_range(NULL, idx, sub_idx, slice_count); + mi_page_map_set_range(NULL, idx, sub_idx, slice_count); } void _mi_page_map_unregister_range(void* start, size_t size) { + if mi_unlikely(_mi_page_map == NULL) return; const size_t slice_count = _mi_divide_up(size, MI_ARENA_SLICE_SIZE); size_t sub_idx; const uintptr_t idx = _mi_page_map_index(start, &sub_idx); From 6137ae4ab8f507a8b70b722ca8f075c52338278d Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 21 Jan 2025 20:12:13 -0800 Subject: [PATCH 207/264] fix page_flags --- include/mimalloc/internal.h | 32 +++++++++++++++++--------------- include/mimalloc/types.h | 2 +- src/arena.c | 10 ++++++++-- 3 files changed, 26 insertions(+), 18 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 535fe1fb..e43d4420 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -597,19 +597,6 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { return page->heap; } -static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { - if (heap != NULL) { - page->heap = heap; - page->heap_tag = heap->tag; - mi_atomic_store_release(&page->xthread_id, heap->tld->thread_id); - } - else { - page->heap = NULL; - mi_atomic_store_release(&page->xthread_id,0); - } -} - - // Thread free flag helpers static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { return (mi_block_t*)(tf & ~1); @@ -700,11 +687,11 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) { static inline bool mi_page_is_abandoned(const mi_page_t* page) { // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) - return (mi_page_xthread_id(page) <= MI_PAGE_IS_ABANDONED_MAPPED); + return (mi_page_thread_id(page) == 0); } static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) { - return (mi_page_xthread_id(page) == MI_PAGE_IS_ABANDONED_MAPPED); + return ((mi_page_xthread_id(page) & ~(MI_PAGE_IS_ABANDONED_MAPPED - 1)) == MI_PAGE_IS_ABANDONED_MAPPED); } static inline void mi_page_set_abandoned_mapped(mi_page_t* page) { @@ -801,6 +788,21 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED); } +static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { + mi_assert_internal(!mi_page_is_in_full(page)); + const mi_page_flags_t flags = mi_page_flags(page); + const mi_threadid_t tid = (heap != NULL ? heap->tld->thread_id : 0) | flags; // for MI_PAGE_HAS_ALIGNED + if (heap != NULL) { + page->heap = heap; + page->heap_tag = heap->tag; + } + else { + page->heap = NULL; + } + mi_atomic_store_release(&page->xthread_id, tid); +} + + /* ------------------------------------------------------------------- Guarded objects ------------------------------------------------------------------- */ diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 7e968e10..2a1702ff 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -248,7 +248,7 @@ typedef struct mi_block_s { // `is_abandoned_mapped` is true if the page is abandoned (thread_id==0) and it is in an arena so can be quickly found for reuse ("mapped") #define MI_PAGE_IN_FULL_QUEUE MI_ZU(0x01) #define MI_PAGE_HAS_ALIGNED MI_ZU(0x02) -#define MI_PAGE_IS_ABANDONED_MAPPED MI_ZU(0x04) +#define MI_PAGE_IS_ABANDONED_MAPPED MI_ZU(0x04) // must be highest flag (see `internal.h:mi_page_is_abandoned_mapped`) #define MI_PAGE_FLAG_MASK MI_ZU(0x07) typedef size_t mi_page_flags_t; diff --git a/src/arena.c b/src/arena.c index bcde865e..e111a417 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1833,9 +1833,15 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* // find accessed size size_t asize; // scan the commit map for the highest entry + // scan the commit map for the highest entry size_t idx; - if (mi_bitmap_bsr(arena->slices_committed, &idx)) { - asize = (idx + 1)* MI_ARENA_SLICE_SIZE; + //if (mi_bitmap_bsr(arena->slices_committed, &idx)) { + // asize = (idx + 1)* MI_ARENA_SLICE_SIZE; + //} + if (mi_bitmap_bsr(arena->pages, &idx)) { + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, idx); + const size_t page_slice_count = page->memid.mem.arena.slice_count; + asize = mi_size_of_slices(idx + page_slice_count); } else { asize = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE; From 3f6d286a088c726b96a38d38bed6000249b098bf Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 21 Jan 2025 20:38:02 -0800 Subject: [PATCH 208/264] fix bug in page flag set that would keep pages abandoned --- include/mimalloc/internal.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index e43d4420..d96cfa4c 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -790,11 +790,12 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { mi_assert_internal(!mi_page_is_in_full(page)); - const mi_page_flags_t flags = mi_page_flags(page); - const mi_threadid_t tid = (heap != NULL ? heap->tld->thread_id : 0) | flags; // for MI_PAGE_HAS_ALIGNED + // only the aligned flag is retained (and in particular clear the abandoned-mapped flag). + const mi_page_flags_t flags = (mi_page_has_aligned(page) ? MI_PAGE_HAS_ALIGNED : 0); + const mi_threadid_t tid = (heap == NULL ? 0 : heap->tld->thread_id) | flags; if (heap != NULL) { page->heap = heap; - page->heap_tag = heap->tag; + page->heap_tag = heap->tag; } else { page->heap = NULL; From 570b6b5a7a4509cf659b38ff032eeedb58923db2 Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 21 Jan 2025 20:53:16 -0800 Subject: [PATCH 209/264] slightly better bsf --- include/mimalloc/bits.h | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 5b847f4b..64875e9d 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -205,9 +205,8 @@ static inline size_t mi_ctz(size_t x) { #elif mi_has_builtinz(ctz) return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS); #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86) - if (x==0) return MI_SIZE_BITS; - size_t r; - __asm ("bsf\t%1, %0" : "=r"(r) : "r"(x) : "cc"); + size_t r = MI_SIZE_BITS; // bsf leaves destination unmodified if the argument is 0 (see ) + __asm ("bsf\t%1, %0" : "+r"(r) : "r"(x) : "cc"); return r; #elif MI_HAS_FAST_POPCOUNT return (x!=0 ? (mi_popcount(x^(x-1))-1) : MI_SIZE_BITS); From 5946e9cebf8e713fc17d23417cc6c34acf6cd76f Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 21 Jan 2025 20:58:45 -0800 Subject: [PATCH 210/264] fix assert --- include/mimalloc/internal.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index d96cfa4c..01373025 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -789,7 +789,7 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { } static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { - mi_assert_internal(!mi_page_is_in_full(page)); + // mi_assert_internal(!mi_page_is_in_full(page)); // can happen when destroying pages on heap_destroy // only the aligned flag is retained (and in particular clear the abandoned-mapped flag). const mi_page_flags_t flags = (mi_page_has_aligned(page) ? MI_PAGE_HAS_ALIGNED : 0); const mi_threadid_t tid = (heap == NULL ? 0 : heap->tld->thread_id) | flags; From 7703d14e8c3cf47140270b00e10cefcc4eea18cd Mon Sep 17 00:00:00 2001 From: Daan Date: Wed, 22 Jan 2025 11:21:22 -0800 Subject: [PATCH 211/264] redefine abandoned mapped as a special thread id --- include/mimalloc/internal.h | 215 +++++++++++++++++------------------- include/mimalloc/types.h | 12 +- 2 files changed, 110 insertions(+), 117 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 01373025..8e7ed5e9 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -597,45 +597,6 @@ static inline mi_heap_t* mi_page_heap(const mi_page_t* page) { return page->heap; } -// Thread free flag helpers -static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { - return (mi_block_t*)(tf & ~1); -} -static inline bool mi_tf_is_owned(mi_thread_free_t tf) { - return ((tf & 1) == 1); -} -static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) { - return (mi_thread_free_t)((uintptr_t)block | (owned ? 1 : 0)); -} - - -// Thread id of thread that owns this page (with flags in the bottom 2 bits) -static inline mi_threadid_t mi_page_xthread_id(const mi_page_t* page) { - return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id); -} - -// Plain thread id of the thread that owns this page -static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { - return (mi_page_xthread_id(page) & ~MI_PAGE_FLAG_MASK); -} - -// Thread free access -static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { - return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); -} - -// Owned? -static inline bool mi_page_is_owned(const mi_page_t* page) { - return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); -} - - -//static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) { -// return mi_tf_make(mi_tf_block(tf),delayed); -//} -//static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) { -// return mi_tf_make(block, mi_tf_delayed(tf)); -//} // are all blocks in a page freed? // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`. @@ -644,12 +605,6 @@ static inline bool mi_page_all_free(const mi_page_t* page) { return (page->used == 0); } -// are there any available blocks? -static inline bool mi_page_has_any_available(const mi_page_t* page) { - mi_assert_internal(page != NULL && page->reserved > 0); - return (page->used < page->reserved || (mi_page_thread_free(page) != NULL)); -} - // are there immediately available blocks, i.e. blocks available on the free list. static inline bool mi_page_immediate_available(const mi_page_t* page) { mi_assert_internal(page != NULL); @@ -685,25 +640,6 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) { return (page->reserved - page->used <= frac); } -static inline bool mi_page_is_abandoned(const mi_page_t* page) { - // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) - return (mi_page_thread_id(page) == 0); -} - -static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) { - return ((mi_page_xthread_id(page) & ~(MI_PAGE_IS_ABANDONED_MAPPED - 1)) == MI_PAGE_IS_ABANDONED_MAPPED); -} - -static inline void mi_page_set_abandoned_mapped(mi_page_t* page) { - mi_assert_internal(mi_page_is_abandoned(page)); - mi_atomic_or_relaxed(&page->xthread_id, MI_PAGE_IS_ABANDONED_MAPPED); -} - -static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) { - mi_assert_internal(mi_page_is_abandoned_mapped(page)); - mi_atomic_and_relaxed(&page->xthread_id, ~MI_PAGE_IS_ABANDONED_MAPPED); -} - static inline bool mi_page_is_huge(const mi_page_t* page) { return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || @@ -717,6 +653,109 @@ static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) } +//----------------------------------------------------------- +// Page thread id and flags +//----------------------------------------------------------- + +// Thread id of thread that owns this page (with flags in the bottom 2 bits) +static inline mi_threadid_t mi_page_xthread_id(const mi_page_t* page) { + return mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_id); +} + +// Plain thread id of the thread that owns this page +static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) { + return (mi_page_xthread_id(page) & ~MI_PAGE_FLAG_MASK); +} + +static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) { + return (mi_page_xthread_id(page) & MI_PAGE_FLAG_MASK); +} + +static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) { + if (set) { mi_atomic_or_relaxed(&page->xthread_id, newflag); } + else { mi_atomic_and_relaxed(&page->xthread_id, ~newflag); } +} + +static inline bool mi_page_is_in_full(const mi_page_t* page) { + return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0); +} + +static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) { + mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE); +} + +static inline bool mi_page_has_aligned(const mi_page_t* page) { + return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0); +} + +static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { + mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED); +} + +static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { + // mi_assert_internal(!mi_page_is_in_full(page)); // can happen when destroying pages on heap_destroy + const mi_threadid_t tid = (heap == NULL ? MI_THREADID_ABANDONED : heap->tld->thread_id) | mi_page_flags(page); + if (heap != NULL) { + page->heap = heap; + page->heap_tag = heap->tag; + } + else { + page->heap = NULL; + } + mi_atomic_store_release(&page->xthread_id, tid); +} + +static inline bool mi_page_is_abandoned(const mi_page_t* page) { + // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free) + return (mi_page_thread_id(page) <= MI_THREADID_ABANDONED_MAPPED); +} + +static inline bool mi_page_is_abandoned_mapped(const mi_page_t* page) { + return (mi_page_thread_id(page) == MI_THREADID_ABANDONED_MAPPED); +} + +static inline void mi_page_set_abandoned_mapped(mi_page_t* page) { + mi_assert_internal(mi_page_is_abandoned(page)); + mi_atomic_or_relaxed(&page->xthread_id, MI_THREADID_ABANDONED_MAPPED); +} + +static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) { + mi_assert_internal(mi_page_is_abandoned_mapped(page)); + mi_atomic_and_relaxed(&page->xthread_id, MI_PAGE_FLAG_MASK); +} + +//----------------------------------------------------------- +// Thread free list and ownership +//----------------------------------------------------------- + +// Thread free flag helpers +static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) { + return (mi_block_t*)(tf & ~1); +} +static inline bool mi_tf_is_owned(mi_thread_free_t tf) { + return ((tf & 1) == 1); +} +static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) { + return (mi_thread_free_t)((uintptr_t)block | (owned ? 1 : 0)); +} + +// Thread free access +static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) { + return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); +} + +// are there any available blocks? +static inline bool mi_page_has_any_available(const mi_page_t* page) { + mi_assert_internal(page != NULL && page->reserved > 0); + return (page->used < page->reserved || (mi_page_thread_free(page) != NULL)); +} + + +// Owned? +static inline bool mi_page_is_owned(const mi_page_t* page) { + return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free)); +} + // Unown a page that is currently owned static inline void _mi_page_unown_unconditional(mi_page_t* page) { mi_assert_internal(mi_page_is_owned(page)); @@ -725,7 +764,6 @@ static inline void _mi_page_unown_unconditional(mi_page_t* page) { mi_assert_internal((old&1)==1); MI_UNUSED(old); } - // get ownership if it is not yet owned static inline bool mi_page_try_claim_ownership(mi_page_t* page) { const uintptr_t old = mi_atomic_or_acq_rel(&page->xthread_free, 1); @@ -756,53 +794,6 @@ static inline bool _mi_page_unown(mi_page_t* page) { return false; } -//----------------------------------------------------------- -// Page flags -//----------------------------------------------------------- -static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) { - return (mi_page_xthread_id(page) & MI_PAGE_FLAG_MASK); -} - -static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) { - if (set) { - mi_atomic_or_relaxed(&page->xthread_id, newflag); - } - else { - mi_atomic_and_relaxed(&page->xthread_id, ~newflag); - } -} - -static inline bool mi_page_is_in_full(const mi_page_t* page) { - return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0); -} - -static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) { - mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE); -} - -static inline bool mi_page_has_aligned(const mi_page_t* page) { - return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0); -} - -static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) { - mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED); -} - -static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) { - // mi_assert_internal(!mi_page_is_in_full(page)); // can happen when destroying pages on heap_destroy - // only the aligned flag is retained (and in particular clear the abandoned-mapped flag). - const mi_page_flags_t flags = (mi_page_has_aligned(page) ? MI_PAGE_HAS_ALIGNED : 0); - const mi_threadid_t tid = (heap == NULL ? 0 : heap->tld->thread_id) | flags; - if (heap != NULL) { - page->heap = heap; - page->heap_tag = heap->tag; - } - else { - page->heap = NULL; - } - mi_atomic_store_release(&page->xthread_id, tid); -} - /* ------------------------------------------------------------------- Guarded objects diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 2a1702ff..0bf5722b 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -242,16 +242,18 @@ typedef struct mi_block_s { } mi_block_t; -// The page flags are put in the bottom 3 bits of the thread_id (for a fast test in `mi_free`) +// The page flags are put in the bottom 2 bits of the thread_id (for a fast test in `mi_free`) // `has_aligned` is true if the page has pointers at an offset in a block (so we unalign before free-ing) // `in_full_queue` is true if the page is full and resides in the full queue (so we move it to a regular queue on free-ing) -// `is_abandoned_mapped` is true if the page is abandoned (thread_id==0) and it is in an arena so can be quickly found for reuse ("mapped") #define MI_PAGE_IN_FULL_QUEUE MI_ZU(0x01) #define MI_PAGE_HAS_ALIGNED MI_ZU(0x02) -#define MI_PAGE_IS_ABANDONED_MAPPED MI_ZU(0x04) // must be highest flag (see `internal.h:mi_page_is_abandoned_mapped`) -#define MI_PAGE_FLAG_MASK MI_ZU(0x07) +#define MI_PAGE_FLAG_MASK MI_ZU(0x03) typedef size_t mi_page_flags_t; +// There are two special threadid's: 0 for abandoned threads, and 4 for abandoned & mapped threads -- +// abandoned-mapped pages are abandoned but also mapped in an arena so can be quickly found for reuse. +#define MI_THREADID_ABANDONED MI_ZU(0) +#define MI_THREADID_ABANDONED_MAPPED (MI_PAGE_FLAG_MASK + 1) // Thread free list. // Points to a list of blocks that are freed by other threads. @@ -292,7 +294,7 @@ typedef uint8_t mi_heaptag_t; // - Using `uint16_t` does not seem to slow things down typedef struct mi_page_s { - _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= `heap->thread_id (or 0 if abandoned) | page_flags`) + _Atomic(mi_threadid_t) xthread_id; // thread this page belongs to. (= `heap->thread_id (or 0 or 4 if abandoned) | page_flags`) mi_block_t* free; // list of available free blocks (`malloc` allocates from this list) uint16_t used; // number of blocks in use (including blocks in `thread_free`) From a7370dcbd21f0497bbeb666f22f2e653001ab4c4 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 22 Jan 2025 12:25:02 -0800 Subject: [PATCH 212/264] fix highest allocated page for arena unload --- src/arena.c | 9 +++++++-- test/main-override-dep.cpp | 10 ++++++++++ test/main-override-dep.h | 1 + test/main-override.cpp | 10 ++++++---- 4 files changed, 24 insertions(+), 6 deletions(-) diff --git a/src/arena.c b/src/arena.c index bcde865e..4ad4bb0e 100644 --- a/src/arena.c +++ b/src/arena.c @@ -1834,8 +1834,13 @@ mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* size_t asize; // scan the commit map for the highest entry size_t idx; - if (mi_bitmap_bsr(arena->slices_committed, &idx)) { - asize = (idx + 1)* MI_ARENA_SLICE_SIZE; + //if (mi_bitmap_bsr(arena->slices_committed, &idx)) { + // asize = (idx + 1)* MI_ARENA_SLICE_SIZE; + //} + if (mi_bitmap_bsr(arena->pages, &idx)) { + mi_page_t* page = (mi_page_t*)mi_arena_slice_start(arena, idx); + const size_t page_slice_count = page->memid.mem.arena.slice_count; + asize = mi_size_of_slices(idx + page_slice_count); } else { asize = mi_arena_info_slices(arena) * MI_ARENA_SLICE_SIZE; diff --git a/test/main-override-dep.cpp b/test/main-override-dep.cpp index e92f6fc4..edb57f1f 100644 --- a/test/main-override-dep.cpp +++ b/test/main-override-dep.cpp @@ -12,4 +12,14 @@ std::string TestAllocInDll::GetString() std::string r = test; delete[] test; return r; +} + +#include + +void TestAllocInDll::TestHeapAlloc() +{ + HANDLE heap = GetProcessHeap(); + int* p = (int*)HeapAlloc(heap, 0, sizeof(int)); + *p = 42; + HeapFree(heap, 0, p); } \ No newline at end of file diff --git a/test/main-override-dep.h b/test/main-override-dep.h index 4826f25f..9d4aabfd 100644 --- a/test/main-override-dep.h +++ b/test/main-override-dep.h @@ -8,4 +8,5 @@ class TestAllocInDll { public: __declspec(dllexport) std::string GetString(); + __declspec(dllexport) void TestHeapAlloc(); }; diff --git a/test/main-override.cpp b/test/main-override.cpp index db594acc..af385992 100644 --- a/test/main-override.cpp +++ b/test/main-override.cpp @@ -37,7 +37,7 @@ static void test_thread_local(); // issue #944 static void test_mixed1(); // issue #942 static void test_stl_allocators(); -#if x_WIN32 +#if _WIN32 #include "main-override-dep.h" static void test_dep(); // issue #981: test overriding in another DLL #else @@ -46,8 +46,8 @@ static void test_dep() { }; int main() { mi_stats_reset(); // ignore earlier allocations - various_tests(); - test_mixed1(); + //various_tests(); + //test_mixed1(); test_dep(); @@ -145,11 +145,13 @@ static bool test_stl_allocator1() { struct some_struct { int i; int j; double z; }; -#if x_WIN32 +#if _WIN32 static void test_dep() { TestAllocInDll t; std::string s = t.GetString(); + + t.TestHeapAlloc(); } #endif From dd4b6fc0783868c6ca19a57d6fb341f92a854e1e Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 31 Jan 2025 11:54:51 -0800 Subject: [PATCH 213/264] update options --- include/mimalloc.h | 7 +++---- src/options.c | 7 +++---- src/page.c | 2 +- 3 files changed, 7 insertions(+), 9 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 8b453247..46335619 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -381,11 +381,11 @@ typedef enum mi_option_e { mi_option_os_tag, // tag used for OS logging (macOS only for now) (=100) mi_option_max_errors, // issue at most N error messages mi_option_max_warnings, // issue at most N warning messages - mi_option_max_segment_reclaim, // max. percentage of the abandoned segments can be reclaimed per try (=10%) + mi_option_deprecated_max_segment_reclaim, // max. percentage of the abandoned segments can be reclaimed per try (=10%) mi_option_destroy_on_exit, // if set, release all memory on exit; sometimes used for dynamic unloading but can be unsafe mi_option_arena_reserve, // initial memory size for arena reservation (= 1 GiB on 64-bit) (internally, this value is in KiB; use `mi_option_get_size`) mi_option_arena_purge_mult, // multiplier for `purge_delay` for the purging delay for arenas (=10) - mi_option_purge_extend_delay, + mi_option_deprecated_purge_extend_delay, mi_option_disallow_arena_alloc, // 1 = do not use arena's for allocation (except if using specific arena id's) mi_option_retry_on_oom, // retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. (only on windows) mi_option_visit_abandoned, // allow visiting heap blocks from abandoned threads (=0) @@ -394,8 +394,7 @@ typedef enum mi_option_e { mi_option_guarded_precise, // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0) mi_option_guarded_sample_rate, // 1 out of N allocations in the min/max range will be guarded (=1000) mi_option_guarded_sample_seed, // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0) - mi_option_target_segments_per_thread, // experimental (=0) - mi_option_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) + mi_option_page_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) mi_option_page_full_retain, // retain N full pages per size class (=2) mi_option_page_max_candidates, // max candidate pages to consider for allocation (=4) mi_option_max_vabits, // max user space virtual address bits to consider (=48) diff --git a/src/options.c b/src/options.c index 8d66b320..7b643092 100644 --- a/src/options.c +++ b/src/options.c @@ -150,11 +150,11 @@ static mi_option_desc_t options[_mi_option_last] = { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose { 32, UNINIT, MI_OPTION(max_errors) }, // maximum errors that are output { 32, UNINIT, MI_OPTION(max_warnings) }, // maximum warnings that are output - { 10, UNINIT, MI_OPTION(max_segment_reclaim)}, // max. percentage of the abandoned segments to be reclaimed per try. + { 10, UNINIT, MI_OPTION(deprecated_max_segment_reclaim)}, // max. percentage of the abandoned segments to be reclaimed per try. { 0, UNINIT, MI_OPTION(destroy_on_exit)}, // release all OS memory on process exit; careful with dangling pointer or after-exit frees! { MI_DEFAULT_ARENA_RESERVE, UNINIT, MI_OPTION(arena_reserve) }, // reserve memory N KiB at a time (=1GiB) (use `option_get_size`) { 1, UNINIT, MI_OPTION(arena_purge_mult) }, // purge delay multiplier for arena's - { 1, UNINIT, MI_OPTION_LEGACY(purge_extend_delay, decommit_extend_delay) }, + { 1, UNINIT, MI_OPTION_LEGACY(deprecated_purge_extend_delay, decommit_extend_delay) }, { MI_DEFAULT_DISALLOW_ARENA_ALLOC, UNINIT, MI_OPTION(disallow_arena_alloc) }, // 1 = do not use arena's for allocation (except if using specific arena id's) { 400, UNINIT, MI_OPTION(retry_on_oom) }, // windows only: retry on out-of-memory for N milli seconds (=400), set to 0 to disable retries. #if defined(MI_VISIT_ABANDONED) @@ -168,8 +168,7 @@ static mi_option_desc_t options[_mi_option_last] = { MI_DEFAULT_GUARDED_SAMPLE_RATE, UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, - { 0, UNINIT, MI_OPTION(target_segments_per_thread) }, // abandon segments beyond this point, or 0 to disable. - { 0, UNINIT, MI_OPTION_LEGACY(reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free + { 1, UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { 2, UNINIT, MI_OPTION(page_full_retain) }, { 4, UNINIT, MI_OPTION(page_max_candidates) }, { 0, UNINIT, MI_OPTION(max_vabits) }, diff --git a/src/page.c b/src/page.c index d2d6a854..af1d5072 100644 --- a/src/page.c +++ b/src/page.c @@ -680,7 +680,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m size_t count = 0; #endif long candidate_limit = 0; // we reset this on the first candidate to limit the search - long page_full_retain = heap->page_full_retain; + long page_full_retain = (pq->block_size > MI_SMALL_MAX_OBJ_SIZE ? 0 : heap->page_full_retain); // only retain small pages mi_page_t* page_candidate = NULL; // a page with free space mi_page_t* page = pq->first; From 274bcb61db6b7b7447db2b3b0901d7005a242f85 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 31 Jan 2025 12:11:25 -0800 Subject: [PATCH 214/264] update option names --- src/free.c | 4 ++-- src/heap.c | 2 +- src/init.c | 2 +- test/test-stress.c | 6 +++--- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/src/free.c b/src/free.c index 5d9628f0..865efafa 100644 --- a/src/free.c +++ b/src/free.c @@ -217,7 +217,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe // 2. if the page is not too full, we can try to reclaim it for ourselves // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit. - if (_mi_option_get_fast(mi_option_reclaim_on_free) != 0 && + if (_mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 && !mi_page_is_used_at_frac(page,8) // && !mi_page_is_abandoned_mapped(page) ) @@ -237,7 +237,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) ) { - if (mi_page_queue(tagheap, page->block_size)->first != NULL) { // don't reclaim for an block_size we don't use + if (mi_page_queue(tagheap, page->block_size)->first != NULL) { // don't reclaim for a block_size we don't use // first remove it from the abandoned pages in the arena -- this waits for any readers to finish _mi_arenas_page_unabandon(page); _mi_heap_page_reclaim(tagheap, page); diff --git a/src/heap.c b/src/heap.c index 82ca05cb..1ae7e99f 100644 --- a/src/heap.c +++ b/src/heap.c @@ -175,7 +175,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy, heap->memid = memid; heap->tld = tld; // avoid reading the thread-local tld during initialization heap->exclusive_arena = _mi_arena_from_id(arena_id); - heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_reclaim_on_free)); + heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_page_reclaim_on_free)); heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0); heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); heap->tag = heap_tag; diff --git a/src/init.c b/src/init.c index ac49d292..33c9794d 100644 --- a/src/init.c +++ b/src/init.c @@ -259,7 +259,7 @@ static void mi_heap_main_init(void) { //heap_main.keys[0] = _mi_heap_random_next(&heap_main); //heap_main.keys[1] = _mi_heap_random_next(&heap_main); _mi_heap_guarded_init(&heap_main); - heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_reclaim_on_free); + heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_page_reclaim_on_free); heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0); heap_main.page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); } diff --git a/test/test-stress.c b/test/test-stress.c index fb27a786..303d9f42 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -303,12 +303,12 @@ int main(int argc, char** argv) { mi_option_enable(mi_option_visit_abandoned); #endif #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) - mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); - mi_option_set(mi_option_purge_delay,1); + // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); + // mi_option_set(mi_option_purge_delay,1); #endif #if defined(NDEBUG) && !defined(USE_STD_MALLOC) // mi_option_set(mi_option_purge_delay,-1); - mi_option_set(mi_option_reclaim_on_free, 0); + mi_option_set(mi_option_page_reclaim_on_free, 0); #endif #ifndef USE_STD_MALLOC mi_stats_reset(); From d55fde118981a481655a679b38befda877e78192 Mon Sep 17 00:00:00 2001 From: Daan Date: Fri, 31 Jan 2025 13:34:16 -0800 Subject: [PATCH 215/264] change defaults in test-stress --- test/test-stress.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test-stress.c b/test/test-stress.c index fb27a786..f7ae6fea 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -303,8 +303,8 @@ int main(int argc, char** argv) { mi_option_enable(mi_option_visit_abandoned); #endif #if !defined(NDEBUG) && !defined(USE_STD_MALLOC) - mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); - mi_option_set(mi_option_purge_delay,1); + // mi_option_set(mi_option_arena_reserve, 32 * 1024 /* in kib = 32MiB */); + // mi_option_set(mi_option_purge_delay,1); #endif #if defined(NDEBUG) && !defined(USE_STD_MALLOC) // mi_option_set(mi_option_purge_delay,-1); From 59eeeadc3473e6d38dd83bc41d317b494df1f8ef Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 4 Feb 2025 12:26:21 -0800 Subject: [PATCH 216/264] only allow page_reclaim_on_free for small block pages --- bin/readme.md | 2 +- include/mimalloc/internal.h | 2 +- src/free.c | 9 +++++---- 3 files changed, 7 insertions(+), 6 deletions(-) diff --git a/bin/readme.md b/bin/readme.md index f08b2e87..b79157de 100644 --- a/bin/readme.md +++ b/bin/readme.md @@ -63,7 +63,7 @@ need a specific redirection DLL: mode on Windows arm64. Unfortunately we cannot run x64 code emulated on Windows arm64 with the x64 mimalloc override directly (since the C runtime always uses `arm64ec`). Instead: 1. Build the program as normal for x64 and link as normal with the x64 - `mimalloc.lib` export library. + `mimalloc.dll.lib` export library. 2. Now separately build `mimalloc.dll` in `arm64ec` mode and _overwrite_ your previous (x64) `mimalloc.dll` -- the loader can handle the mix of arm64ec and x64 code. Now use `mimalloc-redirect-arm64ec.dll` to match your new diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 8e7ed5e9..e18390a8 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -433,7 +433,7 @@ static inline bool mi_heap_is_backing(const mi_heap_t* heap) { return (heap->tld->heap_backing == heap); } -static inline bool mi_heap_is_initialized(mi_heap_t* heap) { +static inline bool mi_heap_is_initialized(const mi_heap_t* heap) { mi_assert_internal(heap != NULL); return (heap != NULL && heap != &_mi_heap_empty); } diff --git a/src/free.c b/src/free.c index 865efafa..1a81c504 100644 --- a/src/free.c +++ b/src/free.c @@ -185,7 +185,7 @@ void mi_free(void* p) mi_attr_noexcept else { // page is full or contains (inner) aligned blocks; use generic multi-thread path mi_free_generic_mt(page, p); - } + } } @@ -218,7 +218,8 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe // 2. if the page is not too full, we can try to reclaim it for ourselves // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit. if (_mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 && - !mi_page_is_used_at_frac(page,8) + page->block_size <= MI_SMALL_MAX_OBJ_SIZE && // only for small sized blocks + !mi_page_is_used_at_frac(page,8) // and not too full // && !mi_page_is_abandoned_mapped(page) ) { @@ -228,11 +229,11 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noe // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) mi_heap_t* const heap = mi_prim_get_default_heap(); - if (heap != (mi_heap_t*)&_mi_heap_empty) // we did not already terminate our thread (can this happen? + if (mi_heap_is_initialized(heap)) // we did not already terminate our thread { mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); if ((tagheap != NULL) && // don't reclaim across heap object types - (tagheap->allow_page_reclaim) && // we are allowed to reclaim abandoned pages + (tagheap->allow_page_reclaim) && // and we are allowed to reclaim abandoned pages // (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) ) From db7930f961ceb781cd4e70140676e389db4576f1 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 4 Feb 2025 14:58:08 -0800 Subject: [PATCH 217/264] avoid atomics in mi_free_try_collect_mt --- include/mimalloc/internal.h | 3 +- src/free.c | 14 ++++--- src/heap.c | 8 ++-- src/page.c | 80 ++++++++++++++++++++++++------------- 4 files changed, 67 insertions(+), 38 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index e18390a8..c1e55ddc 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -199,7 +199,8 @@ void _mi_heap_collect_retired(mi_heap_t* heap, bool force); size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append); void _mi_deferred_free(mi_heap_t* heap, bool force); -void _mi_page_free_collect(mi_page_t* page,bool force); +void _mi_page_free_collect(mi_page_t* page, bool force); +void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head); void _mi_page_init(mi_heap_t* heap, mi_page_t* page); size_t _mi_bin_size(uint8_t bin); // for stats diff --git a/src/free.c b/src/free.c index 1a81c504..ebcf08ab 100644 --- a/src/free.c +++ b/src/free.c @@ -48,7 +48,7 @@ static inline void mi_free_block_local(mi_page_t* page, mi_block_t* block, bool } // Forward declaration for multi-threaded collect -static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept; +static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept; // Free a block multi-threaded static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_noexcept @@ -69,14 +69,14 @@ static inline void mi_free_block_mt(mi_page_t* page, mi_block_t* block) mi_attr_ mi_thread_free_t tf_old = mi_atomic_load_relaxed(&page->xthread_free); do { mi_block_set_next(page, block, mi_tf_block(tf_old)); - tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */); + tf_new = mi_tf_create(block, true /* always use owned: try to claim it if the page is abandoned */); } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_old, tf_new)); // todo: release is enough? // and atomically try to collect the page if it was abandoned const bool is_owned_now = !mi_tf_is_owned(tf_old); if (is_owned_now) { mi_assert_internal(mi_page_is_abandoned(page)); - mi_free_try_collect_mt(page); + mi_free_try_collect_mt(page,block); } } @@ -194,18 +194,20 @@ void mi_free(void* p) mi_attr_noexcept // ------------------------------------------------------ -static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page) mi_attr_noexcept { +static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept { mi_assert_internal(mi_page_is_owned(page)); mi_assert_internal(mi_page_is_abandoned(page)); // we own the page now.. // safe to collect the thread atomic free list - _mi_page_free_collect(page, false); // update `used` count + // use the `_partly` version to avoid atomic operations since we already have the `mt_free` pointing into the thread free list + _mi_page_free_collect_partly(page, mt_free); + #if MI_DEBUG > 1 if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_all_free(page)); } #endif - // 1. free if the page is free now + // 1. free if the page is free now (this is updated by `_mi_page_free_collect_partly`) if (mi_page_all_free(page)) { // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish) diff --git a/src/heap.c b/src/heap.c index 1ae7e99f..10c65ff2 100644 --- a/src/heap.c +++ b/src/heap.c @@ -115,14 +115,14 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) // collect retired pages _mi_heap_collect_retired(heap, force); - + // if (_mi_is_main_thread()) { mi_debug_show_arenas(true, false, false); } - + // collect all pages owned by this thread mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL); - // collect arenas (this is program wide so don't force purges on abandonment of threads) - //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); + // collect arenas (this is program wide so don't force purges on abandonment of threads) + //mi_atomic_storei64_release(&heap->tld->subproc->purge_expire, 1); _mi_arenas_collect(collect == MI_FORCE /* force purge? */, collect >= MI_FORCE /* visit all? */, heap->tld); } diff --git a/src/page.c b/src/page.c index af1d5072..ccb4445b 100644 --- a/src/page.c +++ b/src/page.c @@ -137,9 +137,39 @@ bool _mi_page_is_valid(mi_page_t* page) { Page collect the `local_free` and `thread_free` lists ----------------------------------------------------------- */ -// Collect the local `thread_free` list using an atomic exchange. -static void _mi_page_thread_free_collect(mi_page_t* page) +static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) { + if (head == NULL) return; + + // find the last block in the list -- also to get a proper use count (without data races) + size_t max_count = page->capacity; // cannot collect more than capacity + size_t count = 1; + mi_block_t* last = head; + mi_block_t* next; + while ((next = mi_block_next(page, last)) != NULL && count <= max_count) { + count++; + last = next; + } + + // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free) + if (count > max_count) { + _mi_error_message(EFAULT, "corrupted thread-free list\n"); + return; // the thread-free items cannot be freed + } + + // and append the current local free list + mi_block_set_next(page, last, page->local_free); + page->local_free = head; + + // update counts now + mi_assert_internal(count <= UINT16_MAX); + page->used = page->used - (uint16_t)count; +} + +// Collect the local `thread_free` list using an atomic exchange. +static void mi_page_thread_free_collect(mi_page_t* page) +{ + // atomically capture the thread free list mi_block_t* head; mi_thread_free_t tfreex; mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); @@ -150,35 +180,15 @@ static void _mi_page_thread_free_collect(mi_page_t* page) } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex)); // release is enough? mi_assert_internal(head != NULL); - // find the tail -- also to get a proper count (without data races) - size_t max_count = page->capacity; // cannot collect more than capacity - size_t count = 1; - mi_block_t* tail = head; - mi_block_t* next; - while( (next = mi_block_next(page,tail)) != NULL && count <= max_count) { - count++; - tail = next; - } - - // if `count > max_count` there was a memory corruption (possibly infinite list due to double multi-threaded free) - if (count > max_count) { - _mi_error_message(EFAULT, "corrupted thread-free list\n"); - return; // the thread-free items cannot be freed - } - - // and append the current local free list - mi_block_set_next(page,tail, page->local_free); - page->local_free = head; - - // update counts now - page->used -= (uint16_t)count; + // and move it to the local list + mi_page_thread_collect_to_local(page, head); } void _mi_page_free_collect(mi_page_t* page, bool force) { mi_assert_internal(page!=NULL); // collect the thread free list - _mi_page_thread_free_collect(page); + mi_page_thread_free_collect(page); // and the local free list if (page->local_free != NULL) { @@ -205,6 +215,23 @@ void _mi_page_free_collect(mi_page_t* page, bool force) { mi_assert_internal(!force || page->local_free == NULL); } +// collect elements in the thread-free list starting at `head`. +void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) { + if (head == NULL) return; + mi_block_t* next = mi_block_next(page,head); // we cannot collect the head element itself as `page->thread_free` may point at it (and we want to avoid atomic ops) + if (next != NULL) { + mi_page_thread_collect_to_local(page, next); + if (page->local_free != NULL && page->free == NULL) { + page->free = page->local_free; + page->local_free = NULL; + page->free_is_zero = false; + } + } + if (page->used == 1) { + // all elements are free'd since we skipped the `head` element itself + _mi_page_free_collect(page, false); // collect the final element + } +} /* ----------------------------------------------------------- @@ -333,9 +360,8 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { // abandon full pages _mi_page_abandon(page, pq); } - else { + else if (!mi_page_is_in_full(page)) { // put full pages in a heap local queue - if (mi_page_is_in_full(page)) return; mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page); _mi_page_free_collect(page, false); // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set } From b0c8d86c41066832d35db85952d65f483b1fecf6 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 4 Feb 2025 15:03:27 -0800 Subject: [PATCH 218/264] refactor mi_free_try_collect_mt --- src/free.c | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/src/free.c b/src/free.c index ebcf08ab..5e83ad95 100644 --- a/src/free.c +++ b/src/free.c @@ -217,12 +217,13 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* return; } + const bool too_full = mi_page_is_used_at_frac(page, 8); // more than 7/8th of the page is in use? + // 2. if the page is not too full, we can try to reclaim it for ourselves // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit. - if (_mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 && - page->block_size <= MI_SMALL_MAX_OBJ_SIZE && // only for small sized blocks - !mi_page_is_used_at_frac(page,8) // and not too full - // && !mi_page_is_abandoned_mapped(page) + if (!too_full && + _mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 && + page->block_size <= MI_SMALL_MAX_OBJ_SIZE // only for small sized blocks ) { // the page has still some blocks in use (but not too many) @@ -252,7 +253,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* } // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations - if (!mi_page_is_used_at_frac(page,8) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page + if (!too_full && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && _mi_arenas_page_try_reabandon_to_mapped(page)) { From 8fc8da5d81bcee92650752d473603ea42a6fb203 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Tue, 4 Feb 2025 17:54:49 -0800 Subject: [PATCH 219/264] use thread local stats for abandoned statistics to reduce contention --- include/mimalloc/internal.h | 3 ++- include/mimalloc/types.h | 13 ++++++++++--- src/arena.c | 25 +++++++++++++------------ src/init.c | 12 ++++++++++++ src/page.c | 2 +- src/stats.c | 6 ++++++ 6 files changed, 44 insertions(+), 17 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index c1e55ddc..92f02788 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -116,6 +116,7 @@ mi_subproc_t* _mi_subproc_main(void); mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; size_t _mi_thread_seq_id(void) mi_attr_noexcept; +mi_tld_t* _mi_thread_tld(void) mi_attr_noexcept; void _mi_heap_guarded_init(mi_heap_t* heap); // os.c @@ -171,7 +172,7 @@ void _mi_arenas_unsafe_destroy_all(mi_tld_t* tld); mi_page_t* _mi_arenas_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment); void _mi_arenas_page_free(mi_page_t* page); -void _mi_arenas_page_abandon(mi_page_t* page); +void _mi_arenas_page_abandon(mi_page_t* page, mi_tld_t* tld); void _mi_arenas_page_unabandon(mi_page_t* page); bool _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page); diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 0bf5722b..6ed17f09 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -544,13 +544,20 @@ void __mi_stat_counter_increase_mt(mi_stat_counter_t* stat, size_t amount); #define mi_subproc_stat_adjust_increase(subproc,stat,amnt,b) __mi_stat_adjust_increase_mt( &(subproc)->stats.stat, amnt, b) #define mi_subproc_stat_adjust_decrease(subproc,stat,amnt,b) __mi_stat_adjust_decrease_mt( &(subproc)->stats.stat, amnt, b) +#define mi_tld_stat_counter_increase(tld,stat,amount) __mi_stat_counter_increase( &(tld)->stats.stat, amount) +#define mi_tld_stat_increase(tld,stat,amount) __mi_stat_increase( &(tld)->stats.stat, amount) +#define mi_tld_stat_decrease(tld,stat,amount) __mi_stat_decrease( &(tld)->stats.stat, amount) +#define mi_tld_stat_adjust_increase(tld,stat,amnt,b) __mi_stat_adjust_increase( &(tld)->stats.stat, amnt, b) +#define mi_tld_stat_adjust_decrease(tld,stat,amnt,b) __mi_stat_adjust_decrease( &(tld)->stats.stat, amnt, b) + + #define mi_os_stat_counter_increase(stat,amount) mi_subproc_stat_counter_increase(_mi_subproc(),stat,amount) #define mi_os_stat_increase(stat,amount) mi_subproc_stat_increase(_mi_subproc(),stat,amount) #define mi_os_stat_decrease(stat,amount) mi_subproc_stat_decrease(_mi_subproc(),stat,amount) -#define mi_heap_stat_counter_increase(heap,stat,amount) __mi_stat_counter_increase( &(heap)->tld->stats.stat, amount) -#define mi_heap_stat_increase(heap,stat,amount) __mi_stat_increase( &(heap)->tld->stats.stat, amount) -#define mi_heap_stat_decrease(heap,stat,amount) __mi_stat_decrease( &(heap)->tld->stats.stat, amount) +#define mi_heap_stat_counter_increase(heap,stat,amount) mi_tld_stat_counter_increase(heap->tld, stat, amount) +#define mi_heap_stat_increase(heap,stat,amount) mi_tld_stat_increase( heap->tld, stat, amount) +#define mi_heap_stat_decrease(heap,stat,amount) mi_tld_stat_decrease( heap->tld, stat, amount) #define mi_debug_heap_stat_counter_increase(heap,stat,amount) mi_debug_stat_counter_increase( (heap)->tld->stats.stat, amount) #define mi_debug_heap_stat_increase(heap,stat,amount) mi_debug_stat_increase( (heap)->tld->stats.stat, amount) diff --git a/src/arena.c b/src/arena.c index e111a417..ca2ea164 100644 --- a/src/arena.c +++ b/src/arena.c @@ -563,8 +563,9 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_ mi_assert_internal(mi_page_is_abandoned(page)); mi_assert_internal(mi_arena_has_page(arena,page)); mi_atomic_decrement_relaxed(&subproc->abandoned_count[bin]); - mi_subproc_stat_decrease( arena->subproc, pages_abandoned, 1); - mi_subproc_stat_counter_increase(arena->subproc, pages_reclaim_on_alloc, 1); + mi_tld_t* tld = _mi_thread_tld(); + mi_tld_stat_decrease( tld, pages_abandoned, 1); + mi_tld_stat_counter_increase( tld, pages_reclaim_on_alloc, 1); _mi_page_free_collect(page, false); // update `used` count mi_assert_internal(mi_bbitmap_is_clearN(arena->slices_free, slice_index, slice_count)); @@ -855,7 +856,7 @@ void _mi_arenas_page_free(mi_page_t* page) { Arena abandon ----------------------------------------------------------- */ -void _mi_arenas_page_abandon(mi_page_t* page) { +void _mi_arenas_page_abandon(mi_page_t* page, mi_tld_t* tld) { mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN)); mi_assert_internal(_mi_ptr_page(page)==page); mi_assert_internal(mi_page_is_owned(page)); @@ -878,7 +879,7 @@ void _mi_arenas_page_abandon(mi_page_t* page) { const bool wasclear = mi_bitmap_set(arena->pages_abandoned[bin], slice_index); MI_UNUSED(wasclear); mi_assert_internal(wasclear); mi_atomic_increment_relaxed(&arena->subproc->abandoned_count[bin]); - mi_subproc_stat_increase(arena->subproc, pages_abandoned, 1); + mi_tld_stat_increase(tld, pages_abandoned, 1); } else { // page is full (or a singleton), or the page is OS/externally allocated @@ -894,7 +895,7 @@ void _mi_arenas_page_abandon(mi_page_t* page) { subproc->os_abandoned_pages = page; } } - mi_subproc_stat_increase(_mi_subproc(), pages_abandoned, 1); + mi_tld_stat_increase(tld, pages_abandoned, 1); } _mi_page_unown(page); } @@ -912,10 +913,10 @@ bool _mi_arenas_page_try_reabandon_to_mapped(mi_page_t* page) { return false; } else { - mi_subproc_t* subproc = _mi_subproc(); - mi_subproc_stat_counter_increase( subproc, pages_reabandon_full, 1); - mi_subproc_stat_adjust_decrease( subproc, pages_abandoned, 1, true /* on alloc */); // adjust as we are not abandoning fresh - _mi_arenas_page_abandon(page); + mi_tld_t* tld = _mi_thread_tld(); + mi_tld_stat_counter_increase( tld, pages_reabandon_full, 1); + mi_tld_stat_adjust_decrease( tld, pages_abandoned, 1, true /* on alloc */); // adjust as we are not abandoning fresh + _mi_arenas_page_abandon(page,tld); return true; } } @@ -942,14 +943,14 @@ void _mi_arenas_page_unabandon(mi_page_t* page) { mi_bitmap_clear_once_set(arena->pages_abandoned[bin], slice_index); mi_page_clear_abandoned_mapped(page); mi_atomic_decrement_relaxed(&arena->subproc->abandoned_count[bin]); - mi_subproc_stat_decrease(arena->subproc, pages_abandoned, 1); + mi_tld_stat_decrease(_mi_thread_tld(), pages_abandoned, 1); } else { // page is full (or a singleton), page is OS allocated - mi_subproc_t* subproc = _mi_subproc(); - mi_subproc_stat_decrease(_mi_subproc(), pages_abandoned, 1); + mi_tld_stat_decrease(_mi_thread_tld(), pages_abandoned, 1); // if not an arena page, remove from the subproc os pages list if (page->memid.memkind != MI_MEM_ARENA && mi_option_is_enabled(mi_option_visit_abandoned)) { + mi_subproc_t* subproc = _mi_subproc(); mi_lock(&subproc->os_abandoned_pages_lock) { if (page->prev != NULL) { page->prev->next = page->next; } if (page->next != NULL) { page->next->prev = page->prev; } diff --git a/src/init.c b/src/init.c index 33c9794d..ced30104 100644 --- a/src/init.c +++ b/src/init.c @@ -357,6 +357,18 @@ mi_subproc_t* _mi_subproc(void) { } +mi_tld_t* _mi_thread_tld(void) mi_attr_noexcept { + // should work without doing initialization (as it may be called from `_mi_tld -> mi_tld_alloc ... -> os_alloc -> _mi_subproc()` + mi_heap_t* heap = mi_prim_get_default_heap(); + if (heap == NULL) { + return &tld_empty; + } + else { + return heap->tld; + } +} + + /* ----------------------------------------------------------- Sub process ----------------------------------------------------------- */ diff --git a/src/page.c b/src/page.c index ccb4445b..dc3a6365 100644 --- a/src/page.c +++ b/src/page.c @@ -280,7 +280,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { mi_page_queue_remove(pq, page); mi_tld_t* tld = page->heap->tld; mi_page_set_heap(page, NULL); - _mi_arenas_page_abandon(page); + _mi_arenas_page_abandon(page,tld); _mi_arenas_collect(false, false, tld); // allow purging } } diff --git a/src/stats.c b/src/stats.c index 057dc093..d8450a84 100644 --- a/src/stats.c +++ b/src/stats.c @@ -152,6 +152,12 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) { mi_stat_counter_add(&stats->normal_count, &src->normal_count, 1); mi_stat_counter_add(&stats->huge_count, &src->huge_count, 1); mi_stat_counter_add(&stats->guarded_alloc_count, &src->guarded_alloc_count, 1); + + mi_stat_counter_add(&stats->pages_extended, &src->pages_extended, 1); + mi_stat_counter_add(&stats->pages_reclaim_on_alloc, &src->pages_reclaim_on_alloc, 1); + mi_stat_counter_add(&stats->pages_reclaim_on_free, &src->pages_reclaim_on_free, 1); + mi_stat_counter_add(&stats->pages_reabandon_full, &src->pages_reabandon_full, 1); + mi_stat_counter_add(&stats->pages_unabandon_busy_wait, &src->pages_unabandon_busy_wait, 1); #if MI_STAT>1 for (size_t i = 0; i <= MI_BIN_HUGE; i++) { if (src->normal_bins[i].allocated > 0 || src->normal_bins[i].freed > 0) { From df172843d13e357f58ea0e2bf9a9c5b5f54ad070 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 4 Feb 2025 20:15:38 -0800 Subject: [PATCH 220/264] call page_free_collect less often from a page search --- src/page.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/page.c b/src/page.c index dc3a6365..4b0c810c 100644 --- a/src/page.c +++ b/src/page.c @@ -175,7 +175,7 @@ static void mi_page_thread_free_collect(mi_page_t* page) mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); do { head = mi_tf_block(tfree); - if (head == NULL) return; // return if the list is empty + if mi_likely(head == NULL) return; // return if the list is empty tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree)); // set the thread free list to NULL } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex)); // release is enough? mi_assert_internal(head != NULL); @@ -717,14 +717,16 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m count++; #endif candidate_limit--; - - // collect freed blocks by us and other threads - _mi_page_free_collect(page, false); - + // search up to N pages for a best candidate // is the local free list non-empty? - const bool immediate_available = mi_page_immediate_available(page); + bool immediate_available = mi_page_immediate_available(page); + if (!immediate_available) { + // collect freed blocks by us and other threads to we get a proper use count + _mi_page_free_collect(page, false); + immediate_available = mi_page_immediate_available(page); + } // if the page is completely full, move it to the `mi_pages_full` // queue so we don't visit long-lived pages too often. @@ -742,7 +744,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m page_candidate = page; candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates); } - else if (mi_page_all_free(page_candidate)) { + else if (mi_page_all_free(page_candidate)) { _mi_page_free(page_candidate, pq); page_candidate = page; } From 1e0801dceafbb87d9049a94068ad148d67f3946e Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 4 Feb 2025 20:15:38 -0800 Subject: [PATCH 221/264] call page_free_collect less often from a page search --- src/page.c | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/src/page.c b/src/page.c index dc3a6365..4b0c810c 100644 --- a/src/page.c +++ b/src/page.c @@ -175,7 +175,7 @@ static void mi_page_thread_free_collect(mi_page_t* page) mi_thread_free_t tfree = mi_atomic_load_relaxed(&page->xthread_free); do { head = mi_tf_block(tfree); - if (head == NULL) return; // return if the list is empty + if mi_likely(head == NULL) return; // return if the list is empty tfreex = mi_tf_create(NULL,mi_tf_is_owned(tfree)); // set the thread free list to NULL } while (!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tfree, tfreex)); // release is enough? mi_assert_internal(head != NULL); @@ -717,14 +717,16 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m count++; #endif candidate_limit--; - - // collect freed blocks by us and other threads - _mi_page_free_collect(page, false); - + // search up to N pages for a best candidate // is the local free list non-empty? - const bool immediate_available = mi_page_immediate_available(page); + bool immediate_available = mi_page_immediate_available(page); + if (!immediate_available) { + // collect freed blocks by us and other threads to we get a proper use count + _mi_page_free_collect(page, false); + immediate_available = mi_page_immediate_available(page); + } // if the page is completely full, move it to the `mi_pages_full` // queue so we don't visit long-lived pages too often. @@ -742,7 +744,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m page_candidate = page; candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates); } - else if (mi_page_all_free(page_candidate)) { + else if (mi_page_all_free(page_candidate)) { _mi_page_free(page_candidate, pq); page_candidate = page; } From 27895ce35df45276b7fbb54cb9e800df6065ddd5 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 5 Feb 2025 14:25:36 -0800 Subject: [PATCH 222/264] fix guard page size calculation in secure mode --- src/arena.c | 19 +++++++++++-------- 1 file changed, 11 insertions(+), 8 deletions(-) diff --git a/src/arena.c b/src/arena.c index ca2ea164..3e2fc583 100644 --- a/src/arena.c +++ b/src/arena.c @@ -160,12 +160,15 @@ static mi_arena_t* mi_page_arena(mi_page_t* page, size_t* slice_index, size_t* s return mi_arena_from_memid(page->memid, slice_index, slice_count); } -static size_t mi_memid_size(mi_memid_t memid) { - if (memid.memkind == MI_MEM_ARENA) { - return memid.mem.arena.slice_count * MI_ARENA_SLICE_SIZE; +static size_t mi_page_full_size(mi_page_t* page) { + if (page->memid.memkind == MI_MEM_ARENA) { + return page->memid.mem.arena.slice_count * MI_ARENA_SLICE_SIZE; } - else if (mi_memid_is_os(memid) || memid.memkind == MI_MEM_EXTERNAL) { - return memid.mem.os.size; + else if (mi_memid_is_os(page->memid) || page->memid.memkind == MI_MEM_EXTERNAL) { + mi_assert_internal((uint8_t*)page->memid.mem.os.base <= (uint8_t*)page); + const ptrdiff_t presize = (uint8_t*)page - (uint8_t*)page->memid.mem.os.base; + mi_assert_internal((ptrdiff_t)page->memid.mem.os.size >= presize); + return (presize > page->memid.mem.os.size ? 0 : page->memid.mem.os.size - presize); } else { return 0; @@ -820,7 +823,7 @@ void _mi_arenas_page_free(mi_page_t* page) { // we must do this since we may later allocate large spans over this page and cannot have a guard page in between #if MI_SECURE >= 2 if (!page->memid.is_pinned) { - _mi_os_secure_guard_page_reset_before((uint8_t*)page + mi_memid_size(page->memid)); + _mi_os_secure_guard_page_reset_before((uint8_t*)page + mi_page_full_size(page)); } #endif @@ -831,7 +834,7 @@ void _mi_arenas_page_free(mi_page_t* page) { mi_bitmap_clear(arena->pages, page->memid.mem.arena.slice_index); if (page->slice_committed > 0) { // if committed on-demand, set the commit bits to account commit properly - mi_assert_internal(mi_memid_size(page->memid) >= page->slice_committed); + mi_assert_internal(mi_page_full_size(page) >= page->slice_committed); const size_t total_slices = page->slice_committed / MI_ARENA_SLICE_SIZE; // conservative //mi_assert_internal(mi_bitmap_is_clearN(arena->slices_committed, page->memid.mem.arena.slice_index, total_slices)); mi_assert_internal(page->memid.mem.arena.slice_count >= total_slices); @@ -849,7 +852,7 @@ void _mi_arenas_page_free(mi_page_t* page) { mi_assert_internal(mi_bitmap_is_setN(arena->slices_committed, page->memid.mem.arena.slice_index, page->memid.mem.arena.slice_count)); } } - _mi_arenas_free(page, mi_memid_size(page->memid), page->memid); + _mi_arenas_free(page, mi_page_full_size(page), page->memid); } /* ----------------------------------------------------------- From 5fbba3f20c0b28bd477cf359df9bdd6c8143e1ce Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 5 Feb 2025 14:27:36 -0800 Subject: [PATCH 223/264] fix sign of comparison --- src/arena.c | 2 +- src/options.c | 4 ++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arena.c b/src/arena.c index 3e2fc583..78b13749 100644 --- a/src/arena.c +++ b/src/arena.c @@ -168,7 +168,7 @@ static size_t mi_page_full_size(mi_page_t* page) { mi_assert_internal((uint8_t*)page->memid.mem.os.base <= (uint8_t*)page); const ptrdiff_t presize = (uint8_t*)page - (uint8_t*)page->memid.mem.os.base; mi_assert_internal((ptrdiff_t)page->memid.mem.os.size >= presize); - return (presize > page->memid.mem.os.size ? 0 : page->memid.mem.os.size - presize); + return (presize > (ptrdiff_t)page->memid.mem.os.size ? 0 : page->memid.mem.os.size - presize); } else { return 0; diff --git a/src/options.c b/src/options.c index 7b643092..9ebb0b6a 100644 --- a/src/options.c +++ b/src/options.c @@ -144,7 +144,7 @@ static mi_option_desc_t options[_mi_option_last] = #else { 1, UNINIT, MI_OPTION(eager_commit_delay) }, // the first N segments per thread are not eagerly committed (but per page in the segment on demand) #endif - { 2500,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds + { 1000,UNINIT, MI_OPTION_LEGACY(purge_delay,reset_delay) }, // purge delay in milli-seconds { 0, UNINIT, MI_OPTION(use_numa_nodes) }, // 0 = use available numa nodes, otherwise use at most N nodes. { 0, UNINIT, MI_OPTION_LEGACY(disallow_os_alloc,limit_os_alloc) }, // 1 = do not use OS memory for allocation (but only reserved arenas) { 100, UNINIT, MI_OPTION(os_tag) }, // only apple specific for now but might serve more or less related purpose @@ -168,7 +168,7 @@ static mi_option_desc_t options[_mi_option_last] = { MI_DEFAULT_GUARDED_SAMPLE_RATE, UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, - { 1, UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free + { 0, UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free { 2, UNINIT, MI_OPTION(page_full_retain) }, { 4, UNINIT, MI_OPTION(page_max_candidates) }, { 0, UNINIT, MI_OPTION(max_vabits) }, From 5aa679cdee122f59a6ceac9aae8cbd4181379ef1 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 5 Feb 2025 15:41:37 -0800 Subject: [PATCH 224/264] make page_reclaim_on_free 0 by default; but allow reclaim_on_free if the page was originally in this heap (just as in v2 with the full queue) --- src/free.c | 47 ++++++++++++++++++++++------------------------- src/heap.c | 2 +- src/init.c | 2 +- src/options.c | 10 +++++----- src/page.c | 13 +++++++------ 5 files changed, 36 insertions(+), 38 deletions(-) diff --git a/src/free.c b/src/free.c index 5e83ad95..b1827f1e 100644 --- a/src/free.c +++ b/src/free.c @@ -217,43 +217,40 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* return; } - const bool too_full = mi_page_is_used_at_frac(page, 8); // more than 7/8th of the page is in use? - // 2. if the page is not too full, we can try to reclaim it for ourselves - // note: this seems a bad idea but it speeds up some benchmarks (like `larson`) quite a bit. - if (!too_full && - _mi_option_get_fast(mi_option_page_reclaim_on_free) != 0 && - page->block_size <= MI_SMALL_MAX_OBJ_SIZE // only for small sized blocks - ) + // note: + // we only reclaim if the page originated from our heap (the heap field is preserved on abandonment) + // to avoid claiming arbitrary object sizes and limit indefinite expansion. + // this helps benchmarks like `larson` + const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free); + if (reclaim_on_free >= 0 && page->block_size <= MI_SMALL_MAX_OBJ_SIZE) // only for small sized blocks { // the page has still some blocks in use (but not too many) // reclaim in our heap if compatible, or otherwise abandon again // todo: optimize this check further? // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) - mi_heap_t* const heap = mi_prim_get_default_heap(); - if (mi_heap_is_initialized(heap)) // we did not already terminate our thread - { - mi_heap_t* const tagheap = _mi_heap_by_tag(heap, page->heap_tag); - if ((tagheap != NULL) && // don't reclaim across heap object types - (tagheap->allow_page_reclaim) && // and we are allowed to reclaim abandoned pages - // (page->subproc == tagheap->tld->subproc) && // don't reclaim across sub-processes; todo: make this check faster (integrate with _mi_heap_by_tag ? ) - (_mi_arena_memid_is_suitable(page->memid, tagheap->exclusive_arena)) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) - ) - { - if (mi_page_queue(tagheap, page->block_size)->first != NULL) { // don't reclaim for a block_size we don't use - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arenas_page_unabandon(page); - _mi_heap_page_reclaim(tagheap, page); - mi_heap_stat_counter_increase(tagheap, pages_reclaim_on_free, 1); - return; - } + mi_heap_t* heap = mi_prim_get_default_heap(); + if (heap != page->heap) { + if (mi_heap_is_initialized(heap)) { + heap = _mi_heap_by_tag(heap, page->heap_tag); } } + if (heap != NULL && heap->allow_page_reclaim && + (heap == page->heap || (reclaim_on_free == 1 && !mi_page_is_used_at_frac(page, 8))) && // only reclaim if we were the originating heap, or if reclaim_on_free == 1 and the pages is not too full + _mi_arena_memid_is_suitable(page->memid,heap->exclusive_arena) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) + ) + { + // first remove it from the abandoned pages in the arena -- this waits for any readers to finish + _mi_arenas_page_unabandon(page); + _mi_heap_page_reclaim(heap, page); + mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1); + return; + } } // 3. if the page is unmapped, try to reabandon so it can possibly be mapped and found for allocations - if (!too_full && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page + if (!mi_page_is_used_at_frac(page, 8) && // only reabandon if a full page starts to have enough blocks available to prevent immediate re-abandon of a full page !mi_page_is_abandoned_mapped(page) && page->memid.memkind == MI_MEM_ARENA && _mi_arenas_page_try_reabandon_to_mapped(page)) { diff --git a/src/heap.c b/src/heap.c index 10c65ff2..5ac79996 100644 --- a/src/heap.c +++ b/src/heap.c @@ -175,7 +175,7 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy, heap->memid = memid; heap->tld = tld; // avoid reading the thread-local tld during initialization heap->exclusive_arena = _mi_arena_from_id(arena_id); - heap->allow_page_reclaim = (!allow_destroy && mi_option_is_enabled(mi_option_page_reclaim_on_free)); + heap->allow_page_reclaim = (!allow_destroy && mi_option_get(mi_option_page_reclaim_on_free) >= 0); heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0); heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); heap->tag = heap_tag; diff --git a/src/init.c b/src/init.c index ced30104..d5bfe935 100644 --- a/src/init.c +++ b/src/init.c @@ -259,7 +259,7 @@ static void mi_heap_main_init(void) { //heap_main.keys[0] = _mi_heap_random_next(&heap_main); //heap_main.keys[1] = _mi_heap_random_next(&heap_main); _mi_heap_guarded_init(&heap_main); - heap_main.allow_page_reclaim = mi_option_is_enabled(mi_option_page_reclaim_on_free); + heap_main.allow_page_reclaim = (mi_option_get(mi_option_page_reclaim_on_free) >= 0); heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0); heap_main.page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); } diff --git a/src/options.c b/src/options.c index 9ebb0b6a..9caffbd3 100644 --- a/src/options.c +++ b/src/options.c @@ -168,13 +168,13 @@ static mi_option_desc_t options[_mi_option_last] = { MI_DEFAULT_GUARDED_SAMPLE_RATE, UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, - { 0, UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free - { 2, UNINIT, MI_OPTION(page_full_retain) }, - { 4, UNINIT, MI_OPTION(page_max_candidates) }, - { 0, UNINIT, MI_OPTION(max_vabits) }, + { 0, UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free: -1 = disable completely, 0 = only reclaim into the originating heap, 1 = reclaim on free across heaps + { 2, UNINIT, MI_OPTION(page_full_retain) }, // number of (small) pages to retain in the free page queues + { 4, UNINIT, MI_OPTION(page_max_candidates) }, // max search to find a best page candidate + { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, + { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/page.c b/src/page.c index 4b0c810c..2a51bea6 100644 --- a/src/page.c +++ b/src/page.c @@ -278,10 +278,11 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { } else { mi_page_queue_remove(pq, page); - mi_tld_t* tld = page->heap->tld; - mi_page_set_heap(page, NULL); - _mi_arenas_page_abandon(page,tld); - _mi_arenas_collect(false, false, tld); // allow purging + mi_heap_t* heap = page->heap; + mi_page_set_heap(page, NULL); + page->heap = heap; // dont set heap to NULL so we can reclaim_on_free within the same heap + _mi_arenas_page_abandon(page, heap->tld); + _mi_arenas_collect(false, false, heap->tld); // allow purging } } @@ -717,7 +718,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m count++; #endif candidate_limit--; - + // search up to N pages for a best candidate // is the local free list non-empty? @@ -744,7 +745,7 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m page_candidate = page; candidate_limit = _mi_option_get_fast(mi_option_page_max_candidates); } - else if (mi_page_all_free(page_candidate)) { + else if (mi_page_all_free(page_candidate)) { _mi_page_free(page_candidate, pq); page_candidate = page; } From 1657bfb453cc3c08dbe612e3499fb01f1e6d97c6 Mon Sep 17 00:00:00 2001 From: daanx Date: Wed, 5 Feb 2025 16:01:45 -0800 Subject: [PATCH 225/264] clarify control flow and comments in page reclaim_on_free --- src/free.c | 36 ++++++++++++++++++------------------ 1 file changed, 18 insertions(+), 18 deletions(-) diff --git a/src/free.c b/src/free.c index b1827f1e..c584e150 100644 --- a/src/free.c +++ b/src/free.c @@ -217,17 +217,13 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* return; } - // 2. if the page is not too full, we can try to reclaim it for ourselves - // note: - // we only reclaim if the page originated from our heap (the heap field is preserved on abandonment) - // to avoid claiming arbitrary object sizes and limit indefinite expansion. - // this helps benchmarks like `larson` + // 2. we can try to reclaim the page for ourselves + // note: we only reclaim if the page originated from our heap (the heap field is preserved on abandonment) + // to avoid claiming arbitrary object sizes and limit indefinite expansion. This helps benchmarks like `larson` const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free); if (reclaim_on_free >= 0 && page->block_size <= MI_SMALL_MAX_OBJ_SIZE) // only for small sized blocks { - // the page has still some blocks in use (but not too many) - // reclaim in our heap if compatible, or otherwise abandon again - // todo: optimize this check further? + // get our heap (with the right tag) // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) mi_heap_t* heap = mi_prim_get_default_heap(); @@ -236,16 +232,20 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* heap = _mi_heap_by_tag(heap, page->heap_tag); } } - if (heap != NULL && heap->allow_page_reclaim && - (heap == page->heap || (reclaim_on_free == 1 && !mi_page_is_used_at_frac(page, 8))) && // only reclaim if we were the originating heap, or if reclaim_on_free == 1 and the pages is not too full - _mi_arena_memid_is_suitable(page->memid,heap->exclusive_arena) // don't reclaim across unsuitable arena's; todo: inline arena_is_suitable (?) - ) - { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arenas_page_unabandon(page); - _mi_heap_page_reclaim(heap, page); - mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1); - return; + // can we reclaim? + if (heap != NULL && heap->allow_page_reclaim) { + if (heap == page->heap || // only reclaim if we were the originating heap, + (reclaim_on_free == 1 && // OR if the reclaim option across heaps is enabled + !mi_page_is_used_at_frac(page, 8) && // and the page is not too full + _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena)) // and the memory is suitable + ) + { + // first remove it from the abandoned pages in the arena -- this waits for any readers to finish + _mi_arenas_page_unabandon(page); + _mi_heap_page_reclaim(heap, page); + mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1); + return; + } } } From 515047b676c43b0de8a7b547716500aeea69793a Mon Sep 17 00:00:00 2001 From: Daan Date: Wed, 5 Feb 2025 20:55:21 -0800 Subject: [PATCH 226/264] improve free on macos --- include/mimalloc/internal.h | 4 ++-- src/free.c | 31 ++++++++++++++++++++----------- src/page-map.c | 8 ++++---- 3 files changed, 26 insertions(+), 17 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 92f02788..25e30f10 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -492,7 +492,7 @@ static inline mi_page_t* _mi_unchecked_ptr_page(const void* p) { // 2-level page map: // double indirection, but low commit and low virtual reserve. // -// the page-map is usually 4 MiB and points to sub maps of 64 KiB. +// the page-map is usually 4 MiB (for 48 bits virtual addresses) and points to sub maps of 64 KiB. // the page-map is committed on-demand (in 64 KiB parts) (and sub-maps are committed on-demand as well) // one sub page-map = 64 KiB => covers 2^(16-3) * 2^16 = 2^29 = 512 MiB address space // the page-map needs 48-(16+13) = 19 bits => 2^19 sub map pointers = 4 MiB size. @@ -519,7 +519,7 @@ static inline mi_page_t* _mi_checked_ptr_page(const void* p) { size_t sub_idx; const size_t idx = _mi_page_map_index(p, &sub_idx); mi_page_t** const sub = _mi_page_map[idx]; - if mi_unlikely(sub == NULL) return NULL; + if mi_unlikely(sub == NULL) return (mi_page_t*)&_mi_page_empty; return sub[sub_idx]; } diff --git a/src/free.c b/src/free.c index c584e150..266faad8 100644 --- a/src/free.c +++ b/src/free.c @@ -123,6 +123,10 @@ static void mi_decl_noinline mi_free_generic_local(mi_page_t* page, void* p) mi_ // free a pointer owned by another thread (page parameter comes first for better codegen) static void mi_decl_noinline mi_free_generic_mt(mi_page_t* page, void* p) mi_attr_noexcept { if (p==NULL) return; // a NULL pointer is seen as abandoned (tid==0) with a full flag set + #if !MI_PAGE_MAP_FLAT + if (page==&_mi_page_empty) return; // an invalid pointer may lead to using the empty page + #endif + mi_assert_internal(p!=NULL && page != NULL && page != &_mi_page_empty); mi_block_t* const block = _mi_page_ptr_unalign(page, p); // don't check `has_aligned` flag to avoid a race (issue #865) mi_block_check_unguard(page, block, p); mi_free_block_mt(page, block); @@ -135,10 +139,9 @@ void mi_decl_noinline _mi_free_generic(mi_page_t* page, bool is_local, void* p) } -// Get the segment data belonging to a pointer -// This is just a single `and` in release mode but does further checks in debug mode -// (and secure mode) to see if this was a valid pointer. -static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) +// Get the page belonging to a pointer +// Does further checks in debug mode to see if this was a valid pointer. +static inline mi_page_t* mi_validate_ptr_page(const void* p, const char* msg) { MI_UNUSED_RELEASE(msg); #if MI_DEBUG @@ -146,9 +149,14 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) _mi_error_message(EINVAL, "%s: invalid (unaligned) pointer: %p\n", msg, p); return NULL; } - mi_page_t* const page = _mi_safe_ptr_page(p); - if (page == NULL && p != NULL) { - _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p); + mi_page_t* page = _mi_safe_ptr_page(p); + if (page == NULL) { + if (p != NULL) { + _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p); + } + #if !MI_PAGE_MAP_FLAT + page = (mi_page_t*)&_mi_page_empty; + #endif } return page; #else @@ -160,12 +168,13 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg) // Fast path written carefully to prevent register spilling on the stack void mi_free(void* p) mi_attr_noexcept { - mi_page_t* const page = mi_checked_ptr_page(p,"mi_free"); + mi_page_t* const page = mi_validate_ptr_page(p,"mi_free"); - #if MI_PAGE_MAP_FLAT // if not flat, NULL will point to `_mi_page_empty` and get to `mi_free_generic_mt` + #if MI_PAGE_MAP_FLAT // if not flat, p==NULL leads to `_mi_page_empty` which leads to `mi_free_generic_mt` if mi_unlikely(page==NULL) return; #endif - + mi_assert_internal(page!=NULL); + const mi_threadid_t xtid = (_mi_prim_thread_id() ^ mi_page_xthread_id(page)); if mi_likely(xtid == 0) { // `tid == mi_page_thread_id(page) && mi_page_flags(page) == 0` // thread-local, aligned, and not a full page @@ -283,7 +292,7 @@ static size_t mi_decl_noinline mi_page_usable_aligned_size_of(const mi_page_t* p } static inline size_t _mi_usable_size(const void* p, const char* msg) mi_attr_noexcept { - const mi_page_t* const page = mi_checked_ptr_page(p,msg); + const mi_page_t* const page = mi_validate_ptr_page(p,msg); if mi_unlikely(page==NULL) return 0; if mi_likely(!mi_page_has_aligned(page)) { const mi_block_t* block = (const mi_block_t*)p; diff --git a/src/page-map.c b/src/page-map.c index 2b610935..74c22e90 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -206,7 +206,7 @@ bool _mi_page_map_init(void) { if (!mi_page_map_memid.initially_committed) { _mi_os_commit(&_mi_page_map[0], os_page_size, NULL); // commit first part of the map } - _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size); // we reserved 2 subs at the end already + _mi_page_map[0] = (mi_page_t**)((uint8_t*)_mi_page_map + page_map_size); // we reserved 2 sub maps at the end already if (!mi_page_map_memid.initially_committed) { _mi_os_commit(_mi_page_map[0], os_page_size, NULL); // only first OS page } @@ -315,10 +315,10 @@ void _mi_page_map_unregister_range(void* start, size_t size) { mi_page_map_set_range(NULL, idx, sub_idx, slice_count); // todo: avoid committing if not already committed? } -// Return the empty page for the NULL pointer to match the behaviour of `_mi_ptr_page` +// Return NULL for invalid pointers mi_page_t* _mi_safe_ptr_page(const void* p) { + if (p==NULL) return NULL; if mi_unlikely(p >= mi_page_map_max_address) return NULL; - if (p == NULL) return (mi_page_t*)&_mi_page_empty; // to match `_mi_ptr_page` (see `mi_free` as well) size_t sub_idx; const size_t idx = _mi_page_map_index(p,&sub_idx); if mi_unlikely(!mi_page_map_is_committed(idx,NULL)) return NULL; @@ -328,7 +328,7 @@ mi_page_t* _mi_safe_ptr_page(const void* p) { } mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept { - return (p != NULL && _mi_safe_ptr_page(p) != NULL); + return (_mi_safe_ptr_page(p) != NULL); } #endif From 3d767ebef69a43d5fc3fab8c16b2eaa3395371f2 Mon Sep 17 00:00:00 2001 From: Daan Date: Wed, 5 Feb 2025 21:20:44 -0800 Subject: [PATCH 227/264] use regular free in zone_free on macos --- include/mimalloc.h | 4 ++-- src/options.c | 2 +- src/prim/osx/alloc-override-zone.c | 3 ++- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index 46335619..be28f17a 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -394,8 +394,8 @@ typedef enum mi_option_e { mi_option_guarded_precise, // disregard minimal alignment requirement to always place guarded blocks exactly in front of a guard page (=0) mi_option_guarded_sample_rate, // 1 out of N allocations in the min/max range will be guarded (=1000) mi_option_guarded_sample_seed, // can be set to allow for a (more) deterministic re-execution when a guard page is triggered (=0) - mi_option_page_reclaim_on_free, // allow to reclaim an abandoned segment on a free (=1) - mi_option_page_full_retain, // retain N full pages per size class (=2) + mi_option_page_reclaim_on_free, // reclaim abandoned pages on a free (=0). -1 disallowr always, 0 allows if the page originated from the current heap, 1 allow always + mi_option_page_full_retain, // retain N full (small) pages per size class (=2) mi_option_page_max_candidates, // max candidate pages to consider for allocation (=4) mi_option_max_vabits, // max user space virtual address bits to consider (=48) mi_option_pagemap_commit, // commit the full pagemap (to always catch invalid pointer uses) (=0) diff --git a/src/options.c b/src/options.c index 9caffbd3..485beb48 100644 --- a/src/options.c +++ b/src/options.c @@ -168,7 +168,7 @@ static mi_option_desc_t options[_mi_option_last] = { MI_DEFAULT_GUARDED_SAMPLE_RATE, UNINIT, MI_OPTION(guarded_sample_rate)}, // 1 out of N allocations in the min/max range will be guarded (=4000) { 0, UNINIT, MI_OPTION(guarded_sample_seed)}, - { 0, UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim an abandoned segment on a free: -1 = disable completely, 0 = only reclaim into the originating heap, 1 = reclaim on free across heaps + { 0, UNINIT, MI_OPTION_LEGACY(page_reclaim_on_free, abandoned_reclaim_on_free) },// reclaim abandoned pages on a free: -1 = disable completely, 0 = only reclaim into the originating heap, 1 = reclaim on free across heaps { 2, UNINIT, MI_OPTION(page_full_retain) }, // number of (small) pages to retain in the free page queues { 4, UNINIT, MI_OPTION(page_max_candidates) }, // max search to find a best page candidate { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits diff --git a/src/prim/osx/alloc-override-zone.c b/src/prim/osx/alloc-override-zone.c index d3af170d..a8f5fbc6 100644 --- a/src/prim/osx/alloc-override-zone.c +++ b/src/prim/osx/alloc-override-zone.c @@ -64,7 +64,8 @@ static void* zone_valloc(malloc_zone_t* zone, size_t size) { static void zone_free(malloc_zone_t* zone, void* p) { MI_UNUSED(zone); - mi_cfree(p); + // mi_cfree(p); // checked free as `zone_free` may be called with invalid pointers + mi_free(p); // with the page_map and pagemap_commit=1 we can use the regular free } static void* zone_realloc(malloc_zone_t* zone, void* p, size_t newsize) { From 4c562f392a536fa180e48441a76881e15db6ff13 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 6 Feb 2025 11:53:22 -0800 Subject: [PATCH 228/264] allow page reclaim on free to the originating heap also within a threadpool --- src/free.c | 5 +++-- src/heap.c | 6 +++--- src/page.c | 6 +++--- 3 files changed, 9 insertions(+), 8 deletions(-) diff --git a/src/free.c b/src/free.c index 266faad8..3fdb35aa 100644 --- a/src/free.c +++ b/src/free.c @@ -220,7 +220,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* if (mi_page_all_free(page)) { // first remove it from the abandoned pages in the arena (if mapped, this waits for any readers to finish) - _mi_arenas_page_unabandon(page); + _mi_arenas_page_unabandon(page); // we can free the page directly _mi_arenas_page_free(page); return; @@ -244,8 +244,9 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* // can we reclaim? if (heap != NULL && heap->allow_page_reclaim) { if (heap == page->heap || // only reclaim if we were the originating heap, - (reclaim_on_free == 1 && // OR if the reclaim option across heaps is enabled + (reclaim_on_free == 1 && // OR if the reclaim across heaps is allowed !mi_page_is_used_at_frac(page, 8) && // and the page is not too full + !heap->tld->is_in_threadpool && // and not part of a threadpool _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena)) // and the memory is suitable ) { diff --git a/src/heap.c b/src/heap.c index 5ac79996..daad8afc 100644 --- a/src/heap.c +++ b/src/heap.c @@ -181,10 +181,10 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy, heap->tag = heap_tag; if (heap->tld->is_in_threadpool) { // if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap. - heap->allow_page_reclaim = false; - // .. but abandoning is good in this case: quarter the full page retain (possibly to 0) + // this is checked in `free.c:mi_free_try_collect_mt` + // .. but abandoning is good in this case: halve the full page retain (possibly to 0) // (so blocked threads do not hold on to too much memory) - if (heap->page_full_retain >= 0) { + if (heap->page_full_retain > 0) { heap->page_full_retain = heap->page_full_retain / 4; } } diff --git a/src/page.c b/src/page.c index 2a51bea6..b3dabb41 100644 --- a/src/page.c +++ b/src/page.c @@ -279,7 +279,7 @@ void _mi_page_abandon(mi_page_t* page, mi_page_queue_t* pq) { else { mi_page_queue_remove(pq, page); mi_heap_t* heap = page->heap; - mi_page_set_heap(page, NULL); + mi_page_set_heap(page, NULL); page->heap = heap; // dont set heap to NULL so we can reclaim_on_free within the same heap _mi_arenas_page_abandon(page, heap->tld); _mi_arenas_collect(false, false, heap->tld); // allow purging @@ -358,11 +358,11 @@ static void mi_page_to_full(mi_page_t* page, mi_page_queue_t* pq) { mi_heap_t* heap = mi_page_heap(page); if (heap->allow_page_abandon) { - // abandon full pages + // abandon full pages (this is the usual case in order to allow for sharing of memory between heaps) _mi_page_abandon(page, pq); } else if (!mi_page_is_in_full(page)) { - // put full pages in a heap local queue + // put full pages in a heap local queue (this is for heaps that cannot abandon, for example, if the heap can be destroyed) mi_page_queue_enqueue_from(&mi_page_heap(page)->pages[MI_BIN_FULL], pq, page); _mi_page_free_collect(page, false); // try to collect right away in case another thread freed just before MI_USE_DELAYED_FREE was set } From 64aaf9d88f507c60ffc9ede4c8aea3b512867456 Mon Sep 17 00:00:00 2001 From: Daan Date: Thu, 6 Feb 2025 17:08:06 -0800 Subject: [PATCH 229/264] fix performance bug in mi_bchunk_try_find _and_clearNX --- src/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index 8a7a9442..d1719c3b 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -804,7 +804,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, const size_t post = mi_bfield_clz(~b); if (post > 0) { const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1])); - if (post + pre <= n) { + if (post + pre >= n) { // it fits -- try to claim it atomically const size_t cidx = (i*MI_BFIELD_BITS) + (MI_BFIELD_BITS - post); if (mi_bchunk_try_clearNX(chunk, cidx, n, NULL)) { From 7931678899281766f6fb03678928e615bfbcd571 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 6 Feb 2025 22:59:14 -0800 Subject: [PATCH 230/264] further optimize mi_bchunk_try_find_and_clearNX --- include/mimalloc/bits.h | 8 ++++++-- src/bitmap.c | 14 ++++++++------ src/options.c | 2 +- src/page.c | 4 ++-- 4 files changed, 17 insertions(+), 11 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 64875e9d..d4632441 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -199,6 +199,8 @@ static inline size_t mi_ctz(size_t x) { size_t r; __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return r; + #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) + return _tzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long idx; return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS); @@ -221,6 +223,8 @@ static inline size_t mi_clz(size_t x) { size_t r; __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return r; + #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) + return _lzcnt_u64(x); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long idx; return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS); @@ -254,7 +258,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) { bool is_zero; __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); return !is_zero; - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long i; return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false); #else @@ -271,7 +275,7 @@ static inline bool mi_bsr(size_t x, size_t* idx) { bool is_zero; __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc"); return !is_zero; - #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long i; return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false); #else diff --git a/src/bitmap.c b/src/bitmap.c index d1719c3b..0b13e2ec 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -773,9 +773,10 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); size_t idx; + // is there a range inside the field? while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit - if (idx + n > MI_BFIELD_BITS) break; // too short, maybe cross over, or continue with the next field + if (idx + n > MI_BFIELD_BITS) break; // too short: maybe cross over, or continue with the next field const size_t bmask = mask<>idx == mask); @@ -792,15 +793,16 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, } } else { - // advance - const size_t ones = mi_bfield_ctz(~(b>>idx)); // skip all ones (since it didn't fit the mask) - mi_assert_internal(ones>0); - b = b & ~mi_bfield_mask(ones, idx); // clear the ones + // advance by clearing the least run of ones, for example, with n>=4, idx=2: + // b = 1111 1101 1010 1100 + // .. + (1< 0) { const size_t pre = mi_bfield_ctz(~mi_atomic_load_relaxed(&chunk->bfields[i+1])); diff --git a/src/options.c b/src/options.c index 485beb48..d1bdd716 100644 --- a/src/options.c +++ b/src/options.c @@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) + { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) }; static void mi_option_init(mi_option_desc_t* desc); diff --git a/src/page.c b/src/page.c index b3dabb41..4e1f683c 100644 --- a/src/page.c +++ b/src/page.c @@ -137,7 +137,7 @@ bool _mi_page_is_valid(mi_page_t* page) { Page collect the `local_free` and `thread_free` lists ----------------------------------------------------------- */ -static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) +static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) { if (head == NULL) return; @@ -167,7 +167,7 @@ static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) } // Collect the local `thread_free` list using an atomic exchange. -static void mi_page_thread_free_collect(mi_page_t* page) +static mi_decl_noinline void mi_page_thread_free_collect(mi_page_t* page) { // atomically capture the thread free list mi_block_t* head; From 9053cf0cd25e7a59750eb974012c0f371ce3e312 Mon Sep 17 00:00:00 2001 From: Sergey Markelov Date: Fri, 7 Feb 2025 12:35:59 -0700 Subject: [PATCH 231/264] prim: fix dev3 UWP build (#1005) --- src/prim/windows/prim.c | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index 0916a7ea..f91925fc 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -127,9 +127,11 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) config->has_partial_free = false; config->has_virtual_reserve = true; // windows version - const DWORD win_version = GetVersion(); - win_major_version = (DWORD)(LOBYTE(LOWORD(win_version))); - win_minor_version = (DWORD)(HIBYTE(LOWORD(win_version))); + OSVERSIONINFOW version{sizeof(version)}; + if (GetVersionExW(&version)) { + win_major_version = version.dwMajorVersion; + win_minor_version = version.dwMinorVersion; + } // get the page size SYSTEM_INFO si; GetSystemInfo(&si); From ca25fb3d17a1326f89a13c4c01d5a6d67b973af2 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 7 Feb 2025 17:38:53 -0800 Subject: [PATCH 232/264] avoid reload on clearing mask --- src/bitmap.c | 55 ++++++++++++++++++++++++++-------------------------- 1 file changed, 28 insertions(+), 27 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index 0b13e2ec..c096bd4a 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -165,25 +165,31 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already // Tries to clear a mask atomically, and returns true if the mask bits atomically transitioned from mask to 0 // and false otherwise (leaving the bit field as is). // `all_clear` is set to `true` if the new bfield became zero. -static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, bool* all_clear) { +static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) { mi_assert_internal(mask != 0); - mi_bfield_t old = mi_atomic_load_relaxed(b); - do { - if ((old&mask) != mask) { - // the mask bits are no longer set - if (all_clear != NULL) { *all_clear = (old==0); } + mi_assert_internal((expect & mask) == mask); + // try to atomically clear the mask bits + while mi_unlikely(!mi_atomic_cas_strong_acq_rel(b, &expect, expect & ~mask)) { + if ((expect & mask) != mask) { + if (all_clear != NULL) { *all_clear = (expect == 0); } return false; } - } while (!mi_atomic_cas_weak_acq_rel(b, &old, old&~mask)); // try to atomically clear the mask bits - if (all_clear != NULL) { *all_clear = ((old&~mask) == 0); } + } + if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0); } return true; } +static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask, bool* all_clear) { + mi_assert_internal(mask != 0); + const mi_bfield_t expect = mi_atomic_load_relaxed(b); + return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear); +} + // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0 // and `false` otherwise leaving the bfield `b` as-is. // `all_clear` is set to true if the new bfield became zero (and false otherwise) -static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)*b, size_t idx, bool* all_clear) { +static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_one()<bfields[chunk_idx]); size_t idx; - if (!allow_allset && (~b == 0)) return false; if (mi_bfield_find_least_bit(b, &idx)) { // find the least bit - if mi_likely(mi_bfield_atomic_try_clear(&chunk->bfields[chunk_idx], idx, NULL)) { // clear it atomically + if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], mi_bfield_mask(1,idx), b, NULL)) { // clear it atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); return true; @@ -565,7 +570,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx if (mask==0) return false; mi_assert_internal((_tzcnt_u32(mask)%8) == 0); // tzcnt == 0, 8, 16, or 24 const size_t chunk_idx = _tzcnt_u32(mask) / 8; - if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } @@ -600,7 +605,7 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx mi_assert_internal((_tzcnt_u64(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. chunk_idx = mi_ctz(mask) / 8; #endif - if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } @@ -621,17 +626,13 @@ static inline bool mi_bchunk_try_find_and_clear(mi_bchunk_t* chunk, size_t* pidx if (mask==0) return false; mi_assert_internal((mi_ctz(mask)%8) == 0); // tzcnt == 0, 8, 16, 24 , .. const size_t chunk_idx = mi_ctz(mask) / 8; - if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx, true)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, chunk_idx, pidx)) return true; // try again // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } #else - // try first to find a field that is not all set (to reduce fragmentation) (not needed for binned bitmaps) - // for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - // if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, false /* don't consider allset fields */)) return true; - // } for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx, true)) return true; + if (mi_bchunk_try_find_and_clear_at(chunk, i, pidx)) return true; } return false; #endif @@ -643,9 +644,8 @@ static inline bool mi_bchunk_try_find_and_clear_1(mi_bchunk_t* chunk, size_t n, } #if !(MI_OPT_SIMD && defined(__AVX2__) && (MI_BCHUNK_BITS==512)) -static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx, bool allow_all_set) { +static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t chunk_idx, size_t* pidx) { const mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[chunk_idx]); - if (!allow_all_set && (~b == 0)) return false; // has_set8 has low bit in each byte set if the byte in x == 0xFF const mi_bfield_t has_set8 = ((~b - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F @@ -655,7 +655,7 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c if (mi_bfield_find_least_bit(has_set8, &idx)) { // find least 1-bit mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); mi_assert_internal((idx%8)==0); - if mi_likely(mi_bfield_atomic_try_clear8(&chunk->bfields[chunk_idx], idx, NULL)) { // unset the byte atomically + if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[chunk_idx], (mi_bfield_t)0xFF << idx, b, NULL)) { // unset the byte atomically *pidx = (chunk_idx*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx + 8 <= MI_BCHUNK_BITS); return true; @@ -701,7 +701,7 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true; // } for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, true /* allow allset fields */)) return true; + if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true; } return false; #endif @@ -771,7 +771,8 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, const mi_bfield_t mask = mi_bfield_mask(n, 0); // for all fields in the chunk for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { - mi_bfield_t b = mi_atomic_load_relaxed(&chunk->bfields[i]); + mi_bfield_t b0 = mi_atomic_load_relaxed(&chunk->bfields[i]); + mi_bfield_t b = b0; size_t idx; // is there a range inside the field? @@ -781,7 +782,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, const size_t bmask = mask<>idx == mask); if ((b&bmask) == bmask) { // found a match with all bits set, try clearing atomically - if mi_likely(mi_bfield_atomic_try_clear_mask(&chunk->bfields[i], bmask, NULL)) { + if mi_likely(mi_bfield_atomic_try_clear_mask_of(&chunk->bfields[i], bmask, b0, NULL)) { *pidx = (i*MI_BFIELD_BITS) + idx; mi_assert_internal(*pidx < MI_BCHUNK_BITS); mi_assert_internal(*pidx + n <= MI_BCHUNK_BITS); @@ -789,7 +790,7 @@ mi_decl_noinline static bool mi_bchunk_try_find_and_clearNX(mi_bchunk_t* chunk, } else { // if we failed to atomically commit, reload b and try again from the start - b = mi_atomic_load_acquire(&chunk->bfields[i]); + b = b0 = mi_atomic_load_acquire(&chunk->bfields[i]); } } else { From 9b7914fd3fb165a8caebc3a37179eee2447ecd93 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 09:35:21 -0800 Subject: [PATCH 233/264] fix bug in mi_page_free_collect_partly where the tail of the free list was kept --- src/page.c | 40 +++++++++++++++++++++------------------- 1 file changed, 21 insertions(+), 19 deletions(-) diff --git a/src/page.c b/src/page.c index 4e1f683c..f25d0d9b 100644 --- a/src/page.c +++ b/src/page.c @@ -137,7 +137,7 @@ bool _mi_page_is_valid(mi_page_t* page) { Page collect the `local_free` and `thread_free` lists ----------------------------------------------------------- */ -static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) +static void mi_page_thread_collect_to_local(mi_page_t* page, mi_block_t* head) { if (head == NULL) return; @@ -167,7 +167,7 @@ static mi_decl_noinline void mi_page_thread_collect_to_local(mi_page_t* page, mi } // Collect the local `thread_free` list using an atomic exchange. -static mi_decl_noinline void mi_page_thread_free_collect(mi_page_t* page) +static void mi_page_thread_free_collect(mi_page_t* page) { // atomically capture the thread free list mi_block_t* head; @@ -215,11 +215,17 @@ void _mi_page_free_collect(mi_page_t* page, bool force) { mi_assert_internal(!force || page->local_free == NULL); } -// collect elements in the thread-free list starting at `head`. +// Collect elements in the thread-free list starting at `head`. This is an optimized +// version of `_mi_page_free_collect` to be used from `free.c:_mi_free_collect_mt` that avoids atomic access to `xthread_free`. +// +// `head` must be in the `xthread_free` list. It will not collect `head` itself +// so the `used` count is not fully updated in general. However, if the `head` is +// the last remaining element, it will be collected and the used count will become `0` (so `mi_page_all_free` becomes true). void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) { if (head == NULL) return; - mi_block_t* next = mi_block_next(page,head); // we cannot collect the head element itself as `page->thread_free` may point at it (and we want to avoid atomic ops) + mi_block_t* next = mi_block_next(page,head); // we cannot collect the head element itself as `page->thread_free` may point to it (and we want to avoid atomic ops) if (next != NULL) { + mi_block_set_next(page, head, NULL); mi_page_thread_collect_to_local(page, next); if (page->local_free != NULL && page->free == NULL) { page->free = page->local_free; @@ -229,6 +235,8 @@ void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head) { } if (page->used == 1) { // all elements are free'd since we skipped the `head` element itself + mi_assert_internal(mi_tf_block(mi_atomic_load_relaxed(&page->xthread_free)) == head); + mi_assert_internal(mi_block_next(page,head) == NULL); _mi_page_free_collect(page, false); // collect the final element } } @@ -816,31 +824,25 @@ static mi_decl_noinline mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, m // Find a page with free blocks of `size`. -static inline mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) { +static mi_page_t* mi_find_free_page(mi_heap_t* heap, mi_page_queue_t* pq) { // mi_page_queue_t* pq = mi_page_queue(heap, size); mi_assert_internal(!mi_page_queue_is_huge(pq)); // check the first page: we even do this with candidate search or otherwise we re-search every time mi_page_t* page = pq->first; - if (page != NULL) { - #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness + if mi_likely(page != NULL && mi_page_immediate_available(page)) { + #if (MI_SECURE>=3) // in secure mode, we extend half the time to increase randomness if (page->capacity < page->reserved && ((_mi_heap_random_next(heap) & 1) == 1)) { mi_page_extend_free(heap, page); mi_assert_internal(mi_page_immediate_available(page)); } - else - #endif - { - _mi_page_free_collect(page,false); - } - - if (mi_page_immediate_available(page)) { - page->retire_expire = 0; - return page; // fast path - } + #endif + page->retire_expire = 0; + return page; // fast path + } + else { + return mi_page_queue_find_free_ex(heap, pq, true); } - - return mi_page_queue_find_free_ex(heap, pq, true); } From bc7fe059a6d87cb01a58c8f604f5b7764813c659 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 09:35:52 -0800 Subject: [PATCH 234/264] improve performance of mi_free_collect_mt by specializing mi_page_unown --- src/free.c | 33 +++++++++++++++++++++++++++++++-- 1 file changed, 31 insertions(+), 2 deletions(-) diff --git a/src/free.c b/src/free.c index 3fdb35aa..1df10728 100644 --- a/src/free.c +++ b/src/free.c @@ -201,7 +201,7 @@ void mi_free(void* p) mi_attr_noexcept // ------------------------------------------------------ // Multi-threaded Free (`_mt`) // ------------------------------------------------------ - +static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free); static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept { mi_assert_internal(mi_page_is_owned(page)); @@ -269,7 +269,36 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* // not reclaimed or free'd, unown again - _mi_page_unown(page); + // _mi_page_unown(page); + mi_page_unown_from_free(page, mt_free); +} + + +// release ownership of a page. This may free the page if all (other) blocks were concurrently +// freed in the meantime. Returns true if the page was freed. +// This is a specialized version of `mi_page_unown` to (try to) avoid calling `mi_page_free_collect` again. +static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free) { + mi_assert_internal(mi_page_is_owned(page)); + mi_assert_internal(mi_page_is_abandoned(page)); + mi_assert_internal(mt_free != NULL); + mi_assert_internal(page->used > 1); + mi_thread_free_t tf_expect = mi_tf_create(mt_free, true); + mi_thread_free_t tf_new = mi_tf_create(mt_free, false); + while mi_unlikely(!mi_atomic_cas_weak_acq_rel(&page->xthread_free, &tf_expect, tf_new)) { + mi_assert_internal(mi_tf_is_owned(tf_expect)); + while (mi_tf_block(tf_expect) != NULL) { + _mi_page_free_collect(page,false); // update used + if (mi_page_all_free(page)) { // it may become free just before unowning it + _mi_arenas_page_unabandon(page); + _mi_arenas_page_free(page); + return true; + } + tf_expect = mi_atomic_load_relaxed(&page->xthread_free); + } + mi_assert_internal(mi_tf_block(tf_expect)==NULL); + tf_new = mi_tf_create(NULL, false); + } + return false; } From 2017181a6913e174f875c85c250dba3144ac9f04 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 09:36:09 -0800 Subject: [PATCH 235/264] improve performance of clearNX --- src/bitmap.c | 18 ++---------------- src/bitmap.h | 4 ---- 2 files changed, 2 insertions(+), 20 deletions(-) diff --git a/src/bitmap.c b/src/bitmap.c index c096bd4a..623f921d 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -167,14 +167,13 @@ static inline bool mi_bfield_atomic_setX(_Atomic(mi_bfield_t)*b, size_t* already // `all_clear` is set to `true` if the new bfield became zero. static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi_bfield_t mask, mi_bfield_t expect, bool* all_clear) { mi_assert_internal(mask != 0); - mi_assert_internal((expect & mask) == mask); // try to atomically clear the mask bits - while mi_unlikely(!mi_atomic_cas_strong_acq_rel(b, &expect, expect & ~mask)) { + do { if ((expect & mask) != mask) { if (all_clear != NULL) { *all_clear = (expect == 0); } return false; } - } + } while (!mi_atomic_cas_weak_acq_rel(b, &expect, expect & ~mask)); if (all_clear != NULL) { *all_clear = ((expect & ~mask) == 0); } return true; } @@ -696,10 +695,6 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clear8(mi_bchunk_t* chunk, s // note: there must be an atomic release/acquire in between or otherwise the registers may not be reloaded } } #else - // first skip allset fields to reduce fragmentation (not needed for binned bitmaps) - // for(int i = 0; i < MI_BCHUNK_FIELDS; i++) { - // if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx, false /* don't allow allset fields */)) return true; - // } for (int i = 0; i < MI_BCHUNK_FIELDS; i++) { if (mi_bchunk_try_find_and_clear8_at(chunk, i, pidx)) return true; } @@ -892,15 +887,6 @@ static mi_decl_noinline bool mi_bchunk_try_find_and_clearN_(mi_bchunk_t* chunk, } -//static inline bool mi_bchunk_try_find_and_clearN(mi_bchunk_t* chunk, size_t n, size_t* pidx) { -// if (n==1) return mi_bchunk_try_find_and_clear(chunk, pidx); // small pages -// if (n==8) return mi_bchunk_try_find_and_clear8(chunk, pidx); // medium pages -// // if (n==MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearX(chunk, pidx); // large pages -// if (n==0 || n > MI_BCHUNK_BITS) return false; // cannot be more than a chunk -// if (n<=MI_BFIELD_BITS) return mi_bchunk_try_find_and_clearNX(chunk, n, pidx); -// return mi_bchunk_try_find_and_clearN_(chunk, n, pidx); -//} - // ------- mi_bchunk_clear_once_set --------------------------------------- diff --git a/src/bitmap.h b/src/bitmap.h index 9afdffce..b17d83e5 100644 --- a/src/bitmap.h +++ b/src/bitmap.h @@ -271,10 +271,6 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); // `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! bool mi_bbitmap_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); -// Clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 1's to 0's -// `n` cannot cross chunk boundaries (and `n <= MI_BCHUNK_BITS`)! -bool mi_bbitmap_clearN(mi_bbitmap_t* bbitmap, size_t idx, size_t n); - // Is a sequence of n bits already all set/cleared? bool mi_bbitmap_is_xsetN(mi_xset_t set, mi_bbitmap_t* bbitmap, size_t idx, size_t n); From 2048fa2d17684dde6a588a3aa444149b0cb1d842 Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 09:53:00 -0800 Subject: [PATCH 236/264] fix comments --- include/mimalloc/types.h | 4 ++-- src/bitmap.c | 3 ++- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 6ed17f09..29d6fde9 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -99,7 +99,7 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_ENCODE_FREELIST 1 #endif -// Enable large pages for objects between 64KiB and 256KiB. +// Enable large pages for objects between 64KiB and 512KiB. // Disabled by default as for many workloads the block sizes above 64 KiB are quite random which can lead to too many partially used large pages. #ifndef MI_ENABLE_LARGE_PAGES #define MI_ENABLE_LARGE_PAGES 0 @@ -342,7 +342,7 @@ typedef struct mi_page_s { #define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 8 KiB #if MI_ENABLE_LARGE_PAGES #define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB -#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 256 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` +#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #else #define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/8) // <= 64 KiB #define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // <= 64 KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` diff --git a/src/bitmap.c b/src/bitmap.c index 623f921d..b458d5e8 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -184,7 +184,7 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_b return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear); } - +/* // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0 // and `false` otherwise leaving the bfield `b` as-is. // `all_clear` is set to true if the new bfield became zero (and false otherwise) @@ -203,6 +203,7 @@ static inline bool mi_bfield_atomic_try_clear8(_Atomic(mi_bfield_t)*b, size_t id const mi_bfield_t mask = ((mi_bfield_t)0xFF)< Date: Sat, 8 Feb 2025 11:51:18 -0800 Subject: [PATCH 237/264] set the option commit_on_demand back to 2 as we only do this for medium/large pages --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index d1bdd716..485beb48 100644 --- a/src/options.c +++ b/src/options.c @@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) + { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) }; static void mi_option_init(mi_option_desc_t* desc); From c7f7c23dc15a27abb6a26e78fd7b3c073f43b388 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Sat, 8 Feb 2025 12:43:00 -0800 Subject: [PATCH 238/264] make C compatible --- src/prim/windows/prim.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/prim/windows/prim.c b/src/prim/windows/prim.c index f91925fc..31ef0e94 100644 --- a/src/prim/windows/prim.c +++ b/src/prim/windows/prim.c @@ -127,10 +127,10 @@ void _mi_prim_mem_init( mi_os_mem_config_t* config ) config->has_partial_free = false; config->has_virtual_reserve = true; // windows version - OSVERSIONINFOW version{sizeof(version)}; + OSVERSIONINFOW version; _mi_memzero_var(version); if (GetVersionExW(&version)) { - win_major_version = version.dwMajorVersion; - win_minor_version = version.dwMinorVersion; + win_major_version = version.dwMajorVersion; + win_minor_version = version.dwMinorVersion; } // get the page size SYSTEM_INFO si; From 9dd753d2c0aee48b38a56d513ae01231ca6901ac Mon Sep 17 00:00:00 2001 From: daanx Date: Sat, 8 Feb 2025 13:12:19 -0800 Subject: [PATCH 239/264] add comment --- src/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index b458d5e8..f3030153 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -169,7 +169,7 @@ static inline bool mi_bfield_atomic_try_clear_mask_of(_Atomic(mi_bfield_t)*b, mi mi_assert_internal(mask != 0); // try to atomically clear the mask bits do { - if ((expect & mask) != mask) { + if ((expect & mask) != mask) { // are all bits still set? if (all_clear != NULL) { *all_clear = (expect == 0); } return false; } From 06ade47b05672ff33481ba4dd3d4b0f6aa7aefc2 Mon Sep 17 00:00:00 2001 From: Daan Date: Sat, 8 Feb 2025 23:26:45 -0800 Subject: [PATCH 240/264] fix is_huge definition --- include/mimalloc/internal.h | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 25e30f10..151c81a3 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -644,8 +644,9 @@ static inline bool mi_page_is_used_at_frac(const mi_page_t* page, uint16_t n) { static inline bool mi_page_is_huge(const mi_page_t* page) { - return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || - (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page)); + return (mi_page_is_singleton(page) && + (page->block_size > MI_LARGE_MAX_OBJ_SIZE || + (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page))); } static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size) { From fe5258a179bcc25f010e2012df9f7ab3e52cff97 Mon Sep 17 00:00:00 2001 From: Daan Date: Sat, 8 Feb 2025 23:33:16 -0800 Subject: [PATCH 241/264] change process initialization order (potential fix for issue #1007) --- src/init.c | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/src/init.c b/src/init.c index d5bfe935..31b0d271 100644 --- a/src/init.c +++ b/src/init.c @@ -246,8 +246,6 @@ static void mi_tld_main_init(void) { // Initialization of the (statically allocated) main heap, and the main tld and subproc. static void mi_heap_main_init(void) { if (heap_main.cookie == 0) { - mi_subproc_main_init(); - mi_tld_main_init(); // heap heap_main.cookie = 1; #if defined(__APPLE__) || defined(_WIN32) && !defined(MI_SHARED_LIB) @@ -262,6 +260,9 @@ static void mi_heap_main_init(void) { heap_main.allow_page_reclaim = (mi_option_get(mi_option_page_reclaim_on_free) >= 0); heap_main.allow_page_abandon = (mi_option_get(mi_option_page_full_retain) >= 0); heap_main.page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32); + + mi_subproc_main_init(); + mi_tld_main_init(); } } @@ -666,14 +667,16 @@ void mi_process_init(void) mi_attr_noexcept { if (!mi_atomic_once(&process_init)) return; _mi_process_is_initialized = true; _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id()); - mi_process_setup_auto_thread_done(); - + mi_detect_cpu_features(); - mi_subproc_main_init(); - mi_tld_main_init(); - mi_heap_main_init(); _mi_os_init(); _mi_page_map_init(); + mi_heap_main_init(); + mi_tld_main_init(); + // the following two can potentially allocate (on freeBSD for locks and thread keys) + mi_subproc_main_init(); + mi_process_setup_auto_thread_done(); + #if MI_DEBUG _mi_verbose_message("debug level : %d\n", MI_DEBUG); #endif From d8c119cc4fb1e717261a3f3a875ffeddf5528462 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Sun, 9 Feb 2025 08:56:22 -0800 Subject: [PATCH 242/264] add mi_decl_maybe_unused; fix compilation with OPT_SIMD (issue #1009) --- ide/vs2022/mimalloc-lib.vcxproj | 8 +++++++ ide/vs2022/mimalloc-override-dll.vcxproj | 8 +++++++ include/mimalloc/internal.h | 27 ++++++++++++++++-------- src/bitmap.c | 19 ++++++----------- 4 files changed, 41 insertions(+), 21 deletions(-) diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj index b0547769..035adf8d 100644 --- a/ide/vs2022/mimalloc-lib.vcxproj +++ b/ide/vs2022/mimalloc-lib.vcxproj @@ -178,6 +178,7 @@ CompileAsCpp false stdcpp20 + /Zc:__cplusplus %(AdditionalOptions) @@ -197,6 +198,7 @@ CompileAsCpp false stdcpp20 + /Zc:__cplusplus %(AdditionalOptions) @@ -224,6 +226,7 @@ CompileAsCpp false stdcpp20 + /Zc:__cplusplus %(AdditionalOptions) @@ -251,6 +254,7 @@ CompileAsCpp false stdcpp20 + /Zc:__cplusplus %(AdditionalOptions) @@ -283,6 +287,7 @@ CompileAsCpp true stdcpp20 + /Zc:__cplusplus %(AdditionalOptions) true @@ -312,6 +317,7 @@ true stdcpp20 AdvancedVectorExtensions2 + /Zc:__cplusplus %(AdditionalOptions) true @@ -348,6 +354,7 @@ stdcpp20 CPUExtensionRequirementsARMv81 Sync + /Zc:__cplusplus %(AdditionalOptions) true @@ -384,6 +391,7 @@ stdcpp20 CPUExtensionRequirementsARMv81 Sync + /Zc:__cplusplus %(AdditionalOptions) true diff --git a/ide/vs2022/mimalloc-override-dll.vcxproj b/ide/vs2022/mimalloc-override-dll.vcxproj index be69716f..3c2ef98f 100644 --- a/ide/vs2022/mimalloc-override-dll.vcxproj +++ b/ide/vs2022/mimalloc-override-dll.vcxproj @@ -174,6 +174,7 @@ MultiThreadedDebugDLL false CompileAsCpp + /Zc:__cplusplus %(AdditionalOptions) $(ProjectDir)\..\..\bin\mimalloc-redirect32.lib;%(AdditionalDependencies) @@ -204,6 +205,7 @@ MultiThreadedDebugDLL false CompileAsCpp + /Zc:__cplusplus %(AdditionalOptions) $(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies) @@ -234,6 +236,7 @@ MultiThreadedDebugDLL false CompileAsCpp + /Zc:__cplusplus %(AdditionalOptions) $(ProjectDir)\..\..\bin\mimalloc-redirect-arm64.lib;%(AdditionalDependencies) @@ -264,6 +267,7 @@ MultiThreadedDebugDLL false CompileAsCpp + /Zc:__cplusplus %(AdditionalOptions) $(ProjectDir)\..\..\bin\mimalloc-redirect-arm64ec.lib;%(AdditionalDependencies) @@ -298,6 +302,7 @@ MultiThreadedDLL CompileAsCpp false + /Zc:__cplusplus %(AdditionalOptions) true @@ -332,6 +337,7 @@ MultiThreadedDLL CompileAsCpp false + /Zc:__cplusplus %(AdditionalOptions) true @@ -367,6 +373,7 @@ CompileAsCpp false CPUExtensionRequirementsARMv81 + /Zc:__cplusplus %(AdditionalOptions) true @@ -402,6 +409,7 @@ CompileAsCpp false CPUExtensionRequirementsARMv81 + /Zc:__cplusplus %(AdditionalOptions) true diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 151c81a3..a76f7baf 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -18,11 +18,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "track.h" #include "bits.h" -#if (MI_DEBUG>0) -#define mi_trace_message(...) _mi_trace_message(__VA_ARGS__) -#else -#define mi_trace_message(...) -#endif +#define mi_decl_cache_align mi_decl_align(64) #if defined(_MSC_VER) #pragma warning(disable:4127) // suppress constant conditional warning (due to MI_SECURE paths) @@ -52,19 +48,32 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_decl_hidden #endif -#define mi_decl_cache_align mi_decl_align(64) +#if (defined(__GNUC__) && (__GNUC__ >= 7)) || defined(__clang__) // includes clang and icc +#define mi_decl_maybe_unused __attribute__((unused)) +#elif __cplusplus >= 201703L // c++17 +#define mi_decl_maybe_unused [[maybe_unused]] +#else +#define mi_decl_maybe_unused +#endif + +#if defined(__cplusplus) +#define mi_decl_externc extern "C" +#else +#define mi_decl_externc +#endif #if defined(__EMSCRIPTEN__) && !defined(__wasi__) #define __wasi__ #endif -#if defined(__cplusplus) -#define mi_decl_externc extern "C" +#if (MI_DEBUG>0) +#define mi_trace_message(...) _mi_trace_message(__VA_ARGS__) #else -#define mi_decl_externc +#define mi_trace_message(...) #endif + // "libc.c" #include int _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args); diff --git a/src/bitmap.c b/src/bitmap.c index f3030153..6214980b 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -184,26 +184,23 @@ static inline bool mi_bfield_atomic_try_clear_mask(_Atomic(mi_bfield_t)* b, mi_b return mi_bfield_atomic_try_clear_mask_of(b, mask, expect, all_clear); } -/* // Tries to clear a bit atomically. Returns `true` if the bit transitioned from 1 to 0 // and `false` otherwise leaving the bfield `b` as-is. // `all_clear` is set to true if the new bfield became zero (and false otherwise) -static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) { +mi_decl_maybe_unused static inline bool mi_bfield_atomic_try_clear(_Atomic(mi_bfield_t)* b, size_t idx, bool* all_clear) { mi_assert_internal(idx < MI_BFIELD_BITS); const mi_bfield_t mask = mi_bfield_one()<bfields[chunk_idx]); // has_set8 has low bit in each byte set if the byte in x == 0xFF const mi_bfield_t has_set8 = @@ -663,7 +659,6 @@ static inline bool mi_bchunk_try_find_and_clear8_at(mi_bchunk_t* chunk, size_t c } return false; } -#endif // find least aligned byte in a chunk with all bits set, and try unset it atomically // set `*pidx` to its bit index (0 <= *pidx < MI_BCHUNK_BITS) on success. From f3c86bd976c9f2004d86aa3ec6cf889ab71be4d4 Mon Sep 17 00:00:00 2001 From: Daan Date: Sun, 9 Feb 2025 18:38:15 -0800 Subject: [PATCH 243/264] add simd test in azure pipeline --- azure-pipelines.yml | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index 5393035e..fc00dc8c 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -89,6 +89,11 @@ jobs: CXX: clang++ BuildType: release-clang cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release + Release SIMD Clang: + CC: clang + CXX: clang++ + BuildType: release-simd-clang + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON Secure Clang: CC: clang CXX: clang++ @@ -148,6 +153,9 @@ jobs: Release: BuildType: release cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release + Release SIMD: + BuildType: release-simd + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON Secure: BuildType: secure cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON From 11f4da7ea53a13b8bd26614ca7e418c22810e73a Mon Sep 17 00:00:00 2001 From: Daan Date: Sun, 9 Feb 2025 18:46:55 -0800 Subject: [PATCH 244/264] add simd test on windows --- azure-pipelines.yml | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/azure-pipelines.yml b/azure-pipelines.yml index fc00dc8c..2ab709ff 100644 --- a/azure-pipelines.yml +++ b/azure-pipelines.yml @@ -30,6 +30,10 @@ jobs: BuildType: release cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release MSBuildConfiguration: Release + Release SIMD: + BuildType: release-simd + cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON -DMI_WIN_USE_FIXED_TLS=ON + MSBuildConfiguration: Release Secure: BuildType: secure cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON From a1cb38b70ebc1c9517cc003d52910d1de2d8d2b4 Mon Sep 17 00:00:00 2001 From: Daan Date: Sun, 9 Feb 2025 19:10:33 -0800 Subject: [PATCH 245/264] fix link error in debug mode in test --- test/test-stress.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/test/test-stress.c b/test/test-stress.c index 6fbd8d0e..d3b8bcc5 100644 --- a/test/test-stress.c +++ b/test/test-stress.c @@ -261,7 +261,9 @@ static void test_stress(void) { #if !defined(NDEBUG) || defined(MI_TSAN) if ((n + 1) % 10 == 0) { printf("- iterations left: %3d\n", ITER - (n + 1)); + #ifndef USE_STD_MALLOC mi_debug_show_arenas(); + #endif //mi_collect(true); //mi_debug_show_arenas(); } From 89d629317f986d2ef7605ced9fa5ec011adc1594 Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 10 Feb 2025 12:45:38 -0800 Subject: [PATCH 246/264] limit page_reclaim to page queues of less than 4 pages; make page_commit_on_demand 0 by default. --- src/free.c | 10 +++++++++- src/options.c | 4 ++-- 2 files changed, 11 insertions(+), 3 deletions(-) diff --git a/src/free.c b/src/free.c index 1df10728..9ca71499 100644 --- a/src/free.c +++ b/src/free.c @@ -202,6 +202,14 @@ void mi_free(void* p) mi_attr_noexcept // Multi-threaded Free (`_mt`) // ------------------------------------------------------ static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free); +static bool inline mi_page_queue_len_is_atmost( mi_heap_t* heap, size_t block_size, size_t atmost) { + mi_page_queue_t* const pq = mi_page_queue(heap,block_size); + mi_assert_internal(pq!=NULL); + for(mi_page_t* p = pq->first; p!=NULL; p = p->next, atmost--) { + if (atmost == 0) { return false; } + } + return true; +} static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept { mi_assert_internal(mi_page_is_owned(page)); @@ -243,7 +251,7 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* } // can we reclaim? if (heap != NULL && heap->allow_page_reclaim) { - if (heap == page->heap || // only reclaim if we were the originating heap, + if ((heap == page->heap && mi_page_queue_len_is_atmost(heap, page->block_size, 4)) || // only reclaim if we were the originating heap, and we have at most N pages already (reclaim_on_free == 1 && // OR if the reclaim across heaps is allowed !mi_page_is_used_at_frac(page, 8) && // and the page is not too full !heap->tld->is_in_threadpool && // and not part of a threadpool diff --git a/src/options.c b/src/options.c index a61c2dc2..d1bdd716 100644 --- a/src/options.c +++ b/src/options.c @@ -172,9 +172,9 @@ static mi_option_desc_t options[_mi_option_last] = { 2, UNINIT, MI_OPTION(page_full_retain) }, // number of (small) pages to retain in the free page queues { 4, UNINIT, MI_OPTION(page_max_candidates) }, // max search to find a best page candidate { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits - { MI_DEFAULT_PAGEMAP_COMMIT, + { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) + { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) }; static void mi_option_init(mi_option_desc_t* desc); From c820259f3b886062b29ff607dac00226eb3c93e3 Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 10 Feb 2025 21:25:30 -0800 Subject: [PATCH 247/264] fix heap_main declaration --- src/init.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/init.c b/src/init.c index 2cd77cb2..5bedab85 100644 --- a/src/init.c +++ b/src/init.c @@ -138,7 +138,7 @@ mi_decl_cache_align const mi_heap_t _mi_heap_empty = { MI_MEMID_STATIC }; -static mi_decl_cache_align mi_heap_t heap_main; +extern mi_decl_hidden mi_decl_cache_align mi_heap_t heap_main; static mi_decl_cache_align mi_tld_t tld_main = { 0, // thread_id @@ -153,7 +153,7 @@ static mi_decl_cache_align mi_tld_t tld_main = { MI_MEMID_STATIC // memid }; -static mi_decl_cache_align mi_heap_t heap_main = { +mi_decl_cache_align mi_heap_t heap_main = { &tld_main, // thread local data NULL, // exclusive arena 0, // initial cookie From 69a5fbb1f3f9fdd9361d8a33677d5573c7db5f72 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 11 Feb 2025 09:32:13 -0800 Subject: [PATCH 248/264] avoid overflow in max address calculation on 32-bit (issue #1010) --- src/page-map.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/page-map.c b/src/page-map.c index 74c22e90..44d6de4a 100644 --- a/src/page-map.c +++ b/src/page-map.c @@ -40,7 +40,7 @@ bool _mi_page_map_init(void) { } // Allocate the page map and commit bits - mi_page_map_max_address = (void*)(MI_PU(1) << vbits); + mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits)); const size_t page_map_size = (MI_ZU(1) << (vbits - MI_ARENA_SLICE_SHIFT)); const bool commit = (page_map_size <= 1*MI_MiB || mi_option_is_enabled(mi_option_pagemap_commit)); // _mi_os_has_overcommit(); // commit on-access on Linux systems? const size_t commit_bits = _mi_divide_up(page_map_size, MI_PAGE_MAP_ENTRIES_PER_COMMIT_BIT); @@ -183,7 +183,7 @@ bool _mi_page_map_init(void) { // Allocate the page map and commit bits mi_assert(MI_MAX_VABITS >= vbits); - mi_page_map_max_address = (void*)(MI_PU(1) << vbits); + mi_page_map_max_address = (void*)(vbits >= MI_SIZE_BITS ? (SIZE_MAX - MI_ARENA_SLICE_SIZE + 1) : (MI_PU(1) << vbits)); const size_t page_map_count = (MI_ZU(1) << (vbits - MI_PAGE_MAP_SUB_SHIFT - MI_ARENA_SLICE_SHIFT)); mi_assert(page_map_count <= MI_PAGE_MAP_COUNT); const size_t os_page_size = _mi_os_page_size(); From 63b8f8f753dae22b5179d639e78f047da07baed6 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 11 Feb 2025 09:47:03 -0800 Subject: [PATCH 249/264] fix assertion condition --- src/arena-meta.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/arena-meta.c b/src/arena-meta.c index ff50ea60..530e42cb 100644 --- a/src/arena-meta.c +++ b/src/arena-meta.c @@ -25,9 +25,9 @@ terms of the MIT license. A copy of the license can be found in the file #define MI_META_PAGE_SIZE MI_ARENA_SLICE_SIZE #define MI_META_PAGE_ALIGN MI_ARENA_SLICE_ALIGN -#define MI_META_BLOCK_SIZE (128) // large enough such that META_MAX_SIZE > 4k (even on 32-bit) +#define MI_META_BLOCK_SIZE (128) // large enough such that META_MAX_SIZE >= 4k (even on 32-bit) #define MI_META_BLOCK_ALIGN MI_META_BLOCK_SIZE -#define MI_META_BLOCKS_PER_PAGE (MI_ARENA_SLICE_SIZE / MI_META_BLOCK_SIZE) // 1024 +#define MI_META_BLOCKS_PER_PAGE (MI_META_PAGE_SIZE / MI_META_BLOCK_SIZE) // 512 #define MI_META_MAX_SIZE (MI_BCHUNK_SIZE * MI_META_BLOCK_SIZE) typedef struct mi_meta_page_s { @@ -150,7 +150,7 @@ mi_decl_noinline void _mi_meta_free(void* p, size_t size, mi_memid_t memid) { const size_t block_idx = memid.mem.meta.block_index; mi_meta_page_t* mpage = (mi_meta_page_t*)memid.mem.meta.meta_page; mi_assert_internal(mi_meta_page_of_ptr(p,NULL) == mpage); - mi_assert_internal(block_idx + block_count < MI_META_BLOCKS_PER_PAGE); + mi_assert_internal(block_idx + block_count <= MI_META_BLOCKS_PER_PAGE); mi_assert_internal(mi_bbitmap_is_clearN(&mpage->blocks_free, block_idx, block_count)); // we zero on free (and on the initial page allocation) so we don't need a "dirty" map _mi_memzero_aligned(mi_meta_block_start(mpage, block_idx), block_count*MI_META_BLOCK_SIZE); From 44a4c83fbfda403ae25dd436fed4adf3197a62b3 Mon Sep 17 00:00:00 2001 From: daanx Date: Tue, 11 Feb 2025 13:56:58 -0800 Subject: [PATCH 250/264] maintain count in pagequeue for constant time test in free.c --- include/mimalloc/internal.h | 1 + include/mimalloc/types.h | 1 + src/free.c | 5 ++++- src/heap.c | 4 ++++ src/init.c | 2 +- src/page-queue.c | 37 +++++++++++++++++++++++++++++++++++-- 6 files changed, 46 insertions(+), 4 deletions(-) diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index c9f69a26..b45f7565 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -213,6 +213,7 @@ void _mi_deferred_free(mi_heap_t* heap, bool force); void _mi_page_free_collect(mi_page_t* page, bool force); void _mi_page_free_collect_partly(mi_page_t* page, mi_block_t* head); void _mi_page_init(mi_heap_t* heap, mi_page_t* page); +bool _mi_page_queue_is_valid(mi_heap_t* heap, const mi_page_queue_t* pq); size_t _mi_bin_size(uint8_t bin); // for stats uint8_t _mi_bin(size_t size); // for stats diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 5059ecd1..a743546e 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -389,6 +389,7 @@ typedef struct mi_tld_s mi_tld_t; typedef struct mi_page_queue_s { mi_page_t* first; mi_page_t* last; + size_t count; size_t block_size; } mi_page_queue_t; diff --git a/src/free.c b/src/free.c index 9ca71499..418acd02 100644 --- a/src/free.c +++ b/src/free.c @@ -202,13 +202,16 @@ void mi_free(void* p) mi_attr_noexcept // Multi-threaded Free (`_mt`) // ------------------------------------------------------ static bool mi_page_unown_from_free(mi_page_t* page, mi_block_t* mt_free); -static bool inline mi_page_queue_len_is_atmost( mi_heap_t* heap, size_t block_size, size_t atmost) { +static inline bool mi_page_queue_len_is_atmost( mi_heap_t* heap, size_t block_size, size_t atmost) { mi_page_queue_t* const pq = mi_page_queue(heap,block_size); mi_assert_internal(pq!=NULL); + return (pq->count <= atmost); + /* for(mi_page_t* p = pq->first; p!=NULL; p = p->next, atmost--) { if (atmost == 0) { return false; } } return true; + */ } static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* mt_free) mi_attr_noexcept { diff --git a/src/heap.c b/src/heap.c index daad8afc..116d0589 100644 --- a/src/heap.c +++ b/src/heap.c @@ -63,6 +63,9 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ static bool mi_heap_is_valid(mi_heap_t* heap) { mi_assert_internal(heap!=NULL); mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL); + for (int bin = 0; bin < MI_BIN_COUNT; bin++) { + mi_assert_internal(_mi_page_queue_is_valid(heap, &heap->pages[bin])); + } return true; } #endif @@ -106,6 +109,7 @@ static bool mi_heap_page_collect(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_t static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect) { if (heap==NULL || !mi_heap_is_initialized(heap)) return; + mi_assert_expensive(mi_heap_is_valid(heap)); const bool force = (collect >= MI_FORCE); _mi_deferred_free(heap, force); diff --git a/src/init.c b/src/init.c index 5bedab85..4cac1c18 100644 --- a/src/init.c +++ b/src/init.c @@ -50,7 +50,7 @@ const mi_page_t _mi_page_empty = { // Empty page queues for every bin -#define QNULL(sz) { NULL, NULL, (sz)*sizeof(uintptr_t) } +#define QNULL(sz) { NULL, NULL, 0, (sz)*sizeof(uintptr_t) } #define MI_PAGE_QUEUES_EMPTY \ { QNULL(1), \ QNULL( 1), QNULL( 2), QNULL( 3), QNULL( 4), QNULL( 5), QNULL( 6), QNULL( 7), QNULL( 8), /* 8 */ \ diff --git a/src/page-queue.c b/src/page-queue.c index 9e3aaacc..5365c0b7 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -49,6 +49,10 @@ static inline bool mi_page_queue_is_special(const mi_page_queue_t* pq) { return (pq->block_size > MI_LARGE_MAX_OBJ_SIZE); } +static inline size_t mi_page_queue_count(const mi_page_queue_t* pq) { + return pq->count; +} + /* ----------------------------------------------------------- Bins ----------------------------------------------------------- */ @@ -142,6 +146,25 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* } #endif +bool _mi_page_queue_is_valid(mi_heap_t* heap, const mi_page_queue_t* pq) { + if (pq==NULL) return false; + size_t count = 0; + mi_page_t* prev = NULL; + for (mi_page_t* page = pq->first; page != NULL; page = page->next) { + mi_assert_internal(page->prev == prev); + mi_assert_internal(mi_page_block_size(page) == pq->block_size); + mi_assert_internal(page->heap == heap); + if (page->next == NULL) { + mi_assert_internal(pq->last == page); + } + count++; + prev = page; + } + mi_assert_internal(pq->count == count); + return true; +} + + static mi_page_queue_t* mi_heap_page_queue_of(mi_heap_t* heap, const mi_page_t* page) { mi_assert_internal(heap!=NULL); uint8_t bin = (mi_page_is_in_full(page) ? MI_BIN_FULL : (mi_page_is_huge(page) ? MI_BIN_HUGE : mi_bin(mi_page_block_size(page)))); @@ -211,6 +234,7 @@ static bool mi_page_queue_is_empty(mi_page_queue_t* queue) { static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { mi_assert_internal(page != NULL); mi_assert_expensive(mi_page_queue_contains(queue, page)); + mi_assert_internal(queue->count >= 1); mi_assert_internal(mi_page_block_size(page) == queue->block_size || (mi_page_is_huge(page) && mi_page_queue_is_huge(queue)) || (mi_page_is_in_full(page) && mi_page_queue_is_full(queue))); @@ -225,6 +249,7 @@ static void mi_page_queue_remove(mi_page_queue_t* queue, mi_page_t* page) { mi_heap_queue_first_update(heap,queue); } heap->page_count--; + queue->count--; page->next = NULL; page->prev = NULL; mi_page_set_in_full(page,false); @@ -253,6 +278,7 @@ static void mi_page_queue_push(mi_heap_t* heap, mi_page_queue_t* queue, mi_page_ else { queue->first = queue->last = page; } + queue->count++; // update direct mi_heap_queue_first_update(heap, queue); @@ -279,6 +305,7 @@ static void mi_page_queue_push_at_end(mi_heap_t* heap, mi_page_queue_t* queue, m else { queue->first = queue->last = page; } + queue->count++; // update direct if (queue->first == page) { @@ -298,6 +325,7 @@ static void mi_page_queue_move_to_front(mi_heap_t* heap, mi_page_queue_t* queue, static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* from, bool enqueue_at_end, mi_page_t* page) { mi_assert_internal(page != NULL); + mi_assert_internal(from->count >= 1); mi_assert_expensive(mi_page_queue_contains(from, page)); mi_assert_expensive(!mi_page_queue_contains(to, page)); const size_t bsize = mi_page_block_size(page); @@ -320,8 +348,10 @@ static void mi_page_queue_enqueue_from_ex(mi_page_queue_t* to, mi_page_queue_t* mi_assert_internal(mi_heap_contains_queue(heap, from)); mi_heap_queue_first_update(heap, from); } + from->count--; // insert into `to` + to->count++; if (enqueue_at_end) { // enqueue at the end page->prev = to->last; @@ -378,15 +408,16 @@ static void mi_page_queue_enqueue_from_full(mi_page_queue_t* to, mi_page_queue_t size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append) { mi_assert_internal(mi_heap_contains_queue(heap,pq)); mi_assert_internal(pq->block_size == append->block_size); - + if (append->first==NULL) return 0; - + // set append pages to new heap and count size_t count = 0; for (mi_page_t* page = append->first; page != NULL; page = page->next) { mi_page_set_heap(page, heap); count++; } + mi_assert_internal(count == append->count); if (pq->last==NULL) { // take over afresh @@ -403,5 +434,7 @@ size_t _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue append->first->prev = pq->last; pq->last = append->last; } + pq->count += append->count; + return count; } From 0cbdcfac94780061af20b3c39e9f21ab41ddd400 Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 11 Feb 2025 16:07:35 -0800 Subject: [PATCH 251/264] fix signed warning --- src/heap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/heap.c b/src/heap.c index 57bb2f52..ac67698a 100644 --- a/src/heap.c +++ b/src/heap.c @@ -63,7 +63,7 @@ static bool mi_heap_page_is_valid(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_ static bool mi_heap_is_valid(mi_heap_t* heap) { mi_assert_internal(heap!=NULL); mi_heap_visit_pages(heap, &mi_heap_page_is_valid, NULL, NULL); - for (int bin = 0; bin < MI_BIN_COUNT; bin++) { + for (size_t bin = 0; bin < MI_BIN_COUNT; bin++) { mi_assert_internal(_mi_page_queue_is_valid(heap, &heap->pages[bin])); } return true; From cd2763aa3dbea905231798cec23c1ba0eaa1f7f7 Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 11 Feb 2025 16:27:25 -0800 Subject: [PATCH 252/264] fix compile warnings and assertion --- src/page-queue.c | 15 ++++++++++++--- 1 file changed, 12 insertions(+), 3 deletions(-) diff --git a/src/page-queue.c b/src/page-queue.c index 1ffbbf2a..6e8b0853 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -141,12 +141,21 @@ static bool mi_heap_contains_queue(const mi_heap_t* heap, const mi_page_queue_t* #endif bool _mi_page_queue_is_valid(mi_heap_t* heap, const mi_page_queue_t* pq) { + MI_UNUSED_RELEASE(heap); if (pq==NULL) return false; - size_t count = 0; - mi_page_t* prev = NULL; + size_t count = 0; MI_UNUSED_RELEASE(count); + mi_page_t* prev = NULL; MI_UNUSED_RELEASE(prev); for (mi_page_t* page = pq->first; page != NULL; page = page->next) { mi_assert_internal(page->prev == prev); - mi_assert_internal(mi_page_block_size(page) == pq->block_size); + if (mi_page_is_in_full(page)) { + mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 2); + } + else if (mi_page_is_huge(page)) { + mi_assert_internal(_mi_wsize_from_size(pq->block_size) == MI_LARGE_MAX_OBJ_WSIZE + 1); + } + else { + mi_assert_internal(mi_page_block_size(page) == pq->block_size); + } mi_assert_internal(page->heap == heap); if (page->next == NULL) { mi_assert_internal(pq->last == page); From 2775be9bed98f20a8856bd2dd0b893ea41eacc0f Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 11 Feb 2025 16:28:08 -0800 Subject: [PATCH 253/264] disable page commit_on_demand by default --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index 485beb48..d1bdd716 100644 --- a/src/options.c +++ b/src/options.c @@ -174,7 +174,7 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 2, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) + { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) }; static void mi_option_init(mi_option_desc_t* desc); From 62848bd0722121da50ea6181a0c685c45518394f Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 17 Feb 2025 14:19:56 -0800 Subject: [PATCH 254/264] remove -mtune flag for now (issue #1010) --- CMakeLists.txt | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3bb15e8c..c9217001 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -437,15 +437,15 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES) # to support multi-arch binaries (#999) set(MI_OPT_ARCH_FLAGS "") if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES) - list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a;-mtune=native") + list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a") endif() if("x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES) list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_x86_64;-march=haswell;-Xarch_x86_64;-mavx2") endif() elseif(MI_ARCH STREQUAL "x64") - set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2;-mtune=native") # fast bit scan (since 2013) + set(MI_OPT_ARCH_FLAGS "-march=haswell;-mavx2") # fast bit scan (since 2013) elseif(MI_ARCH STREQUAL "arm64") - set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a;-mtune=native") # fast atomics (since 2016) + set(MI_OPT_ARCH_FLAGS "-march=armv8.1-a") # fast atomics (since 2016) endif() endif() endif() From f8857a5189bd95d4db427b884c78fab14b3e18fb Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 18 Feb 2025 06:33:55 -0800 Subject: [PATCH 255/264] fix mi_bsr to not use lzcnt directly --- include/mimalloc/bits.h | 12 +++--------- 1 file changed, 3 insertions(+), 9 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index d4632441..89ec7296 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -231,9 +231,8 @@ static inline size_t mi_clz(size_t x) { #elif mi_has_builtinz(clz) return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS); #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86) - if (x==0) return MI_SIZE_BITS; - size_t r; - __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc"); + size_t r = MI_SIZE_BITS; // bsr leaves destination unmodified if the argument is 0 (see ) + __asm ("bsr\t%1, %0" : "+r"(r) : "r"(x) : "cc"); return (MI_SIZE_BITS - 1 - r); #else #define MI_HAS_FAST_BITSCAN 0 @@ -270,12 +269,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) { // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bsr(size_t x, size_t* idx) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9) - // on x64 the carry flag is set on zero which gives better codegen - bool is_zero; - __asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc"); - return !is_zero; - #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #if 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long i; return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false); #else From dce6ec8b41711621f017b9219bafb601dd04e3aa Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 18 Feb 2025 06:45:12 -0800 Subject: [PATCH 256/264] fix find_highest_bit --- src/bitmap.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/bitmap.c b/src/bitmap.c index 1c28fe44..3907e91d 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -49,7 +49,7 @@ static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bfield_find_highest_bit(mi_bfield_t x, size_t* idx) { - return mi_bsf(x, idx); + return mi_bsr(x, idx); } From 7e611f7545a70b4db7c561f04688ce483bf6a37b Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 18 Feb 2025 16:04:50 -0800 Subject: [PATCH 257/264] merge from dev --- src/prim/unix/prim.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c index 1c33288e..04d931d7 100644 --- a/src/prim/unix/prim.c +++ b/src/prim/unix/prim.c @@ -57,7 +57,7 @@ terms of the MIT license. A copy of the license can be found in the file #include #endif -#if defined(__linux__) || defined(__FreeBSD__) +#if (defined(__linux__) && !defined(__ANDROID__)) || defined(__FreeBSD__) #define MI_HAS_SYSCALL_H #include #endif From a7f11cd2b03c45c71f21e7ce493c6e81bb07e644 Mon Sep 17 00:00:00 2001 From: daanx Date: Thu, 20 Feb 2025 14:39:35 -0800 Subject: [PATCH 258/264] define mi_clz/ctz in portable way on x64 that does not require BMI1 (issue #1016) --- include/mimalloc/bits.h | 43 ++++++++++++++++++++++++----------------- 1 file changed, 25 insertions(+), 18 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 89ec7296..9b1d75f7 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -195,19 +195,24 @@ size_t _mi_clz_generic(size_t x); size_t _mi_ctz_generic(size_t x); static inline size_t mi_ctz(size_t x) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0 - size_t r; - __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); + #if defined(__GNUC__) && MI_ARCH_X64 + // tzcnt is interpreted as bsf if BMI1 is not supported (pre-haswell) + // tzcnt sets carry-flag on zero, while bsf sets the zero-flag + // tzcnt sets the result to MI_SIZE_BITS if the argument 0 + // bsf leaves destination _unmodified_ if the argument is 0 (both AMD and Intel now, see ) + // so we always initialize r to MI_SIZE_BITS to work correctly on all cpu's without branching + size_t r = MI_SIZE_BITS; + __asm ("tzcnt\t%1, %0" : "+r"(r) : "r"(x) : "cc"); // use '+r' to keep the assignment to r in case this becomes bsf on older cpu's return r; - #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) - return _tzcnt_u64(x); + #elif mi_has_builtinz(ctz) + return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS); + #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) + return (x!=0 ? _tzcnt_u64(x) : MI_SIZE_BITS); // ensure it still works on older cpu's as well #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long idx; return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS); - #elif mi_has_builtinz(ctz) - return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS); - #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86) - size_t r = MI_SIZE_BITS; // bsf leaves destination unmodified if the argument is 0 (see ) + #elif defined(__GNUC__) && MI_ARCH_X86 + size_t r = MI_SIZE_BITS; __asm ("bsf\t%1, %0" : "+r"(r) : "r"(x) : "cc"); return r; #elif MI_HAS_FAST_POPCOUNT @@ -219,20 +224,21 @@ static inline size_t mi_ctz(size_t x) { } static inline size_t mi_clz(size_t x) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 + // we don't optimize to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016) + // on pre-haswell cpu's lzcnt gets executed as bsr which is not equivalent (at it returns the bit position) + #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 size_t r; __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return r; - #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) - return _lzcnt_u64(x); + #elif mi_has_builtinz(clz) + return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS); #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long idx; return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS); - #elif mi_has_builtinz(clz) - return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS); #elif defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86) - size_t r = MI_SIZE_BITS; // bsr leaves destination unmodified if the argument is 0 (see ) - __asm ("bsr\t%1, %0" : "+r"(r) : "r"(x) : "cc"); + if (x==0) return MI_SIZE_BITS; + size_t r; + __asm ("bsr\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return (MI_SIZE_BITS - 1 - r); #else #define MI_HAS_FAST_BITSCAN 0 @@ -252,12 +258,13 @@ static inline size_t mi_clz(size_t x) { // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bsf(size_t x, size_t* idx) { - #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9) + // see note in `mi_ctz` + #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9) // on x64 the carry flag is set on zero which gives better codegen bool is_zero; __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); return !is_zero; - #elif 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long i; return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false); #else From b59c1f8ce4ba7d88a1782118cfd5ac6d7dbee876 Mon Sep 17 00:00:00 2001 From: daanx Date: Mon, 24 Feb 2025 13:48:21 -0800 Subject: [PATCH 259/264] update comments --- include/mimalloc/bits.h | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 9b1d75f7..335fbab7 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -197,9 +197,9 @@ size_t _mi_ctz_generic(size_t x); static inline size_t mi_ctz(size_t x) { #if defined(__GNUC__) && MI_ARCH_X64 // tzcnt is interpreted as bsf if BMI1 is not supported (pre-haswell) - // tzcnt sets carry-flag on zero, while bsf sets the zero-flag - // tzcnt sets the result to MI_SIZE_BITS if the argument 0 - // bsf leaves destination _unmodified_ if the argument is 0 (both AMD and Intel now, see ) + // if the argument is zero: + // - tzcnt: sets carry-flag, and returns MI_SIZE_BITS + // - bsf : sets zero-flag, and leaves the destination _unmodified_ (on both AMD and Intel now, see ) // so we always initialize r to MI_SIZE_BITS to work correctly on all cpu's without branching size_t r = MI_SIZE_BITS; __asm ("tzcnt\t%1, %0" : "+r"(r) : "r"(x) : "cc"); // use '+r' to keep the assignment to r in case this becomes bsf on older cpu's @@ -207,7 +207,7 @@ static inline size_t mi_ctz(size_t x) { #elif mi_has_builtinz(ctz) return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS); #elif defined(_MSC_VER) && MI_ARCH_X64 && defined(__BMI1__) - return (x!=0 ? _tzcnt_u64(x) : MI_SIZE_BITS); // ensure it still works on older cpu's as well + return (x!=0 ? _tzcnt_u64(x) : MI_SIZE_BITS); // ensure it still works on non-BMI1 cpu's as well #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long idx; return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS); @@ -224,7 +224,7 @@ static inline size_t mi_ctz(size_t x) { } static inline size_t mi_clz(size_t x) { - // we don't optimize to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016) + // we don't optimize anymore to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016) // on pre-haswell cpu's lzcnt gets executed as bsr which is not equivalent (at it returns the bit position) #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 size_t r; @@ -258,7 +258,7 @@ static inline size_t mi_clz(size_t x) { // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bsf(size_t x, size_t* idx) { - // see note in `mi_ctz` + // we don't optimize anymore to lzcnt so we run correctly on older cpu's as well #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9) // on x64 the carry flag is set on zero which gives better codegen bool is_zero; From c3fc75e0ff27d72880906a2d48d5f21f6a195402 Mon Sep 17 00:00:00 2001 From: Daan Leijen Date: Fri, 28 Feb 2025 16:26:45 -0800 Subject: [PATCH 260/264] update clz/ctz for BMI1 --- include/mimalloc/bits.h | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h index 335fbab7..fc56e8ea 100644 --- a/include/mimalloc/bits.h +++ b/include/mimalloc/bits.h @@ -195,7 +195,11 @@ size_t _mi_clz_generic(size_t x); size_t _mi_ctz_generic(size_t x); static inline size_t mi_ctz(size_t x) { - #if defined(__GNUC__) && MI_ARCH_X64 + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + size_t r; + __asm ("tzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_X64 // tzcnt is interpreted as bsf if BMI1 is not supported (pre-haswell) // if the argument is zero: // - tzcnt: sets carry-flag, and returns MI_SIZE_BITS @@ -226,7 +230,7 @@ static inline size_t mi_ctz(size_t x) { static inline size_t mi_clz(size_t x) { // we don't optimize anymore to lzcnt as there are still non BMI1 cpu's around (like Intel Celeron, see issue #1016) // on pre-haswell cpu's lzcnt gets executed as bsr which is not equivalent (at it returns the bit position) - #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0 size_t r; __asm ("lzcnt\t%1, %0" : "=r"(r) : "r"(x) : "cc"); return r; @@ -259,7 +263,7 @@ static inline size_t mi_clz(size_t x) { // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bsf(size_t x, size_t* idx) { // we don't optimize anymore to lzcnt so we run correctly on older cpu's as well - #if 0 && defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9) + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) && (!defined(__clang_major__) || __clang_major__ >= 9) // on x64 the carry flag is set on zero which gives better codegen bool is_zero; __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); @@ -276,7 +280,7 @@ static inline bool mi_bsf(size_t x, size_t* idx) { // return false if `x==0` (with `*idx` undefined) and true otherwise, // with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). static inline bool mi_bsr(size_t x, size_t* idx) { - #if 0 && defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) unsigned long i; return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false); #else From 6fce7b90a477b14605f3e301fe2e2adcc009b6c8 Mon Sep 17 00:00:00 2001 From: daanx Date: Fri, 28 Feb 2025 16:51:59 -0800 Subject: [PATCH 261/264] reduce object class sizes (/8), add max reclaim queue size --- include/mimalloc.h | 1 + include/mimalloc/types.h | 4 ++-- src/free.c | 51 +++++++++++++++++++++------------------- src/options.c | 3 ++- 4 files changed, 32 insertions(+), 27 deletions(-) diff --git a/include/mimalloc.h b/include/mimalloc.h index b14cba52..1a544b6f 100644 --- a/include/mimalloc.h +++ b/include/mimalloc.h @@ -404,6 +404,7 @@ typedef enum mi_option_e { mi_option_max_vabits, // max user space virtual address bits to consider (=48) mi_option_pagemap_commit, // commit the full pagemap (to always catch invalid pointer uses) (=0) mi_option_page_commit_on_demand, // commit page memory on-demand + mi_option_page_reclaim_max, // don't reclaim pages if we already own N pages (in that size class) (=16) _mi_option_last, // legacy option names mi_option_large_os_pages = mi_option_allow_large_os_pages, diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index a743546e..ba3c43fa 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -339,9 +339,9 @@ typedef struct mi_page_s { #endif // The max object size are checked to not waste more than 12.5% internally over the page sizes. -#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 16 KiB +#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < ~8 KiB #if MI_ENABLE_LARGE_PAGES -#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/4) // < 128 KiB +#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB #define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin` #else #define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/4) // <= 128 KiB diff --git a/src/free.c b/src/free.c index 418acd02..12bb8e26 100644 --- a/src/free.c +++ b/src/free.c @@ -239,33 +239,36 @@ static void mi_decl_noinline mi_free_try_collect_mt(mi_page_t* page, mi_block_t* // 2. we can try to reclaim the page for ourselves // note: we only reclaim if the page originated from our heap (the heap field is preserved on abandonment) - // to avoid claiming arbitrary object sizes and limit indefinite expansion. This helps benchmarks like `larson` - const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free); - if (reclaim_on_free >= 0 && page->block_size <= MI_SMALL_MAX_OBJ_SIZE) // only for small sized blocks + // to avoid claiming arbitrary object sizes and limit indefinite expansion. This helps benchmarks like `larson` + if (page->block_size <= MI_SMALL_MAX_OBJ_SIZE) // only for small sized blocks { - // get our heap (with the right tag) - // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should - // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) - mi_heap_t* heap = mi_prim_get_default_heap(); - if (heap != page->heap) { - if (mi_heap_is_initialized(heap)) { - heap = _mi_heap_by_tag(heap, page->heap_tag); + const long reclaim_on_free = _mi_option_get_fast(mi_option_page_reclaim_on_free); + if (reclaim_on_free >= 0) { // and reclaiming is allowed + // get our heap (with the right tag) + // note: don't use `mi_heap_get_default()` as we may just have terminated this thread and we should + // not reinitialize the heap for this thread. (can happen due to thread-local destructors for example -- issue #944) + mi_heap_t* heap = mi_prim_get_default_heap(); + if (heap != page->heap) { + if (mi_heap_is_initialized(heap)) { + heap = _mi_heap_by_tag(heap, page->heap_tag); + } } - } - // can we reclaim? - if (heap != NULL && heap->allow_page_reclaim) { - if ((heap == page->heap && mi_page_queue_len_is_atmost(heap, page->block_size, 4)) || // only reclaim if we were the originating heap, and we have at most N pages already + // can we reclaim into this heap? + if (heap != NULL && heap->allow_page_reclaim) { + const long reclaim_max = _mi_option_get_fast(mi_option_page_reclaim_max); + if ((heap == page->heap && mi_page_queue_len_is_atmost(heap, page->block_size, reclaim_max)) || // only reclaim if we were the originating heap, and we have at most N pages already (reclaim_on_free == 1 && // OR if the reclaim across heaps is allowed - !mi_page_is_used_at_frac(page, 8) && // and the page is not too full - !heap->tld->is_in_threadpool && // and not part of a threadpool - _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena)) // and the memory is suitable - ) - { - // first remove it from the abandoned pages in the arena -- this waits for any readers to finish - _mi_arenas_page_unabandon(page); - _mi_heap_page_reclaim(heap, page); - mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1); - return; + !mi_page_is_used_at_frac(page, 8) && // and the page is not too full + !heap->tld->is_in_threadpool && // and not part of a threadpool + _mi_arena_memid_is_suitable(page->memid, heap->exclusive_arena)) // and the memory is suitable + ) + { + // first remove it from the abandoned pages in the arena -- this waits for any readers to finish + _mi_arenas_page_unabandon(page); + _mi_heap_page_reclaim(heap, page); + mi_heap_stat_counter_increase(heap, pages_reclaim_on_free, 1); + return; + } } } } diff --git a/src/options.c b/src/options.c index d1bdd716..b8028afe 100644 --- a/src/options.c +++ b/src/options.c @@ -174,7 +174,8 @@ static mi_option_desc_t options[_mi_option_last] = { 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits { MI_DEFAULT_PAGEMAP_COMMIT, UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront? - { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this on overcommit systems (like Linux)) + { 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this only on overcommit systems (like Linux)) + { 16, UNINIT, MI_OPTION(page_reclaim_max) }, // don't reclaim pages if we already own N pages (in that size class) }; static void mi_option_init(mi_option_desc_t* desc); From 632fe6d8c8f87c54b0b37869fd3db1d41425b38b Mon Sep 17 00:00:00 2001 From: Daan Date: Mon, 3 Mar 2025 17:19:20 -0800 Subject: [PATCH 262/264] add MI_WIN_DBG_EXTS option for cmake --- CMakeLists.txt | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/CMakeLists.txt b/CMakeLists.txt index 3c47671d..52844552 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -40,6 +40,8 @@ option(MI_INSTALL_TOPLEVEL "Install directly into $CMAKE_INSTALL_PREFIX instead option(MI_NO_THP "Disable transparent huge pages support on Linux/Android for the mimalloc process only" OFF) option(MI_EXTRA_CPPDEFS "Extra pre-processor definitions (use as `-DMI_EXTRA_CPPDEFS=\"opt1=val1;opt2=val2\"`)" "") +option(MI_WIN_DBG_EXTS "Build with windows debugger extension points") + # negated options for vcpkg features option(MI_NO_USE_CXX "Use plain C compilation (has priority over MI_USE_CXX)" OFF) option(MI_NO_OPT_ARCH "Do not use architecture specific optimizations (like '-march=armv8.1-a' for example) (has priority over MI_OPT_ARCH)" OFF) @@ -512,6 +514,9 @@ endfunction() if(WIN32) list(APPEND mi_libraries psapi shell32 user32 advapi32 bcrypt) + if(MI_WIN_DBG_EXTS) + list(APPEND mi_libraries dbgeng) # todo: only for the dll? + endif() else() find_link_library("pthread" MI_LIB_PTHREAD) if(MI_LIB_PTHREAD) From a6302f47680193b8c8e3afba571bcea87f0775e4 Mon Sep 17 00:00:00 2001 From: Gustavo Varo Date: Tue, 4 Mar 2025 08:57:24 -0500 Subject: [PATCH 263/264] Add barebones of MiMalloc WinDbg extension --- CMakeLists.txt | 15 ++ ide/vs2022/mimalloc-lib.vcxproj | 5 +- ide/vs2022/mimalloc-lib.vcxproj.filters | 3 + ide/vs2022/mimalloc-override-dll.vcxproj | 4 +- .../mimalloc-override-dll.vcxproj.filters | 3 + src/prim/windows/windbg/mimalloc_dbg.cpp | 146 ++++++++++++++++++ 6 files changed, 173 insertions(+), 3 deletions(-) create mode 100644 src/prim/windows/windbg/mimalloc_dbg.cpp diff --git a/CMakeLists.txt b/CMakeLists.txt index 52844552..2dba9ba1 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -74,6 +74,10 @@ set(mi_sources src/stats.c src/prim/prim.c) +if(WIN32 AND MI_WIN_DBG_EXTS) + list(APPEND mi_sources src/prim/windows/windbg/mimalloc_dbg.cpp) +endif() + set(mi_cflags "") set(mi_cflags_static "") # extra flags for a static library build set(mi_cflags_dynamic "") # extra flags for a shared-object library build @@ -255,6 +259,17 @@ if(MI_TRACK_ETW) endif() endif() +if(MI_WIN_DBG_EXTS) + if(NOT WIN32) + set(MI_WIN_DBG_EXTS OFF) + message(WARNING "Can only enable Windows debbuger extension support on Windows (MI_WIN_DBG_EXTS=OFF)") + endif() + if(MI_WIN_DBG_EXTS) + message(STATUS "Compile with Windows debbuger extension support (MI_WIN_DBG_EXTS=ON)") + list(APPEND mi_defines MI_WIN_DBG_EXTS=1) + endif() +endif() + if(MI_GUARDED) message(STATUS "Compile guard pages behind certain object allocations (MI_GUARDED=ON)") list(APPEND mi_defines MI_GUARDED=1) diff --git a/ide/vs2022/mimalloc-lib.vcxproj b/ide/vs2022/mimalloc-lib.vcxproj index b4bf013e..b5bc9677 100644 --- a/ide/vs2022/mimalloc-lib.vcxproj +++ b/ide/vs2022/mimalloc-lib.vcxproj @@ -211,8 +211,7 @@ - - + dbgeng.lib @@ -477,6 +476,7 @@ true + @@ -493,6 +493,7 @@ + diff --git a/ide/vs2022/mimalloc-lib.vcxproj.filters b/ide/vs2022/mimalloc-lib.vcxproj.filters index 6825f113..c2d7db7b 100644 --- a/ide/vs2022/mimalloc-lib.vcxproj.filters +++ b/ide/vs2022/mimalloc-lib.vcxproj.filters @@ -61,6 +61,9 @@ Sources + + Sources + diff --git a/ide/vs2022/mimalloc-override-dll.vcxproj b/ide/vs2022/mimalloc-override-dll.vcxproj index 556d7926..3904e344 100644 --- a/ide/vs2022/mimalloc-override-dll.vcxproj +++ b/ide/vs2022/mimalloc-override-dll.vcxproj @@ -208,7 +208,7 @@ /Zc:__cplusplus %(AdditionalOptions) - $(ProjectDir)\..\..\bin\mimalloc-redirect.lib;%(AdditionalDependencies) + $(ProjectDir)\..\..\bin\mimalloc-redirect.lib;dbgeng.lib;%(AdditionalDependencies) @@ -441,6 +441,7 @@ + @@ -506,6 +507,7 @@ true + diff --git a/ide/vs2022/mimalloc-override-dll.vcxproj.filters b/ide/vs2022/mimalloc-override-dll.vcxproj.filters index ebcf545a..e09e6bb1 100644 --- a/ide/vs2022/mimalloc-override-dll.vcxproj.filters +++ b/ide/vs2022/mimalloc-override-dll.vcxproj.filters @@ -61,6 +61,9 @@ Sources + + Sources + diff --git a/src/prim/windows/windbg/mimalloc_dbg.cpp b/src/prim/windows/windbg/mimalloc_dbg.cpp new file mode 100644 index 00000000..5bc76372 --- /dev/null +++ b/src/prim/windows/windbg/mimalloc_dbg.cpp @@ -0,0 +1,146 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) Microsoft Research +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +#include +#include +#include +#include +#include +#include + +#include "mimalloc.h" +#include "mimalloc/internal.h" + +ULONG64 g_MiMallocBase = 0; +IDebugClient* g_DebugClient = nullptr; +IDebugControl* g_DebugControl = nullptr; +IDebugSymbols3* g_DebugSymbols = nullptr; +IDebugDataSpaces* g_DataSpaces = nullptr; + +// Function to find mimalloc.dll base address at startup +HRESULT FindMimallocBase() +{ + if (g_DebugSymbols == nullptr) + { + return E_FAIL; + } + + return g_DebugSymbols->GetModuleByModuleName("mimalloc", 0, NULL, &g_MiMallocBase); +} + +// Entry point for the extension +extern "C" __declspec(dllexport) HRESULT CALLBACK DebugExtensionInitialize(PULONG version, PULONG flags) +{ + UNREFERENCED_PARAMETER(flags); + + // Ensure Version is valid + if (!version) + { + return E_INVALIDARG; + } + + // Set the version + *version = DEBUG_EXTENSION_VERSION(1, 0); + + HRESULT hr = DebugCreate(__uuidof(IDebugClient), (void**)&g_DebugClient); + if (FAILED(hr)) + { + return hr; + } + + // Query for the IDebugControl interface + hr = g_DebugClient->QueryInterface(__uuidof(IDebugControl), (void**)&g_DebugControl); + if (FAILED(hr)) + { + g_DebugClient->Release(); + + return hr; + } + + hr = g_DebugClient->QueryInterface(__uuidof(IDebugSymbols3), (void**)&g_DebugSymbols); + if (FAILED(hr)) + { + g_DebugControl->Release(); + g_DebugClient->Release(); + + return hr; + } + + hr = g_DebugClient->QueryInterface(__uuidof(IDebugDataSpaces), (void**)&g_DataSpaces); + if (FAILED(hr)) + { + g_DebugSymbols->Release(); + g_DebugControl->Release(); + g_DebugClient->Release(); + + return hr; + } + + // Find mimalloc base address at startup + hr = FindMimallocBase(); + if (FAILED(hr) || g_MiMallocBase == 0) + { + return E_FAIL; // Prevent extension from loading + } + + mi_register_output( + [](const char* msg, void* arg) { + g_DebugControl->Output(DEBUG_OUTPUT_ERROR, msg); + g_DebugControl->Output(DEBUG_OUTPUT_ERROR, "\n"); + }, + nullptr); + + g_DebugControl->Output(DEBUG_OUTPUT_NORMAL, "mimalloc.dll base address found: 0x%llx\n", g_MiMallocBase); + + return S_OK; +} + +// Notifies the extension that a debug event has occurred +extern "C" __declspec(dllexport) void CALLBACK DebugExtensionNotify(ULONG notify, ULONG64 argument) +{ + UNREFERENCED_PARAMETER(notify); + UNREFERENCED_PARAMETER(argument); +} + +// Uninitializes the extension +extern "C" __declspec(dllexport) void CALLBACK DebugExtensionUninitialize() +{ + if (g_DebugSymbols) + { + g_DebugSymbols->Release(); + g_DebugSymbols = nullptr; + } + + if (g_DebugControl) + { + g_DebugControl->Release(); + g_DebugControl = nullptr; + } + + if (g_DebugClient) + { + g_DebugClient->Release(); + g_DebugClient = nullptr; + } +} + +// Sample command: !mi_help +extern "C" __declspec(dllexport) HRESULT CALLBACK mi_help(PDEBUG_CLIENT Client, PCSTR args) +{ + UNREFERENCED_PARAMETER(args); + + // Print Help + g_DebugControl->Output(DEBUG_OUTPUT_NORMAL, "Hello from MiMalloc WinDbg Extension!\n"); + + return S_OK; +} + +extern "C" __declspec(dllexport) HRESULT CALLBACK mi_dump_arenas(PDEBUG_CLIENT client, PCSTR args) +{ + mi_debug_show_arenas(); + return S_OK; +} \ No newline at end of file From fcc76cb95cb2d7fbd7644d41dbe0d3d71333c4cb Mon Sep 17 00:00:00 2001 From: Daan Date: Tue, 4 Mar 2025 11:37:41 -0800 Subject: [PATCH 264/264] fix options printing when verbose is off --- src/options.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/options.c b/src/options.c index bf6cf437..864ef273 100644 --- a/src/options.c +++ b/src/options.c @@ -202,7 +202,7 @@ void _mi_options_init(void) { } } #endif - if (!mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); } + if (mi_option_is_enabled(mi_option_verbose)) { mi_options_print(); } } #define mi_stringifyx(str) #str // and stringify