diff --git a/ide/vs2022/mimalloc.vcxproj b/ide/vs2022/mimalloc.vcxproj index dddab777..138acf39 100644 --- a/ide/vs2022/mimalloc.vcxproj +++ b/ide/vs2022/mimalloc.vcxproj @@ -120,6 +120,7 @@ CompileAsCpp false stdcpp20 + AdvancedVectorExtensions2 @@ -219,7 +220,6 @@ true true - false @@ -252,17 +252,21 @@ + + + + diff --git a/ide/vs2022/mimalloc.vcxproj.filters b/ide/vs2022/mimalloc.vcxproj.filters index 54ee0fcb..48958be1 100644 --- a/ide/vs2022/mimalloc.vcxproj.filters +++ b/ide/vs2022/mimalloc.vcxproj.filters @@ -13,9 +13,6 @@ Sources - - Sources - Sources @@ -64,6 +61,12 @@ Sources + + Sources + + + Sources + @@ -93,6 +96,12 @@ Headers + + Headers + + + Headers + diff --git a/include/mimalloc/bits.h b/include/mimalloc/bits.h new file mode 100644 index 00000000..642f0f9c --- /dev/null +++ b/include/mimalloc/bits.h @@ -0,0 +1,313 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2024 Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- + Bit operation, and platform dependent definition (MI_INTPTR_SIZE etc) +---------------------------------------------------------------------------- */ + +#pragma once +#ifndef MI_BITS_H +#define MI_BITS_H + + +// ------------------------------------------------------ +// Size of a pointer. +// We assume that `sizeof(void*)==sizeof(intptr_t)` +// and it holds for all platforms we know of. +// +// However, the C standard only requires that: +// p == (void*)((intptr_t)p)) +// but we also need: +// i == (intptr_t)((void*)i) +// or otherwise one might define an intptr_t type that is larger than a pointer... +// ------------------------------------------------------ + +#if INTPTR_MAX > INT64_MAX +# define MI_INTPTR_SHIFT (4) // assume 128-bit (as on arm CHERI for example) +#elif INTPTR_MAX == INT64_MAX +# define MI_INTPTR_SHIFT (3) +#elif INTPTR_MAX == INT32_MAX +# define MI_INTPTR_SHIFT (2) +#else +#error platform pointers must be 32, 64, or 128 bits +#endif + +#if SIZE_MAX == UINT64_MAX +# define MI_SIZE_SHIFT (3) +typedef int64_t mi_ssize_t; +#elif SIZE_MAX == UINT32_MAX +# define MI_SIZE_SHIFT (2) +typedef int32_t mi_ssize_t; +#else +#error platform objects must be 32 or 64 bits +#endif + +#if (SIZE_MAX/2) > LONG_MAX +# define MI_ZU(x) x##ULL +# define MI_ZI(x) x##LL +#else +# define MI_ZU(x) x##UL +# define MI_ZI(x) x##L +#endif + +#define MI_INTPTR_SIZE (1< +#endif +#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) +#include +#endif + +#if defined(__AVX2__) && !defined(__BMI2__) // msvc +#define __BMI2__ 1 +#endif +#if (defined(__AVX2__) || defined(__BMI2__)) && !defined(__BMI1__) // msvc +#define __BMI1__ 1 +#endif + +/* -------------------------------------------------------------------------------- + Builtin's +-------------------------------------------------------------------------------- */ + +#ifndef __has_builtin +#define __has_builtin(x) 0 +#endif + +#define mi_builtin(name) __builtin_##name +#define mi_has_builtin(name) __has_builtin(__builtin_##name) + +#if (LONG_MAX == INT32_MAX) +#define mi_builtin32(name) mi_builtin(name##l) +#define mi_has_builtin32(name) mi_has_builtin(name##l) +#else +#define mi_builtin32(name) mi_builtin(name) +#define mi_has_builtin32(name) mi_has_builtin(name) +#endif +#if (LONG_MAX == INT64_MAX) +#define mi_builtin64(name) mi_builtin(name##l) +#define mi_has_builtin64(name) mi_has_builtin(name##l) +#else +#define mi_builtin64(name) mi_builtin(name##ll) +#define mi_has_builtin64(name) mi_has_builtin(name##ll) +#endif + +#if (MI_SIZE_BITS == 32) +#define mi_builtin_size(name) mi_builtin32(name) +#define mi_has_builtin_size(name) mi_has_builtin32(name) +#elif (MI_SIZE_BITS == 64) +#define mi_builtin_size(name) mi_builtin64(name) +#define mi_has_builtin_size(name) mi_has_builtin64(name) +#endif + + +/* -------------------------------------------------------------------------------- + Count trailing/leading zero's +-------------------------------------------------------------------------------- */ + +size_t _mi_clz_generic(size_t x); +size_t _mi_ctz_generic(size_t x); + +static inline size_t mi_ctz(size_t x) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + uint64_t r; + __asm volatile ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_ARM64 + uint64_t r; + __asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_RISCV + size_t r; + __asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : ); + return r; + #elif MI_ARCH_X64 && defined(__BMI1__) + return (size_t)_tzcnt_u64(x); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long idx; + #if MI_SIZE_BITS==32 + return (_BitScanForward(&idx, x) ? (size_t)idx : 32); + #else + return (_BitScanForward64(&idx, x) ? (size_t)idx : 64); + #endif + #elif mi_has_builtin_size(ctz) + return (x!=0 ? (size_t)mi_builtin_size(ctz)(x) : MI_SIZE_BITS); + #else + #define MI_HAS_FAST_BITSCAN 0 + return _mi_ctz_generic(x); + #endif +} + +static inline size_t mi_clz(size_t x) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + uint64_t r; + __asm volatile ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_ARM64 + uint64_t r; + __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc"); + return r; + #elif defined(__GNUC__) && MI_ARCH_RISCV + size_t r; + __asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : ); + return r; + #elif MI_ARCH_X64 && defined(__BMI1__) + return (size_t)_lzcnt_u64(x); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long idx; + #if MI_SIZE_BITS==32 + return (_BitScanReverse(&idx, x) ? 31 - (size_t)idx : 32); + #else + return (_BitScanReverse64(&idx, x) ? 63 - (size_t)idx : 64); + #endif + #elif mi_has_builtin_size(clz) + return (x!=0 ? (size_t)mi_builtin_size(clz)(x) : MI_SIZE_BITS); + #else + #define MI_HAS_FAST_BITSCAN 0 + return _mi_clz_generic(x); + #endif +} + +#ifndef MI_HAS_FAST_BITSCAN +#define MI_HAS_FAST_BITSCAN 1 +#endif + +/* -------------------------------------------------------------------------------- + find trailing/leading zero (bit scan forward/reverse) +-------------------------------------------------------------------------------- */ + +// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's) +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bsf(size_t x, size_t* idx) { + #if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) + // on x64 the carry flag is set on zero which gives better codegen + bool is_zero; + __asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" ); + return !is_zero; + #else + *idx = mi_ctz(x); + return (x!=0); + #endif +} + +// Bit scan reverse: find the most significant bit that is set +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bsr(size_t x, size_t* idx) { + #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long i; + #if MI_SIZE_BITS==32 + return (_BitScanReverse(&i, x) ? (*idx = i, true) : false); + #else + return (_BitScanReverse64(&i, x) ? (*idx = i, true) : false); + #endif + #else + const size_t r = mi_clz(x); + *idx = (~r & (MI_SIZE_BITS - 1)); + return (x!=0); + #endif +} + + +/* -------------------------------------------------------------------------------- + find least/most significant bit position +-------------------------------------------------------------------------------- */ + +// Find most significant bit index, or MI_SIZE_BITS if 0 +static inline size_t mi_find_msb(size_t x) { + #if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + unsigned long i; + #if MI_SIZE_BITS==32 + return (_BitScanReverse(&i, x) ? i : 32); + #else + return (_BitScanReverse64(&i, x) ? i : 64); + #endif + #else + return (x==0 ? MI_SIZE_BITS : MI_SIZE_BITS - 1 - mi_clz(x)); + #endif +} + +// Find least significant bit index, or MI_SIZE_BITS if 0 (this equals `mi_ctz`, count trailing zero's) +static inline size_t mi_find_lsb(size_t x) { + return mi_ctz(x); +} + + +/* -------------------------------------------------------------------------------- + rotate +-------------------------------------------------------------------------------- */ + +static inline size_t mi_rotr(size_t x, size_t r) { + #if (mi_has_builtin(rotateright64) && MI_SIZE_BITS==64) + return mi_builtin(rotateright64)(x,r); + #elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32) + return mi_builtin(rotateright32)(x,r); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #if MI_BFIELD_SIZE==4 + return _lrotr(x,(int)r); + #else + return _rotr64(x,(int)r); + #endif + #else + // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to + // avoid UB when `rshift==0`. See + const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); + return (x >> rshift) | (x << ((-rshift) & (MI_SIZE_BITS-1))); + #endif +} + +static inline size_t mi_rotl(size_t x, size_t r) { + #if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64) + return mi_builtin(rotateleft64)(x,r); + #elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32) + return mi_builtin(rotateleft32)(x,r); + #elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32) + #if MI_SIZE_BITS==32 + return _lrotl(x,(int)r); + #else + return _rotl64(x,(int)r); + #endif + #else + // The term `(-rshift)&(MI_BFIELD_BITS-1)` is written instead of `MI_BFIELD_BITS - rshift` to + // avoid UB when `rshift==0`. See + const unsigned int rshift = (unsigned int)(r) & (MI_SIZE_BITS-1); + return (x << rshift) | (x >> ((-rshift) & (MI_SIZE_BITS-1))) + #endif +} + +#endif // MI_BITS_H diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h index 716386d2..b997099e 100644 --- a/include/mimalloc/internal.h +++ b/include/mimalloc/internal.h @@ -16,6 +16,7 @@ terms of the MIT license. A copy of the license can be found in the file #include "types.h" #include "track.h" +#include "bits.h" #if (MI_DEBUG>0) #define mi_trace_message(...) _mi_trace_message(__VA_ARGS__) @@ -23,26 +24,28 @@ terms of the MIT license. A copy of the license can be found in the file #define mi_trace_message(...) #endif -#define MI_CACHE_LINE 64 #if defined(_MSC_VER) #pragma warning(disable:4127) // suppress constant conditional warning (due to MI_SECURE paths) #pragma warning(disable:26812) // unscoped enum warning #define mi_decl_noinline __declspec(noinline) #define mi_decl_thread __declspec(thread) -#define mi_decl_cache_align __declspec(align(MI_CACHE_LINE)) +#define mi_decl_align(a) __declspec(align(a)) #define mi_decl_weak #elif (defined(__GNUC__) && (__GNUC__ >= 3)) || defined(__clang__) // includes clang and icc #define mi_decl_noinline __attribute__((noinline)) #define mi_decl_thread __thread -#define mi_decl_cache_align __attribute__((aligned(MI_CACHE_LINE))) +#define mi_decl_align(a) __attribute__((aligned(a))) #define mi_decl_weak __attribute__((weak)) #else #define mi_decl_noinline #define mi_decl_thread __thread // hope for the best :-) -#define mi_decl_cache_align +#define mi_decl_align(a) #define mi_decl_weak #endif +#define mi_decl_cache_align mi_decl_align(64) + + #if defined(__EMSCRIPTEN__) && !defined(__wasi__) #define __wasi__ #endif @@ -89,6 +92,7 @@ void _mi_thread_done(mi_heap_t* heap); void _mi_thread_data_collect(void); void _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap); mi_threadid_t _mi_thread_id(void) mi_attr_noexcept; +size_t _mi_thread_seq_id(void) mi_attr_noexcept; mi_heap_t* _mi_heap_main_get(void); // statically allocated main backing heap mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id); void _mi_heap_guarded_init(mi_heap_t* heap); @@ -96,6 +100,7 @@ void _mi_heap_guarded_init(mi_heap_t* heap); // os.c void _mi_os_init(void); // called from process init void* _mi_os_alloc(size_t size, mi_memid_t* memid, mi_stats_t* stats); +void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats); void _mi_os_free(void* p, size_t size, mi_memid_t memid, mi_stats_t* stats); void _mi_os_free_ex(void* p, size_t size, bool still_committed, mi_memid_t memid, mi_stats_t* stats); @@ -675,15 +680,6 @@ static inline bool mi_is_in_same_page(const void* p, const void* q) { return (idxp == idxq); } -static inline uintptr_t mi_rotl(uintptr_t x, uintptr_t shift) { - shift %= MI_INTPTR_BITS; - return (shift==0 ? x : ((x << shift) | (x >> (MI_INTPTR_BITS - shift)))); -} -static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) { - shift %= MI_INTPTR_BITS; - return (shift==0 ? x : ((x >> shift) | (x << (MI_INTPTR_BITS - shift)))); -} - static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) { void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]); return (p==null ? NULL : p); @@ -821,112 +817,6 @@ static inline size_t _mi_os_numa_node_count(void) { } - -// ----------------------------------------------------------------------- -// Count bits: trailing or leading zeros (with MI_INTPTR_BITS on all zero) -// ----------------------------------------------------------------------- - -#if defined(__GNUC__) - -#include // LONG_MAX -#define MI_HAVE_FAST_BITSCAN -static inline size_t mi_clz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; -#if (INTPTR_MAX == LONG_MAX) - return __builtin_clzl(x); -#else - return __builtin_clzll(x); -#endif -} -static inline size_t mi_ctz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; -#if (INTPTR_MAX == LONG_MAX) - return __builtin_ctzl(x); -#else - return __builtin_ctzll(x); -#endif -} - -#elif defined(_MSC_VER) - -#include // LONG_MAX -#include // BitScanReverse64 -#define MI_HAVE_FAST_BITSCAN -static inline size_t mi_clz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; - unsigned long idx; -#if (INTPTR_MAX == LONG_MAX) - _BitScanReverse(&idx, x); -#else - _BitScanReverse64(&idx, x); -#endif - return ((MI_INTPTR_BITS - 1) - idx); -} -static inline size_t mi_ctz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; - unsigned long idx; -#if (INTPTR_MAX == LONG_MAX) - _BitScanForward(&idx, x); -#else - _BitScanForward64(&idx, x); -#endif - return idx; -} - -#else -static inline size_t mi_ctz32(uint32_t x) { - // de Bruijn multiplication, see - static const unsigned char debruijn[32] = { - 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, - 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 - }; - if (x==0) return 32; - return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27]; -} -static inline size_t mi_clz32(uint32_t x) { - // de Bruijn multiplication, see - static const uint8_t debruijn[32] = { - 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, - 23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 - }; - if (x==0) return 32; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27]; -} - -static inline size_t mi_clz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; -#if (MI_INTPTR_BITS <= 32) - return mi_clz32((uint32_t)x); -#else - size_t count = mi_clz32((uint32_t)(x >> 32)); - if (count < 32) return count; - return (32 + mi_clz32((uint32_t)x)); -#endif -} -static inline size_t mi_ctz(uintptr_t x) { - if (x==0) return MI_INTPTR_BITS; -#if (MI_INTPTR_BITS <= 32) - return mi_ctz32((uint32_t)x); -#else - size_t count = mi_ctz32((uint32_t)x); - if (count < 32) return count; - return (32 + mi_ctz32((uint32_t)(x>>32))); -#endif -} - -#endif - -// "bit scan reverse": Return index of the highest bit (or MI_INTPTR_BITS if `x` is zero) -static inline size_t mi_bsr(uintptr_t x) { - return (x==0 ? MI_INTPTR_BITS : MI_INTPTR_BITS - 1 - mi_clz(x)); -} - - // --------------------------------------------------------------------------------- // Provide our own `_mi_memcpy` for potential performance optimizations. // @@ -947,20 +837,20 @@ static inline void _mi_memcpy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); } } -static inline void _mi_memzero(void* dst, size_t n) { +static inline void _mi_memset(void* dst, int val, size_t n) { if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) { - __stosb((unsigned char*)dst, 0, n); + __stosb((unsigned char*)dst, (uint8_t)val, n); } else { - memset(dst, 0, n); + memset(dst, val, n); } } #else static inline void _mi_memcpy(void* dst, const void* src, size_t n) { memcpy(dst, src, n); } -static inline void _mi_memzero(void* dst, size_t n) { - memset(dst, 0, n); +static inline void _mi_memset(void* dst, int val, size_t n) { + memset(dst, val, n); } #endif @@ -978,10 +868,10 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) { _mi_memcpy(adst, asrc, n); } -static inline void _mi_memzero_aligned(void* dst, size_t n) { +static inline void _mi_memset_aligned(void* dst, int val, size_t n) { mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0); void* adst = __builtin_assume_aligned(dst, MI_INTPTR_SIZE); - _mi_memzero(adst, n); + _mi_memset(adst, val, n); } #else // Default fallback on `_mi_memcpy` @@ -990,11 +880,19 @@ static inline void _mi_memcpy_aligned(void* dst, const void* src, size_t n) { _mi_memcpy(dst, src, n); } -static inline void _mi_memzero_aligned(void* dst, size_t n) { +static inline void _mi_memset_aligned(void* dst, int val, size_t n) { mi_assert_internal((uintptr_t)dst % MI_INTPTR_SIZE == 0); - _mi_memzero(dst, n); + _mi_memset(dst, val, n); } #endif +static inline void _mi_memzero(void* dst, size_t n) { + _mi_memset(dst, 0, n); +} + +static inline void _mi_memzero_aligned(void* dst, size_t n) { + _mi_memset_aligned(dst, 0, n); +} + #endif diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h index 56715df4..8a627438 100644 --- a/include/mimalloc/prim.h +++ b/include/mimalloc/prim.h @@ -369,7 +369,4 @@ static inline mi_heap_t* mi_prim_get_default_heap(void) { #endif // mi_prim_get_default_heap() - - - #endif // MIMALLOC_PRIM_H diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h index 44074450..e8705991 100644 --- a/include/mimalloc/types.h +++ b/include/mimalloc/types.h @@ -23,6 +23,7 @@ terms of the MIT license. A copy of the license can be found in the file #include // ptrdiff_t #include // uintptr_t, uint16_t, etc +#include "bits.h" // bit ops, size defines #include "atomic.h" // _Atomic #ifdef _MSC_VER @@ -106,61 +107,6 @@ terms of the MIT license. A copy of the license can be found in the file // #define MI_HUGE_PAGE_ABANDON 1 -// ------------------------------------------------------ -// Platform specific values -// ------------------------------------------------------ - -// ------------------------------------------------------ -// Size of a pointer. -// We assume that `sizeof(void*)==sizeof(intptr_t)` -// and it holds for all platforms we know of. -// -// However, the C standard only requires that: -// p == (void*)((intptr_t)p)) -// but we also need: -// i == (intptr_t)((void*)i) -// or otherwise one might define an intptr_t type that is larger than a pointer... -// ------------------------------------------------------ - -#if INTPTR_MAX > INT64_MAX -# define MI_INTPTR_SHIFT (4) // assume 128-bit (as on arm CHERI for example) -#elif INTPTR_MAX == INT64_MAX -# define MI_INTPTR_SHIFT (3) -#elif INTPTR_MAX == INT32_MAX -# define MI_INTPTR_SHIFT (2) -#else -#error platform pointers must be 32, 64, or 128 bits -#endif - -#if SIZE_MAX == UINT64_MAX -# define MI_SIZE_SHIFT (3) -typedef int64_t mi_ssize_t; -#elif SIZE_MAX == UINT32_MAX -# define MI_SIZE_SHIFT (2) -typedef int32_t mi_ssize_t; -#else -#error platform objects must be 32 or 64 bits -#endif - -#if (SIZE_MAX/2) > LONG_MAX -# define MI_ZU(x) x##ULL -# define MI_ZI(x) x##LL -#else -# define MI_ZU(x) x##UL -# define MI_ZI(x) x##L -#endif - -#define MI_INTPTR_SIZE (1<= 655360) #error "mimalloc internal: define more bins" @@ -461,8 +410,6 @@ typedef struct mi_page_queue_s { size_t block_size; } mi_page_queue_t; -#define MI_BIN_FULL (MI_BIN_HUGE+1) - // Random context typedef struct mi_random_cxt_s { uint32_t input[16]; diff --git a/src/bitmap.c b/src/bitmap.c index 976ba72c..3e6311dc 100644 --- a/src/bitmap.c +++ b/src/bitmap.c @@ -18,6 +18,7 @@ between the fields. (This is used in arena allocation) #include "mimalloc.h" #include "mimalloc/internal.h" +#include "mimalloc/bits.h" #include "bitmap.h" /* ----------------------------------------------------------- @@ -53,7 +54,7 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_ const size_t mask = mi_bitmap_mask_(count, 0); const size_t bitidx_max = MI_BITMAP_FIELD_BITS - count; -#ifdef MI_HAVE_FAST_BITSCAN +#if MI_HAS_FAST_BITSCAN size_t bitidx = mi_ctz(~map); // quickly find the first zero bit if possible #else size_t bitidx = 0; // otherwise start at 0 @@ -79,7 +80,7 @@ bool _mi_bitmap_try_find_claim_field(mi_bitmap_t bitmap, size_t idx, const size_ } else { // on to the next bit range -#ifdef MI_HAVE_FAST_BITSCAN +#if MI_HAS_FAST_BITSCAN mi_assert_internal(mapm != 0); const size_t shift = (count == 1 ? 1 : (MI_INTPTR_BITS - mi_clz(mapm) - bitidx)); mi_assert_internal(shift > 0 && shift <= count); @@ -146,7 +147,7 @@ static bool mi_bitmap_is_claimedx(mi_bitmap_t bitmap, size_t bitmap_fields, size return ((field & mask) == mask); } -// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. +// Try to set `count` bits at `bitmap_idx` from 0 to 1 atomically. // Returns `true` if successful when all previous `count` bits were 0. bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count, mi_bitmap_index_t bitmap_idx) { const size_t idx = mi_bitmap_index_field(bitmap_idx); @@ -154,9 +155,9 @@ bool _mi_bitmap_try_claim(mi_bitmap_t bitmap, size_t bitmap_fields, size_t count const size_t mask = mi_bitmap_mask_(count, bitidx); mi_assert_internal(bitmap_fields > idx); MI_UNUSED(bitmap_fields); size_t expected = mi_atomic_load_relaxed(&bitmap[idx]); - do { + do { if ((expected & mask) != 0) return false; - } + } while (!mi_atomic_cas_strong_acq_rel(&bitmap[idx], &expected, expected | mask)); mi_assert_internal((expected & mask) == 0); return true; @@ -194,7 +195,7 @@ static bool mi_bitmap_try_find_claim_field_across(mi_bitmap_t bitmap, size_t bit if (initial == 0) return false; if (initial >= count) return _mi_bitmap_try_find_claim_field(bitmap, idx, count, bitmap_idx); // no need to cross fields (this case won't happen for us) if (_mi_divide_up(count - initial, MI_BITMAP_FIELD_BITS) >= (bitmap_fields - idx)) return false; // not enough entries - + // scan ahead size_t found = initial; size_t mask = 0; // mask bits for the final field diff --git a/src/init.c b/src/init.c index a90818a4..2544f097 100644 --- a/src/init.c +++ b/src/init.c @@ -124,6 +124,18 @@ mi_threadid_t _mi_thread_id(void) mi_attr_noexcept { return _mi_prim_thread_id(); } +// Thread sequence number +static _Atomic(size_t) mi_tcount; +static mi_decl_thread size_t mi_tseq; + +size_t _mi_thread_seq_id(void) mi_attr_noexcept { + size_t tseq = mi_tseq; + if (tseq == 0) { + mi_tseq = tseq = mi_atomic_add_acq_rel(&mi_tcount,1); + } + return tseq; +} + // the thread-local default heap for allocation mi_decl_thread mi_heap_t* _mi_heap_default = (mi_heap_t*)&_mi_heap_empty; @@ -169,8 +181,8 @@ mi_stats_t _mi_stats_main = { MI_STATS_NULL }; #if MI_GUARDED mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) { heap->guarded_sample_seed = seed; - if (heap->guarded_sample_seed == 0) { - heap->guarded_sample_seed = _mi_heap_random_next(heap); + if (heap->guarded_sample_seed == 0) { + heap->guarded_sample_seed = _mi_heap_random_next(heap); } heap->guarded_sample_rate = sample_rate; if (heap->guarded_sample_rate >= 1) { @@ -188,9 +200,9 @@ void _mi_heap_guarded_init(mi_heap_t* heap) { mi_heap_guarded_set_sample_rate(heap, (size_t)mi_option_get_clamp(mi_option_guarded_sample_rate, 0, LONG_MAX), (size_t)mi_option_get(mi_option_guarded_sample_seed)); - mi_heap_guarded_set_size_bound(heap, + mi_heap_guarded_set_size_bound(heap, (size_t)mi_option_get_clamp(mi_option_guarded_min, 0, LONG_MAX), - (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) ); + (size_t)mi_option_get_clamp(mi_option_guarded_max, 0, LONG_MAX) ); } #else mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed) { @@ -602,7 +614,7 @@ static void mi_detect_cpu_features(void) { } #else static void mi_detect_cpu_features(void) { - // nothing + // nothing } #endif diff --git a/src/libc.c b/src/libc.c index ce541f1b..05ed7b02 100644 --- a/src/libc.c +++ b/src/libc.c @@ -7,7 +7,7 @@ terms of the MIT license. A copy of the license can be found in the file // -------------------------------------------------------- // This module defines various std libc functions to reduce -// the dependency on libc, and also prevent errors caused +// the dependency on libc, and also prevent errors caused // by some libc implementations when called before `main` // executes (due to malloc redirection) // -------------------------------------------------------- @@ -83,7 +83,7 @@ bool _mi_getenv(const char* name, char* result, size_t result_size) { // Define our own limited `_mi_vsnprintf` and `_mi_snprintf` // This is mostly to avoid calling these when libc is not yet // initialized (and to reduce dependencies) -// +// // format: d i, p x u, s // prec: z l ll L // width: 10 @@ -130,7 +130,7 @@ static void mi_out_alignright(char fill, char* start, size_t len, size_t extra, } -static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) +static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* end) { if (x == 0 || base == 0 || base > 16) { if (prefix != 0) { mi_outc(prefix, out, end); } @@ -144,8 +144,8 @@ static void mi_out_num(uintmax_t x, size_t base, char prefix, char** out, char* mi_outc((digit <= 9 ? '0' + digit : 'A' + digit - 10),out,end); x = x / base; } - if (prefix != 0) { - mi_outc(prefix, out, end); + if (prefix != 0) { + mi_outc(prefix, out, end); } size_t len = *out - start; // and reverse in-place @@ -181,7 +181,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) { size_t width = 0; char numtype = 'd'; char numplus = 0; - bool alignright = true; + bool alignright = true; if (c == '+' || c == ' ') { numplus = c; MI_NEXTC(); } if (c == '-') { alignright = false; MI_NEXTC(); } if (c == '0') { fill = '0'; MI_NEXTC(); } @@ -191,7 +191,7 @@ void _mi_vsnprintf(char* buf, size_t bufsize, const char* fmt, va_list args) { width = (10 * width) + (c - '0'); MI_NEXTC(); } if (c == 0) break; // extra check due to while - } + } if (c == 'z' || c == 't' || c == 'L') { numtype = c; MI_NEXTC(); } else if (c == 'l') { numtype = c; MI_NEXTC(); @@ -273,3 +273,56 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) { _mi_vsnprintf(buf, buflen, fmt, args); va_end(args); } + + + +// -------------------------------------------------------- +// generic trailing and leading zero count +// -------------------------------------------------------- + +static inline size_t mi_ctz_generic32(uint32_t x) { + // de Bruijn multiplication, see + static const uint8_t debruijn[32] = { + 0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8, + 31, 27, 13, 23, 21, 19, 16, 7, 26, 12, 18, 6, 11, 5, 10, 9 + }; + if (x==0) return 32; + return debruijn[((x & -(int32_t)x) * 0x077CB531UL) >> 27]; +} + +static inline size_t mi_clz_generic32(uint32_t x) { + // de Bruijn multiplication, see + static const uint8_t debruijn[32] = { + 31, 22, 30, 21, 18, 10, 29, 2, 20, 17, 15, 13, 9, 6, 28, 1, + 23, 19, 11, 3, 16, 14, 7, 24, 12, 4, 8, 25, 5, 26, 27, 0 + }; + if (x==0) return 32; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + return debruijn[(uint32_t)(x * 0x07C4ACDDUL) >> 27]; +} + +size_t _mi_clz_generic(size_t x) { + if (x==0) return MI_SIZE_BITS; + #if (MI_SIZE_BITS <= 32) + return mi_clz_generic32((uint32_t)x); + #else + const size_t count = mi_clz_generic32((uint32_t)(x >> 32)); + if (count < 32) return count; + return (32 + mi_clz_generic32((uint32_t)x)); + #endif +} + +size_t _mi_ctz_generic(size_t x) { + if (x==0) return MI_SIZE_BITS; + #if (MI_SIZE_BITS <= 32) + return mi_ctz_generic32((uint32_t)x); + #else + const size_t count = mi_ctz_generic32((uint32_t)x); + if (count < 32) return count; + return (32 + mi_ctz_generic32((uint32_t)(x>>32))); + #endif +} diff --git a/src/os.c b/src/os.c index a7130b90..36b167cb 100644 --- a/src/os.c +++ b/src/os.c @@ -359,6 +359,18 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo return p; } +void* _mi_os_zalloc(size_t size, mi_memid_t* memid, mi_stats_t* stats) { + void* p = _mi_os_alloc(size, memid, &_mi_stats_main); + if (p == NULL) return NULL; + + // zero the OS memory if needed + if (!memid->initially_zero) { + _mi_memzero_aligned(p, size); + memid->initially_zero = true; + } + return p; +} + /* ----------------------------------------------------------- OS aligned allocation with an offset. This is used for large alignments > MI_BLOCK_ALIGNMENT_MAX. We use a large mimalloc diff --git a/src/page-queue.c b/src/page-queue.c index 9796f3dc..0a791adb 100644 --- a/src/page-queue.c +++ b/src/page-queue.c @@ -83,9 +83,10 @@ static inline uint8_t mi_bin(size_t size) { #if defined(MI_ALIGN4W) if (wsize <= 16) { wsize = (wsize+3)&~3; } // round to 4x word sizes #endif - wsize--; - // find the highest bit - uint8_t b = (uint8_t)mi_bsr(wsize); // note: wsize != 0 + wsize--; + mi_assert_internal(wsize!=0); + // find the highest bit position + uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize)); // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin diff --git a/src/xarena.c b/src/xarena.c new file mode 100644 index 00000000..42943f84 --- /dev/null +++ b/src/xarena.c @@ -0,0 +1,1777 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2024, Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- +"Arenas" are fixed area's of OS memory from which we can allocate +large blocks (>= MI_ARENA_MIN_BLOCK_SIZE, 4MiB). +In contrast to the rest of mimalloc, the arenas are shared between +threads and need to be accessed using atomic operations. + +Arenas are also used to for huge OS page (1GiB) reservations or for reserving +OS memory upfront which can be improve performance or is sometimes needed +on embedded devices. We can also employ this with WASI or `sbrk` systems +to reserve large arenas upfront and be able to reuse the memory more effectively. + +The arena allocation needs to be thread safe and we use an atomic bitmap to allocate. +-----------------------------------------------------------------------------*/ + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "xbitmap.h" + + +/* ----------------------------------------------------------- + Arena allocation +----------------------------------------------------------- */ + +#define MI_ARENA_BLOCK_SIZE (MI_SMALL_PAGE_SIZE) // 64KiB +#define MI_ARENA_BLOCK_ALIGN (MI_ARENA_BLOCK_SIZE) // 64KiB +#define MI_ARENA_BIN_COUNT (MI_BIN_COUNT) + +#define MI_ARENA_MIN_OBJ_SIZE MI_ARENA_BLOCK_SIZE +#define MI_ARENA_MAX_OBJ_SIZE (MI_BITMAP_CHUNK_BITS * MI_ARENA_BLOCK_SIZE) // for now, cannot cross chunk boundaries + +// A memory arena descriptor +typedef struct mi_arena_s { + mi_arena_id_t id; // arena id; 0 for non-specific + mi_memid_t memid; // memid of the memory area + // _Atomic(uint8_t*) start; // the start of the memory area + // size_t meta_size; // size of the arena structure itself (including its bitmaps) + // mi_memid_t meta_memid; // memid of the arena structure itself (OS or static allocation) + size_t block_count; // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`) + int numa_node; // associated NUMA node + bool exclusive; // only allow allocations if specifically for this arena + bool is_large; // memory area consists of large- or huge OS pages (always committed) + mi_lock_t abandoned_visit_lock; // lock is only used when abandoned segments are being visited + _Atomic(mi_msecs_t) purge_expire; // expiration time when blocks should be decommitted from `blocks_decommit`. + + mi_bitmap_t blocks_free; // is the block free? + mi_bitmap_t blocks_committed; // is the block committed? (i.e. accessible) + mi_bitmap_t blocks_purge; // can the block be purged? (block in purge => block in free) + mi_bitmap_t blocks_dirty; // is the block potentially non-zero? + mi_bitmap_t blocks_abandoned[MI_BIN_COUNT]; // abandoned pages per size bin (a set bit means the start of the page) + // the full queue contains abandoned full pages +} mi_arena_t; + +#define MI_MAX_ARENAS (1024) // Limited for now (and takes up .bss) + +// The available arenas +static mi_decl_cache_align _Atomic(mi_arena_t*) mi_arenas[MI_MAX_ARENAS]; +static mi_decl_cache_align _Atomic(size_t) mi_arena_count; // = 0 + + +/* ----------------------------------------------------------- + Arena id's + id = arena_index + 1 +----------------------------------------------------------- */ + +size_t mi_arena_id_index(mi_arena_id_t id) { + return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); +} + +static mi_arena_id_t mi_arena_id_create(size_t arena_index) { + mi_assert_internal(arena_index < MI_MAX_ARENAS); + return (int)arena_index + 1; +} + +mi_arena_id_t _mi_arena_id_none(void) { + return 0; +} + +static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { + return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || + (arena_id == req_arena_id)); +} + +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { + if (memid.memkind == MI_MEM_ARENA) { + return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); + } + else { + return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); + } +} + +size_t mi_arena_get_count(void) { + return mi_atomic_load_relaxed(&mi_arena_count); +} + +mi_arena_t* mi_arena_from_index(size_t idx) { + mi_assert_internal(idx < mi_arena_get_count()); + return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); +} + + + +/* ----------------------------------------------------------- + Util +----------------------------------------------------------- */ + +// Blocks needed for a given byte size +static size_t mi_block_count_of_size(size_t size) { + return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); +} + +// Byte size of a number of blocks +static size_t mi_size_of_blocks(size_t bcount) { + return (bcount * MI_ARENA_BLOCK_SIZE); +} + +// Size of an arena +static size_t mi_arena_size(mi_arena_t* arena) { + return mi_size_of_blocks(arena->block_count); +} + +static size_t mi_arena_info_blocks(void) { + const size_t os_page_size = _mi_os_page_size(); + const size_t info_size = _mi_align_up(sizeof(mi_arena_t), os_page_size) + os_page_size; // + guard page + const size_t info_blocks = mi_block_count_of_size(info_size); + return info_blocks; +} + + +// Start of the arena memory area +static uint8_t* mi_arena_start(mi_arena_t* arena) { + return ((uint8_t*)arena); +} + +// Start of a block +void* mi_arena_block_start(mi_arena_t* arena, size_t block_index) { + return (mi_arena_start(arena) + mi_size_of_blocks(block_index)); +} + +// Arena area +void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { + if (size != NULL) *size = 0; + const size_t arena_index = mi_arena_id_index(arena_id); + if (arena_index >= MI_MAX_ARENAS) return NULL; + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); + if (arena == NULL) return NULL; + if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } + return mi_arena_start(arena); +} + + +// Create an arena memid +static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, size_t block_index) { + mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); + memid.mem.arena.id = id; + memid.mem.arena.block_index = block_index; + memid.mem.arena.is_exclusive = is_exclusive; + return memid; +} + +// returns if the arena is exclusive +bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, size_t* block_index) { + mi_assert_internal(memid.memkind == MI_MEM_ARENA); + *arena_index = mi_arena_id_index(memid.mem.arena.id); + *block_index = memid.mem.arena.block_index; + return memid.mem.arena.is_exclusive; +} + + + +/* ----------------------------------------------------------- + Arena Allocation +----------------------------------------------------------- */ + +static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, + bool commit, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + MI_UNUSED(arena_index); + mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); + + size_t block_index; + if (!mi_bitmap_try_find_and_clearN(&arena->blocks_free, tseq, needed_bcount, &block_index)) return NULL; + + // claimed it! + void* p = mi_arena_block_start(arena, block_index); + *memid = mi_memid_create_arena(arena->id, arena->exclusive, block_index); + memid->is_pinned = arena->memid.is_pinned; + + // set the dirty bits + if (arena->memid.initially_zero) { + memid->initially_zero = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, block_index, needed_bcount, NULL); + } + + // set commit state + if (commit) { + // commit requested, but the range may not be committed as a whole: ensure it is committed now + memid->initially_committed = true; + + bool all_already_committed; + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount, &all_already_committed); + if (!all_already_committed) { + bool commit_zero = false; + if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { + memid->initially_committed = false; + } + else { + if (commit_zero) { memid->initially_zero = true; } + } + } + } + else { + // no need to commit, but check if already fully committed + memid->initially_committed = mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_index, needed_bcount); + } + + return p; +} + +// allocate in a speficic arena +static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, + size_t size, size_t alignment, + bool commit, bool allow_large, mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; + + const size_t bcount = mi_block_count_of_size(size); + const size_t arena_index = mi_arena_id_index(arena_id); + mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); + mi_assert_internal(size <= mi_size_of_blocks(bcount)); + + // Check arena suitability + mi_arena_t* arena = mi_arena_from_index(arena_index); + if (arena == NULL) return NULL; + if (!allow_large && arena->is_large) return NULL; + if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; + if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity + const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); + if (match_numa_node) { if (!numa_suitable) return NULL; } + else { if (numa_suitable) return NULL; } + } + + // try to allocate + void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, tseq, memid, tld); + mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); + return p; +} + + +// allocate from an arena with fallback to the OS +static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, size_t tseq, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert(alignment <= MI_ARENA_BLOCK_ALIGN); + if (alignment > MI_ARENA_BLOCK_ALIGN) return NULL; + + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + if mi_likely(max_arena == 0) return NULL; + + if (req_arena_id != _mi_arena_id_none()) { + // try a specific arena if requested + if (mi_arena_id_index(req_arena_id) < max_arena) { + void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + else { + // try numa affine allocation + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + + // try from another numa node instead.. + if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + } + return NULL; +} + +// try to reserve a fresh arena space +static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t* arena_id) +{ + if (_mi_preloading()) return false; // use OS only while pre loading + if (req_arena_id != _mi_arena_id_none()) return false; + + const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); + if (arena_count > (MI_MAX_ARENAS - 4)) return false; + + // calc reserve + size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); + if (arena_reserve == 0) return false; + + if (!_mi_os_has_virtual_reserve()) { + arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) + } + arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); + + if (arena_count >= 8 && arena_count <= 128) { + // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16); + size_t reserve = 0; + if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { + arena_reserve = reserve; + } + } + + // check arena bounds + const size_t min_reserve = mi_size_of_blocks(mi_arena_info_blocks() + 1); + const size_t max_reserve = MI_BITMAP_MAX_BITS * MI_ARENA_BLOCK_SIZE; + if (arena_reserve < min_reserve) { + arena_reserve = min_reserve; + } + else if (arena_reserve > max_reserve) { + arena_reserve = max_reserve; + } + + if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size + + // commit eagerly? + bool arena_commit = false; + if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } + else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } + + return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); +} + + +void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert_internal(memid != NULL && tld != NULL); + mi_assert_internal(size > 0); + size_t tseq = _mi_thread_seq_id(); + *memid = _mi_memid_none(); + + const int numa_node = _mi_os_numa_node(tld); // current numa node + + // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? + if (size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && alignment <= MI_ARENA_BLOCK_ALIGN && align_offset == 0) { + void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + + // otherwise, try to first eagerly reserve a new arena + if (req_arena_id == _mi_arena_id_none()) { + mi_arena_id_t arena_id = 0; + if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { + // and try allocate in there + mi_assert_internal(req_arena_id == _mi_arena_id_none()); + p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, tseq, memid, tld); + if (p != NULL) return p; + } + } + } + } + + // if we cannot use OS allocation, return NULL + if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { + errno = ENOMEM; + return NULL; + } + + // finally, fall back to the OS + if (align_offset > 0) { + return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); + } + else { + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); + } +} + +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); +} + + +/* ----------------------------------------------------------- + Arena free +----------------------------------------------------------- */ +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats); +static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats); + +void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { + mi_assert_internal(size > 0 && stats != NULL); + mi_assert_internal(committed_size <= size); + if (p==NULL) return; + if (size==0) return; + const bool all_committed = (committed_size == size); + + // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) + mi_track_mem_undefined(p, size); + + if (mi_memkind_is_os(memid.memkind)) { + // was a direct OS allocation, pass through + if (!all_committed && committed_size > 0) { + // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + _mi_os_free(p, size, memid, stats); + } + else if (memid.memkind == MI_MEM_ARENA) { + // allocated in an arena + size_t arena_idx; + size_t block_idx; + mi_arena_memid_indices(memid, &arena_idx, &block_idx); + mi_assert_internal(arena_idx < MI_MAX_ARENAS); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_idx]); + mi_assert_internal(arena != NULL); + const size_t blocks = mi_block_count_of_size(size); + + // checks + if (arena == NULL) { + _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + mi_assert_internal(block_idx < arena->block_count); + mi_assert_internal(block_idx > mi_arena_info_blocks()); + if (block_idx <= mi_arena_info_blocks() || block_idx > arena->block_count) { + _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + + // potentially decommit + if (arena->memid.is_pinned || arena->memid.initially_committed) { + mi_assert_internal(all_committed); + } + else { + if (!all_committed) { + // mark the entire range as no longer committed (so we recommit the full range when re-using) + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); + mi_track_mem_noaccess(p, size); + if (committed_size > 0) { + // if partially committed, adjust the committed stats (is it will be recommitted when re-using) + // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + // note: if not all committed, it may be that the purge will reset/decommit the entire range + // that contains already decommitted parts. Since purge consistently uses reset or decommit that + // works (as we should never reset decommitted parts). + } + // (delay) purge the entire range + mi_arena_schedule_purge(arena, block_idx, blocks, stats); + } + + // and make it available to others again + bool all_inuse = mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_free, block_idx, blocks, NULL); + if (!all_inuse) { + _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); + return; + }; + } + else { + // arena was none, external, or static; nothing to do + mi_assert_internal(memid.memkind < MI_MEM_OS); + } + + // purge expired decommits + mi_arenas_try_purge(false, false, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +static void mi_arenas_unsafe_destroy(void) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + size_t new_max_arena = 0; + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + if (arena != NULL) { + mi_lock_done(&arena->abandoned_visit_lock); + if (mi_memkind_is_os(arena->memid.memkind)) { + mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); + _mi_os_free(mi_arena_start(arena), mi_arena_size(arena), arena->memid, &_mi_stats_main); + } + } + } + + // try to lower the max arena. + size_t expected = max_arena; + mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); +} + +// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired +void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { + mi_arenas_unsafe_destroy(); + _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas +} + +// Is a pointer inside any of our arenas? +bool _mi_arena_contains(const void* p) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena != NULL && mi_arena_start(arena) <= (const uint8_t*)p && mi_arena_start(arena) + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { + return true; + } + } + return false; +} + + +/* ----------------------------------------------------------- + Add an arena. +----------------------------------------------------------- */ + +static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { + mi_assert_internal(arena != NULL); + mi_assert_internal(arena->block_count > 0); + if (arena_id != NULL) { *arena_id = -1; } + + size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); + if (i >= MI_MAX_ARENAS) { + mi_atomic_decrement_acq_rel(&mi_arena_count); + return false; + } + _mi_stat_counter_increase(&stats->arena_count,1); + arena->id = mi_arena_id_create(i); + mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); + if (arena_id != NULL) { *arena_id = arena->id; } + return true; +} + +static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept +{ + mi_assert(!is_large || memid.initially_committed && memid.is_pinned); + mi_assert(_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)); + mi_assert(start!=NULL); + if (start==NULL) return false; + if (!_mi_is_aligned(start,MI_ARENA_BLOCK_SIZE)) { + // todo: use alignment in memid to align to blocksize first? + _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_BLOCK_SIZE/MI_KiB, start); + return false; + } + + if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); } + + const size_t info_blocks = mi_arena_info_blocks(); + const size_t bcount = size / MI_ARENA_BLOCK_SIZE; // divide down + if (bcount < info_blocks+1) { + _mi_warning_message("cannot use OS memory since it is not large enough (size %zu KiB, minimum required is %zu KiB)", size/MI_KiB, mi_size_of_blocks(info_blocks+1)/MI_KiB); + return false; + } + if (bcount > MI_BITMAP_MAX_BITS) { + // todo: allow larger areas (either by splitting it up in arena's or having larger arena's) + _mi_warning_message("cannot use OS memory since it is too large (size %zu MiB, maximum is %zu MiB)", size/MI_MiB, mi_size_of_blocks(MI_BITMAP_MAX_BITS)/MI_MiB); + return false; + } + mi_arena_t* arena = (mi_arena_t*)start; + + // commit & zero if needed + bool is_zero = memid.initially_zero; + if (!memid.initially_committed) { + _mi_os_commit(arena, mi_size_of_blocks(info_blocks), &is_zero, &_mi_stats_main); + } + if (!is_zero) { + _mi_memzero(arena, mi_size_of_blocks(info_blocks)); + } + + // init + arena->id = _mi_arena_id_none(); + arena->memid = memid; + arena->exclusive = exclusive; + arena->block_count = bcount; + arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) + arena->is_large = is_large; + arena->purge_expire = 0; + mi_lock_init(&arena->abandoned_visit_lock); + + // init bitmaps + mi_bitmap_init(&arena->blocks_free,true); + mi_bitmap_init(&arena->blocks_committed,true); + mi_bitmap_init(&arena->blocks_dirty,true); + mi_bitmap_init(&arena->blocks_purge,true); + for( int i = 0; i < MI_ARENA_BIN_COUNT; i++) { + mi_bitmap_init(&arena->blocks_abandoned[i],true); + } + + // reserve our meta info (and reserve blocks outside the memory area) + mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_free, info_blocks /* start */, arena->block_count - info_blocks); + if (memid.initially_committed) { + mi_bitmap_unsafe_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, arena->block_count); + } + else { + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_committed, 0, info_blocks, NULL); + } + mi_bitmap_xsetN(MI_BIT_SET, &arena->blocks_dirty, 0, info_blocks, NULL); + + return mi_arena_add(arena, arena_id, &_mi_stats_main); +} + + +bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); + memid.initially_committed = is_committed; + memid.initially_zero = is_zero; + memid.is_pinned = is_large; + return mi_manage_os_memory_ex2(start, size, is_large, numa_node, exclusive, memid, arena_id); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = _mi_arena_id_none(); + size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block + mi_memid_t memid; + void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main); + if (start == NULL) return ENOMEM; + const bool is_large = memid.is_pinned; // todo: use separate is_large field? + if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { + _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); + _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); + return ENOMEM; + } + _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); + return 0; +} + + +// Manage a range of regular OS memory +bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { + return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { + return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); +} + + +/* ----------------------------------------------------------- + Debugging +----------------------------------------------------------- */ +static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) { + size_t bit_set_count = 0; + for (int bit = 0; bit < MI_BFIELD_BITS; bit++) { + bool is_set = ((((mi_bfield_t)1 << bit) & field) != 0); + if (is_set) bit_set_count++; + buf[bit] = (is_set ? 'x' : '.'); + } + return bit_set_count; +} + +static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_t* bitmap) { + _mi_verbose_message("%s%s:\n", prefix, header); + size_t bit_count = 0; + size_t bit_set_count = 0; + for (int i = 0; i < MI_BFIELD_BITS && bit_count < block_count; i++) { + char buf[MI_BITMAP_CHUNK_BITS + 1]; + mi_bitmap_chunk_t* chunk = &bitmap->chunks[i]; + for (int j = 0; j < MI_BITMAP_CHUNK_FIELDS; j++) { + if (bit_count < block_count) { + bit_set_count += mi_debug_show_bfield(chunk->bfields[j], buf + j*MI_BFIELD_BITS); + } + else { + _mi_memset(buf + j*MI_BFIELD_BITS, ' ', MI_BFIELD_BITS); + } + bit_count += MI_BFIELD_BITS; + } + buf[MI_BITMAP_CHUNK_BITS] = 0; + _mi_verbose_message("%s %s\n", prefix, buf); + } + _mi_verbose_message("%s total ('x'): %zu\n", prefix, bit_set_count); + return bit_set_count; +} + +void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { + MI_UNUSED(show_abandoned); + size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); + size_t free_total = 0; + size_t block_total = 0; + //size_t abandoned_total = 0; + size_t purge_total = 0; + for (size_t i = 0; i < max_arenas; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena == NULL) break; + block_total += arena->block_count; + _mi_verbose_message("arena %zu: %zu blocks%s\n", i, arena->block_count, (arena->memid.is_pinned ? ", pinned" : "")); + if (show_inuse) { + free_total += mi_debug_show_bitmap(" ", "free blocks", arena->block_count, &arena->blocks_free); + } + mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, &arena->blocks_committed); + // todo: abandoned blocks + if (show_purge) { + purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, &arena->blocks_purge); + } + } + if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", block_total - free_total); + // if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); + if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); +} + + +/* ----------------------------------------------------------- + Reserve a huge page arena. +----------------------------------------------------------- */ +// reserve at a specific numa node +int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = -1; + if (pages==0) return 0; + if (numa_node < -1) numa_node = -1; + if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); + size_t hsize = 0; + size_t pages_reserved = 0; + mi_memid_t memid; + void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); + if (p==NULL || pages_reserved==0) { + _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); + return ENOMEM; + } + _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); + + if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { + _mi_os_free(p, hsize, memid, &_mi_stats_main); + return ENOMEM; + } + return 0; +} + +int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { + return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); +} + +// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) +int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { + if (pages == 0) return 0; + + // pages per numa node + size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); + if (numa_count <= 0) numa_count = 1; + const size_t pages_per = pages / numa_count; + const size_t pages_mod = pages % numa_count; + const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); + + // reserve evenly among numa nodes + for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { + size_t node_pages = pages_per; // can be 0 + if (numa_node < pages_mod) node_pages++; + int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); + if (err) return err; + if (pages < node_pages) { + pages = 0; + } + else { + pages -= node_pages; + } + } + + return 0; +} + +int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { + MI_UNUSED(max_secs); + _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); + if (pages_reserved != NULL) *pages_reserved = 0; + int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); + if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; + return err; +} + + + +/* ----------------------------------------------------------- + Arena purge +----------------------------------------------------------- */ + +static long mi_arena_purge_delay(void) { + // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); +} + +// reset or decommit in an arena and update the committed/decommit bitmaps +// assumes we own the area (i.e. blocks_free is claimed by us) +static void mi_arena_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { + mi_assert_internal(!arena->memid.is_pinned); + const size_t size = mi_size_of_blocks(blocks); + void* const p = mi_arena_block_start(arena, block_idx); + bool needs_recommit; + if (mi_bitmap_is_xsetN(MI_BIT_SET, &arena->blocks_committed, block_idx, blocks)) { + // all blocks are committed, we can purge freely + needs_recommit = _mi_os_purge(p, size, stats); + } + else { + // some blocks are not committed -- this can happen when a partially committed block is freed + // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge + // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), + // and also undo the decommit stats (as it was already adjusted) + mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); + needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); + if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } + } + + // clear the purged blocks + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_purge, blocks, block_idx, NULL); + + // update committed bitmap + if (needs_recommit) { + mi_bitmap_xsetN(MI_BIT_CLEAR, &arena->blocks_committed, blocks, block_idx, NULL); + } +} + + +// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. +// Note: assumes we (still) own the area as we may purge immediately +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t block_idx, size_t blocks, mi_stats_t* stats) { + const long delay = mi_arena_purge_delay(); + if (delay < 0) return; // is purging allowed at all? + + if (_mi_preloading() || delay == 0) { + // decommit directly + mi_arena_purge(arena, block_idx, blocks, stats); + } + else { + // schedule decommit + _mi_error_message(EFAULT, "purging not yet implemented\n"); + } +} + + +static void mi_arenas_try_purge(bool force, bool visit_all, mi_stats_t* stats) { + if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled + + const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); + if (max_arena == 0) return; + + _mi_error_message(EFAULT, "purging not yet implemented\n"); + MI_UNUSED(stats); + MI_UNUSED(visit_all); + MI_UNUSED(force); +} + + +#if 0 + +#define MI_IN_ARENA_C +#include "arena-abandon.c" +#undef MI_IN_ARENA_C + +/* ----------------------------------------------------------- + Arena id's + id = arena_index + 1 +----------------------------------------------------------- */ + +size_t mi_arena_id_index(mi_arena_id_t id) { + return (size_t)(id <= 0 ? MI_MAX_ARENAS : id - 1); +} + +static mi_arena_id_t mi_arena_id_create(size_t arena_index) { + mi_assert_internal(arena_index < MI_MAX_ARENAS); + return (int)arena_index + 1; +} + +mi_arena_id_t _mi_arena_id_none(void) { + return 0; +} + +static bool mi_arena_id_is_suitable(mi_arena_id_t arena_id, bool arena_is_exclusive, mi_arena_id_t req_arena_id) { + return ((!arena_is_exclusive && req_arena_id == _mi_arena_id_none()) || + (arena_id == req_arena_id)); +} + +bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_id_t request_arena_id) { + if (memid.memkind == MI_MEM_ARENA) { + return mi_arena_id_is_suitable(memid.mem.arena.id, memid.mem.arena.is_exclusive, request_arena_id); + } + else { + return mi_arena_id_is_suitable(_mi_arena_id_none(), false, request_arena_id); + } +} + +size_t mi_arena_get_count(void) { + return mi_atomic_load_relaxed(&mi_arena_count); +} + +mi_arena_t* mi_arena_from_index(size_t idx) { + mi_assert_internal(idx < mi_arena_get_count()); + return mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[idx]); +} + + +/* ----------------------------------------------------------- + Arena allocations get a (currently) 16-bit memory id where the + lower 8 bits are the arena id, and the upper bits the block index. +----------------------------------------------------------- */ + +static size_t mi_block_count_of_size(size_t size) { + return _mi_divide_up(size, MI_ARENA_BLOCK_SIZE); +} + +static size_t mi_size_of_blocks(size_t bcount) { + return (bcount * MI_ARENA_BLOCK_SIZE); +} + +static size_t mi_arena_size(mi_arena_t* arena) { + return mi_size_of_blocks(arena->block_count); +} + +static mi_memid_t mi_memid_create_arena(mi_arena_id_t id, bool is_exclusive, mi_bitmap_index_t bitmap_index) { + mi_memid_t memid = _mi_memid_create(MI_MEM_ARENA); + memid.mem.arena.id = id; + memid.mem.arena.block_index = bitmap_index; + memid.mem.arena.is_exclusive = is_exclusive; + return memid; +} + +bool mi_arena_memid_indices(mi_memid_t memid, size_t* arena_index, mi_bitmap_index_t* bitmap_index) { + mi_assert_internal(memid.memkind == MI_MEM_ARENA); + *arena_index = mi_arena_id_index(memid.mem.arena.id); + *bitmap_index = memid.mem.arena.block_index; + return memid.mem.arena.is_exclusive; +} + + + +/* ----------------------------------------------------------- + Special static area for mimalloc internal structures + to avoid OS calls (for example, for the arena metadata (~= 256b)) +----------------------------------------------------------- */ + +#define MI_ARENA_STATIC_MAX ((MI_INTPTR_SIZE/2)*MI_KiB) // 4 KiB on 64-bit + +static mi_decl_cache_align uint8_t mi_arena_static[MI_ARENA_STATIC_MAX]; // must be cache aligned, see issue #895 +static mi_decl_cache_align _Atomic(size_t) mi_arena_static_top; + +static void* mi_arena_static_zalloc(size_t size, size_t alignment, mi_memid_t* memid) { + *memid = _mi_memid_none(); + if (size == 0 || size > MI_ARENA_STATIC_MAX) return NULL; + const size_t toplow = mi_atomic_load_relaxed(&mi_arena_static_top); + if ((toplow + size) > MI_ARENA_STATIC_MAX) return NULL; + + // try to claim space + if (alignment < MI_MAX_ALIGN_SIZE) { alignment = MI_MAX_ALIGN_SIZE; } + const size_t oversize = size + alignment - 1; + if (toplow + oversize > MI_ARENA_STATIC_MAX) return NULL; + const size_t oldtop = mi_atomic_add_acq_rel(&mi_arena_static_top, oversize); + size_t top = oldtop + oversize; + if (top > MI_ARENA_STATIC_MAX) { + // try to roll back, ok if this fails + mi_atomic_cas_strong_acq_rel(&mi_arena_static_top, &top, oldtop); + return NULL; + } + + // success + *memid = _mi_memid_create(MI_MEM_STATIC); + memid->initially_zero = true; + const size_t start = _mi_align_up(oldtop, alignment); + uint8_t* const p = &mi_arena_static[start]; + _mi_memzero_aligned(p, size); + return p; +} + +void* _mi_arena_meta_zalloc(size_t size, mi_memid_t* memid) { + *memid = _mi_memid_none(); + + // try static + void* p = mi_arena_static_zalloc(size, MI_MAX_ALIGN_SIZE, memid); + if (p != NULL) return p; + + // or fall back to the OS + p = _mi_os_alloc(size, memid, &_mi_stats_main); + if (p == NULL) return NULL; + + // zero the OS memory if needed + if (!memid->initially_zero) { + _mi_memzero_aligned(p, size); + memid->initially_zero = true; + } + return p; +} + +void _mi_arena_meta_free(void* p, mi_memid_t memid, size_t size) { + if (mi_memkind_is_os(memid.memkind)) { + _mi_os_free(p, size, memid, &_mi_stats_main); + } + else { + mi_assert(memid.memkind == MI_MEM_STATIC); + } +} + +void* mi_arena_block_start(mi_arena_t* arena, mi_bitmap_index_t bindex) { + return (arena->start + mi_size_of_blocks(mi_bitmap_index_bit(bindex))); +} + + +/* ----------------------------------------------------------- + Thread safe allocation in an arena +----------------------------------------------------------- */ + +// claim the `blocks_inuse` bits +static bool mi_arena_try_claim(mi_arena_t* arena, size_t blocks, size_t block_idx, mi_stats_t* stats) +{ + size_t idx = 0; // mi_atomic_load_relaxed(&arena->search_idx); // start from last search; ok to be relaxed as the exact start does not matter + if (_mi_bitmap_try_find_from_claim_across(arena->blocks_inuse, arena->field_count, idx, blocks, bitmap_idx, stats)) { + mi_atomic_store_relaxed(&arena->search_idx, mi_bitmap_index_field(*bitmap_idx)); // start search from found location next time around + return true; + }; + return false; +} + + +/* ----------------------------------------------------------- + Arena Allocation +----------------------------------------------------------- */ + +static mi_decl_noinline void* mi_arena_try_alloc_at(mi_arena_t* arena, size_t arena_index, size_t needed_bcount, + bool commit, mi_memid_t* memid, mi_os_tld_t* tld) +{ + MI_UNUSED(arena_index); + mi_assert_internal(mi_arena_id_index(arena->id) == arena_index); + + mi_bitmap_index_t bitmap_index; + if (!mi_arena_try_claim(arena, needed_bcount, &bitmap_index, tld->stats)) return NULL; + + // claimed it! + void* p = mi_arena_block_start(arena, bitmap_index); + *memid = mi_memid_create_arena(arena->id, arena->exclusive, bitmap_index); + memid->is_pinned = arena->memid.is_pinned; + + // none of the claimed blocks should be scheduled for a decommit + if (arena->blocks_purge != NULL) { + // this is thread safe as a potential purge only decommits parts that are not yet claimed as used (in `blocks_inuse`). + _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, needed_bcount, bitmap_index); + } + + // set the dirty bits (todo: no need for an atomic op here?) + if (arena->memid.initially_zero && arena->blocks_dirty != NULL) { + memid->initially_zero = _mi_bitmap_claim_across(arena->blocks_dirty, arena->field_count, needed_bcount, bitmap_index, NULL); + } + + // set commit state + if (arena->blocks_committed == NULL) { + // always committed + memid->initially_committed = true; + } + else if (commit) { + // commit requested, but the range may not be committed as a whole: ensure it is committed now + memid->initially_committed = true; + bool any_uncommitted; + _mi_bitmap_claim_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index, &any_uncommitted); + if (any_uncommitted) { + bool commit_zero = false; + if (!_mi_os_commit(p, mi_size_of_blocks(needed_bcount), &commit_zero, tld->stats)) { + memid->initially_committed = false; + } + else { + if (commit_zero) { memid->initially_zero = true; } + } + } + } + else { + // no need to commit, but check if already fully committed + memid->initially_committed = _mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, needed_bcount, bitmap_index); + } + + return p; +} + +// allocate in a speficic arena +static void* mi_arena_try_alloc_at_id(mi_arena_id_t arena_id, bool match_numa_node, int numa_node, size_t size, size_t alignment, + bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) +{ + MI_UNUSED_RELEASE(alignment); + mi_assert(alignment <= MI_SEGMENT_ALIGN); + const size_t bcount = mi_block_count_of_size(size); + const size_t arena_index = mi_arena_id_index(arena_id); + mi_assert_internal(arena_index < mi_atomic_load_relaxed(&mi_arena_count)); + mi_assert_internal(size <= mi_size_of_blocks(bcount)); + + // Check arena suitability + mi_arena_t* arena = mi_arena_from_index(arena_index); + if (arena == NULL) return NULL; + if (!allow_large && arena->is_large) return NULL; + if (!mi_arena_id_is_suitable(arena->id, arena->exclusive, req_arena_id)) return NULL; + if (req_arena_id == _mi_arena_id_none()) { // in not specific, check numa affinity + const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node); + if (match_numa_node) { if (!numa_suitable) return NULL; } + else { if (numa_suitable) return NULL; } + } + + // try to allocate + void* p = mi_arena_try_alloc_at(arena, arena_index, bcount, commit, memid, tld); + mi_assert_internal(p == NULL || _mi_is_aligned(p, alignment)); + return p; +} + + +// allocate from an arena with fallback to the OS +static mi_decl_noinline void* mi_arena_try_alloc(int numa_node, size_t size, size_t alignment, + bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld ) +{ + MI_UNUSED(alignment); + mi_assert_internal(alignment <= MI_SEGMENT_ALIGN); + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + if mi_likely(max_arena == 0) return NULL; + + if (req_arena_id != _mi_arena_id_none()) { + // try a specific arena if requested + if (mi_arena_id_index(req_arena_id) < max_arena) { + void* p = mi_arena_try_alloc_at_id(req_arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + } + else { + // try numa affine allocation + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + + // try from another numa node instead.. + if (numa_node >= 0) { // if numa_node was < 0 (no specific affinity requested), all arena's have been tried already + for (size_t i = 0; i < max_arena; i++) { + void* p = mi_arena_try_alloc_at_id(mi_arena_id_create(i), false /* only proceed if not numa local */, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + } + } + return NULL; +} + +// try to reserve a fresh arena space +static bool mi_arena_reserve(size_t req_size, bool allow_large, mi_arena_id_t req_arena_id, mi_arena_id_t *arena_id) +{ + if (_mi_preloading()) return false; // use OS only while pre loading + if (req_arena_id != _mi_arena_id_none()) return false; + + const size_t arena_count = mi_atomic_load_acquire(&mi_arena_count); + if (arena_count > (MI_MAX_ARENAS - 4)) return false; + + size_t arena_reserve = mi_option_get_size(mi_option_arena_reserve); + if (arena_reserve == 0) return false; + + if (!_mi_os_has_virtual_reserve()) { + arena_reserve = arena_reserve/4; // be conservative if virtual reserve is not supported (for WASM for example) + } + arena_reserve = _mi_align_up(arena_reserve, MI_ARENA_BLOCK_SIZE); + arena_reserve = _mi_align_up(arena_reserve, MI_SEGMENT_SIZE); + if (arena_count >= 8 && arena_count <= 128) { + // scale up the arena sizes exponentially every 8 entries (128 entries get to 589TiB) + const size_t multiplier = (size_t)1 << _mi_clamp(arena_count/8, 0, 16 ); + size_t reserve = 0; + if (!mi_mul_overflow(multiplier, arena_reserve, &reserve)) { + arena_reserve = reserve; + } + } + if (arena_reserve < req_size) return false; // should be able to at least handle the current allocation size + + // commit eagerly? + bool arena_commit = false; + if (mi_option_get(mi_option_arena_eager_commit) == 2) { arena_commit = _mi_os_has_overcommit(); } + else if (mi_option_get(mi_option_arena_eager_commit) == 1) { arena_commit = true; } + + return (mi_reserve_os_memory_ex(arena_reserve, arena_commit, allow_large, false /* exclusive? */, arena_id) == 0); +} + + +void* _mi_arena_alloc_aligned(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, + mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + mi_assert_internal(memid != NULL && tld != NULL); + mi_assert_internal(size > 0); + *memid = _mi_memid_none(); + + const int numa_node = _mi_os_numa_node(tld); // current numa node + + // try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data) + if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) || req_arena_id != _mi_arena_id_none()) { // is arena allocation allowed? + if (size >= MI_ARENA_MIN_OBJ_SIZE && alignment <= MI_SEGMENT_ALIGN && align_offset == 0) { + void* p = mi_arena_try_alloc(numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + + // otherwise, try to first eagerly reserve a new arena + if (req_arena_id == _mi_arena_id_none()) { + mi_arena_id_t arena_id = 0; + if (mi_arena_reserve(size, allow_large, req_arena_id, &arena_id)) { + // and try allocate in there + mi_assert_internal(req_arena_id == _mi_arena_id_none()); + p = mi_arena_try_alloc_at_id(arena_id, true, numa_node, size, alignment, commit, allow_large, req_arena_id, memid, tld); + if (p != NULL) return p; + } + } + } + } + + // if we cannot use OS allocation, return NULL + if (mi_option_is_enabled(mi_option_disallow_os_alloc) || req_arena_id != _mi_arena_id_none()) { + errno = ENOMEM; + return NULL; + } + + // finally, fall back to the OS + if (align_offset > 0) { + return _mi_os_alloc_aligned_at_offset(size, alignment, align_offset, commit, allow_large, memid, tld->stats); + } + else { + return _mi_os_alloc_aligned(size, alignment, commit, allow_large, memid, tld->stats); + } +} + +void* _mi_arena_alloc(size_t size, bool commit, bool allow_large, mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_os_tld_t* tld) +{ + return _mi_arena_alloc_aligned(size, MI_ARENA_BLOCK_SIZE, 0, commit, allow_large, req_arena_id, memid, tld); +} + + +void* mi_arena_area(mi_arena_id_t arena_id, size_t* size) { + if (size != NULL) *size = 0; + size_t arena_index = mi_arena_id_index(arena_id); + if (arena_index >= MI_MAX_ARENAS) return NULL; + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[arena_index]); + if (arena == NULL) return NULL; + if (size != NULL) { *size = mi_size_of_blocks(arena->block_count); } + return arena->start; +} + + +/* ----------------------------------------------------------- + Arena purge +----------------------------------------------------------- */ + +static long mi_arena_purge_delay(void) { + // <0 = no purging allowed, 0=immediate purging, >0=milli-second delay + return (mi_option_get(mi_option_purge_delay) * mi_option_get(mi_option_arena_purge_mult)); +} + +// reset or decommit in an arena and update the committed/decommit bitmaps +// assumes we own the area (i.e. blocks_in_use is claimed by us) +static void mi_arena_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) { + mi_assert_internal(arena->blocks_committed != NULL); + mi_assert_internal(arena->blocks_purge != NULL); + mi_assert_internal(!arena->memid.is_pinned); + const size_t size = mi_size_of_blocks(blocks); + void* const p = mi_arena_block_start(arena, bitmap_idx); + bool needs_recommit; + if (_mi_bitmap_is_claimed_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx)) { + // all blocks are committed, we can purge freely + needs_recommit = _mi_os_purge(p, size, stats); + } + else { + // some blocks are not committed -- this can happen when a partially committed block is freed + // in `_mi_arena_free` and it is conservatively marked as uncommitted but still scheduled for a purge + // we need to ensure we do not try to reset (as that may be invalid for uncommitted memory), + // and also undo the decommit stats (as it was already adjusted) + mi_assert_internal(mi_option_is_enabled(mi_option_purge_decommits)); + needs_recommit = _mi_os_purge_ex(p, size, false /* allow reset? */, stats); + if (needs_recommit) { _mi_stat_increase(&_mi_stats_main.committed, size); } + } + + // clear the purged blocks + _mi_bitmap_unclaim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx); + // update committed bitmap + if (needs_recommit) { + _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); + } +} + +// Schedule a purge. This is usually delayed to avoid repeated decommit/commit calls. +// Note: assumes we (still) own the area as we may purge immediately +static void mi_arena_schedule_purge(mi_arena_t* arena, size_t bitmap_idx, size_t blocks, mi_stats_t* stats) { + mi_assert_internal(arena->blocks_purge != NULL); + const long delay = mi_arena_purge_delay(); + if (delay < 0) return; // is purging allowed at all? + + if (_mi_preloading() || delay == 0) { + // decommit directly + mi_arena_purge(arena, bitmap_idx, blocks, stats); + } + else { + // schedule decommit + mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); + if (expire != 0) { + mi_atomic_addi64_acq_rel(&arena->purge_expire, (mi_msecs_t)(delay/10)); // add smallish extra delay + } + else { + mi_atomic_storei64_release(&arena->purge_expire, _mi_clock_now() + delay); + } + _mi_bitmap_claim_across(arena->blocks_purge, arena->field_count, blocks, bitmap_idx, NULL); + } +} + +// purge a range of blocks +// return true if the full range was purged. +// assumes we own the area (i.e. blocks_in_use is claimed by us) +static bool mi_arena_purge_range(mi_arena_t* arena, size_t idx, size_t startseqx, size_t bitlen, size_t purge, mi_stats_t* stats) { + const size_t endidx = startseqx + bitlen; + size_t bitseqx = startseqx; + bool all_purged = false; + while (bitseqx < endidx) { + // count consecutive ones in the purge mask + size_t count = 0; + while (bitseqx + count < endidx && (purge & ((size_t)1 << (bitseqx + count))) != 0) { + count++; + } + if (count > 0) { + // found range to be purged + const mi_bitmap_index_t range_idx = mi_bitmap_index_create(idx, bitseqx); + mi_arena_purge(arena, range_idx, count, stats); + if (count == bitlen) { + all_purged = true; + } + } + bitseqx += (count+1); // +1 to skip the zero bit (or end) + } + return all_purged; +} + +// returns true if anything was purged +static bool mi_arena_try_purge(mi_arena_t* arena, mi_msecs_t now, bool force, mi_stats_t* stats) +{ + if (arena->memid.is_pinned || arena->blocks_purge == NULL) return false; + mi_msecs_t expire = mi_atomic_loadi64_relaxed(&arena->purge_expire); + if (expire == 0) return false; + if (!force && expire > now) return false; + + // reset expire (if not already set concurrently) + mi_atomic_casi64_strong_acq_rel(&arena->purge_expire, &expire, (mi_msecs_t)0); + + // potential purges scheduled, walk through the bitmap + bool any_purged = false; + bool full_purge = true; + for (size_t i = 0; i < arena->field_count; i++) { + size_t purge = mi_atomic_load_relaxed(&arena->blocks_purge[i]); + if (purge != 0) { + size_t bitseqx = 0; + while (bitseqx < MI_BITMAP_FIELD_BITS) { + // find consecutive range of ones in the purge mask + size_t bitlen = 0; + while (bitseqx + bitlen < MI_BITMAP_FIELD_BITS && (purge & ((size_t)1 << (bitseqx + bitlen))) != 0) { + bitlen++; + } + // temporarily claim the purge range as "in-use" to be thread-safe with allocation + // try to claim the longest range of corresponding in_use bits + const mi_bitmap_index_t bitmap_index = mi_bitmap_index_create(i, bitseqx); + while( bitlen > 0 ) { + if (_mi_bitmap_try_claim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index)) { + break; + } + bitlen--; + } + // actual claimed bits at `in_use` + if (bitlen > 0) { + // read purge again now that we have the in_use bits + purge = mi_atomic_load_acquire(&arena->blocks_purge[i]); + if (!mi_arena_purge_range(arena, i, bitseqx, bitlen, purge, stats)) { + full_purge = false; + } + any_purged = true; + // release the claimed `in_use` bits again + _mi_bitmap_unclaim(arena->blocks_inuse, arena->field_count, bitlen, bitmap_index); + } + bitseqx += (bitlen+1); // +1 to skip the zero (or end) + } // while bitseqx + } // purge != 0 + } + // if not fully purged, make sure to purge again in the future + if (!full_purge) { + const long delay = mi_arena_purge_delay(); + mi_msecs_t expected = 0; + mi_atomic_casi64_strong_acq_rel(&arena->purge_expire,&expected,_mi_clock_now() + delay); + } + return any_purged; +} + +static void mi_arenas_try_purge( bool force, bool visit_all, mi_stats_t* stats ) { + if (_mi_preloading() || mi_arena_purge_delay() <= 0) return; // nothing will be scheduled + + const size_t max_arena = mi_atomic_load_acquire(&mi_arena_count); + if (max_arena == 0) return; + + // allow only one thread to purge at a time + static mi_atomic_guard_t purge_guard; + mi_atomic_guard(&purge_guard) + { + mi_msecs_t now = _mi_clock_now(); + size_t max_purge_count = (visit_all ? max_arena : 1); + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + if (arena != NULL) { + if (mi_arena_try_purge(arena, now, force, stats)) { + if (max_purge_count <= 1) break; + max_purge_count--; + } + } + } + } +} + + +/* ----------------------------------------------------------- + Arena free +----------------------------------------------------------- */ + +void _mi_arena_free(void* p, size_t size, size_t committed_size, mi_memid_t memid, mi_stats_t* stats) { + mi_assert_internal(size > 0 && stats != NULL); + mi_assert_internal(committed_size <= size); + if (p==NULL) return; + if (size==0) return; + const bool all_committed = (committed_size == size); + + // need to set all memory to undefined as some parts may still be marked as no_access (like padding etc.) + mi_track_mem_undefined(p,size); + + if (mi_memkind_is_os(memid.memkind)) { + // was a direct OS allocation, pass through + if (!all_committed && committed_size > 0) { + // if partially committed, adjust the committed stats (as `_mi_os_free` will increase decommit by the full size) + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + _mi_os_free(p, size, memid, stats); + } + else if (memid.memkind == MI_MEM_ARENA) { + // allocated in an arena + size_t arena_idx; + size_t bitmap_idx; + mi_arena_memid_indices(memid, &arena_idx, &bitmap_idx); + mi_assert_internal(arena_idx < MI_MAX_ARENAS); + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t,&mi_arenas[arena_idx]); + mi_assert_internal(arena != NULL); + const size_t blocks = mi_block_count_of_size(size); + + // checks + if (arena == NULL) { + _mi_error_message(EINVAL, "trying to free from an invalid arena: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + mi_assert_internal(arena->field_count > mi_bitmap_index_field(bitmap_idx)); + if (arena->field_count <= mi_bitmap_index_field(bitmap_idx)) { + _mi_error_message(EINVAL, "trying to free from an invalid arena block: %p, size %zu, memid: 0x%zx\n", p, size, memid); + return; + } + + // potentially decommit + if (arena->memid.is_pinned || arena->blocks_committed == NULL) { + mi_assert_internal(all_committed); + } + else { + mi_assert_internal(arena->blocks_committed != NULL); + mi_assert_internal(arena->blocks_purge != NULL); + + if (!all_committed) { + // mark the entire range as no longer committed (so we recommit the full range when re-using) + _mi_bitmap_unclaim_across(arena->blocks_committed, arena->field_count, blocks, bitmap_idx); + mi_track_mem_noaccess(p,size); + if (committed_size > 0) { + // if partially committed, adjust the committed stats (is it will be recommitted when re-using) + // in the delayed purge, we now need to not count a decommit if the range is not marked as committed. + _mi_stat_decrease(&_mi_stats_main.committed, committed_size); + } + // note: if not all committed, it may be that the purge will reset/decommit the entire range + // that contains already decommitted parts. Since purge consistently uses reset or decommit that + // works (as we should never reset decommitted parts). + } + // (delay) purge the entire range + mi_arena_schedule_purge(arena, bitmap_idx, blocks, stats); + } + + // and make it available to others again + bool all_inuse = _mi_bitmap_unclaim_across(arena->blocks_inuse, arena->field_count, blocks, bitmap_idx); + if (!all_inuse) { + _mi_error_message(EAGAIN, "trying to free an already freed arena block: %p, size %zu\n", p, size); + return; + }; + } + else { + // arena was none, external, or static; nothing to do + mi_assert_internal(memid.memkind < MI_MEM_OS); + } + + // purge expired decommits + mi_arenas_try_purge(false, false, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +static void mi_arenas_unsafe_destroy(void) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + size_t new_max_arena = 0; + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_acquire(mi_arena_t, &mi_arenas[i]); + if (arena != NULL) { + mi_lock_done(&arena->abandoned_visit_lock); + if (arena->start != NULL && mi_memkind_is_os(arena->memid.memkind)) { + mi_atomic_store_ptr_release(mi_arena_t, &mi_arenas[i], NULL); + _mi_os_free(arena->start, mi_arena_size(arena), arena->memid, &_mi_stats_main); + } + else { + new_max_arena = i; + } + _mi_arena_meta_free(arena, arena->meta_memid, arena->meta_size); + } + } + + // try to lower the max arena. + size_t expected = max_arena; + mi_atomic_cas_strong_acq_rel(&mi_arena_count, &expected, new_max_arena); +} + +// Purge the arenas; if `force_purge` is true, amenable parts are purged even if not yet expired +void _mi_arenas_collect(bool force_purge, mi_stats_t* stats) { + mi_arenas_try_purge(force_purge, force_purge /* visit all? */, stats); +} + +// destroy owned arenas; this is unsafe and should only be done using `mi_option_destroy_on_exit` +// for dynamic libraries that are unloaded and need to release all their allocated memory. +void _mi_arena_unsafe_destroy_all(mi_stats_t* stats) { + mi_arenas_unsafe_destroy(); + _mi_arenas_collect(true /* force purge */, stats); // purge non-owned arenas +} + +// Is a pointer inside any of our arenas? +bool _mi_arena_contains(const void* p) { + const size_t max_arena = mi_atomic_load_relaxed(&mi_arena_count); + for (size_t i = 0; i < max_arena; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena != NULL && arena->start <= (const uint8_t*)p && arena->start + mi_size_of_blocks(arena->block_count) > (const uint8_t*)p) { + return true; + } + } + return false; +} + +/* ----------------------------------------------------------- + Add an arena. +----------------------------------------------------------- */ + +static bool mi_arena_add(mi_arena_t* arena, mi_arena_id_t* arena_id, mi_stats_t* stats) { + mi_assert_internal(arena != NULL); + mi_assert_internal((uintptr_t)mi_atomic_load_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0); + mi_assert_internal(arena->block_count > 0); + if (arena_id != NULL) { *arena_id = -1; } + + size_t i = mi_atomic_increment_acq_rel(&mi_arena_count); + if (i >= MI_MAX_ARENAS) { + mi_atomic_decrement_acq_rel(&mi_arena_count); + return false; + } + _mi_stat_counter_increase(&stats->arena_count,1); + arena->id = mi_arena_id_create(i); + mi_atomic_store_ptr_release(mi_arena_t,&mi_arenas[i], arena); + if (arena_id != NULL) { *arena_id = arena->id; } + return true; +} + +static bool mi_manage_os_memory_ex2(void* start, size_t size, bool is_large, int numa_node, bool exclusive, mi_memid_t memid, mi_arena_id_t* arena_id) mi_attr_noexcept +{ + if (arena_id != NULL) *arena_id = _mi_arena_id_none(); + if (size < MI_ARENA_BLOCK_SIZE) return false; + + if (is_large) { + mi_assert_internal(memid.initially_committed && memid.is_pinned); + } + + const size_t bcount = size / MI_ARENA_BLOCK_SIZE; + const size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS); + const size_t bitmaps = (memid.is_pinned ? 3 : 5); + const size_t asize = sizeof(mi_arena_t) + (bitmaps*fields*sizeof(mi_bitmap_field_t)); + mi_memid_t meta_memid; + mi_arena_t* arena = (mi_arena_t*)_mi_arena_meta_zalloc(asize, &meta_memid); + if (arena == NULL) return false; + + // already zero'd due to zalloc + // _mi_memzero(arena, asize); + arena->id = _mi_arena_id_none(); + arena->memid = memid; + arena->exclusive = exclusive; + arena->meta_size = asize; + arena->meta_memid = meta_memid; + arena->block_count = bcount; + arena->field_count = fields; + arena->start = (uint8_t*)start; + arena->numa_node = numa_node; // TODO: or get the current numa node if -1? (now it allows anyone to allocate on -1) + arena->is_large = is_large; + arena->purge_expire = 0; + arena->search_idx = 0; + mi_lock_init(&arena->abandoned_visit_lock); + // consecutive bitmaps + arena->blocks_dirty = &arena->blocks_inuse[fields]; // just after inuse bitmap + arena->blocks_abandoned = &arena->blocks_inuse[2 * fields]; // just after dirty bitmap + arena->blocks_committed = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[3*fields]); // just after abandoned bitmap + arena->blocks_purge = (arena->memid.is_pinned ? NULL : &arena->blocks_inuse[4*fields]); // just after committed bitmap + // initialize committed bitmap? + if (arena->blocks_committed != NULL && arena->memid.initially_committed) { + memset((void*)arena->blocks_committed, 0xFF, fields*sizeof(mi_bitmap_field_t)); // cast to void* to avoid atomic warning + } + + // and claim leftover blocks if needed (so we never allocate there) + ptrdiff_t post = (fields * MI_BITMAP_FIELD_BITS) - bcount; + mi_assert_internal(post >= 0); + if (post > 0) { + // don't use leftover bits at the end + mi_bitmap_index_t postseqx = mi_bitmap_index_create(fields - 1, MI_BITMAP_FIELD_BITS - post); + _mi_bitmap_claim(arena->blocks_inuse, fields, post, postseqx, NULL); + } + return mi_arena_add(arena, arena_id, &_mi_stats_main); + +} + +bool mi_manage_os_memory_ex(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + mi_memid_t memid = _mi_memid_create(MI_MEM_EXTERNAL); + memid.initially_committed = is_committed; + memid.initially_zero = is_zero; + memid.is_pinned = is_large; + return mi_manage_os_memory_ex2(start,size,is_large,numa_node,exclusive,memid, arena_id); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory_ex(size_t size, bool commit, bool allow_large, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = _mi_arena_id_none(); + size = _mi_align_up(size, MI_ARENA_BLOCK_SIZE); // at least one block + mi_memid_t memid; + void* start = _mi_os_alloc_aligned(size, MI_SEGMENT_ALIGN, commit, allow_large, &memid, &_mi_stats_main); + if (start == NULL) return ENOMEM; + const bool is_large = memid.is_pinned; // todo: use separate is_large field? + if (!mi_manage_os_memory_ex2(start, size, is_large, -1 /* numa node */, exclusive, memid, arena_id)) { + _mi_os_free_ex(start, size, commit, memid, &_mi_stats_main); + _mi_verbose_message("failed to reserve %zu KiB memory\n", _mi_divide_up(size, 1024)); + return ENOMEM; + } + _mi_verbose_message("reserved %zu KiB memory%s\n", _mi_divide_up(size, 1024), is_large ? " (in large os pages)" : ""); + return 0; +} + + +// Manage a range of regular OS memory +bool mi_manage_os_memory(void* start, size_t size, bool is_committed, bool is_large, bool is_zero, int numa_node) mi_attr_noexcept { + return mi_manage_os_memory_ex(start, size, is_committed, is_large, is_zero, numa_node, false /* exclusive? */, NULL); +} + +// Reserve a range of regular OS memory +int mi_reserve_os_memory(size_t size, bool commit, bool allow_large) mi_attr_noexcept { + return mi_reserve_os_memory_ex(size, commit, allow_large, false, NULL); +} + + +/* ----------------------------------------------------------- + Debugging +----------------------------------------------------------- */ + +static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t block_count, mi_bitmap_field_t* fields, size_t field_count ) { + _mi_verbose_message("%s%s:\n", prefix, header); + size_t bcount = 0; + size_t inuse_count = 0; + for (size_t i = 0; i < field_count; i++) { + char buf[MI_BITMAP_FIELD_BITS + 1]; + uintptr_t field = mi_atomic_load_relaxed(&fields[i]); + for (size_t bit = 0; bit < MI_BITMAP_FIELD_BITS; bit++, bcount++) { + if (bcount < block_count) { + bool inuse = ((((uintptr_t)1 << bit) & field) != 0); + if (inuse) inuse_count++; + buf[bit] = (inuse ? 'x' : '.'); + } + else { + buf[bit] = ' '; + } + } + buf[MI_BITMAP_FIELD_BITS] = 0; + _mi_verbose_message("%s %s\n", prefix, buf); + } + _mi_verbose_message("%s total ('x'): %zu\n", prefix, inuse_count); + return inuse_count; +} + +void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge) mi_attr_noexcept { + size_t max_arenas = mi_atomic_load_relaxed(&mi_arena_count); + size_t inuse_total = 0; + size_t abandoned_total = 0; + size_t purge_total = 0; + for (size_t i = 0; i < max_arenas; i++) { + mi_arena_t* arena = mi_atomic_load_ptr_relaxed(mi_arena_t, &mi_arenas[i]); + if (arena == NULL) break; + _mi_verbose_message("arena %zu: %zu blocks of size %zuMiB (in %zu fields) %s\n", i, arena->block_count, MI_ARENA_BLOCK_SIZE / MI_MiB, arena->field_count, (arena->memid.is_pinned ? ", pinned" : "")); + if (show_inuse) { + inuse_total += mi_debug_show_bitmap(" ", "inuse blocks", arena->block_count, arena->blocks_inuse, arena->field_count); + } + if (arena->blocks_committed != NULL) { + mi_debug_show_bitmap(" ", "committed blocks", arena->block_count, arena->blocks_committed, arena->field_count); + } + if (show_abandoned) { + abandoned_total += mi_debug_show_bitmap(" ", "abandoned blocks", arena->block_count, arena->blocks_abandoned, arena->field_count); + } + if (show_purge && arena->blocks_purge != NULL) { + purge_total += mi_debug_show_bitmap(" ", "purgeable blocks", arena->block_count, arena->blocks_purge, arena->field_count); + } + } + if (show_inuse) _mi_verbose_message("total inuse blocks : %zu\n", inuse_total); + if (show_abandoned) _mi_verbose_message("total abandoned blocks: %zu\n", abandoned_total); + if (show_purge) _mi_verbose_message("total purgeable blocks: %zu\n", purge_total); +} + + +/* ----------------------------------------------------------- + Reserve a huge page arena. +----------------------------------------------------------- */ +// reserve at a specific numa node +int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept { + if (arena_id != NULL) *arena_id = -1; + if (pages==0) return 0; + if (numa_node < -1) numa_node = -1; + if (numa_node >= 0) numa_node = numa_node % _mi_os_numa_node_count(); + size_t hsize = 0; + size_t pages_reserved = 0; + mi_memid_t memid; + void* p = _mi_os_alloc_huge_os_pages(pages, numa_node, timeout_msecs, &pages_reserved, &hsize, &memid); + if (p==NULL || pages_reserved==0) { + _mi_warning_message("failed to reserve %zu GiB huge pages\n", pages); + return ENOMEM; + } + _mi_verbose_message("numa node %i: reserved %zu GiB huge pages (of the %zu GiB requested)\n", numa_node, pages_reserved, pages); + + if (!mi_manage_os_memory_ex2(p, hsize, true, numa_node, exclusive, memid, arena_id)) { + _mi_os_free(p, hsize, memid, &_mi_stats_main); + return ENOMEM; + } + return 0; +} + +int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msecs) mi_attr_noexcept { + return mi_reserve_huge_os_pages_at_ex(pages, numa_node, timeout_msecs, false, NULL); +} + +// reserve huge pages evenly among the given number of numa nodes (or use the available ones as detected) +int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t timeout_msecs) mi_attr_noexcept { + if (pages == 0) return 0; + + // pages per numa node + size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count()); + if (numa_count <= 0) numa_count = 1; + const size_t pages_per = pages / numa_count; + const size_t pages_mod = pages % numa_count; + const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50); + + // reserve evenly among numa nodes + for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) { + size_t node_pages = pages_per; // can be 0 + if (numa_node < pages_mod) node_pages++; + int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per); + if (err) return err; + if (pages < node_pages) { + pages = 0; + } + else { + pages -= node_pages; + } + } + + return 0; +} + +int mi_reserve_huge_os_pages(size_t pages, double max_secs, size_t* pages_reserved) mi_attr_noexcept { + MI_UNUSED(max_secs); + _mi_warning_message("mi_reserve_huge_os_pages is deprecated: use mi_reserve_huge_os_pages_interleave/at instead\n"); + if (pages_reserved != NULL) *pages_reserved = 0; + int err = mi_reserve_huge_os_pages_interleave(pages, 0, (size_t)(max_secs * 1000.0)); + if (err==0 && pages_reserved!=NULL) *pages_reserved = pages; + return err; +} + + +#endif \ No newline at end of file diff --git a/src/xbitmap.c b/src/xbitmap.c new file mode 100644 index 00000000..68525c84 --- /dev/null +++ b/src/xbitmap.c @@ -0,0 +1,599 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2024 Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- +Concurrent bitmap that can set/reset sequences of bits atomically +---------------------------------------------------------------------------- */ + +#include "mimalloc.h" +#include "mimalloc/internal.h" +#include "mimalloc/bits.h" +#include "xbitmap.h" + +/* -------------------------------------------------------------------------------- + bfields +-------------------------------------------------------------------------------- */ + +static inline size_t mi_bfield_ctz(mi_bfield_t x) { + return mi_ctz(x); +} + +static inline size_t mi_bfield_clz(mi_bfield_t x) { + return mi_clz(x); +} + +// find the least significant bit that is set (i.e. count trailing zero's) +// return false if `x==0` (with `*idx` undefined) and true otherwise, +// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`). +static inline bool mi_bfield_find_least_bit(mi_bfield_t x, size_t* idx) { + return mi_bsf(x,idx); +} + +static inline mi_bfield_t mi_bfield_rotate_right(mi_bfield_t x, size_t r) { + return mi_rotr(x,r); +} + +// Set/clear a bit atomically. Returns `true` if the bit transitioned from 0 to 1 (or 1 to 0). +static inline bool mi_bfield_atomic_xset(mi_bit_t set, _Atomic(mi_bfield_t)*b, size_t idx) { + mi_assert_internal(idx < MI_BFIELD_BITS); + const mi_bfield_t mask = ((mi_bfield_t)1)<bfields[i], idx); +} + +static bool mi_bitmap_chunk_try_xset8(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t byte_idx ) { + mi_assert_internal(byte_idx*8 < MI_BITMAP_CHUNK_BITS); + const size_t i = byte_idx / MI_BFIELD_SIZE; + const size_t ibyte_idx = byte_idx % MI_BFIELD_SIZE; + return mi_bfield_atomic_try_xset8( set, &chunk->bfields[i], ibyte_idx); +} + +// Set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0) +static bool mi_bitmap_chunk_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n, bool* palready_xset) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + bool all_transition = true; + bool all_already_xset = true; + size_t idx = cidx % MI_BFIELD_BITS; + size_t field = cidx / MI_BFIELD_BITS; + while (n > 0) { + size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(idx + m <= MI_BFIELD_BITS); + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask, &already_xset); + all_already_xset = all_already_xset && already_xset; + // next field + field++; + idx = 0; + n -= m; + } + *palready_xset = all_already_xset; + return all_transition; +} + +// Check if a sequence of `n` bits within a chunk are all set/cleared. +static bool mi_bitmap_chunk_is_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + bool all_xset = true; + size_t idx = cidx % MI_BFIELD_BITS; + size_t field = cidx / MI_BFIELD_BITS; + while (n > 0) { + size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(idx + m <= MI_BFIELD_BITS); + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask); + // next field + field++; + idx = 0; + n -= m; + } + return all_xset; +} + +// Try to atomically set/clear a sequence of `n` bits within a chunk. Returns true if all bits transitioned from 0 to 1 (or 1 to 0), +// and false otherwise leaving all bit fields as is. +static bool mi_bitmap_chunk_try_xsetN(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t cidx, size_t n) { + mi_assert_internal(cidx + n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(n>0); + if (n==0) return true; + size_t start_idx = cidx % MI_BFIELD_BITS; + size_t start_field = cidx / MI_BFIELD_BITS; + size_t end_field = MI_BITMAP_CHUNK_FIELDS; + size_t mask_mid = 0; + size_t mask_end = 0; + + // first field + size_t field = start_field; + size_t m = MI_BFIELD_BITS - start_idx; // m is the bits to xset in this field + if (m > n) { m = n; } + mi_assert_internal(start_idx + m <= MI_BFIELD_BITS); + mi_assert_internal(start_field < MI_BITMAP_CHUNK_FIELDS); + const size_t mask_start = (m == MI_BFIELD_BITS ? ~MI_ZU(0) : ((MI_ZU(1)<bfields[field], mask_start)) return false; + + // done? + n -= m; + if (n==0) return true; + + // continue with mid fields and last field: if these fail we need to recover by unsetting previous fields + + // mid fields + while (n >= MI_BFIELD_BITS) { + field++; + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + mask_mid = ~MI_ZU(0); + if (!mi_bfield_atomic_try_xset_mask(set, &chunk->bfields[field], mask_mid)) goto restore; + n -= MI_BFIELD_BITS; + } + + // last field + if (n > 0) { + mi_assert_internal(n < MI_BFIELD_BITS); + field++; + mi_assert_internal(field < MI_BITMAP_CHUNK_FIELDS); + end_field = field; + mask_end = (MI_ZU(1)<bfields[field], mask_end)) goto restore; + } + + return true; + +restore: + // field is on the field that failed to set atomically; we need to restore all previous fields + mi_assert_internal(field > start_field); + while( field > start_field) { + field--; + const size_t mask = (field == start_field ? mask_start : (field == end_field ? mask_end : mask_mid)); + bool already_xset; + mi_bfield_atomic_xset_mask(!set, &chunk->bfields[field], mask, &already_xset); + } + return false; +} + + +// find least 1-bit in a chunk and try unset it atomically +// set `*pidx` to thi bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// todo: try neon version +static inline bool mi_bitmap_chunk_find_and_try_clear(mi_bitmap_chunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + while(true) { + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + if (_mm256_testz_si256(vec,vec)) return false; // vec == 0 ? + const __m256i vcmp = _mm256_cmpeq_epi64(vec, _mm256_setzero_si256()); // (elem64 == 0 ? -1 : 0) + const uint32_t mask = ~_mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte (so each 8 bits in the mask will be all 1 or all 0) + mi_assert_internal(mask != 0); + const size_t chunk_idx = _tzcnt_u32(mask) / 8; // tzcnt == 0, 8, 16, or 24 + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + size_t cidx; + if (mi_bfield_find_least_bit(chunk->bfields[chunk_idx],&cidx)) { // find the bit that is set + if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[chunk_idx], cidx)) { // unset atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + // try again + } + #else + size_t idx; + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + size_t idx; + if mi_unlikely(mi_bfield_find_least_bit(chunk->bfields[i],&idx)) { // find least 1-bit + if mi_likely(mi_bfield_atomic_try_xset(MI_BIT_CLEAR,&chunk->bfields[i],idx)) { // try unset atomically + *pidx = (i*MI_BFIELD_BITS + idx); + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + } + } + return false; + #endif +} + + +// find least byte in a chunk with all bits set, and try unset it atomically +// set `*pidx` to its bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success. +// todo: try neon version +static inline bool mi_bitmap_chunk_find_and_try_clear8(mi_bitmap_chunk_t* chunk, size_t* pidx) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + while(true) { + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + const __m256i vcmp = _mm256_cmpeq_epi8(vec, _mm256_set1_epi64x(~0)); // (byte == ~0 ? -1 : 0) + const uint32_t mask = _mm256_movemask_epi8(vcmp); // mask of most significant bit of each byte + if (mask == 0) return false; + const size_t i = _tzcnt_u32(mask); + mi_assert_internal(8*i < MI_BITMAP_CHUNK_BITS); + const size_t chunk_idx = i / MI_BFIELD_SIZE; + const size_t byte_idx = i % MI_BFIELD_SIZE; + if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[chunk_idx],byte_idx)) { // try to unset atomically + *pidx = (chunk_idx*MI_BFIELD_BITS) + (byte_idx*8); + mi_assert_internal(*pidx < MI_BITMAP_CHUNK_BITS); + return true; + } + // try again + } + #else + size_t idx; + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + const mi_bfield_t x = chunk->bfields[i]; + // has_set8 has low bit in each byte set if the byte in x == 0xFF + const mi_bfield_t has_set8 = ((~x - MI_BFIELD_LO_BIT8) & // high bit set if byte in x is 0xFF or < 0x7F + (x & MI_BFIELD_HI_BIT8)) // high bit set if byte in x is >= 0x80 + >> 7; // shift high bit to low bit + size_t idx; + if mi_unlikely(mi_bfield_find_least_bit(has_set8,&idx)) { // find least 1-bit + mi_assert_internal(idx <= (MI_BFIELD_BITS - 8)); + mi_assert_internal((idx%8)==0); + const size_t byte_idx = idx/8; + if mi_likely(mi_bfield_atomic_try_xset8(MI_BIT_CLEAR,&chunk->bfields[i],byte_idx)) { // unset the byte atomically + *pidx = (i*MI_BFIELD_BITS) + idx; + mi_assert_internal(*pidx + 8 <= MI_BITMAP_CHUNK_BITS); + return true; + } + // else continue + } + } + return false; + #endif +} + + +// find a sequence of `n` bits in a chunk with all `n` bits set, and try unset it atomically +// set `*pidx` to its bit index (0 <= *pidx <= MI_BITMAP_CHUNK_BITS - n) on success. +// todo: try avx2 and neon version +// todo: allow spanning across bfield boundaries? +static inline bool mi_bitmap_chunk_find_and_try_clearN(mi_bitmap_chunk_t* chunk, size_t n, size_t* pidx) { + if (n == 0 || n > MI_BFIELD_BITS) return false; // TODO: allow larger? + const mi_bfield_t mask = (n==MI_BFIELD_BITS ? ~((mi_bfield_t)0) : (((mi_bfield_t)1) << n)-1); + for(int i = 0; i < MI_BITMAP_CHUNK_FIELDS; i++) { + mi_bfield_t b = chunk->bfields[i]; + size_t bshift = 0; + size_t idx; + while (mi_bfield_find_least_bit(b, &idx)) { // find least 1-bit + b >>= idx; + bshift += idx; + if (bshift + n >= MI_BFIELD_BITS) break; + + if ((b&mask) == mask) { // found a match + mi_assert_internal( ((mask << bshift) >> bshift) == mask ); + if mi_likely(mi_bfield_atomic_try_xset_mask(MI_BIT_CLEAR,&chunk->bfields[i],mask<bfields[i] >> bshift); + } + } + else { + // advance + const size_t ones = mi_bfield_ctz(~b); // skip all ones (since it didn't fit the mask) + mi_assert_internal(ones>0); + bshift += ones; + b >>= ones; + } + } + } + return false; +} + + +// are all bits in a bitmap chunk set? +static bool mi_bitmap_chunk_all_are_set(mi_bitmap_chunk_t* chunk) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return _mm256_test_all_ones(vec); + #else + // written like this for vectorization + mi_bfield_t x = chunk->bfields[0]; + for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { + x = x & chunk->bfields[i]; + } + return (~x == 0); + #endif +} + +// are all bits in a bitmap chunk clear? +static bool mi_bitmap_chunk_all_are_clear(mi_bitmap_chunk_t* chunk) { + #if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256) + const __m256i vec = _mm256_load_si256((const __m256i*)chunk->bfields); + return _mm256_testz_si256( vec, vec ); + #else + // written like this for vectorization + mi_bfield_t x = chunk->bfields[0]; + for(int i = 1; i < MI_BITMAP_CHUNK_FIELDS; i++) { + x = x | chunk->bfields[i]; + } + return (x == 0); + #endif +} + +/* -------------------------------------------------------------------------------- + bitmap +-------------------------------------------------------------------------------- */ +// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero) { + if (!already_zero) { + _mi_memzero_aligned(bitmap, sizeof(*bitmap)); + } +} + +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(idx + n<=MI_BITMAP_MAX_BITS); + + // first chunk + size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + size_t m = MI_BITMAP_CHUNK_BITS - cidx; + if (m > n) { m = n; } + bool already_xset; + mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, m, &already_xset); + + // n can be large so use memset for efficiency for all in-between chunks + chunk_idx++; + n -= m; + const size_t mid_chunks = n / MI_BITMAP_CHUNK_BITS; + if (mid_chunks > 0) { + _mi_memset(&bitmap->chunks[chunk_idx], (set ? ~0 : 0), MI_BITMAP_CHUNK_BITS/8); + chunk_idx += mid_chunks; + n -= mid_chunks * MI_BITMAP_CHUNK_BITS; + } + + // last chunk + if (n > 0) { + mi_assert_internal(n < MI_BITMAP_CHUNK_BITS); + mi_assert_internal(chunk_idx < MI_BITMAP_CHUNK_FIELDS); + mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], 0, n, &already_xset); + } +} + + +// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0), +// and false otherwise leaving the bitmask as is. +bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + return mi_bitmap_chunk_try_xset( set, &bitmap->chunks[chunk_idx], cidx); +} + +// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) +// and false otherwise leaving the bitmask as is. +bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx) { + mi_assert_internal(idx < MI_BITMAP_MAX_BITS); + mi_assert_internal(idx%8 == 0); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t byte_idx = (idx % MI_BITMAP_CHUNK_BITS)/8; + return mi_bitmap_chunk_try_xset8( set, &bitmap->chunks[chunk_idx],byte_idx); +} + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) +// and false otherwise leaving the bitmask as is. +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + if (n==1) { return mi_bitmap_try_xset(set,bitmap,idx); } + if (n==8) { return mi_bitmap_try_xset8(set,bitmap,idx); } + + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_try_xsetN( set, &bitmap->chunks[chunk_idx], cidx, n); +} + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + bool local_already_xset; + if (already_xset==NULL) { already_xset = &local_already_xset; } + // if (n==1) { return mi_bitmap_xset(set, bitmap, idx); } + // if (n==8) { return mi_bitmap_xset8(set, bitmap, idx); } + + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n, already_xset); +} + +// Is a sequence of n bits already all set/cleared? +bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n) { + mi_assert_internal(n>0); + mi_assert_internal(n<=MI_BITMAP_CHUNK_BITS); + mi_assert_internal(idx + n <= MI_BITMAP_MAX_BITS); + const size_t chunk_idx = idx / MI_BITMAP_CHUNK_BITS; + const size_t cidx = idx % MI_BITMAP_CHUNK_BITS; + mi_assert_internal(cidx + n <= MI_BITMAP_CHUNK_BITS); // don't cross chunks (for now) + if (cidx + n > MI_BITMAP_CHUNK_BITS) { n = MI_BITMAP_CHUNK_BITS - cidx; } // paranoia + return mi_bitmap_chunk_is_xsetN(set, &bitmap->chunks[chunk_idx], cidx, n); +} + + +#define mi_bitmap_forall_set_chunks(bitmap,start,decl_chunk_idx) \ + { size_t _set_idx; \ + size_t _start = start % MI_BFIELD_BITS; \ + mi_bfield_t _any_set = mi_bfield_rotate_right(bitmap->any_set, _start); \ + while (mi_bfield_find_least_bit(_any_set,&_set_idx)) { \ + decl_chunk_idx = (_set_idx + _start) % MI_BFIELD_BITS; + +#define mi_bitmap_forall_set_chunks_end() \ + _start += _set_idx+1; /* so chunk_idx stays valid */ \ + _any_set >>= _set_idx; /* skip scanned bits (and avoid UB with (idx+1)) */ \ + _any_set >>= 1; \ + } \ + } + +// Find a set bit in a bitmap and atomically unset it. Returns true on success, +// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. +// The low `MI_BFIELD_BITS` of start are used to set the start point of the search +// (to reduce thread contention). +bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start) { + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clear(&bitmap->chunks[chunk_idx],&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx < MI_BITMAP_MAX_BITS); + return true; + } + else { + // we may find that all are unset only on a second iteration but that is ok as + // _any_set is a conservative approximation. + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + } + } + } + mi_bitmap_forall_set_chunks_end(); + return false; +} + + +// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. +bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ) { + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clear8(&bitmap->chunks[chunk_idx],&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-8); + mi_assert_internal((*pidx % 8) == 0); + return true; + } + else { + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + } + } + } + mi_bitmap_forall_set_chunks_end(); + return false; +} + +// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ) { + // TODO: allow at least MI_BITMAP_CHUNK_BITS and probably larger + // TODO: allow spanning across chunk boundaries + if (n == 0 || n > MI_BFIELD_BITS) return false; + mi_bitmap_forall_set_chunks(bitmap,start,size_t chunk_idx) + { + size_t cidx; + if mi_likely(mi_bitmap_chunk_find_and_try_clearN(&bitmap->chunks[chunk_idx],n,&cidx)) { + *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx; + mi_assert_internal(*pidx <= MI_BITMAP_MAX_BITS-n); + return true; + } + else { + if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) { + mi_bfield_atomic_xset(MI_BIT_CLEAR,&bitmap->any_set,chunk_idx); + } + } + } + mi_bitmap_forall_set_chunks_end(); + return false; +} diff --git a/src/xbitmap.h b/src/xbitmap.h new file mode 100644 index 00000000..869db2a2 --- /dev/null +++ b/src/xbitmap.h @@ -0,0 +1,94 @@ +/* ---------------------------------------------------------------------------- +Copyright (c) 2019-2023 Microsoft Research, Daan Leijen +This is free software; you can redistribute it and/or modify it under the +terms of the MIT license. A copy of the license can be found in the file +"LICENSE" at the root of this distribution. +-----------------------------------------------------------------------------*/ + +/* ---------------------------------------------------------------------------- +Concurrent bitmap that can set/reset sequences of bits atomically +---------------------------------------------------------------------------- */ +#pragma once +#ifndef MI_XBITMAP_H +#define MI_XBITMAP_H + +/* -------------------------------------------------------------------------------- + Definitions +-------------------------------------------------------------------------------- */ + +typedef size_t mi_bfield_t; + +#define MI_BFIELD_BITS_SHIFT (MI_SIZE_SHIFT+3) +#define MI_BFIELD_BITS (1 << MI_BFIELD_BITS_SHIFT) +#define MI_BFIELD_SIZE (MI_BFIELD_BITS/8) +#define MI_BFIELD_BITS_MOD_MASK (MI_BFIELD_BITS - 1) +#define MI_BFIELD_LO_BIT8 ((~(mi_bfield_t(0)))/0xFF) // 0x01010101 .. +#define MI_BFIELD_HI_BIT8 (MI_BFIELD_LO_BIT8 << 7) // 0x80808080 .. + +#define MI_BITMAP_CHUNK_BITS_SHIFT (8) // 2^8 = 256 bits per chunk +#define MI_BITMAP_CHUNK_BITS (1 << MI_BITMAP_CHUNK_BITS_SHIFT) +#define MI_BITMAP_CHUNK_FIELDS (MI_BITMAP_CHUNK_BITS / MI_BFIELD_BITS) +#define MI_BITMAP_CHUNK_BITS_MOD_MASK (MI_BITMAP_CHUNK_BITS - 1) + +typedef mi_decl_align(32) struct mi_bitmap_chunk_s { + _Atomic(mi_bfield_t) bfields[MI_BITMAP_CHUNK_FIELDS]; +} mi_bitmap_chunk_t; + + +typedef mi_decl_align(32) struct mi_bitmap_s { + mi_bitmap_chunk_t chunks[MI_BFIELD_BITS]; + _Atomic(mi_bfield_t)any_set; +} mi_bitmap_t; + +#define MI_BITMAP_MAX_BITS (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS) // 16k bits on 64bit, 8k bits on 32bit + +/* -------------------------------------------------------------------------------- + Bitmap +-------------------------------------------------------------------------------- */ + +typedef bool mi_bit_t; +#define MI_BIT_SET (true) +#define MI_BIT_CLEAR (false) + +// initialize a bitmap to all unset; avoid a mem_zero if `already_zero` is true +void mi_bitmap_init(mi_bitmap_t* bitmap, bool already_zero); + +// Set/clear a sequence of `n` bits in the bitmap (and can cross chunks). Not atomic so only use if local to a thread. +void mi_bitmap_unsafe_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from all 0's to 1's (or all 1's to 0's). +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +// If `already_xset` is not NULL, it is set to true if all the bits were already all set/cleared. +bool mi_bitmap_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n, bool* already_xset); + +// Is a sequence of n bits already all set/cleared? +bool mi_bitmap_is_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Try to set/clear a bit in the bitmap; returns `true` if atomically transitioned from 0 to 1 (or 1 to 0) +// and false otherwise leaving the bitmask as is. +mi_decl_nodiscard bool mi_bitmap_try_xset(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); + +// Try to set/clear a byte in the bitmap; returns `true` if atomically transitioned from 0 to 0xFF (or 0xFF to 0) +// and false otherwise leaving the bitmask as is. +mi_decl_nodiscard bool mi_bitmap_try_xset8(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx); + +// Try to set/clear a sequence of `n` bits in the bitmap; returns `true` if atomically transitioned from 0's to 1's (or 1's to 0's) +// and false otherwise leaving the bitmask as is. +// `n` cannot cross chunk boundaries (and `n <= MI_BITMAP_CHUNK_BITS`)! +mi_decl_nodiscard bool mi_bitmap_try_xsetN(mi_bit_t set, mi_bitmap_t* bitmap, size_t idx, size_t n); + +// Find a set bit in a bitmap and atomically unset it. Returns true on success, +// and in that case sets the index: `0 <= *pidx < MI_BITMAP_MAX_BITS`. +// The low `MI_BFIELD_BITS` of start are used to set the start point of the search +// (to reduce thread contention). +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear(mi_bitmap_t* bitmap, size_t* pidx, size_t start); + +// Find a byte in the bitmap with all bits set (0xFF) and atomically unset it to zero. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-8`. +mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t start, size_t* pidx ); + +// Find a sequence of `n` bits in the bitmap with all bits set, and atomically unset all. +// Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`. +mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t start, size_t n, size_t* pidx ); + +#endif // MI_XBITMAP_H diff --git a/test/main-override-static.c b/test/main-override-static.c index 4ad76d6a..a8e30f69 100644 --- a/test/main-override-static.c +++ b/test/main-override-static.c @@ -7,6 +7,8 @@ #include #include // redefines malloc etc. +static void mi_bins(void); + static void double_free1(); static void double_free2(); static void corrupt_free(); @@ -33,7 +35,7 @@ int main() { // corrupt_free(); // block_overflow1(); // block_overflow2(); - test_canary_leak(); + // test_canary_leak(); // test_aslr(); // invalid_free(); // test_reserved(); @@ -41,6 +43,9 @@ int main() { // test_heap_walk(); // alloc_huge(); + mi_bins(); + + void* p1 = malloc(78); void* p2 = malloc(24); free(p1); @@ -73,7 +78,7 @@ int main() { static void invalid_free() { free((void*)0xBADBEEF); - realloc((void*)0xBADBEEF,10); + realloc((void*)0xBADBEEF, 10); } static void block_overflow1() { @@ -171,7 +176,7 @@ static void test_process_info(void) { size_t peak_commit = 0; size_t page_faults = 0; for (int i = 0; i < 100000; i++) { - void* p = calloc(100,10); + void* p = calloc(100, 10); free(p); } mi_process_info(&elapsed, &user_msecs, &system_msecs, ¤t_rss, &peak_rss, ¤t_commit, &peak_commit, &page_faults); @@ -229,8 +234,8 @@ static void test_heap_walk(void) { } static void test_canary_leak(void) { - char* p = mi_mallocn_tp(char,23); - for(int i = 0; i < 23; i++) { + char* p = mi_mallocn_tp(char, 23); + for (int i = 0; i < 23; i++) { p[i] = '0'+i; } puts(p); @@ -248,15 +253,15 @@ static void test_canary_leak(void) { static void test_large_pages(void) { mi_memid_t memid; - #if 0 +#if 0 size_t pages_reserved; size_t page_size; uint8_t* p = (uint8_t*)_mi_os_alloc_huge_os_pages(1, -1, 30000, &pages_reserved, &page_size, &memid); const size_t req_size = pages_reserved * page_size; - #else +#else const size_t req_size = 64*MI_MiB; - uint8_t* p = (uint8_t*)_mi_os_alloc(req_size,&memid,NULL); - #endif + uint8_t* p = (uint8_t*)_mi_os_alloc(req_size, &memid, NULL); +#endif p[0] = 1; @@ -276,63 +281,16 @@ static void test_large_pages(void) { // bin size experiments // ------------------------------ -#if 0 +#if 1 #include #include +#include -#define MI_INTPTR_SIZE 8 #define MI_LARGE_WSIZE_MAX (4*1024*1024 / MI_INTPTR_SIZE) #define MI_BIN_HUGE 100 //#define MI_ALIGN2W -// Bit scan reverse: return the index of the highest bit. -static inline uint8_t mi_bsr32(uint32_t x); - -#if defined(_MSC_VER) -#include -#include -static inline uint8_t mi_bsr32(uint32_t x) { - uint32_t idx; - _BitScanReverse((DWORD*)&idx, x); - return idx; -} -#elif defined(__GNUC__) || defined(__clang__) -static inline uint8_t mi_bsr32(uint32_t x) { - return (31 - __builtin_clz(x)); -} -#else -static inline uint8_t mi_bsr32(uint32_t x) { - // de Bruijn multiplication, see - static const uint8_t debruijn[32] = { - 31, 0, 22, 1, 28, 23, 18, 2, 29, 26, 24, 10, 19, 7, 3, 12, - 30, 21, 27, 17, 25, 9, 6, 11, 20, 16, 8, 5, 15, 4, 14, 13, - }; - x |= x >> 1; - x |= x >> 2; - x |= x >> 4; - x |= x >> 8; - x |= x >> 16; - x++; - return debruijn[(x*0x076be629) >> 27]; -} -#endif - -/* -// Bit scan reverse: return the index of the highest bit. -uint8_t _mi_bsr(uintptr_t x) { - if (x == 0) return 0; - #if MI_INTPTR_SIZE==8 - uint32_t hi = (x >> 32); - return (hi == 0 ? mi_bsr32((uint32_t)x) : 32 + mi_bsr32(hi)); - #elif MI_INTPTR_SIZE==4 - return mi_bsr32(x); - #else - # error "define bsr for non-32 or 64-bit platforms" - #endif -} -*/ - static inline size_t _mi_wsize_from_size(size_t size) { return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t); @@ -370,7 +328,9 @@ extern inline uint8_t _mi_bin8(size_t size) { #endif wsize--; // find the highest bit - uint8_t b = mi_bsr32((uint32_t)wsize); + size_t idx; + mi_bsr(wsize, &idx); + uint8_t b = (uint8_t)idx; // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). // - adjust with 3 because we use do not round the first 8 sizes // which each get an exact bin @@ -402,44 +362,79 @@ static inline uint8_t _mi_bin4(size_t size) { bin = MI_BIN_HUGE; } else { - uint8_t b = mi_bsr32((uint32_t)wsize); + size_t idx; + mi_bsr(wsize, &idx); + uint8_t b = (uint8_t)idx; bin = ((b << 1) + (uint8_t)((wsize >> (b - 1)) & 0x01)) + 3; } return bin; } -static size_t _mi_binx4(size_t bsize) { - if (bsize==0) return 0; - uint8_t b = mi_bsr32((uint32_t)bsize); - if (b <= 1) return bsize; - size_t bin = ((b << 1) | (bsize >> (b - 1))&0x01); +static size_t _mi_binx4(size_t wsize) { + size_t bin; + if (wsize <= 1) { + bin = 1; + } + else if (wsize <= 8) { + // bin = (wsize+1)&~1; // round to double word sizes + bin = (uint8_t)wsize; + } + else { + size_t idx; + mi_bsr(wsize, &idx); + uint8_t b = (uint8_t)idx; + if (b <= 1) return wsize; + bin = ((b << 1) | (wsize >> (b - 1))&0x01) + 3; + } return bin; } static size_t _mi_binx8(size_t bsize) { if (bsize<=1) return bsize; - uint8_t b = mi_bsr32((uint32_t)bsize); + size_t idx; + mi_bsr(bsize, &idx); + uint8_t b = (uint8_t)idx; if (b <= 2) return bsize; size_t bin = ((b << 2) | (bsize >> (b - 2))&0x03) - 5; return bin; } + +static inline size_t mi_bin(size_t wsize) { + uint8_t bin; + if (wsize <= 1) { + bin = 1; + } + else if (wsize <= 8) { + // bin = (wsize+1)&~1; // round to double word sizes + bin = (uint8_t)wsize; + } + else { + wsize--; + assert(wsize>0); + // find the highest bit + uint8_t b = (uint8_t)(MI_SIZE_BITS - 1 - mi_clz(wsize)); + + // and use the top 3 bits to determine the bin (~12.5% worst internal fragmentation). + // - adjust with 3 because we use do not round the first 8 sizes + // which each get an exact bin + bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3; + } + return bin; +} + + static void mi_bins(void) { //printf(" QNULL(1), /* 0 */ \\\n "); size_t last_bin = 0; - size_t min_bsize = 0; - size_t last_bsize = 0; - for (size_t bsize = 1; bsize < 2*1024; bsize++) { - size_t size = bsize * 64 * 1024; - size_t bin = _mi_binx8(bsize); + for (size_t wsize = 1; wsize <= (4*1024*1024) / 8 + 1024; wsize++) { + size_t bin = mi_bin(wsize); if (bin != last_bin) { - printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_bsize, last_bsize, last_bin); - //printf("QNULL(%6zd), ", wsize); - //if (last_bin%8 == 0) printf("/* %i */ \\\n ", last_bin); + //printf("min bsize: %6zd, max bsize: %6zd, bin: %6zd\n", min_wsize, last_wsize, last_bin); + printf("QNULL(%6zd), ", wsize-1); + if (last_bin%8 == 0) printf("/* %zu */ \\\n ", last_bin); last_bin = bin; - min_bsize = bsize; } - last_bsize = bsize; } } #endif