merge from dev3

This commit is contained in:
Daan 2025-03-06 21:06:46 -08:00
commit 8edce30c17
23 changed files with 357 additions and 262 deletions

View file

@ -14,8 +14,7 @@ option(MI_XMALLOC "Enable abort() call on memory allocation failure by
option(MI_SHOW_ERRORS "Show error and warning messages by default (only enabled by default in DEBUG mode)" OFF)
option(MI_GUARDED "Build with guard pages behind certain object allocations (implies MI_NO_PADDING=ON)" OFF)
option(MI_USE_CXX "Use the C++ compiler to compile the library (instead of the C compiler)" OFF)
option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" ON)
option(MI_OPT_ARCH "Only for optimized builds: turn on architecture specific optimizations (for x64: '-march=haswell;-mavx2' (2013), for arm64: '-march=armv8.1-a' (2016))" OFF)
option(MI_OPT_SIMD "Use SIMD instructions (requires MI_OPT_ARCH to be enabled)" OFF)
option(MI_SEE_ASM "Generate assembly files" OFF)
option(MI_OSX_INTERPOSE "Use interpose to override standard malloc on macOS" ON)
@ -125,9 +124,44 @@ if("${CMAKE_BINARY_DIR}" MATCHES ".*(S|s)ecure$")
set(MI_SECURE "ON")
endif()
# Determine architecture
set(MI_OPT_ARCH_FLAGS "")
set(MI_ARCH "unknown")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|i[3456]86)$" OR CMAKE_GENERATOR_PLATFORM MATCHES "^(x86|Win32)$")
set(MI_ARCH "x86")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64" OR "x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES) # must be before arm64
set(MI_ARCH "x64")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
set(MI_ARCH "arm64")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$")
set(MI_ARCH "arm32")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$")
if(CMAKE_SIZEOF_VOID_P==4)
set(MI_ARCH "riscv32")
else()
set(MI_ARCH "riscv64")
endif()
else()
set(MI_ARCH ${CMAKE_SYSTEM_PROCESSOR})
endif()
message(STATUS "Architecture: ${MI_ARCH}") # (${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_GENERATOR_PLATFORM}, ${CMAKE_GENERATOR})")
# negative overrides (mainly to support vcpkg features)
if(MI_NO_USE_CXX)
set(MI_USE_CXX "OFF")
endif()
if(MI_NO_OPT_ARCH)
set(MI_OPT_ARCH "OFF")
elseif(MI_ARCH STREQUAL "arm64")
set(MI_OPT_ARCH "ON") # enable armv8.1-a by default on arm64 unless MI_NO_OPT_ARCH is set
endif()
# -----------------------------------------------------------------------------
# Process options
# -----------------------------------------------------------------------------
if(CMAKE_C_COMPILER_ID STREQUAL "Clang" AND CMAKE_CXX_COMPILER_FRONTEND_VARIANT STREQUAL "MSVC")
set(MI_CLANG_CL "ON")
endif()
@ -147,32 +181,16 @@ if(CMAKE_C_COMPILER_ID MATCHES "Intel")
list(APPEND mi_cflags -Wall)
endif()
# negative overrides (mainly to support vcpkg features)
if(MI_NO_USE_CXX)
set(MI_USE_CXX "OFF")
endif()
if(MI_NO_OPT_ARCH)
set(MI_OPT_ARCH "OFF")
endif()
if(MSVC)
add_compile_options(/std:c++20)
add_compile_options(/std:c++20)
elseif(CMAKE_CXX_COMPILER_ID MATCHES "Clang" OR CMAKE_CXX_COMPILER_ID MATCHES "GNU")
add_compile_options(-std=c++20)
add_compile_options(-std=c++20)
endif()
if(CMAKE_C_COMPILER_ID MATCHES "MSVC|Intel")
set(MI_USE_CXX "ON")
endif()
if(CMAKE_BUILD_TYPE MATCHES "Release|RelWithDebInfo")
if (NOT MI_OPT_ARCH)
message(STATUS "Architecture specific optimizations are disabled (MI_OPT_ARCH=OFF)")
endif()
#else()
# set(MI_OPT_ARCH OFF)
endif()
if(MI_OVERRIDE)
message(STATUS "Override standard malloc (MI_OVERRIDE=ON)")
if(APPLE)
@ -397,28 +415,6 @@ if(MI_WIN_USE_FIXED_TLS)
list(APPEND mi_defines MI_WIN_USE_FIXED_TLS=1)
endif()
# Determine architecture
set(MI_OPT_ARCH_FLAGS "")
set(MI_ARCH "unknown")
if(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86|i[3456]86)$" OR CMAKE_GENERATOR_PLATFORM MATCHES "^(x86|Win32)$")
set(MI_ARCH "x86")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(x86_64|x64|amd64|AMD64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "x64" OR "x86_64" IN_LIST CMAKE_OSX_ARCHITECTURES) # must be before arm64
set(MI_ARCH "x64")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(aarch64|arm64|armv[89].?|ARM64)$" OR CMAKE_GENERATOR_PLATFORM STREQUAL "ARM64" OR "arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
set(MI_ARCH "arm64")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(arm|armv[34567]|ARM)$")
set(MI_ARCH "arm32")
elseif(CMAKE_SYSTEM_PROCESSOR MATCHES "^(riscv|riscv32|riscv64)$")
if(CMAKE_SIZEOF_VOID_P==4)
set(MI_ARCH "riscv32")
else()
set(MI_ARCH "riscv64")
endif()
else()
set(MI_ARCH ${CMAKE_SYSTEM_PROCESSOR})
endif()
message(STATUS "Architecture: ${MI_ARCH}") # (${CMAKE_SYSTEM_PROCESSOR}, ${CMAKE_GENERATOR_PLATFORM}, ${CMAKE_GENERATOR})")
# Check /proc/cpuinfo for an SV39 MMU and limit the virtual address bits.
# (this will skip the aligned hinting in that case. Issue #939, #949)
if (EXISTS /proc/cpuinfo)
@ -475,7 +471,6 @@ endif()
if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM_NAME MATCHES "Haiku")
if(MI_OPT_ARCH)
if(APPLE AND CMAKE_C_COMPILER_ID STREQUAL "AppleClang" AND CMAKE_OSX_ARCHITECTURES) # to support multi-arch binaries (#999)
set(MI_OPT_ARCH_FLAGS "")
if("arm64" IN_LIST CMAKE_OSX_ARCHITECTURES)
list(APPEND MI_OPT_ARCH_FLAGS "-Xarch_arm64;-march=armv8.1-a")
endif()
@ -502,7 +497,7 @@ if (MSVC AND MSVC_VERSION GREATER_EQUAL 1914) # vs2017+
endif()
if(MINGW)
add_definitions(-D_WIN32_WINNT=0x601) # issue #976
add_definitions(-D_WIN32_WINNT=0x600) # issue #976
endif()
if(MI_OPT_ARCH_FLAGS)

View file

@ -32,7 +32,7 @@ jobs:
MSBuildConfiguration: Release
Release SIMD:
BuildType: release-simd
cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON -DMI_WIN_USE_FIXED_TLS=ON
cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON -DMI_WIN_USE_FIXED_TLS=ON
MSBuildConfiguration: Release
Secure:
BuildType: secure
@ -97,7 +97,7 @@ jobs:
CC: clang
CXX: clang++
BuildType: release-simd-clang
cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON
cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON
Secure Clang:
CC: clang
CXX: clang++
@ -159,7 +159,7 @@ jobs:
cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release
Release SIMD:
BuildType: release-simd
cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_SIMD=ON
cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_OPT_ARCH=ON -DMI_OPT_SIMD=ON
Secure:
BuildType: secure
cmakeExtraArgs: -DCMAKE_BUILD_TYPE=Release -DMI_SECURE=ON

View file

@ -1,6 +1,6 @@
set(mi_version_major 3)
set(mi_version_minor 0)
set(mi_version_patch 2)
set(mi_version_patch 3)
set(mi_version ${mi_version_major}.${mi_version_minor})
set(PACKAGE_VERSION ${mi_version})

View file

@ -18,6 +18,8 @@ vcpkg_check_features(OUT_FEATURE_OPTIONS FEATURE_OPTIONS
guarded MI_GUARDED
secure MI_SECURE
override MI_OVERRIDE
optarch MI_OPT_ARCH
optsimd MI_OPT_SIMD
xmalloc MI_XMALLOC
asm MI_SEE_ASM
)
@ -26,16 +28,14 @@ string(COMPARE EQUAL "${VCPKG_LIBRARY_LINKAGE}" "dynamic" MI_BUILD_SHARED)
vcpkg_cmake_configure(
SOURCE_PATH "${SOURCE_PATH}"
OPTIONS_RELEASE
-DMI_OPT_ARCH=ON
OPTIONS
-DMI_USE_CXX=ON
-DMI_BUILD_TESTS=OFF
-DMI_BUILD_OBJECT=ON
${FEATURE_OPTIONS}
-DMI_BUILD_STATIC=${MI_BUILD_STATIC}
-DMI_BUILD_SHARED=${MI_BUILD_SHARED}
-DMI_INSTALL_TOPLEVEL=ON
${FEATURE_OPTIONS}
)
vcpkg_cmake_install()

View file

@ -26,9 +26,18 @@
"secure": {
"description": "Use full security mitigations (like guard pages and randomization)"
},
"guarded": {
"description": "Use build that support guard pages after objects controlled with MIMALLOC_GUARDED_SAMPLE_RATE"
},
"xmalloc": {
"description": "If out-of-memory, call abort() instead of returning NULL"
},
"optarch": {
"description": "Use architecture specific optimizations (on x64: '-march=haswell;-mavx2', on arm64: '-march=armv8.1-a')"
},
"optsimd": {
"description": "Allow use of SIMD instructions (avx2 or neon) (requires 'optarch' to be enabled)"
},
"asm": {
"description": "Generate assembly files"
}

View file

@ -315,7 +315,7 @@
<CompileAs>CompileAsCpp</CompileAs>
<IntrinsicFunctions>true</IntrinsicFunctions>
<LanguageStandard>stdcpp20</LanguageStandard>
<EnableEnhancedInstructionSet>AdvancedVectorExtensions2</EnableEnhancedInstructionSet>
<EnableEnhancedInstructionSet>StreamingSIMDExtensions</EnableEnhancedInstructionSet>
<AdditionalOptions>/Zc:__cplusplus %(AdditionalOptions)</AdditionalOptions>
</ClCompile>
<Link>

View file

@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
#ifndef MIMALLOC_H
#define MIMALLOC_H
#define MI_MALLOC_VERSION 302 // major + 2 digits minor
#define MI_MALLOC_VERSION 303 // major + 2 digits minor
// ------------------------------------------------------
// Compiler specific attributes
@ -266,7 +266,7 @@ typedef bool (mi_cdecl mi_block_visit_fun)(const mi_heap_t* heap, const mi_heap_
mi_decl_export bool mi_heap_visit_blocks(const mi_heap_t* heap, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
// Experimental
// Advanced
mi_decl_nodiscard mi_decl_export bool mi_is_in_heap_region(const void* p) mi_attr_noexcept;
mi_decl_nodiscard mi_decl_export bool mi_is_redirected(void) mi_attr_noexcept;
@ -279,7 +279,7 @@ mi_decl_export bool mi_manage_os_memory(void* start, size_t size, bool is_commi
mi_decl_export void mi_debug_show_arenas(void) mi_attr_noexcept;
mi_decl_export void mi_arenas_print(void) mi_attr_noexcept;
// Experimental: heaps associated with specific memory arena's
// Advanced: heaps associated with specific memory arena's
typedef void* mi_arena_id_t;
mi_decl_export void* mi_arena_area(mi_arena_id_t arena_id, size_t* size);
mi_decl_export int mi_reserve_huge_os_pages_at_ex(size_t pages, int numa_node, size_t timeout_msecs, bool exclusive, mi_arena_id_t* arena_id) mi_attr_noexcept;
@ -292,7 +292,7 @@ mi_decl_nodiscard mi_decl_export mi_heap_t* mi_heap_new_in_arena(mi_arena_id_t a
#endif
// Experimental: allow sub-processes whose memory areas stay separated (and no reclamation between them)
// Advanced: allow sub-processes whose memory areas stay separated (and no reclamation between them)
// Used for example for separate interpreters in one process.
typedef void* mi_subproc_id_t;
mi_decl_export mi_subproc_id_t mi_subproc_main(void);
@ -300,10 +300,15 @@ mi_decl_export mi_subproc_id_t mi_subproc_new(void);
mi_decl_export void mi_subproc_delete(mi_subproc_id_t subproc);
mi_decl_export void mi_subproc_add_current_thread(mi_subproc_id_t subproc); // this should be called right after a thread is created (and no allocation has taken place yet)
// Experimental: visit abandoned heap areas (that are not owned by a specific heap)
// Advanced: visit abandoned heap areas (that are not owned by a specific heap)
mi_decl_export bool mi_abandoned_visit_blocks(mi_subproc_id_t subproc_id, int heap_tag, bool visit_blocks, mi_block_visit_fun* visitor, void* arg);
// Experimental: set numa-affinity of a heap
mi_decl_export void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node);
// Experimental: objects followed by a guard page.
// Setting the sample rate on a specific heap can be used to test parts of the program more
// specifically (in combination with `mi_heap_set_default`).
// A sample rate of 0 disables guarded objects, while 1 uses a guard page for every object.
// A seed of 0 uses a random start point. Only objects within the size bound are eligable for guard pages.
mi_decl_export void mi_heap_guarded_set_sample_rate(mi_heap_t* heap, size_t sample_rate, size_t seed);
@ -324,13 +329,6 @@ mi_decl_export void mi_collect_reduce(size_t target_thread_owned) mi_attr_noexce
// experimental
//mi_decl_export void* mi_os_alloc(size_t size, bool commit, size_t* full_size);
//mi_decl_export void* mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, void** base, size_t* full_size);
//mi_decl_export void* mi_os_alloc_aligned_allow_large(size_t size, size_t alignment, bool commit, bool* is_committed, bool* is_pinned, void** base, size_t* full_size);
//mi_decl_export void mi_os_free(void* p, size_t size);
//mi_decl_export void mi_os_commit(void* p, size_t size);
//mi_decl_export void mi_os_decommit(void* p, size_t size);
mi_decl_export bool mi_arena_unload(mi_arena_id_t arena_id, void** base, size_t* accessed_size, size_t* size);
mi_decl_export bool mi_arena_reload(void* start, size_t size, mi_arena_id_t* arena_id);
mi_decl_export bool mi_heap_reload(mi_heap_t* heap, mi_arena_id_t arena);

View file

@ -134,6 +134,12 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
}
static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
const int64_t add = mi_atomic_load_relaxed((_Atomic(int64_t)*)padd);
if (add != 0) {
mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
}
}
static inline void mi_atomic_maxi64_relaxed(volatile int64_t* p, int64_t x) {
int64_t current = mi_atomic_load_relaxed((_Atomic(int64_t)*)p);
while (current < x && !mi_atomic_cas_weak_release((_Atomic(int64_t)*)p, &current, x)) { /* nothing */ };

View file

@ -90,7 +90,7 @@ typedef int32_t mi_ssize_t;
#endif
#endif
#if MI_ARCH_X64 && defined(__AVX2__)
#if (MI_ARCH_X86 || MI_ARCH_X64)
#include <immintrin.h>
#elif MI_ARCH_ARM64 && MI_OPT_SIMD
#include <arm_neon.h>
@ -134,6 +134,18 @@ typedef int32_t mi_ssize_t;
Builtin's
-------------------------------------------------------------------------------- */
#if defined(__GNUC__) || defined(__clang__)
#define mi_unlikely(x) (__builtin_expect(!!(x),false))
#define mi_likely(x) (__builtin_expect(!!(x),true))
#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
#define mi_unlikely(x) (x) [[unlikely]]
#define mi_likely(x) (x) [[likely]]
#else
#define mi_unlikely(x) (x)
#define mi_likely(x) (x)
#endif
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
@ -171,14 +183,25 @@ typedef int32_t mi_ssize_t;
-------------------------------------------------------------------------------- */
size_t _mi_popcount_generic(size_t x);
extern bool _mi_cpu_has_popcnt;
static inline size_t mi_popcount(size_t x) {
#if mi_has_builtinz(popcount)
#if defined(__GNUC__) && (MI_ARCH_X64 || MI_ARCH_X86)
#if !defined(__BMI1__)
if mi_unlikely(!_mi_cpu_has_popcnt) { return _mi_popcount_generic(x); }
#endif
size_t r;
__asm ("popcnt\t%1,%0" : "=r"(r) : "r"(x) : "cc");
return r;
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86)
#if !defined(__BMI1__)
if mi_unlikely(!_mi_cpu_has_popcnt) { return _mi_popcount_generic(x); }
#endif
return (size_t)mi_msc_builtinz(__popcnt)(x);
#elif defined(_MSC_VER) && MI_ARCH_ARM64
return (size_t)mi_msc_builtinz(__popcnt)(x);
#elif mi_has_builtinz(popcount)
return mi_builtinz(popcount)(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
return mi_msc_builtinz(__popcnt)(x);
#elif MI_ARCH_X64 && defined(__BMI1__)
return (size_t)_mm_popcnt_u64(x);
#else
#define MI_HAS_FAST_POPCOUNT 0
return (x<=1 ? x : _mi_popcount_generic(x));

View file

@ -159,6 +159,8 @@ bool _mi_os_secure_guard_page_set_before(void* addr, bool is_pinned);
bool _mi_os_secure_guard_page_reset_at(void* addr);
bool _mi_os_secure_guard_page_reset_before(void* addr);
int _mi_os_numa_node(void);
int _mi_os_numa_node_count(void);
void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allow_large, mi_memid_t* memid);
void* _mi_os_alloc_aligned_at_offset(size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_large, mi_memid_t* memid);
@ -174,8 +176,8 @@ mi_arena_id_t _mi_arena_id_none(void);
mi_arena_t* _mi_arena_from_id(mi_arena_id_t id);
bool _mi_arena_memid_is_suitable(mi_memid_t memid, mi_arena_t* request_arena);
void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid);
void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid);
void* _mi_arenas_alloc_aligned(mi_subproc_t* subproc, size_t size, size_t alignment, size_t align_offset, bool commit, bool allow_pinned, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid);
void _mi_arenas_free(void* p, size_t size, mi_memid_t memid);
bool _mi_arenas_contain(const void* p);
void _mi_arenas_collect(bool force_purge, bool visit_all, mi_tld_t* tld);
@ -254,25 +256,6 @@ bool _mi_page_is_valid(mi_page_t* page);
#endif
// ------------------------------------------------------
// Branches
// ------------------------------------------------------
#if defined(__GNUC__) || defined(__clang__)
#define mi_unlikely(x) (__builtin_expect(!!(x),false))
#define mi_likely(x) (__builtin_expect(!!(x),true))
#elif (defined(__cplusplus) && (__cplusplus >= 202002L)) || (defined(_MSVC_LANG) && _MSVC_LANG >= 202002L)
#define mi_unlikely(x) (x) [[unlikely]]
#define mi_likely(x) (x) [[likely]]
#else
#define mi_unlikely(x) (x)
#define mi_likely(x) (x)
#endif
#ifndef __has_builtin
#define __has_builtin(x) 0
#endif
/* -----------------------------------------------------------
Assertions
@ -1026,24 +1009,6 @@ static inline uintptr_t _mi_random_shuffle(uintptr_t x) {
return x;
}
// -------------------------------------------------------------------
// Optimize numa node access for the common case (= one node)
// -------------------------------------------------------------------
int _mi_os_numa_node_get(void);
size_t _mi_os_numa_node_count_get(void);
extern mi_decl_hidden _Atomic(size_t) _mi_numa_node_count;
static inline int _mi_os_numa_node(void) {
if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
else return _mi_os_numa_node_get();
}
static inline size_t _mi_os_numa_node_count(void) {
const size_t count = mi_atomic_load_relaxed(&_mi_numa_node_count);
if mi_likely(count > 0) { return count; }
else return _mi_os_numa_node_count_get();
}
// ---------------------------------------------------------------------------------
// Provide our own `_mi_memcpy` for potential performance optimizations.
@ -1053,10 +1018,10 @@ static inline size_t _mi_os_numa_node_count(void) {
// (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017). See also issue #201 and pr #253.
// ---------------------------------------------------------------------------------
#if !MI_TRACK_ENABLED && defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
#include <intrin.h>
#if !MI_TRACK_ENABLED && defined(_WIN32) && (MI_ARCH_X64 || MI_ARCH_X86)
extern bool _mi_cpu_has_fsrm;
extern bool _mi_cpu_has_erms;
static inline void _mi_memcpy(void* dst, const void* src, size_t n) {
if ((_mi_cpu_has_fsrm && n <= 128) || (_mi_cpu_has_erms && n > 128)) {
__movsb((unsigned char*)dst, (const unsigned char*)src, n);

View file

@ -343,10 +343,10 @@ typedef struct mi_page_s {
// The max object size are checked to not waste more than 12.5% internally over the page sizes.
#define MI_SMALL_MAX_OBJ_SIZE ((MI_SMALL_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < ~8 KiB
#if MI_ENABLE_LARGE_PAGES
#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < 64 KiB
#define MI_MEDIUM_MAX_OBJ_SIZE ((MI_MEDIUM_PAGE_SIZE-MI_PAGE_INFO_SIZE)/8) // < ~64 KiB
#define MI_LARGE_MAX_OBJ_SIZE (MI_LARGE_PAGE_SIZE/8) // <= 512KiB // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
#else
#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/4) // <= 128 KiB
#define MI_MEDIUM_MAX_OBJ_SIZE (MI_MEDIUM_PAGE_SIZE/8) // <= 64 KiB
#define MI_LARGE_MAX_OBJ_SIZE MI_MEDIUM_MAX_OBJ_SIZE // note: this must be a nice power of 2 or we get rounding issues with `_mi_bin`
#endif
#define MI_LARGE_MAX_OBJ_WSIZE (MI_LARGE_MAX_OBJ_SIZE/MI_SIZE_SIZE)
@ -424,6 +424,7 @@ typedef struct mi_padding_s {
struct mi_heap_s {
mi_tld_t* tld; // thread-local data
mi_arena_t* exclusive_arena; // if the heap should only allocate from a specific arena (or NULL)
int numa_node; // preferred numa node (or -1 for no preference)
uintptr_t cookie; // random cookie to verify pointers (see `_mi_ptr_cookie`)
mi_random_ctx_t random; // random number context used for secure allocation
size_t page_count; // total number of pages in the `pages` queues.
@ -485,6 +486,7 @@ typedef int64_t mi_msecs_t;
struct mi_tld_s {
mi_threadid_t thread_id; // thread id of this thread
size_t thread_seq; // thread sequence id (linear count of created threads)
int numa_node; // thread preferred numa node
mi_subproc_t* subproc; // sub-process this thread belongs to.
mi_heap_t* heap_backing; // backing heap of this thread (cannot be deleted)
mi_heap_t* heaps; // list of heaps in this thread (so we can abandon all when the thread terminates)

View file

@ -12,8 +12,9 @@ is a general purpose allocator with excellent [performance](#performance) charac
Initially developed by Daan Leijen for the runtime systems of the
[Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
Latest release tag: `v2.1.9` (2025-01-03).
Latest v1 tag: `v1.8.9` (2024-01-03).
Latest release : `v3.0.2` (beta) (2025-03-06).
Latest v2 release: `v2.2.2` (2025-03-06).
Latest v1 release: `v1.9.2` (2024-03-06).
mimalloc is a drop-in replacement for `malloc` and can be used in other programs
without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@ -71,22 +72,28 @@ Enjoy!
### Branches
* `master`: latest stable release (based on `dev2`).
* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
* `master`: latest stable release (still based on `dev2`).
* `dev`: development branch for mimalloc v1. Use this branch for submitting PR's.
* `dev2`: development branch for mimalloc v2. This branch is downstream of `dev`
(and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage
mimalloc pages what can reduce fragmentation.
* `dev3`: development branch for mimalloc v3-alpha. This branch is downstream of `dev`. This is still experimental,
but simplifies previous versions by having no segments any more. This improves sharing of memory
between threads, and on certain large workloads uses less memory with less fragmentation.
(and is essentially equal to `dev` except for `src/segment.c`). Uses larger sliced segments to manage
mimalloc pages that can reduce fragmentation.
* `dev3`: development branch for mimalloc v3-beta. This branch is downstream of `dev`. This version
simplifies the lock-free ownership of previous versions, has no thread-local segments any more.
This improves sharing of memory between threads, and on certain large workloads may use less memory
with less fragmentation.
### Releases
* 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes.
Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`.
Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile.
Upgrade mimalloc-redirect to v1.3.2. `MI_OPT_ARCH` is off by default now but still assumes armv8.1-a on arm64
for fast atomic operations. Add QNX support.
* 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0.1-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS
guard pages behind objects to catch buffer overflows as they occur.
Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for
thread local destructors (in Rust/C++); macOS tag change; macOS TLS slot fix; improve stats;
consistent mimalloc.dll on Windows (instead of mimalloc-override.dll); fix mimalloc-redirect on Win11 H2;
consistent `mimalloc.dll` on Windows (instead of `mimalloc-override.dll`); fix mimalloc-redirect on Win11 H2;
add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance.
* 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
@ -167,7 +174,7 @@ mimalloc is used in various large scale low-latency services and programs, for e
Open `ide/vs2022/mimalloc.sln` in Visual Studio 2022 and build.
The `mimalloc-lib` project builds a static library (in `out/msvc-x64`), while the
`mimalloc-override-dll` project builds a DLL for overriding malloc
`mimalloc-override-dll` project builds DLL for overriding malloc
in the entire program.
## Linux, macOS, BSD, etc.
@ -240,13 +247,13 @@ on Windows to build with the `clang-cl` compiler directly:
```
## Single source
## Single Source
You can also directly build the single `src/static.c` file as part of your project without
needing `cmake` at all. Make sure to also add the mimalloc `include` directory to the include path.
# Using the library
# Using the Library
The preferred usage is including `<mimalloc.h>`, linking with
the shared- or static library, and using the `mi_malloc` API exclusively for allocation. For example,
@ -474,7 +481,7 @@ Note that certain security restrictions may apply when doing this from
the [shell](https://stackoverflow.com/questions/43941322/dyld-insert-libraries-ignored-when-calling-application-through-bash).
# Windows Override
### Dynamic Override on Windows
<span id="override_on_windows">We use a separate redirection DLL to override mimalloc on Windows</span>
such that we redirect all malloc/free calls that go through the (dynamic) C runtime allocator,

View file

@ -64,11 +64,11 @@ static void* mi_meta_block_start( mi_meta_page_t* mpage, size_t block_idx ) {
// allocate a fresh meta page and add it to the global list.
static mi_meta_page_t* mi_meta_page_zalloc(void) {
// allocate a fresh arena slice
// note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again..
// note: careful with _mi_subproc as it may recurse into mi_tld and meta_page_zalloc again.. (same with _mi_os_numa_node()...)
mi_memid_t memid;
uint8_t* base = (uint8_t*)_mi_arenas_alloc_aligned(_mi_subproc(), MI_META_PAGE_SIZE, MI_META_PAGE_ALIGN, 0,
true /* commit*/, (MI_SECURE==0) /* allow large? */,
NULL /* req arena */, 0 /* thread_seq */, &memid);
NULL /* req arena */, 0 /* thread_seq */, -1 /* numa node */, &memid);
if (base == NULL) return NULL;
mi_assert_internal(_mi_is_aligned(base,MI_META_PAGE_ALIGN));
if (!memid.initially_zero) {

View file

@ -335,12 +335,13 @@ static bool mi_arena_reserve(mi_subproc_t* subproc, size_t req_size, bool allow_
Arena iteration
----------------------------------------------------------- */
static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, int numa_node, bool allow_pinned) {
static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena, bool match_numa, int numa_node, bool allow_pinned) {
if (!allow_pinned && arena->memid.is_pinned) return false;
if (!mi_arena_id_is_suitable(arena, req_arena)) return false;
if (req_arena == NULL) { // if not specific, check numa affinity
const bool numa_suitable = (numa_node < 0 || arena->numa_node < 0 || arena->numa_node == numa_node);
if (!numa_suitable) return false;
if (match_numa) { if (!numa_suitable) return false; }
else { if (numa_suitable) return false; }
}
return true;
}
@ -375,9 +376,9 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena
} \
}
#define mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, name_arena) \
#define mi_forall_suitable_arenas(subproc, req_arena, tseq, match_numa, numa_node, allow_large, name_arena) \
mi_forall_arenas(subproc, req_arena,tseq,name_arena) { \
if (mi_arena_is_suitable(name_arena, req_arena, -1 /* todo: numa node */, allow_large)) { \
if (mi_arena_is_suitable(name_arena, req_arena, match_numa, numa_node, allow_large)) { \
#define mi_forall_suitable_arenas_end() \
}} \
@ -390,19 +391,28 @@ static inline bool mi_arena_is_suitable(mi_arena_t* arena, mi_arena_t* req_arena
// allocate slices from the arenas
static mi_decl_noinline void* mi_arenas_try_find_free(
mi_subproc_t* subproc, size_t slice_count, size_t alignment,
bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
{
mi_assert_internal(slice_count <= mi_slice_count_of_size(MI_ARENA_MAX_OBJ_SIZE));
mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
if (alignment > MI_ARENA_SLICE_ALIGN) return NULL;
// search arena's
mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena)
// search arena's
mi_forall_suitable_arenas(subproc, req_arena, tseq, true /* only numa matching */, numa_node, allow_large, arena)
{
void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
if (p != NULL) return p;
}
mi_forall_suitable_arenas_end();
if (numa_node < 0) return NULL;
// search again but now regardless of preferred numa affinity
mi_forall_suitable_arenas(subproc, req_arena, tseq, false /* numa non-matching now */, numa_node, allow_large, arena)
{
void* p = mi_arena_try_alloc_at(arena, slice_count, commit, tseq, memid);
if (p != NULL) return p;
}
mi_forall_suitable_arenas_end();
return NULL;
}
@ -411,14 +421,14 @@ static mi_decl_noinline void* mi_arenas_try_alloc(
mi_subproc_t* subproc,
size_t slice_count, size_t alignment,
bool commit, bool allow_large,
mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
{
mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
void* p;
// try to find free slices in the arena's
p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
if (p != NULL) return p;
// did we need a specific arena?
@ -441,7 +451,7 @@ static mi_decl_noinline void* mi_arenas_try_alloc(
}
// try once more to allocate in the new arena
mi_assert_internal(req_arena == NULL);
p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
p = mi_arenas_try_find_free(subproc, slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
if (p != NULL) return p;
return NULL;
@ -472,21 +482,18 @@ static void* mi_arena_os_alloc_aligned(
void* _mi_arenas_alloc_aligned( mi_subproc_t* subproc,
size_t size, size_t alignment, size_t align_offset,
bool commit, bool allow_large,
mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
{
mi_assert_internal(memid != NULL);
mi_assert_internal(size > 0);
// *memid = _mi_memid_none();
// const int numa_node = _mi_os_numa_node(&tld->os); // current numa node
// try to allocate in an arena if the alignment is small enough and the object is not too small (as for heap meta data)
if (!mi_option_is_enabled(mi_option_disallow_arena_alloc) && // is arena allocation allowed?
size >= MI_ARENA_MIN_OBJ_SIZE && size <= MI_ARENA_MAX_OBJ_SIZE && // and not too small/large
alignment <= MI_ARENA_SLICE_ALIGN && align_offset == 0) // and good alignment
{
const size_t slice_count = mi_slice_count_of_size(size);
void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, memid);
void* p = mi_arenas_try_alloc(subproc,slice_count, alignment, commit, allow_large, req_arena, tseq, numa_node, memid);
if (p != NULL) return p;
}
@ -495,9 +502,9 @@ void* _mi_arenas_alloc_aligned( mi_subproc_t* subproc,
return p;
}
void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, mi_memid_t* memid)
void* _mi_arenas_alloc(mi_subproc_t* subproc, size_t size, bool commit, bool allow_large, mi_arena_t* req_arena, size_t tseq, int numa_node, mi_memid_t* memid)
{
return _mi_arenas_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, memid);
return _mi_arenas_alloc_aligned(subproc, size, MI_ARENA_SLICE_SIZE, 0, commit, allow_large, req_arena, tseq, numa_node, memid);
}
@ -547,7 +554,9 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_
// search arena's
const bool allow_large = true;
mi_forall_suitable_arenas(subproc, req_arena, tseq, allow_large, arena)
const int any_numa = -1;
const bool match_numa = true;
mi_forall_suitable_arenas(subproc, req_arena, tseq, match_numa, any_numa, allow_large, arena)
{
size_t slice_index;
mi_bitmap_t* const bitmap = arena->pages_abandoned[bin];
@ -582,7 +591,7 @@ static mi_page_t* mi_arenas_page_try_find_abandoned(mi_subproc_t* subproc, size_
// Allocate a fresh page
static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice_count, size_t block_size, size_t block_alignment,
mi_arena_t* req_arena, size_t tseq, bool commit)
mi_arena_t* req_arena, size_t tseq, int numa_node, bool commit)
{
const bool allow_large = (MI_SECURE < 2); // 2 = guard page at end of each arena page
const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
@ -596,7 +605,7 @@ static mi_page_t* mi_arenas_page_alloc_fresh(mi_subproc_t* subproc, size_t slice
!os_align && // not large alignment
slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
{
page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, &memid);
page = (mi_page_t*)mi_arenas_try_alloc(subproc, slice_count, page_alignment, commit, allow_large, req_arena, tseq, numa_node, &memid);
if (page != NULL) {
mi_assert_internal(mi_bitmap_is_clearN(memid.mem.arena.arena->pages, memid.mem.arena.slice_index, memid.mem.arena.slice_count));
mi_bitmap_set(memid.mem.arena.arena->pages, memid.mem.arena.slice_index);
@ -727,7 +736,7 @@ static mi_page_t* mi_arenas_page_regular_alloc(mi_heap_t* heap, size_t slice_cou
const long commit_on_demand = mi_option_get(mi_option_page_commit_on_demand);
const bool commit = (slice_count <= mi_slice_count_of_size(MI_PAGE_MIN_COMMIT_SIZE) || // always commit small pages
(commit_on_demand == 2 && _mi_os_has_overcommit()) || (commit_on_demand == 0));
page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, commit);
page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, 1, req_arena, tld->thread_seq, heap->numa_node, commit);
if (page != NULL) {
mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
_mi_page_init(heap, page);
@ -749,7 +758,7 @@ static mi_page_t* mi_arenas_page_singleton_alloc(mi_heap_t* heap, size_t block_s
const size_t slice_count = mi_slice_count_of_size(_mi_align_up(info_size + block_size, _mi_os_secure_guard_page_size()) + _mi_os_secure_guard_page_size());
#endif
mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq, true /* commit singletons always */);
mi_page_t* page = mi_arenas_page_alloc_fresh(tld->subproc, slice_count, block_size, block_alignment, req_arena, tld->thread_seq, heap->numa_node, true /* commit singletons always */);
if (page == NULL) return NULL;
mi_assert(page->reserved == 1);
@ -1375,7 +1384,7 @@ static size_t mi_debug_show_page_bfield(mi_bfield_t field, char* buf, size_t* k,
return bit_set_count;
}
static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
static size_t mi_debug_show_chunks(const char* header1, const char* header2, const char* header3, size_t slice_count, size_t chunk_count, mi_bchunk_t* chunks, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
_mi_raw_message("\x1B[37m%s%s%s (use/commit: \x1B[31m0 - 25%%\x1B[33m - 50%%\x1B[36m - 75%%\x1B[32m - 100%%\x1B[0m)\n", header1, header2, header3);
const size_t fields_per_line = (narrow ? 2 : 4);
size_t bit_count = 0;
@ -1391,11 +1400,12 @@ static size_t mi_debug_show_chunks(const char* header1, const char* header2, con
char chunk_kind = ' ';
if (chunk_bins != NULL) {
switch (mi_atomic_load_relaxed(&chunk_bins[i])) {
switch (mi_bbitmap_debug_get_bin(chunk_bins,i)) {
case MI_BBIN_SMALL: chunk_kind = 'S'; break;
case MI_BBIN_MEDIUM: chunk_kind = 'M'; break;
case MI_BBIN_LARGE: chunk_kind = 'L'; break;
case MI_BBIN_OTHER: chunk_kind = 'X'; break;
default: chunk_kind = ' '; break; // suppress warning
// case MI_BBIN_NONE: chunk_kind = 'N'; break;
}
}
@ -1432,7 +1442,7 @@ static size_t mi_debug_show_chunks(const char* header1, const char* header2, con
return bit_set_count;
}
static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count, mi_bitmap_t* bitmap, _Atomic(uint8_t)* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
static size_t mi_debug_show_bitmap_binned(const char* header1, const char* header2, const char* header3, size_t slice_count, mi_bitmap_t* bitmap, mi_bchunkmap_t* chunk_bins, bool invert, mi_arena_t* arena, bool narrow) {
return mi_debug_show_chunks(header1, header2, header3, slice_count, mi_bitmap_chunk_count(bitmap), &bitmap->chunks[0], chunk_bins, invert, arena, narrow);
}
@ -1463,7 +1473,7 @@ static void mi_debug_show_arenas_ex(bool show_pages, bool narrow) mi_attr_noexce
const char* header1 = "pages (p:page, f:full, s:singleton, P,F,S:not abandoned, i:arena-info, m:meta-data, ~:free-purgable, _:free-committed, .:free-reserved)";
const char* header2 = (narrow ? "\n " : " ");
const char* header3 = "(chunk bin: S:small, M : medium, L : large, X : other)";
page_total += mi_debug_show_bitmap_binned(header1, header2, header3, arena->slice_count, arena->pages, arena->slices_free->chunk_bins, false, arena, narrow);
page_total += mi_debug_show_bitmap_binned(header1, header2, header3, arena->slice_count, arena->pages, arena->slices_free->chunkmap_bins, false, arena, narrow);
}
}
// if (show_inuse) _mi_raw_message("total inuse slices : %zu\n", slice_total - free_total);
@ -1515,17 +1525,17 @@ int mi_reserve_huge_os_pages_interleave(size_t pages, size_t numa_nodes, size_t
if (pages == 0) return 0;
// pages per numa node
size_t numa_count = (numa_nodes > 0 ? numa_nodes : _mi_os_numa_node_count());
if (numa_count <= 0) numa_count = 1;
int numa_count = (numa_nodes > 0 && numa_nodes <= INT_MAX ? (int)numa_nodes : _mi_os_numa_node_count());
if (numa_count <= 0) { numa_count = 1; }
const size_t pages_per = pages / numa_count;
const size_t pages_mod = pages % numa_count;
const size_t timeout_per = (timeout_msecs==0 ? 0 : (timeout_msecs / numa_count) + 50);
// reserve evenly among numa nodes
for (size_t numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
for (int numa_node = 0; numa_node < numa_count && pages > 0; numa_node++) {
size_t node_pages = pages_per; // can be 0
if (numa_node < pages_mod) node_pages++;
int err = mi_reserve_huge_os_pages_at(node_pages, (int)numa_node, timeout_per);
if ((size_t)numa_node < pages_mod) { node_pages++; }
int err = mi_reserve_huge_os_pages_at(node_pages, numa_node, timeout_per);
if (err) return err;
if (pages < node_pages) {
pages = 0;

View file

@ -218,39 +218,39 @@ static inline bool mi_bfield_atomic_try_clearX(_Atomic(mi_bfield_t)*b, bool* all
// ------- mi_bfield_atomic_is_set ---------------------------------------
// Check if a bit is set
static inline bool mi_bfield_atomic_is_set(_Atomic(mi_bfield_t)*b, const size_t idx) {
static inline bool mi_bfield_atomic_is_set(const _Atomic(mi_bfield_t)*b, const size_t idx) {
const mi_bfield_t x = mi_atomic_load_relaxed(b);
return ((x & mi_bfield_mask(1,idx)) != 0);
}
// Check if a bit is clear
static inline bool mi_bfield_atomic_is_clear(_Atomic(mi_bfield_t)*b, const size_t idx) {
static inline bool mi_bfield_atomic_is_clear(const _Atomic(mi_bfield_t)*b, const size_t idx) {
const mi_bfield_t x = mi_atomic_load_relaxed(b);
return ((x & mi_bfield_mask(1, idx)) == 0);
}
// Check if a bit is xset
static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, _Atomic(mi_bfield_t)*b, const size_t idx) {
static inline bool mi_bfield_atomic_is_xset(mi_xset_t set, const _Atomic(mi_bfield_t)*b, const size_t idx) {
if (set) return mi_bfield_atomic_is_set(b, idx);
else return mi_bfield_atomic_is_clear(b, idx);
}
// Check if all bits corresponding to a mask are set.
static inline bool mi_bfield_atomic_is_set_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
static inline bool mi_bfield_atomic_is_set_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
mi_assert_internal(mask != 0);
const mi_bfield_t x = mi_atomic_load_relaxed(b);
return ((x & mask) == mask);
}
// Check if all bits corresponding to a mask are clear.
static inline bool mi_bfield_atomic_is_clear_mask(_Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
static inline bool mi_bfield_atomic_is_clear_mask(const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
mi_assert_internal(mask != 0);
const mi_bfield_t x = mi_atomic_load_relaxed(b);
return ((x & mask) == 0);
}
// Check if all bits corresponding to a mask are set/cleared.
static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
static inline bool mi_bfield_atomic_is_xset_mask(mi_xset_t set, const _Atomic(mi_bfield_t)* b, mi_bfield_t mask) {
mi_assert_internal(mask != 0);
if (set) return mi_bfield_atomic_is_set_mask(b, mask);
else return mi_bfield_atomic_is_clear_mask(b, mask);
@ -371,7 +371,7 @@ static inline bool mi_bchunk_clearN(mi_bchunk_t* chunk, size_t cidx, size_t n, b
// Check if a sequence of `n` bits within a chunk are all set/cleared.
// This can cross bfield's
mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, const mi_bchunk_t* chunk, size_t field_idx, size_t idx, size_t n) {
mi_assert_internal((field_idx*MI_BFIELD_BITS) + idx + n <= MI_BCHUNK_BITS);
while (n > 0) {
size_t m = MI_BFIELD_BITS - idx; // m is the bits to xset in this field
@ -391,7 +391,7 @@ mi_decl_noinline static bool mi_bchunk_is_xsetN_(mi_xset_t set, mi_bchunk_t* chu
}
// Check if a sequence of `n` bits within a chunk are all set/cleared.
static inline bool mi_bchunk_is_xsetN(mi_xset_t set, mi_bchunk_t* chunk, size_t cidx, size_t n) {
static inline bool mi_bchunk_is_xsetN(mi_xset_t set, const mi_bchunk_t* chunk, size_t cidx, size_t n) {
mi_assert_internal(cidx + n <= MI_BCHUNK_BITS);
mi_assert_internal(n>0);
if (n==0) return true;
@ -1413,7 +1413,23 @@ void mi_bbitmap_unsafe_setN(mi_bbitmap_t* bbitmap, size_t idx, size_t n) {
// Assign a specific size bin to a chunk
static void mi_bbitmap_set_chunk_bin(mi_bbitmap_t* bbitmap, size_t chunk_idx, mi_bbin_t bin) {
mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
mi_atomic_store_release(&bbitmap->chunk_bins[chunk_idx], (uint8_t)bin);
for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) {
if (ibin == bin) {
mi_bchunk_set(& bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
}
else {
mi_bchunk_clear(&bbitmap->chunkmap_bins[ibin], chunk_idx, NULL);
}
}
}
mi_bbin_t mi_bbitmap_debug_get_bin(const mi_bchunkmap_t* chunkmap_bins, size_t chunk_idx) {
for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) {
if (mi_bchunk_is_xsetN(MI_BIT_SET, &chunkmap_bins[ibin], chunk_idx, 1)) {
return ibin;
}
}
return MI_BBIN_NONE;
}
// Track the index of the highest chunk that is accessed.
@ -1541,56 +1557,65 @@ static inline bool mi_bbitmap_try_find_and_clear_generic(mi_bbitmap_t* bbitmap,
mi_assert_internal(MI_BFIELD_BITS >= MI_BCHUNK_FIELDS);
const mi_bfield_t cmap_mask = mi_bfield_mask(cmap_max_count,0);
const size_t cmap_cycle = cmap_acc+1;
const mi_bbin_t bbin = mi_bbin_of(n);
// visit bins from smallest to largest (to reduce fragmentation on the larger blocks)
for(mi_bbin_t bin = MI_BBIN_SMALL; bin <= bbin; bin = mi_bbin_inc(bin)) // no need to traverse for MI_BBIN_NONE as anyone can allocate in MI_BBIN_SMALL
// (int bin = bbin; bin >= MI_BBIN_SMALL; bin--) // visit bins from largest size bin up to the NONE bin
const mi_bbin_t bbin = mi_bbin_of(n);
// visit each cmap entry
size_t cmap_idx = 0;
mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
{
size_t cmap_idx = 0;
mi_bfield_cycle_iterate(cmap_mask, tseq, cmap_cycle, cmap_idx, X)
{
// don't search into non-accessed memory until we tried other size bins as well
if (bin < bbin && cmap_idx > cmap_acc)
// (bin > MI_BBIN_SMALL && cmap_idx > cmap_acc) // large to small
{
break;
}
// and for each chunkmap entry we iterate over its bits to find the chunks
const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
if (cmap_entry == 0) continue;
// and for each chunkmap entry we iterate over its bits to find the chunks
const mi_bfield_t cmap_entry = mi_atomic_load_relaxed(&bbitmap->chunkmap.bfields[cmap_idx]);
const size_t cmap_entry_cycle = (cmap_idx != cmap_acc ? MI_BFIELD_BITS : cmap_acc_bits);
// get size bin masks
mi_bfield_t cmap_bins[MI_BBIN_COUNT] = { 0 };
cmap_bins[MI_BBIN_NONE] = cmap_entry;
for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin < MI_BBIN_NONE; ibin = mi_bbin_inc(ibin)) {
const mi_bfield_t cmap_bin = mi_atomic_load_relaxed(&bbitmap->chunkmap_bins[ibin].bfields[cmap_idx]);
cmap_bins[ibin] = cmap_bin & cmap_entry;
cmap_bins[MI_BBIN_NONE] &= ~cmap_bin; // clear bits that are in an assigned size bin
}
// consider only chunks for a particular size bin at a time
// this picks the best bin only within a cmap entry (~ 1GiB address space), but avoids multiple
// iterations through all entries.
mi_assert_internal(bbin < MI_BBIN_NONE);
for (mi_bbin_t ibin = MI_BBIN_SMALL; ibin <= MI_BBIN_NONE;
// skip from bbin to NONE (so, say, a SMALL will never be placed in a OTHER, MEDIUM, or LARGE chunk to reduce fragmentation)
ibin = (ibin == bbin ? MI_BBIN_NONE : mi_bbin_inc(ibin)))
{
mi_assert_internal(ibin < MI_BBIN_COUNT);
const mi_bfield_t cmap_bin = cmap_bins[ibin];
size_t eidx = 0;
mi_bfield_cycle_iterate(cmap_entry, tseq%8, cmap_entry_cycle, eidx, Y) // reduce the tseq to 8 bins to reduce using extra memory (see `mstress`)
mi_bfield_cycle_iterate(cmap_bin, tseq, cmap_entry_cycle, eidx, Y)
{
mi_assert_internal(eidx <= MI_BFIELD_BITS);
// assertion doesn't quite hold as the max_accessed may be out-of-date
// mi_assert_internal(cmap_entry_cycle > eidx || ibin == MI_BBIN_NONE);
// get the chunk
const size_t chunk_idx = cmap_idx*MI_BFIELD_BITS + eidx;
mi_assert_internal(chunk_idx < mi_bbitmap_chunk_count(bbitmap));
// only in the current size class!
const mi_bbin_t chunk_bin = (mi_bbin_t)mi_atomic_load_relaxed(&bbitmap->chunk_bins[chunk_idx]);
if ((mi_bbin_t)bin == chunk_bin || (bin == bbin && chunk_bin == MI_BBIN_NONE)) // only allow NONE at the final run
// ((mi_bbin_t)bin == chunk_bin || (bin <= MI_BBIN_SMALL && chunk_bin <= MI_BBIN_SMALL)) { largest to smallest
{
mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
size_t cidx;
if ((*on_find)(chunk, n, &cidx)) {
if (cidx==0 && chunk_bin == MI_BBIN_NONE) { // only the first determines the size bin
// this chunk is now reserved for the `bbin` size class
mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin);
}
*pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
return true;
}
else {
/* we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation. */
mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);
mi_bchunk_t* chunk = &bbitmap->chunks[chunk_idx];
size_t cidx;
if ((*on_find)(chunk, n, &cidx)) {
if (cidx==0 && ibin == MI_BBIN_NONE) { // only the first block determines the size bin
// this chunk is now reserved for the `bbin` size class
mi_bbitmap_set_chunk_bin(bbitmap, chunk_idx, bbin);
}
*pidx = (chunk_idx * MI_BCHUNK_BITS) + cidx;
mi_assert_internal(*pidx + n <= mi_bbitmap_max_bits(bbitmap));
return true;
}
else {
// todo: should _on_find_ return a boolen if there is a chance all are clear to avoid calling `try_clear?`
// we may find that all are cleared only on a second iteration but that is ok as the chunkmap is a conservative approximation.
mi_bbitmap_chunkmap_try_clear(bbitmap, chunk_idx);
}
}
mi_bfield_cycle_iterate_end(Y);
}
mi_bfield_cycle_iterate_end(X);
}
mi_bfield_cycle_iterate_end(X);
return false;
}

View file

@ -215,18 +215,24 @@ bool _mi_bitmap_forall_setc_ranges(mi_bitmap_t* bitmap, mi_forall_set_fun_t* vis
// Size bins; larger bins are allowed to go into smaller bins.
// SMALL can only be in small (and NONE), so they cannot fragment the larger bins.
typedef enum mi_bbin_e {
MI_BBIN_NONE, // no bin assigned yet (the chunk is completely free)
MI_BBIN_SMALL, // slice_count == 1
MI_BBIN_OTHER, // slice_count: any other from the other bins, and 1 <= slice_count <= MI_BCHUNK_BITS
MI_BBIN_MEDIUM, // slice_count == 8
MI_BBIN_LARGE, // slice_count == MI_BFIELD_BITS -- only used if MI_ENABLE_LARGE_PAGES is 1
MI_BBIN_NONE, // no bin assigned yet (the chunk is completely free)
MI_BBIN_COUNT
} mi_bbin_t;
static inline mi_bbin_t mi_bbin_inc(mi_bbin_t bbin) {
mi_assert_internal(bbin < MI_BBIN_COUNT);
return (mi_bbin_t)((int)bbin + 1);
}
static inline mi_bbin_t mi_bbin_dec(mi_bbin_t bbin) {
mi_assert_internal(bbin > MI_BBIN_NONE);
return (mi_bbin_t)((int)bbin - 1);
}
static inline mi_bbin_t mi_bbin_of(size_t slice_count) {
if (slice_count==1) return MI_BBIN_SMALL;
if (slice_count==8) return MI_BBIN_MEDIUM;
@ -241,8 +247,8 @@ typedef mi_decl_align(MI_BCHUNK_SIZE) struct mi_bbitmap_s {
_Atomic(size_t) chunk_count; // total count of chunks (0 < N <= MI_BCHUNKMAP_BITS)
_Atomic(size_t) chunk_max_accessed; // max chunk index that was once cleared or set
size_t _padding[MI_BCHUNK_SIZE/MI_SIZE_SIZE - 2]; // suppress warning on msvc
mi_bchunkmap_t chunkmap;
_Atomic(uint8_t) chunk_bins[MI_BITMAP_MAX_CHUNK_COUNT]; // 512b
mi_bchunkmap_t chunkmap;
mi_bchunkmap_t chunkmap_bins[MI_BBIN_COUNT - 1]; // chunkmaps with bit set if the chunk is in that size class (excluding MI_BBIN_NONE)
mi_bchunk_t chunks[MI_BITMAP_DEFAULT_CHUNK_COUNT]; // usually dynamic MI_BITMAP_MAX_CHUNK_COUNT
} mi_bbitmap_t;
@ -255,6 +261,8 @@ static inline size_t mi_bbitmap_max_bits(const mi_bbitmap_t* bbitmap) {
return (mi_bbitmap_chunk_count(bbitmap) * MI_BCHUNK_BITS);
}
mi_bbin_t mi_bbitmap_debug_get_bin(const mi_bchunk_t* chunkmap_bins, size_t chunk_idx);
size_t mi_bbitmap_size(size_t bit_count, size_t* chunk_count);

View file

@ -182,12 +182,13 @@ void _mi_heap_init(mi_heap_t* heap, mi_arena_id_t arena_id, bool allow_destroy,
mi_memid_t memid = heap->memid;
_mi_memcpy_aligned(heap, &_mi_heap_empty, sizeof(mi_heap_t));
heap->memid = memid;
heap->tld = tld; // avoid reading the thread-local tld during initialization
heap->tld = tld; // avoid reading the thread-local tld during initialization
heap->tag = heap_tag;
heap->numa_node = tld->numa_node;
heap->exclusive_arena = _mi_arena_from_id(arena_id);
heap->allow_page_reclaim = (!allow_destroy && mi_option_get(mi_option_page_reclaim_on_free) >= 0);
heap->allow_page_abandon = (!allow_destroy && mi_option_get(mi_option_page_full_retain) >= 0);
heap->page_full_retain = mi_option_get_clamp(mi_option_page_full_retain, -1, 32);
heap->tag = heap_tag;
if (heap->tld->is_in_threadpool) {
// if we run as part of a thread pool it is better to not arbitrarily reclaim abandoned pages into our heap.
// this is checked in `free.c:mi_free_try_collect_mt`
@ -227,7 +228,7 @@ mi_heap_t* _mi_heap_create(int heap_tag, bool allow_destroy, mi_arena_id_t arena
else {
// heaps associated wita a specific arena are allocated in that arena
// note: takes up at least one slice which is quite wasteful...
heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, &memid);
heap = (mi_heap_t*)_mi_arenas_alloc(_mi_subproc(), _mi_align_up(sizeof(mi_heap_t),MI_ARENA_MIN_OBJ_SIZE), true, true, _mi_arena_from_id(arena_id), tld->thread_seq, tld->numa_node, &memid);
}
if (heap==NULL) {
_mi_error_message(ENOMEM, "unable to allocate heap meta-data\n");
@ -261,6 +262,11 @@ uintptr_t _mi_heap_random_next(mi_heap_t* heap) {
return _mi_random_next(&heap->random);
}
void mi_heap_set_numa_affinity(mi_heap_t* heap, int numa_node) {
if (heap == NULL) return;
heap->numa_node = (numa_node < 0 ? -1 : numa_node % _mi_os_numa_node_count());
}
// zero out the page queues
static void mi_heap_reset_pages(mi_heap_t* heap) {
mi_assert_internal(heap != NULL);

View file

@ -104,6 +104,7 @@ static mi_decl_cache_align mi_subproc_t subproc_main
static mi_decl_cache_align mi_tld_t tld_empty = {
0, // thread_id
0, // thread_seq
0, // default numa node
&subproc_main, // subproc
NULL, // heap_backing
NULL, // heaps list
@ -117,6 +118,7 @@ static mi_decl_cache_align mi_tld_t tld_empty = {
mi_decl_cache_align const mi_heap_t _mi_heap_empty = {
&tld_empty, // tld
NULL, // exclusive_arena
0, // preferred numa node
0, // cookie
//{ 0, 0 }, // keys
{ {0}, {0}, 0, true }, // random
@ -141,6 +143,7 @@ extern mi_decl_hidden mi_decl_cache_align mi_heap_t heap_main;
static mi_decl_cache_align mi_tld_t tld_main = {
0, // thread_id
0, // thread_seq
0, // numa node
&subproc_main, // subproc
&heap_main, // heap_backing
&heap_main, // heaps list
@ -154,6 +157,7 @@ static mi_decl_cache_align mi_tld_t tld_main = {
mi_decl_cache_align mi_heap_t heap_main = {
&tld_main, // thread local data
NULL, // exclusive arena
0, // preferred numa node
0, // initial cookie
//{ 0, 0 }, // the key of the main heap can be fixed (unlike page keys that need to be secure!)
{ {0x846ca68b}, {0}, 0, true }, // random
@ -306,6 +310,7 @@ static mi_tld_t* mi_tld_alloc(void) {
tld->heap_backing = NULL;
tld->heaps = NULL;
tld->subproc = &subproc_main;
tld->numa_node = _mi_os_numa_node();
tld->thread_id = _mi_prim_thread_id();
tld->thread_seq = mi_atomic_add_acq_rel(&thread_total_count, 1);
tld->is_in_threadpool = _mi_prim_thread_is_in_threadpool();
@ -647,25 +652,52 @@ void _mi_process_load(void) {
_mi_random_reinit_if_weak(&heap_main.random);
}
#if defined(_WIN32) && (defined(_M_IX86) || defined(_M_X64))
#include <intrin.h>
// CPU features
mi_decl_cache_align bool _mi_cpu_has_fsrm = false;
mi_decl_cache_align bool _mi_cpu_has_erms = false;
mi_decl_cache_align bool _mi_cpu_has_popcnt = false;
#if (MI_ARCH_X64 || MI_ARCH_X86)
#if defined(__GNUC__)
#include <cpuid.h>
static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
return (__get_cpuid(level, &regs4[0], &regs4[1], &regs4[2], &regs4[3]) == 1);
}
#elif defined(_MSC_VER)
static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
__cpuid((int32_t*)regs4, (int32_t)level);
return true;
}
#else
static bool mi_cpuid(uint32_t* regs4, uint32_t level) {
MI_UNUSED(regs4); MI_UNUSED(level);
return false;
}
#endif
static void mi_detect_cpu_features(void) {
// FSRM for fast short rep movsb/stosb support (AMD Zen3+ (~2020) or Intel Ice Lake+ (~2017))
// EMRS for fast enhanced rep movsb/stosb support
int32_t cpu_info[4];
__cpuid(cpu_info, 7);
_mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
_mi_cpu_has_erms = ((cpu_info[2] & (1 << 9)) != 0); // bit 9 of ECX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
uint32_t cpu_info[4];
if (mi_cpuid(cpu_info, 7)) {
_mi_cpu_has_fsrm = ((cpu_info[3] & (1 << 4)) != 0); // bit 4 of EDX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
_mi_cpu_has_erms = ((cpu_info[1] & (1 << 9)) != 0); // bit 9 of EBX : see <https://en.wikipedia.org/wiki/CPUID#EAX=7,_ECX=0:_Extended_Features>
}
if (mi_cpuid(cpu_info, 1)) {
_mi_cpu_has_popcnt = ((cpu_info[2] & (1 << 23)) != 0); // bit 23 of ECX : see <https://en.wikipedia.org/wiki/CPUID#EAX=1:_Processor_Info_and_Feature_Bits>
}
}
#else
static void mi_detect_cpu_features(void) {
// nothing
#if MI_ARCH_ARM64
_mi_cpu_has_popcnt = true;
#endif
}
#endif
// Initialize the process; called by thread_init or the process loader
void mi_process_init(void) mi_attr_noexcept {
// ensure we are called once
@ -685,15 +717,6 @@ void mi_process_init(void) mi_attr_noexcept {
// the following two can potentially allocate (on freeBSD for locks and thread keys)
mi_subproc_main_init();
mi_process_setup_auto_thread_done();
#if MI_DEBUG
_mi_verbose_message("debug level : %d\n", MI_DEBUG);
#endif
_mi_verbose_message("secure level: %d\n", MI_SECURE);
_mi_verbose_message("mem tracking: %s\n", MI_TRACK_TOOL);
#if MI_TSAN
_mi_verbose_message("thread santizer enabled\n");
#endif
mi_thread_init();
#if defined(_WIN32) && defined(MI_WIN_USE_FLS)

View file

@ -355,7 +355,6 @@ size_t _mi_clz_generic(size_t x) {
#endif // bit scan
#if !MI_HAS_FAST_POPCOUNT
#if MI_SIZE_SIZE == 4
#define mi_mask_even_bits32 (0x55555555)
@ -383,7 +382,7 @@ static size_t mi_popcount_generic32(uint32_t x) {
return mi_byte_sum32(x);
}
size_t _mi_popcount_generic(size_t x) {
mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
return mi_popcount_generic32(x);
}
@ -407,9 +406,8 @@ static size_t mi_popcount_generic64(uint64_t x) {
return mi_byte_sum64(x);
}
size_t _mi_popcount_generic(size_t x) {
mi_decl_noinline size_t _mi_popcount_generic(size_t x) {
return mi_popcount_generic64(x);
}
#endif
#endif // popcount

View file

@ -175,7 +175,7 @@ static mi_option_desc_t options[_mi_option_last] =
{ 0, UNINIT, MI_OPTION(max_vabits) }, // max virtual address space bits
{ MI_DEFAULT_PAGEMAP_COMMIT,
UNINIT, MI_OPTION(pagemap_commit) }, // commit the full pagemap upfront?
{ 0, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this only on overcommit systems (like Linux))
{ 1, UNINIT, MI_OPTION(page_commit_on_demand) }, // commit pages on-demand (2 disables this only on overcommit systems (like Linux))
{ 16, UNINIT, MI_OPTION(page_reclaim_max) }, // don't reclaim pages if we already own N pages (in that size class)
};

View file

@ -694,18 +694,19 @@ static void mi_os_free_huge_os_pages(void* p, size_t size) {
Support NUMA aware allocation
-----------------------------------------------------------------------------*/
_Atomic(size_t) _mi_numa_node_count; // = 0 // cache the node count
static _Atomic(int) _mi_numa_node_count; // = 0 // cache the node count
size_t _mi_os_numa_node_count_get(void) {
size_t count = mi_atomic_load_acquire(&_mi_numa_node_count);
if (count <= 0) {
int _mi_os_numa_node_count(void) {
int count = mi_atomic_load_acquire(&_mi_numa_node_count);
if mi_unlikely(count <= 0) {
long ncount = mi_option_get(mi_option_use_numa_nodes); // given explicitly?
if (ncount > 0) {
count = (size_t)ncount;
if (ncount > 0 && ncount < INT_MAX) {
count = (int)ncount;
}
else {
count = _mi_prim_numa_node_count(); // or detect dynamically
if (count == 0) count = 1;
const size_t n = _mi_prim_numa_node_count(); // or detect dynamically
if (n == 0 || n > INT_MAX) { count = 1; }
else { count = (int)n; }
}
mi_atomic_store_release(&_mi_numa_node_count, count); // save it
_mi_verbose_message("using %zd numa regions\n", count);
@ -713,15 +714,24 @@ size_t _mi_os_numa_node_count_get(void) {
return count;
}
int _mi_os_numa_node_get(void) {
size_t numa_count = _mi_os_numa_node_count();
static int mi_os_numa_node_get(void) {
int numa_count = _mi_os_numa_node_count();
if (numa_count<=1) return 0; // optimize on single numa node systems: always node 0
// never more than the node count and >= 0
size_t numa_node = _mi_prim_numa_node();
const size_t n = _mi_prim_numa_node();
int numa_node = (n < INT_MAX ? (int)n : 0);
if (numa_node >= numa_count) { numa_node = numa_node % numa_count; }
return (int)numa_node;
return numa_node;
}
int _mi_os_numa_node(void) {
if mi_likely(mi_atomic_load_relaxed(&_mi_numa_node_count) == 1) { return 0; }
else return mi_os_numa_node_get();
}
/* ----------------------------------------------------------------------------
Public API

View file

@ -62,8 +62,16 @@ terms of the MIT license. A copy of the license can be found in the file
#include <sys/syscall.h>
#endif
#if !defined(MADV_DONTNEED) && defined(POSIX_MADV_DONTNEED) // QNX
#define MADV_DONTNEED POSIX_MADV_DONTNEED
#endif
#if !defined(MADV_FREE) && defined(POSIX_MADV_FREE) // QNX
#define MADV_FREE POSIX_MADV_FREE
#endif
#define MI_UNIX_LARGE_PAGE_SIZE (2*MI_MiB) // TODO: can we query the OS for this?
//------------------------------------------------------------------------------------
// Use syscalls for some primitives to allow for libraries that override open/read/close etc.
// and do allocation themselves; using syscalls prevents recursion when mimalloc is
@ -191,6 +199,8 @@ int _mi_prim_free(void* addr, size_t size ) {
static int unix_madvise(void* addr, size_t size, int advice) {
#if defined(__sun)
int res = madvise((caddr_t)addr, size, advice); // Solaris needs cast (issue #520)
#elif defined(__QNX__)
int res = posix_madvise(addr, size, advice);
#else
int res = madvise(addr, size, advice);
#endif

View file

@ -92,23 +92,23 @@ void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
// must be thread safe as it is called from stats_merge
static void mi_stat_count_add(mi_stat_count_t* stat, const mi_stat_count_t* src) {
static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) {
if (stat==src) return;
if (src->total!=0) { mi_atomic_addi64_relaxed(&stat->total, src->total); }
if (src->current!=0) { mi_atomic_addi64_relaxed(&stat->current, src->current); }
// peak scores do really not work across threads ... we use conservative max
if (src->peak > stat->peak) {
mi_atomic_maxi64_relaxed(&stat->peak, src->peak); // or: mi_atomic_addi64_relaxed( &stat->peak, src->peak);
}
mi_atomic_void_addi64_relaxed(&stat->total, &src->total);
mi_atomic_void_addi64_relaxed(&stat->current, &src->current);
// peak scores do really not work across threads .. we just add them
mi_atomic_void_addi64_relaxed( &stat->peak, &src->peak);
// or, take the max?
// mi_atomic_maxi64_relaxed(&stat->peak, src->peak);
}
static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src) {
static void mi_stat_counter_add_mt(mi_stat_counter_t* stat, const mi_stat_counter_t* src) {
if (stat==src) return;
if (src->total!=0) { mi_atomic_addi64_relaxed(&stat->total, src->total); }
mi_atomic_void_addi64_relaxed(&stat->total, &src->total);
}
#define MI_STAT_COUNT(stat) mi_stat_count_add(&stats->stat, &src->stat);
#define MI_STAT_COUNTER(stat) mi_stat_counter_add(&stats->stat, &src->stat);
#define MI_STAT_COUNT(stat) mi_stat_count_add_mt(&stats->stat, &src->stat);
#define MI_STAT_COUNTER(stat) mi_stat_counter_add_mt(&stats->stat, &src->stat);
// must be thread safe as it is called from stats_merge
static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
@ -119,11 +119,11 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
#if MI_STAT>1
for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
mi_stat_count_add(&stats->malloc_bins[i], &src->malloc_bins[i]);
mi_stat_count_add_mt(&stats->malloc_bins[i], &src->malloc_bins[i]);
}
#endif
for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
mi_stat_count_add(&stats->page_bins[i], &src->page_bins[i]);
mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);
}
}
@ -318,8 +318,8 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
mi_stat_print(&stats->malloc_normal, "normal", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg);
mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? 1 : -1), out, arg);
mi_stat_count_t total = { 0,0,0 };
mi_stat_count_add(&total, &stats->malloc_normal);
mi_stat_count_add(&total, &stats->malloc_huge);
mi_stat_count_add_mt(&total, &stats->malloc_normal);
mi_stat_count_add_mt(&total, &stats->malloc_huge);
mi_stat_print_ex(&total, "total", 1, out, arg, "");
#endif
#if MI_STAT>1