merge from dev

2025-08-26 01:04:48 +03:00 · 2020-03-16 16:41:21 -07:00 · 2020-03-16 16:41:21 -07:00 · 1f396e64a0
commit 1f396e64a0
parent d221a4b904 980d343f39
34 changed files with 2041 additions and 554 deletions
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@ -10,10 +10,6 @@ terms of the MIT license. A copy of the license can be found in the file

 #include "mimalloc-types.h"

-#if defined(MI_MALLOC_OVERRIDE) && (defined(__APPLE__) || defined(__OpenBSD__) || defined(__DragonFly__))
-#define MI_TLS_RECURSE_GUARD
-#endif
-
 #if (MI_DEBUG>0)
 #define mi_trace_message(...)  _mi_trace_message(__VA_ARGS__)
 #else
@ -33,7 +29,7 @@ terms of the MIT license. A copy of the license can be found in the file
 #else
 #define mi_decl_noinline
 #define mi_decl_thread          __thread        // hope for the best :-)
-#define mi_decl_cache_align     
+#define mi_decl_cache_align
 #endif


@ -51,6 +47,7 @@ void       _mi_random_init(mi_random_ctx_t* ctx);
 void       _mi_random_split(mi_random_ctx_t* ctx, mi_random_ctx_t* new_ctx);
 uintptr_t  _mi_random_next(mi_random_ctx_t* ctx);
 uintptr_t  _mi_heap_random_next(mi_heap_t* heap);
+uintptr_t  _os_random_weak(uintptr_t extra_seed);
 static inline uintptr_t _mi_random_shuffle(uintptr_t x);

 // init.c
@ -71,6 +68,7 @@ bool      _mi_os_commit(void* addr, size_t size, bool* is_zero, mi_stats_t* stat
 bool      _mi_os_decommit(void* p, size_t size, mi_stats_t* stats);
 bool      _mi_os_reset(void* p, size_t size, mi_stats_t* stats);
 bool      _mi_os_unreset(void* p, size_t size, bool* is_zero, mi_stats_t* stats);
+size_t    _mi_os_good_alloc_size(size_t size);

 // arena.c
 void*     _mi_arena_alloc_aligned(size_t size, size_t alignment, bool* commit, bool* large, bool* is_zero, size_t* memid, mi_os_tld_t* tld);
@ -89,6 +87,8 @@ uint8_t*   _mi_segment_page_start(const mi_segment_t* segment, const mi_page_t*
 void       _mi_abandoned_reclaim_all(mi_heap_t* heap, mi_segments_tld_t* tld);
 void       _mi_abandoned_await_readers(void);

+
+
 // "page.c"
 void*      _mi_malloc_generic(mi_heap_t* heap, size_t size)  mi_attr_noexcept mi_attr_malloc;

@ -238,9 +238,13 @@ static inline size_t _mi_wsize_from_size(size_t size) {
  return (size + sizeof(uintptr_t) - 1) / sizeof(uintptr_t);
 }

+// Does malloc satisfy the alignment constraints already?
+static inline bool mi_malloc_satisfies_alignment(size_t alignment, size_t size) {
+  return (alignment == sizeof(void*) || (alignment == MI_MAX_ALIGN_SIZE && size > (MI_MAX_ALIGN_SIZE/2)));
+}

 // Overflow detecting multiply
-static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {  
+static inline bool mi_mul_overflow(size_t count, size_t size, size_t* total) {
 #if __has_builtin(__builtin_umul_overflow) || __GNUC__ >= 5
 #include <limits.h>   // UINT_MAX, ULONG_MAX
 #if (SIZE_MAX == UINT_MAX)
@ -273,26 +277,76 @@ static inline bool mi_count_size_overflow(size_t count, size_t size, size_t* tot
 }


-/* -----------------------------------------------------------
-  The thread local default heap
----------------------------------------------------------- */
+/* ----------------------------------------------------------------------------------------
+The thread local default heap: `_mi_get_default_heap` returns the thread local heap.
+On most platforms (Windows, Linux, FreeBSD, NetBSD, etc), this just returns a
+__thread local variable (`_mi_heap_default`). With the initial-exec TLS model this ensures
+that the storage will always be available (allocated on the thread stacks).
+On some platforms though we cannot use that when overriding `malloc` since the underlying
+TLS implementation (or the loader) will call itself `malloc` on a first access and recurse.
+We try to circumvent this in an efficient way:
+- macOSX : we use an unused TLS slot from the OS allocated slots (MI_TLS_SLOT). On OSX, the
+           loader itself calls `malloc` even before the modules are initialized.
+- OpenBSD: we use an unused slot from the pthread block (MI_TLS_PTHREAD_SLOT_OFS).
+- DragonFly: not yet working.
+------------------------------------------------------------------------------------------- */

 extern const mi_heap_t _mi_heap_empty;  // read-only empty heap, initial value of the thread local default heap
-extern mi_heap_t _mi_heap_main;         // statically allocated main backing heap
 extern bool _mi_process_is_initialized;
+mi_heap_t*  _mi_heap_main_get(void);    // statically allocated main backing heap

+#if defined(MI_MALLOC_OVERRIDE)
+#if defined(__MACH__) // OSX
+#define MI_TLS_SLOT               89  // seems unused? 
+// other possible unused ones are 9, 29, __PTK_FRAMEWORK_JAVASCRIPTCORE_KEY4 (94), __PTK_FRAMEWORK_GC_KEY9 (112) and __PTK_FRAMEWORK_OLDGC_KEY9 (89)
+// see <https://github.com/rweichler/substrate/blob/master/include/pthread_machdep.h>
+#elif defined(__OpenBSD__)
+// use end bytes of a name; goes wrong if anyone uses names > 23 characters (ptrhread specifies 16) 
+// see <https://github.com/openbsd/src/blob/master/lib/libc/include/thread_private.h#L371>
+#define MI_TLS_PTHREAD_SLOT_OFS   (6*sizeof(int) + 4*sizeof(void*) + 24)  
+#elif defined(__DragonFly__)
+#warning "mimalloc is not working correctly on DragonFly yet."
+#define MI_TLS_PTHREAD_SLOT_OFS   (4 + 1*sizeof(void*))  // offset `uniqueid` (also used by gdb?) <https://github.com/DragonFlyBSD/DragonFlyBSD/blob/master/lib/libthread_xu/thread/thr_private.h#L458>
+#endif
+#endif
+
+#if defined(MI_TLS_SLOT)
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept;   // forward declaration
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+#include <pthread.h>
+static inline mi_heap_t** mi_tls_pthread_heap_slot(void) {
+  pthread_t self = pthread_self();
+  #if defined(__DragonFly__)
+  if (self==NULL) {
+    static mi_heap_t* pheap_main = _mi_heap_main_get();
+    return &pheap_main;
+  }
+  #endif
+  return (mi_heap_t**)((uint8_t*)self + MI_TLS_PTHREAD_SLOT_OFS);
+}
+#elif defined(MI_TLS_PTHREAD)
+#include <pthread.h>
+extern pthread_key_t _mi_heap_default_key;
+#else
 extern mi_decl_thread mi_heap_t* _mi_heap_default;  // default heap to allocate from
+#endif

 static inline mi_heap_t* mi_get_default_heap(void) {
-#ifdef MI_TLS_RECURSE_GUARD
-  // on some BSD platforms, like macOS, the dynamic loader calls `malloc`
-  // to initialize thread local data. To avoid recursion, we need to avoid
-  // accessing the thread local `_mi_default_heap` until our module is loaded
-  // and use the statically allocated main heap until that time.
-  // TODO: patch ourselves dynamically to avoid this check every time?
-  if (!_mi_process_is_initialized) return &_mi_heap_main;
-#endif
+#if defined(MI_TLS_SLOT)
+  mi_heap_t* heap = (mi_heap_t*)mi_tls_slot(MI_TLS_SLOT);
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD_SLOT_OFS)
+  mi_heap_t* heap = *mi_tls_pthread_heap_slot();
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#elif defined(MI_TLS_PTHREAD)
+  mi_heap_t* heap = (mi_unlikely(_mi_heap_default_key == (pthread_key_t)(-1)) ? _mi_heap_main_get() : (mi_heap_t*)pthread_getspecific(_mi_heap_default_key));
+  return (mi_unlikely(heap == NULL) ? (mi_heap_t*)&_mi_heap_empty : heap);
+#else
+  #if defined(MI_TLS_RECURSE_GUARD)
+  if (mi_unlikely(!_mi_process_is_initialized)) return _mi_heap_main_get();
+  #endif
  return _mi_heap_default;
+#endif
 }

 static inline bool mi_heap_is_default(const mi_heap_t* heap) {
@ -309,6 +363,8 @@ static inline bool mi_heap_is_initialized(mi_heap_t* heap) {
 }

 static inline uintptr_t _mi_ptr_cookie(const void* p) {
+  extern mi_heap_t _mi_heap_main;
+  mi_assert_internal(_mi_heap_main.cookie != 0);
  return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }

@ -317,8 +373,10 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) {
 ----------------------------------------------------------- */

 static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t size) {
-  mi_assert_internal(size <= MI_SMALL_SIZE_MAX);
-  return heap->pages_free_direct[_mi_wsize_from_size(size)];
+  mi_assert_internal(size <= (MI_SMALL_SIZE_MAX + MI_PADDING_SIZE));
+  const size_t idx = _mi_wsize_from_size(size);
+  mi_assert_internal(idx < MI_PAGES_DIRECT);
+  return heap->pages_free_direct[idx];
 }

 // Get the page belonging to a certain size class
@ -394,6 +452,13 @@ static inline size_t mi_page_block_size(const mi_page_t* page) {
  }
 }

+// Get the usable block size of a page without fixed padding.
+// This may still include internal padding due to alignment and rounding up size classes.
+static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
+  return mi_page_block_size(page) - MI_PADDING_SIZE;
+}
+
+
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
  return (mi_block_t*)(mi_atomic_read_relaxed(&page->xthread_free) & ~3);
@ -430,14 +495,14 @@ static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t*
  return mi_tf_make(block, mi_tf_delayed(tf));
 }

-// are all blocks in a page freed? 
+// are all blocks in a page freed?
 // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
 static inline bool mi_page_all_free(const mi_page_t* page) {
  mi_assert_internal(page != NULL);
  return (page->used == 0);
 }

-// are there any available blocks? 
+// are there any available blocks?
 static inline bool mi_page_has_any_available(const mi_page_t* page) {
  mi_assert_internal(page != NULL && page->reserved > 0);
  return (page->used < page->reserved || (mi_page_thread_free(page) != NULL));
@ -485,11 +550,11 @@ static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
 /* -------------------------------------------------------------------
 Encoding/Decoding the free list next pointers

-This is to protect against buffer overflow exploits where the 
-free list is mutated. Many hardened allocators xor the next pointer `p` 
+This is to protect against buffer overflow exploits where the
+free list is mutated. Many hardened allocators xor the next pointer `p`
 with a secret key `k1`, as `p^k1`. This prevents overwriting with known
-values but might be still too weak: if the attacker can guess 
-the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`). 
+values but might be still too weak: if the attacker can guess
+the pointer `p` this  can reveal `k1` (since `p^k1^p == k1`).
 Moreover, if multiple blocks can be read as well, the attacker can
 xor both as `(p1^k1) ^ (p2^k1) == p1^p2` which may reveal a lot
 about the pointers (and subsequently `k1`).
@ -497,9 +562,9 @@ about the pointers (and subsequently `k1`).
 Instead mimalloc uses an extra key `k2` and encodes as `((p^k2)<<<k1)+k1`.
 Since these operations are not associative, the above approaches do not
 work so well any more even if the `p` can be guesstimated. For example,
-for the read case we can subtract two entries to discard the `+k1` term, 
+for the read case we can subtract two entries to discard the `+k1` term,
 but that leads to `((p1^k2)<<<k1) - ((p2^k2)<<<k1)` at best.
-We include the left-rotation since xor and addition are otherwise linear 
+We include the left-rotation since xor and addition are otherwise linear
 in the lowest bit. Finally, both keys are unique per page which reduces
 the re-use of keys by a large factor.

@ -526,30 +591,37 @@ static inline uintptr_t mi_rotr(uintptr_t x, uintptr_t shift) {
  return ((x >> shift) | (x << (MI_INTPTR_BITS - shift)));
 }

-static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, uintptr_t key1, uintptr_t key2 ) {
+static inline void* mi_ptr_decode(const void* null, const mi_encoded_t x, const uintptr_t* keys) {
+  void* p = (void*)(mi_rotr(x - keys[0], keys[0]) ^ keys[1]);
+  return (mi_unlikely(p==null) ? NULL : p);
+}
+
+static inline mi_encoded_t mi_ptr_encode(const void* null, const void* p, const uintptr_t* keys) {
+  uintptr_t x = (uintptr_t)(mi_unlikely(p==NULL) ? null : p);
+  return mi_rotl(x ^ keys[1], keys[0]) + keys[0];
+}
+
+static inline mi_block_t* mi_block_nextx( const void* null, const mi_block_t* block, const uintptr_t* keys ) {
  #ifdef MI_ENCODE_FREELIST
-  mi_block_t* b = (mi_block_t*)(mi_rotr(block->next - key1, key1) ^ key2);
-  if (mi_unlikely((void*)b==null)) { b = NULL; }
-  return b;
+  return (mi_block_t*)mi_ptr_decode(null, block->next, keys);
  #else
-  UNUSED(key1); UNUSED(key2); UNUSED(null);
+  UNUSED(keys); UNUSED(null);
  return (mi_block_t*)block->next;
  #endif
 }

-static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, uintptr_t key1, uintptr_t key2) {
+static inline void mi_block_set_nextx(const void* null, mi_block_t* block, const mi_block_t* next, const uintptr_t* keys) {
  #ifdef MI_ENCODE_FREELIST
-  if (mi_unlikely(next==NULL)) { next = (mi_block_t*)null; }
-  block->next = mi_rotl((uintptr_t)next ^ key2, key1) + key1;
+  block->next = mi_ptr_encode(null, next, keys);
  #else
-  UNUSED(key1); UNUSED(key2); UNUSED(null);
+  UNUSED(keys); UNUSED(null);
  block->next = (mi_encoded_t)next;
  #endif
 }

 static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t* block) {
  #ifdef MI_ENCODE_FREELIST
-  mi_block_t* next = mi_block_nextx(page,block,page->key[0],page->key[1]);
+  mi_block_t* next = mi_block_nextx(page,block,page->keys);
  // check for free list corruption: is `next` at least in the same page?
  // TODO: check if `next` is `page->block_size` aligned?
  if (mi_unlikely(next!=NULL && !mi_is_in_same_page(block, next))) {
@ -559,16 +631,16 @@ static inline mi_block_t* mi_block_next(const mi_page_t* page, const mi_block_t*
  return next;
  #else
  UNUSED(page);
-  return mi_block_nextx(page,block,0,0);
+  return mi_block_nextx(page,block,NULL);
  #endif
 }

 static inline void mi_block_set_next(const mi_page_t* page, mi_block_t* block, const mi_block_t* next) {
  #ifdef MI_ENCODE_FREELIST
-  mi_block_set_nextx(page,block,next, page->key[0], page->key[1]);
+  mi_block_set_nextx(page,block,next, page->keys);
  #else
  UNUSED(page);
-  mi_block_set_nextx(page,block, next,0,0);
+  mi_block_set_nextx(page,block,next,NULL);
  #endif
 }

@ -615,9 +687,8 @@ static inline size_t _mi_os_numa_node_count(void) {


 // -------------------------------------------------------------------
-// Getting the thread id should be performant
-// as it is called in the fast path of `_mi_free`,
-// so we specialize for various platforms.
+// Getting the thread id should be performant as it is called in the
+// fast path of `_mi_free` and we specialize for various platforms.
 // -------------------------------------------------------------------
 #if defined(_WIN32)
 #define WIN32_LEAN_AND_MEAN
@ -626,24 +697,55 @@ static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
  // Windows: works on Intel and ARM in both 32- and 64-bit
  return (uintptr_t)NtCurrentTeb();
 }
-#elif (defined(__GNUC__) || defined(__clang__)) && \
+
+#elif defined(__GNUC__) && \
      (defined(__x86_64__) || defined(__i386__) || defined(__arm__) || defined(__aarch64__))
-// TLS register on x86 is in the FS or GS register
-// see: https://akkadia.org/drepper/tls.pdf
+
+// TLS register on x86 is in the FS or GS register, see: https://akkadia.org/drepper/tls.pdf
+static inline void* mi_tls_slot(size_t slot) mi_attr_noexcept {
+  void* res;
+  const size_t ofs = (slot*sizeof(void*));
+#if defined(__i386__)
+  __asm__("movl %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+  __asm__("movq %%gs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+  __asm__("movq %%fs:%1, %0" : "=r" (res) : "m" (*((void**)ofs)) : );  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  res = tcb[slot];
+#elif defined(__aarch64__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+  res = tcb[slot];
+#endif
+  return res;
+}
+
+// setting is only used on macOSX for now
+static inline void mi_tls_slot_set(size_t slot, void* value) mi_attr_noexcept {
+  const size_t ofs = (slot*sizeof(void*));
+#if defined(__i386__)
+  __asm__("movl %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // 32-bit always uses GS
+#elif defined(__MACH__) && defined(__x86_64__)
+  __asm__("movq %1,%%gs:%0" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 macOSX uses GS
+#elif defined(__x86_64__)
+  __asm__("movq %1,%%fs:%1" : "=m" (*((void**)ofs)) : "rn" (value) : );  // x86_64 Linux, BSD uses FS
+#elif defined(__arm__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrc p15, 0, %0, c13, c0, 3\nbic %0, %0, #3" : "=r" (tcb));
+  tcb[slot] = value;
+#elif defined(__aarch64__)
+  void** tcb; UNUSED(ofs);
+  asm volatile ("mrs %0, tpidr_el0" : "=r" (tcb));
+  tcb[slot] = value;
+#endif
+}
+
 static inline uintptr_t _mi_thread_id(void) mi_attr_noexcept {
-  uintptr_t tid;
-  #if defined(__i386__)
-  __asm__("movl %%gs:0, %0" : "=r" (tid) : : );  // 32-bit always uses GS
-  #elif defined(__MACH__)
-  __asm__("movq %%gs:0, %0" : "=r" (tid) : : );  // x86_64 macOS uses GS
-  #elif defined(__x86_64__)
-  __asm__("movq %%fs:0, %0" : "=r" (tid) : : );  // x86_64 Linux, BSD uses FS
-  #elif defined(__arm__)
-  asm volatile ("mrc p15, 0, %0, c13, c0, 3" : "=r" (tid));
-  #elif defined(__aarch64__)
-  asm volatile ("mrs %0, tpidr_el0" : "=r" (tid));
-  #endif
-  return tid;
+  // in all our targets, slot 0 is the pointer to the thread control block
+  return (uintptr_t)mi_tls_slot(0);
 }
 #else
 // otherwise use standard C