Merge branch 'dev' into dev-arena

2025-08-25 00:34:48 +03:00 · 2020-01-22 20:39:33 -08:00 · 2020-01-22 20:39:33 -08:00 · e226ebcc97
commit e226ebcc97
parent caa5e51a67 3bbbe6c686
20 changed files with 6123 additions and 216 deletions
--- a/src/alloc.c
+++ b/src/alloc.c
@ -240,9 +240,9 @@ static mi_decl_noinline void _mi_free_block_mt(mi_page_t* page, mi_block_t* bloc
      // add to the delayed free list of this heap. (do this atomically as the lock only protects heap memory validity)
      mi_block_t* dfree;
      do {
-        dfree = (mi_block_t*)heap->thread_delayed_free;
+        dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
        mi_block_set_nextx(heap,block,dfree, heap->key[0], heap->key[1]);
-      } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
+      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
    }

    // and reset the MI_DELAYED_FREEING flag
--- a/src/arena.c
+++ b/src/arena.c
@ -62,7 +62,7 @@ typedef uintptr_t mi_block_info_t;

 // A memory arena descriptor
 typedef struct mi_arena_s {
-  uint8_t* start;                         // the start of the memory area
+  _Atomic(uint8_t*) start;                // the start of the memory area
  size_t   block_count;                   // size of the area in arena blocks (of `MI_ARENA_BLOCK_SIZE`)
  size_t   field_count;                   // number of bitmap fields (where `field_count * MI_BITMAP_FIELD_BITS >= block_count`)
  int      numa_node;                     // associated NUMA node
@ -327,7 +327,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
    mi_assert_internal(size <= bcount*MI_ARENA_BLOCK_SIZE);
    // try numa affine allocation
    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
-      mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
+      mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
      if (arena==NULL) break; // end reached
      if ((arena->numa_node<0 || arena->numa_node==numa_node) && // numa local?
          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
@ -339,7 +339,7 @@ void* _mi_arena_alloc_aligned(size_t size, size_t alignment,
    }
    // try from another numa node instead..
    for (size_t i = 0; i < MI_MAX_ARENAS; i++) {
-      mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[i]));
+      mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t, &mi_arenas[i]);
      if (arena==NULL) break; // end reached
      if ((arena->numa_node>=0 && arena->numa_node!=numa_node) && // not numa local!
          (*large || !arena->is_large)) // large OS pages allowed, or arena is not large OS pages
@ -388,7 +388,7 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed, bool
    size_t bitmap_idx;
    mi_arena_id_indices(memid, &arena_idx, &bitmap_idx);
    mi_assert_internal(arena_idx < MI_MAX_ARENAS);
-    mi_arena_t* arena = (mi_arena_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &mi_arenas[arena_idx]));
+    mi_arena_t* arena = mi_atomic_read_ptr_relaxed(mi_arena_t,&mi_arenas[arena_idx]);
    mi_assert_internal(arena != NULL);
    if (arena == NULL) {
      _mi_error_message(EINVAL, "trying to free from non-existent arena: %p, size %zu, memid: 0x%zx\n", p, size, memid);
@ -414,15 +414,15 @@ void _mi_arena_free(void* p, size_t size, size_t memid, bool is_committed, bool

 static bool mi_arena_add(mi_arena_t* arena) {
  mi_assert_internal(arena != NULL);
-  mi_assert_internal((uintptr_t)arena->start % MI_SEGMENT_ALIGN == 0);
+  mi_assert_internal((uintptr_t)mi_atomic_read_ptr_relaxed(uint8_t,&arena->start) % MI_SEGMENT_ALIGN == 0);
  mi_assert_internal(arena->block_count > 0);

-  uintptr_t i = mi_atomic_addu(&mi_arena_count,1);
+  uintptr_t i = mi_atomic_increment(&mi_arena_count);
  if (i >= MI_MAX_ARENAS) {
-    mi_atomic_subu(&mi_arena_count, 1);
+    mi_atomic_decrement(&mi_arena_count);
    return false;
  }
-  mi_atomic_write_ptr(mi_atomic_cast(void*,&mi_arenas[i]), arena);
+  mi_atomic_write_ptr(mi_arena_t,&mi_arenas[i], arena);
  return true;
 }

@ -444,7 +444,7 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
    _mi_warning_message("failed to reserve %zu gb huge pages\n", pages);
    return ENOMEM;
  }
-  _mi_verbose_message("reserved %zu gb huge pages (of the %zu gb requested)\n", pages_reserved, pages);
+  _mi_verbose_message("reserved %zu gb huge pages on numa node %i (of the %zu gb requested)\n", pages_reserved, numa_node, pages);

  size_t bcount = mi_block_count_of_size(hsize);
  size_t fields = _mi_divide_up(bcount, MI_BITMAP_FIELD_BITS);
--- a/src/heap.c
+++ b/src/heap.c
@ -147,7 +147,7 @@ static void mi_heap_collect_ex(mi_heap_t* heap, mi_collect_t collect)

  // collect all pages owned by this thread
  mi_heap_visit_pages(heap, &mi_heap_page_collect, &collect, NULL);
-  mi_assert_internal( collect != ABANDON || heap->thread_delayed_free == NULL );
+  mi_assert_internal( collect != ABANDON || mi_atomic_read_ptr(mi_block_t,&heap->thread_delayed_free) == NULL );

  // collect segment caches
  if (collect >= FORCE) {
--- a/src/memory.c
+++ b/src/memory.c
@ -125,7 +125,7 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {
  if (p==NULL) return false;
  size_t count = mi_atomic_read_relaxed(&regions_count);
  for (size_t i = 0; i < count; i++) {
-    uint8_t* start = (uint8_t*)mi_atomic_read_ptr_relaxed(&regions[i].start);
+    uint8_t* start = mi_atomic_read_ptr_relaxed(uint8_t,&regions[i].start);
    if (start != NULL && (uint8_t*)p >= start && (uint8_t*)p < start + MI_REGION_SIZE) return true;
  }
  return false;
@ -133,9 +133,9 @@ bool mi_is_in_heap_region(const void* p) mi_attr_noexcept {


 static void* mi_region_blocks_start(const mem_region_t* region, mi_bitmap_index_t bit_idx) {
-  void* start = mi_atomic_read_ptr(&region->start);
+  uint8_t* start = mi_atomic_read_ptr(uint8_t,&region->start);
  mi_assert_internal(start != NULL);
-  return ((uint8_t*)start + (bit_idx * MI_SEGMENT_SIZE));  
+  return (start + (bit_idx * MI_SEGMENT_SIZE));  
 }

 static size_t mi_memid_create(mem_region_t* region, mi_bitmap_index_t bit_idx) {
@ -200,7 +200,7 @@ static bool mi_region_try_alloc_os(size_t blocks, bool commit, bool allow_large,
  mi_atomic_write(&r->reset, 0);
  *bit_idx = 0;
  mi_bitmap_claim(&r->in_use, 1, blocks, *bit_idx, NULL);
-  mi_atomic_write_ptr(&r->start, start);
+  mi_atomic_write_ptr(uint8_t*,&r->start, start);

  // and share it 
  mi_region_info_t info;
@ -277,14 +277,14 @@ static void* mi_region_try_alloc(size_t blocks, bool* commit, bool* is_large, bo

  mi_region_info_t info;
  info.value = mi_atomic_read(&region->info);
-  void* start = mi_atomic_read_ptr(&region->start);
+  uint8_t* start = mi_atomic_read_ptr(uint8_t,&region->start);
  mi_assert_internal(!(info.x.is_large && !*is_large));
  mi_assert_internal(start != NULL);

  *is_zero = mi_bitmap_unclaim(&region->dirty, 1, blocks, bit_idx);  
  *is_large = info.x.is_large;
  *memid = mi_memid_create(region, bit_idx);
-  void* p = (uint8_t*)start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);
+  void* p = start + (mi_bitmap_index_bit_in_field(bit_idx) * MI_SEGMENT_SIZE);

  // commit
  if (*commit) {
@ -446,7 +446,7 @@ void _mi_mem_collect(mi_os_tld_t* tld) {
      } while(m == 0 && !mi_atomic_cas_weak(&region->in_use, MI_BITMAP_FIELD_FULL, 0 ));
      if (m == 0) {
        // on success, free the whole region
-        void* start = mi_atomic_read_ptr(&regions[i].start);
+        uint8_t* start = mi_atomic_read_ptr(uint8_t,&regions[i].start);
        size_t arena_memid = mi_atomic_read_relaxed(&regions[i].arena_memid);
        memset(&regions[i], 0, sizeof(mem_region_t));
        // and release the whole region
--- a/src/options.c
+++ b/src/options.c
@ -171,7 +171,7 @@ static void mi_out_buf(const char* msg, void* arg) {
  size_t n = strlen(msg);
  if (n==0) return;
  // claim space
-  uintptr_t start = mi_atomic_addu(&out_len, n);
+  uintptr_t start = mi_atomic_add(&out_len, n);
  if (start >= MI_MAX_DELAY_OUTPUT) return;
  // check bound
  if (start+n >= MI_MAX_DELAY_OUTPUT) {
@ -183,7 +183,7 @@ static void mi_out_buf(const char* msg, void* arg) {
 static void mi_out_buf_flush(mi_output_fun* out, bool no_more_buf, void* arg) {
  if (out==NULL) return;
  // claim (if `no_more_buf == true`, no more output will be added after this point)
-  size_t count = mi_atomic_addu(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
+  size_t count = mi_atomic_add(&out_len, (no_more_buf ? MI_MAX_DELAY_OUTPUT : 1));
  // and output the current contents
  if (count>MI_MAX_DELAY_OUTPUT) count = MI_MAX_DELAY_OUTPUT;
  out_buf[count] = 0;
@ -214,14 +214,14 @@ static mi_output_fun* volatile mi_out_default; // = NULL
 static volatile _Atomic(void*) mi_out_arg; // = NULL

 static mi_output_fun* mi_out_get_default(void** parg) {
-  if (parg != NULL) { *parg = mi_atomic_read_ptr(&mi_out_arg); }
+  if (parg != NULL) { *parg = mi_atomic_read_ptr(void,&mi_out_arg); }
  mi_output_fun* out = mi_out_default;
  return (out == NULL ? &mi_out_buf : out);
 }

 void mi_register_output(mi_output_fun* out, void* arg) mi_attr_noexcept {
  mi_out_default = (out == NULL ? &mi_out_stderr : out); // stop using the delayed output buffer
-  mi_atomic_write_ptr(&mi_out_arg, arg);
+  mi_atomic_write_ptr(void,&mi_out_arg, arg);
  if (out!=NULL) mi_out_buf_flush(out,true,arg);         // output all the delayed output now
 }

@ -330,7 +330,7 @@ static void mi_error_default(int err) {

 void mi_register_error(mi_error_fun* fun, void* arg) {
  mi_error_handler = fun;  // can be NULL
-  mi_atomic_write_ptr(&mi_error_arg, arg);
+  mi_atomic_write_ptr(void,&mi_error_arg, arg);
 }

 void _mi_error_message(int err, const char* fmt, ...) {
@ -341,7 +341,7 @@ void _mi_error_message(int err, const char* fmt, ...) {
  va_end(args);
  // and call the error handler which may abort (or return normally)
  if (mi_error_handler != NULL) {
-    mi_error_handler(err, mi_atomic_read_ptr(&mi_error_arg));
+    mi_error_handler(err, mi_atomic_read_ptr(void,&mi_error_arg));
  }
  else {
    mi_error_default(err);
--- a/src/os.c
+++ b/src/os.c
@ -396,20 +396,20 @@ static void* mi_unix_mmap(void* addr, size_t size, size_t try_alignment, int pro
 // On 64-bit systems, we can do efficient aligned allocation by using
 // the 4TiB to 30TiB area to allocate them.
 #if (MI_INTPTR_SIZE >= 8) && (defined(_WIN32) || (defined(MI_OS_USE_MMAP) && !defined(MAP_ALIGNED)))
-static volatile _Atomic(intptr_t) aligned_base;
+static volatile _Atomic(uintptr_t) aligned_base;

 // Return a 4MiB aligned address that is probably available
 static void* mi_os_get_aligned_hint(size_t try_alignment, size_t size) {
  if (try_alignment == 0 || try_alignment > MI_SEGMENT_SIZE) return NULL;
  if ((size%MI_SEGMENT_SIZE) != 0) return NULL;
-  intptr_t hint = mi_atomic_add(&aligned_base, size);
+  uintptr_t hint = mi_atomic_add(&aligned_base, size);
  if (hint == 0 || hint > ((intptr_t)30<<40)) { // try to wrap around after 30TiB (area after 32TiB is used for huge OS pages)
-    intptr_t init = ((intptr_t)4 << 40); // start at 4TiB area
+    uintptr_t init = ((uintptr_t)4 << 40); // start at 4TiB area
    #if (MI_SECURE>0 || MI_DEBUG==0)     // security: randomize start of aligned allocations unless in debug mode
    uintptr_t r = _mi_heap_random_next(mi_get_default_heap());
    init = init + (MI_SEGMENT_SIZE * ((r>>17) & 0xFFFFF));  // (randomly 20 bits)*4MiB == 0 to 4TiB
    #endif
-    mi_atomic_cas_strong(mi_atomic_cast(uintptr_t, &aligned_base), init, hint + size);
+    mi_atomic_cas_strong(&aligned_base, init, hint + size);
    hint = mi_atomic_add(&aligned_base, size); // this may still give 0 or > 30TiB but that is ok, it is a hint after all
  }
  if (hint%try_alignment != 0) return NULL;
--- a/src/page.c
+++ b/src/page.c
@ -131,7 +131,7 @@ void _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool overrid
    tfreex = mi_tf_set_delayed(tfree, delay);
    old_delay = mi_tf_delayed(tfree);
    if (mi_unlikely(old_delay == MI_DELAYED_FREEING)) {
-      // mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
+      mi_atomic_yield(); // delay until outstanding MI_DELAYED_FREEING are done.
      tfree = mi_tf_set_delayed(tfree, MI_NO_DELAYED_FREE); // will cause CAS to busy fail
    }
    else if (delay == old_delay) {
@ -281,11 +281,11 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
   (put there by other threads if they deallocated in a full page)
 ----------------------------------------------------------- */
 void _mi_heap_delayed_free(mi_heap_t* heap) {
-  // take over the list
+  // take over the list (note: no atomic exchange is it is often NULL)
  mi_block_t* block;
  do {
-    block = (mi_block_t*)heap->thread_delayed_free;
-  } while (block != NULL && !mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), NULL, block));
+    block = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
+  } while (block != NULL && !mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, NULL, block));

  // and free them all
  while(block != NULL) {
@ -296,9 +296,9 @@ void _mi_heap_delayed_free(mi_heap_t* heap) {
      // reset the delayed_freeing flag; in that case delay it further by reinserting.
      mi_block_t* dfree;
      do {
-        dfree = (mi_block_t*)heap->thread_delayed_free;
+        dfree = mi_atomic_read_ptr_relaxed(mi_block_t,&heap->thread_delayed_free);
        mi_block_set_nextx(heap, block, dfree, heap->key[0], heap->key[1]);
-      } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*,&heap->thread_delayed_free), block, dfree));
+      } while (!mi_atomic_cas_ptr_weak(mi_block_t,&heap->thread_delayed_free, block, dfree));
    }
    block = next;
  }
@ -740,14 +740,14 @@ void _mi_deferred_free(mi_heap_t* heap, bool force) {
  heap->tld->heartbeat++;
  if (deferred_free != NULL && !heap->tld->recurse) {
    heap->tld->recurse = true;
-    deferred_free(force, heap->tld->heartbeat, mi_atomic_read_ptr_relaxed(&deferred_arg));
+    deferred_free(force, heap->tld->heartbeat, mi_atomic_read_ptr_relaxed(void,&deferred_arg));
    heap->tld->recurse = false;
  }
 }

 void mi_register_deferred_free(mi_deferred_free_fun* fn, void* arg) mi_attr_noexcept {
  deferred_free = fn;
-  mi_atomic_write_ptr(&deferred_arg, arg);
+  mi_atomic_write_ptr(void,&deferred_arg, arg);
 }


--- a/src/segment.c
+++ b/src/segment.c
@ -853,7 +853,7 @@ static void mi_segments_prepend_abandoned(mi_segment_t* first) {
  if (first == NULL) return;

  // first try if the abandoned list happens to be NULL
-  if (mi_atomic_cas_ptr_weak(mi_atomic_cast(void*, &abandoned), first, NULL)) return;
+  if (mi_atomic_cas_ptr_weak(mi_segment_t, &abandoned, first, NULL)) return;

  // if not, find the end of the list
  mi_segment_t* last = first;
@ -864,9 +864,9 @@ static void mi_segments_prepend_abandoned(mi_segment_t* first) {
  // and atomically prepend
  mi_segment_t* next;
  do {
-    next = (mi_segment_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &abandoned));
+    next = mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned);
    last->abandoned_next = next;
-  } while (!mi_atomic_cas_ptr_weak(mi_atomic_cast(void*, &abandoned), first, next));
+  } while (!mi_atomic_cas_ptr_weak(mi_segment_t, &abandoned, first, next));
 }

 static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
@ -918,9 +918,9 @@ void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {

 bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segments_tld_t* tld) {
  // To avoid the A-B-A problem, grab the entire list atomically
-  mi_segment_t* segment = (mi_segment_t*)mi_atomic_read_ptr_relaxed(mi_atomic_cast(void*, &abandoned));  // pre-read to avoid expensive atomic operations
+  mi_segment_t* segment = mi_atomic_read_ptr_relaxed(mi_segment_t,&abandoned);  // pre-read to avoid expensive atomic operations
  if (segment == NULL) return false;
-  segment = (mi_segment_t*)mi_atomic_exchange_ptr(mi_atomic_cast(void*, &abandoned), NULL);
+  segment = mi_atomic_exchange_ptr(mi_segment_t, &abandoned, NULL);
  if (segment == NULL) return false;

  // we got a non-empty list
--- a/src/stats.c
+++ b/src/stats.c
@ -26,13 +26,13 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {
  if (mi_is_in_main(stat))
  {
    // add atomically (for abandoned pages)
-    mi_atomic_add64(&stat->current,amount);
+    mi_atomic_addi64(&stat->current,amount);
    if (stat->current > stat->peak) stat->peak = stat->current;  // racing.. it's ok
    if (amount > 0) {
-      mi_atomic_add64(&stat->allocated,amount);
+      mi_atomic_addi64(&stat->allocated,amount);
    }
    else {
-      mi_atomic_add64(&stat->freed, -amount);
+      mi_atomic_addi64(&stat->freed, -amount);
    }
  }
  else {
@ -50,8 +50,8 @@ static void mi_stat_update(mi_stat_count_t* stat, int64_t amount) {

 void _mi_stat_counter_increase(mi_stat_counter_t* stat, size_t amount) {  
  if (mi_is_in_main(stat)) {
-    mi_atomic_add64( &stat->count, 1 );
-    mi_atomic_add64( &stat->total, (int64_t)amount );
+    mi_atomic_addi64( &stat->count, 1 );
+    mi_atomic_addi64( &stat->total, (int64_t)amount );
  }
  else {
    stat->count++;
@ -70,17 +70,17 @@ void _mi_stat_decrease(mi_stat_count_t* stat, size_t amount) {
 // must be thread safe as it is called from stats_merge
 static void mi_stat_add(mi_stat_count_t* stat, const mi_stat_count_t* src, int64_t unit) {
  if (stat==src) return;
-  mi_atomic_add64( &stat->allocated, src->allocated * unit);
-  mi_atomic_add64( &stat->current, src->current * unit);
-  mi_atomic_add64( &stat->freed, src->freed * unit);
+  mi_atomic_addi64( &stat->allocated, src->allocated * unit);
+  mi_atomic_addi64( &stat->current, src->current * unit);
+  mi_atomic_addi64( &stat->freed, src->freed * unit);
  // peak scores do not work across threads..
-  mi_atomic_add64( &stat->peak, src->peak * unit);
+  mi_atomic_addi64( &stat->peak, src->peak * unit);
 }

 static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src, int64_t unit) {
  if (stat==src) return;
-  mi_atomic_add64( &stat->total, src->total * unit);
-  mi_atomic_add64( &stat->count, src->count * unit);
+  mi_atomic_addi64( &stat->total, src->total * unit);
+  mi_atomic_addi64( &stat->count, src->count * unit);
 }

 // must be thread safe as it is called from stats_merge