From 2f789aae9a1ed271e3feb22e4ead04db809e4e2e Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Sun, 1 Dec 2024 16:26:59 -0800
Subject: [PATCH] wip: cannot compile

---
 include/mimalloc/internal.h | 84 +++++++++++++++++++------------------
 include/mimalloc/types.h    | 20 +++++----
 src/bitmap.c                | 45 ++++++++++++++++++++
 src/bitmap.h                | 28 ++++++++++++-
 src/free.c                  | 81 +++++++++++++++++++++++------------
 5 files changed, 181 insertions(+), 77 deletions(-)

diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index ec106047..84244c21 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -92,11 +92,13 @@ bool       _mi_preloading(void);           // true while the C runtime is not in
 void       _mi_thread_done(mi_heap_t* heap);
 void       _mi_thread_data_collect(void);
 void       _mi_tld_init(mi_tld_t* tld, mi_heap_t* bheap);
+
 mi_threadid_t _mi_thread_id(void) mi_attr_noexcept;
-size_t      _mi_thread_seq_id(void) mi_attr_noexcept;
+size_t        _mi_thread_seq_id(void) mi_attr_noexcept;
+
 mi_heap_t*    _mi_heap_main_get(void);     // statically allocated main backing heap
 mi_subproc_t* _mi_subproc_from_id(mi_subproc_id_t subproc_id);
-void       _mi_heap_guarded_init(mi_heap_t* heap);
+void          _mi_heap_guarded_init(mi_heap_t* heap);
 
 // os.c
 void       _mi_os_init(void);                                            // called from process init
@@ -180,8 +182,6 @@ void       _mi_heap_delayed_free_all(mi_heap_t* heap);
 bool       _mi_heap_delayed_free_partial(mi_heap_t* heap);
 void       _mi_heap_collect_retired(mi_heap_t* heap, bool force);
 
-void       _mi_page_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
-bool       _mi_page_try_use_delayed_free(mi_page_t* page, mi_delayed_t delay, bool override_never);
 size_t     _mi_page_queue_append(mi_heap_t* heap, mi_page_queue_t* pq, mi_page_queue_t* append);
 void       _mi_deferred_free(mi_heap_t* heap, bool force);
 
@@ -426,6 +426,10 @@ static inline uintptr_t _mi_ptr_cookie(const void* p) {
   return ((uintptr_t)p ^ _mi_heap_main.cookie);
 }
 
+static inline mi_tld_t* _mi_tld(void) {
+  return mi_heap_get_default()->tld;
+}
+
 /* -----------------------------------------------------------
   Pages
 ----------------------------------------------------------- */
@@ -507,53 +511,53 @@ static inline size_t mi_page_usable_block_size(const mi_page_t* page) {
   return mi_page_block_size(page) - MI_PADDING_SIZE;
 }
 
+//static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
+//  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
+//  if (heap != NULL) {
+//    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
+//    page->heap_tag = heap->tag;
+//    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
+//  }
+//  else {
+//    mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc);
+//    mi_atomic_store_release(&page->xthread_id,0);
+//  }
+//}
+
+// Thread free flag helpers
+static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
+  return (mi_block_t*)(tf & ~1);
+}
+static inline bool mi_tf_is_owned(mi_thread_free_t tf) {
+  return ((tf & 1) == 0);
+}
+static inline mi_thread_free_t mi_tf_create(mi_block_t* block, bool owned) {
+  return (mi_thread_free_t)((uintptr_t)block | (owned ? 0 : 1));
+}
+
+
 // Thread free access
 static inline mi_block_t* mi_page_thread_free(const mi_page_t* page) {
-  return (mi_block_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & ~3);
+  return mi_tf_block(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
 }
 
-static inline mi_delayed_t mi_page_thread_free_flag(const mi_page_t* page) {
-  return (mi_delayed_t)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free) & 3);
-}
-
-// Heap access
-static inline mi_heap_t* mi_page_heap(const mi_page_t* page) {
-  return (mi_heap_t*)(mi_atomic_load_relaxed(&((mi_page_t*)page)->xheap));
+// Owned?
+static inline bool mi_page_is_owned(const mi_page_t* page) {
+  return mi_tf_is_owned(mi_atomic_load_relaxed(&((mi_page_t*)page)->xthread_free));
 }
 
+// Thread id of thread that owns this page
 static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {
   return mi_atomic_load_relaxed(&page->xthread_id);
 }
 
-static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
-  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-  if (heap != NULL) {
-    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
-    page->heap_tag = heap->tag;
-    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
-  }
-  else {
-    mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc);
-    mi_atomic_store_release(&page->xthread_id,0);
-  }
-}
 
-// Thread free flag helpers
-static inline mi_block_t* mi_tf_block(mi_thread_free_t tf) {
-  return (mi_block_t*)(tf & ~0x03);
-}
-static inline mi_delayed_t mi_tf_delayed(mi_thread_free_t tf) {
-  return (mi_delayed_t)(tf & 0x03);
-}
-static inline mi_thread_free_t mi_tf_make(mi_block_t* block, mi_delayed_t delayed) {
-  return (mi_thread_free_t)((uintptr_t)block | (uintptr_t)delayed);
-}
-static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
-  return mi_tf_make(mi_tf_block(tf),delayed);
-}
-static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
-  return mi_tf_make(block, mi_tf_delayed(tf));
-}
+//static inline mi_thread_free_t mi_tf_set_delayed(mi_thread_free_t tf, mi_delayed_t delayed) {
+//  return mi_tf_make(mi_tf_block(tf),delayed);
+//}
+//static inline mi_thread_free_t mi_tf_set_block(mi_thread_free_t tf, mi_block_t* block) {
+//  return mi_tf_make(block, mi_tf_delayed(tf));
+//}
 
 // are all blocks in a page freed?
 // note: needs up-to-date used count, (as the `xthread_free` list may not be empty). see `_mi_page_collect_free`.
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index 271c7efb..7329cb86 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -216,13 +216,14 @@ typedef struct mi_block_s {
 #endif
 
 
-// The delayed flags are used for efficient multi-threaded free-ing
-typedef enum mi_delayed_e {
-  MI_USE_DELAYED_FREE   = 0, // push on the owning heap thread delayed list
-  MI_DELAYED_FREEING    = 1, // temporary: another thread is accessing the owning heap
-  MI_NO_DELAYED_FREE    = 2, // optimize: push on page local thread free queue if another block is already in the heap thread delayed free list
-  MI_NEVER_DELAYED_FREE = 3  // sticky: used for abondoned pages without a owning heap; this only resets on page reclaim
-} mi_delayed_t;
+// The owned flags are used for efficient multi-threaded free-ing
+// When we push on the page thread free queue of an abandoned page,
+// we also atomically get to own it. This is needed to atomically
+// abandon a page (while other threads could concurrently free blocks in it).
+typedef enum mi_owned_e {
+  MI_OWNED              = 0, // some heap owns this page
+  MI_ABANDONED          = 1, // the page is abandoned
+} mi_owned_t;
 
 
 // The `in_full` and `has_aligned` page flags are put in a union to efficiently
@@ -247,7 +248,7 @@ typedef union mi_page_flags_s {
 #endif
 
 // Thread free list.
-// We use the bottom 2 bits of the pointer for mi_delayed_t flags
+// We use the bottom bit of the pointer for `mi_owned_t` flags
 typedef uintptr_t mi_thread_free_t;
 
 // Sub processes are used to keep memory separate between them (e.g. multiple interpreters in CPython)
@@ -304,10 +305,11 @@ typedef struct mi_page_s {
   #endif
 
   _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-  _Atomic(uintptr_t)    xheap;             // heap this threads belong to.
+  //  _Atomic(uintptr_t)    xheap;             // heap this threads belong to.
 
   struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
+  mi_subproc_t*         subproc;           // sub-process of this heap
   mi_memid_t            memid;             // provenance of the page memory
 } mi_page_t;
 
diff --git a/src/bitmap.c b/src/bitmap.c
index dd1afe75..5cce6bfa 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -693,3 +693,48 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t
   mi_bitmap_forall_set_chunks_end();
   return false;
 }
+
+
+
+mi_decl_nodiscard bool mi_pairmap_xset(mi_pair_t set, mi_bitmap_t* bitmap, size_t idx);
+mi_decl_nodiscard bool mi_pairmap_xset_while_not_busy(mi_pair_t set, mi_bitmap_t* bitmap, size_t idx);
+
+mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t tseq, size_t* pidx) {
+  size_t set_idx;
+  size_t start = tseq % MI_BFIELD_BITS;
+  size_t epoch = mi_atomic_load_acquire(&pairmap->epoch);
+  mi_bfield_t any_set = mi_bfield_rotate_right(mi_atomic_load_relaxed(&pairmap->any_set), start);
+  while (mi_bfield_find_least_bit(any_set, &set_idx)) {
+    size_t chunk_idx = 2*((set_idx + start) % MI_BFIELD_BITS);
+    {
+      // look at chunk_idx and chunck_idx+1
+      mi_bitmap_chunk_t* chunk1 = &pairmap->chunks[chunk_idx];
+      mi_bitmap_chunk_t* chunk2 = &pairmap->chunks[chunk_idx+1];
+      size_t cidx;
+      if (mi_pairmap_chunk_find_and_set_busy(chunk1, &cidx)) {
+        *pidx = (chunk_idx * MI_BITMAP_CHUNK_BITS) + cidx;
+        mi_assert_internal(*pidx < MI_PAIRMAP_MAX_BITS);
+        return true;
+      }
+      else {
+        if (mi_pairmap_chunk_find_and_set_busy(chunk2, &cidx)) {
+          *pidx = ((chunk_idx+1) * MI_BITMAP_CHUNK_BITS) + cidx;
+          mi_assert_internal(*pidx < MI_PAIRMAP_MAX_BITS);
+          return true;
+        }
+        else if (mi_bitmap_chunk_all_are_clear(chunk1) && mi_bitmap_chunk_all_are_clear(chunk2)) {
+
+          mi_bfield_atomic_xset(MI_BIT_CLEAR, &pairmap->any_set, chunk_idx/2);
+        }
+      }
+      else {
+        if (mi_bitmap_chunk_all_are_clear(&bitmap->chunks[chunk_idx])) {
+          mi_bfield_atomic_xset(MI_BIT_CLEAR, &bitmap->any_set, chunk_idx);
+        }
+      }
+    }
+    start += set_idx+1;    /* so chunk_idx stays valid */ 
+    any_set >>= set_idx;   /* skip scanned bits (and avoid UB with (idx+1)) */ 
+    any_set >>= 1;
+  }
+}
diff --git a/src/bitmap.h b/src/bitmap.h
index 1a180924..2b4bfc25 100644
--- a/src/bitmap.h
+++ b/src/bitmap.h
@@ -41,7 +41,7 @@ typedef mi_decl_align(32) struct mi_bitmap_s {
 #define MI_BITMAP_MAX_BITS  (MI_BFIELD_BITS * MI_BITMAP_CHUNK_BITS)      // 16k bits on 64bit, 8k bits on 32bit
 
 /* --------------------------------------------------------------------------------
-  Bitmap
+  Atomic bitmap
 -------------------------------------------------------------------------------- */
 
 typedef bool  mi_bit_t;
@@ -89,4 +89,30 @@ mi_decl_nodiscard bool mi_bitmap_try_find_and_clear8(mi_bitmap_t* bitmap, size_t
 // Returns true on success, and in that case sets the index: `0 <= *pidx <= MI_BITMAP_MAX_BITS-n`.
 mi_decl_nodiscard bool mi_bitmap_try_find_and_clearN(mi_bitmap_t* bitmap, size_t n, size_t tseq, size_t* pidx );
 
+
+/* --------------------------------------------------------------------------------
+  Atomic bitmap for a pair of bits
+-------------------------------------------------------------------------------- */
+
+typedef mi_bfield_t     mi_pair_t;
+ 
+#define MI_PAIR_CLEAR   (0)
+#define MI_PAIR_BUSY    (1)
+#define MI_PAIR_BUSYX   (2)
+#define MI_PAIR_SET     (3)
+
+typedef mi_decl_align(32) struct mi_pairmap_s {
+  mi_bitmap_chunk_t    chunks[2*MI_BFIELD_BITS];
+  _Atomic(mi_bfield_t) any_set;
+  _Atomic(size_t)      epoch;
+} mi_pairmap_t;
+
+#define MI_PAIRMAP_MAX_PAIRS  (MI_BITMAP_MAX_BITS)      // 16k pairs on 64bit, 8k pairs on 32bit
+#define MI_PAIRMAP_MAX_BITS   (2*MI_PAIRMAP_MAX_PAIRS)  
+
+mi_decl_nodiscard bool mi_pairmap_xset(mi_pair_t set, mi_pairmap_t* pairmap, size_t idx);
+mi_decl_nodiscard bool mi_pairmap_xset_while_not_busy(mi_pair_t set, mi_pairmap_t* pairmap, size_t idx);
+mi_decl_nodiscard bool mi_pairmap_try_find_and_set_busy(mi_pairmap_t* pairmap, size_t n, size_t tseq, size_t* pidx);
+
+
 #endif // MI_XBITMAP_H
diff --git a/src/free.c b/src/free.c
index f0ce8c22..42fcd07e 100644
--- a/src/free.c
+++ b/src/free.c
@@ -147,39 +147,66 @@ void mi_free(void* p) mi_attr_noexcept
   }
 }
 
-// return true if successful
-bool _mi_free_delayed_block(mi_block_t* block) {
-  // get segment and page
-  mi_assert_internal(block!=NULL);
-  mi_page_t* const page = mi_checked_ptr_page(block,"_mi_free_delayed_block");
-  mi_assert_internal(_mi_thread_id() == mi_page_thread_id(page));
 
-  // Clear the no-delayed flag so delayed freeing is used again for this page.
-  // This must be done before collecting the free lists on this page -- otherwise
-  // some blocks may end up in the page `thread_free` list with no blocks in the
-  // heap `thread_delayed_free` list which may cause the page to be never freed!
-  // (it would only be freed if we happen to scan it in `mi_page_queue_find_free_ex`)
-  if (!_mi_page_try_use_delayed_free(page, MI_USE_DELAYED_FREE, false /* dont overwrite never delayed */)) {
-    return false;
-  }
-
-  // collect all other non-local frees (move from `thread_free` to `free`) to ensure up-to-date `used` count
-  _mi_page_free_collect(page, false);
-
-  // and free the block (possibly freeing the page as well since `used` is updated)
-  mi_free_block_local(page, block, false /* stats have already been adjusted */, true /* check for a full page */);
-  return true;
-}
 
 // ------------------------------------------------------
 // Multi-threaded Free (`_mt`)
 // ------------------------------------------------------
 
-// Push a block that is owned by another thread on its page-local thread free
-// list or it's heap delayed free list. Such blocks are later collected by
-// the owning thread in `_mi_free_delayed_block`.
-static void mi_decl_noinline mi_free_block_delayed_mt( mi_page_t* page, mi_block_t* block )
+static void mi_decl_noinline mi_free_try_reclaim_mt(mi_page_t* page) {
+  mi_assert_internal(mi_page_is_owned(page));
+  mi_assert_internal(mi_page_thread_id(page)==0);
+
+  // we own the page now..
+  // first remove it from the abandoned pages in the arena
+  mi_heap_t* const heap = mi_heap_get_default();
+  _mi_arena_page_unabandon(page,heap->tld);
+
+  // collect the thread atomic free list
+  _mi_page_free_collect(page, false);  // update `used` count
+  if (mi_page_is_singleton(page)) mi_assert_internal(mi_page_all_free(page));
+
+  if (mi_page_all_free(page)) {
+    // we can free the page directly
+    _mi_arena_page_free(page, heap->tld);
+  }
+  else {
+    // the page has still some blocks in use
+    // reclaim in our heap if compatible, or otherwise abandon again
+    if ((_mi_option_get_fast(mi_option_abandoned_reclaim_on_free) != 0) &&
+        (mi_prim_get_default_heap() != (mi_heap_t*)&_mi_heap_empty) && // we did not already terminate our thread (can this happen? yes, due to thread-local destructors for example (issue #944))
+        (page->subproc == heap->tld->subproc) &&  // don't reclaim across sub-processes        
+        mi_arena_page_try_reclaim(page)           // and we can reclaim it from the arena
+       )
+    {
+      // make it part of our heap
+      _mi_heap_page_reclaim(heap, page);
+    }
+    else {
+      // abandon again
+      _mi_arena_page_abandon(page, heap->tld);
+    }
+  }
+}
+
+// Push a block that is owned by another thread on its page-local thread free list. 
+static void mi_decl_noinline mi_free_block_delayed_mt(mi_page_t* page, mi_block_t* block)
 {
+  // push atomically on the page thread free list
+  mi_thread_free_t tf_new;
+  mi_thread_free_t tf;
+  do {
+    tf = mi_atomic_load_relaxed(&page->xthread_free);
+    mi_block_set_next(page, block, mi_tf_block(tf));
+    tf_new = mi_tf_create(block, true /* always owned: try to claim it if abandoned */);
+  } while (!mi_atomic_cas_weak_release(&page->xthread_free, &tf, tf_new));
+
+  // and atomically reclaim the page if it was abandoned
+  bool reclaimed = !mi_tf_is_owned(tf);
+  if (reclaimed) mi_free_try_reclaim_mt(page);
+}
+
+  /*
   // Try to put the block on either the page-local thread free list,
   // or the heap delayed free list (if this is the first non-local free in that page)
   mi_thread_free_t tfreex;
@@ -276,7 +303,7 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block
   // thread_delayed free list (or heap delayed free list)
   mi_free_block_delayed_mt(page,block);
 }
-
+*/
 
 // ------------------------------------------------------
 // Usable size