first version that passes the make test

2025-07-01 17:24:38 +03:00 · 2024-11-30 20:21:32 -08:00 · 2024-11-30 20:21:32 -08:00 · 9ebe941ce0
commit 9ebe941ce0
parent 55b70f1588
10 changed files with 155 additions and 91 deletions
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@ -440,16 +440,34 @@ static inline mi_page_t* _mi_heap_get_free_small_page(mi_heap_t* heap, size_t si

 extern uint8_t* _mi_page_map;

-#define MI_PAGE_PTR_INVALID   ((mi_page_t*)(1))
+static inline mi_page_t* _mi_ptr_page_ex(const void* p, bool* valid) {
+  #if 1
+  const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
+  const size_t ofs = _mi_page_map[idx];
+  if (valid != NULL) *valid = (ofs != 0);
+  return (mi_page_t*)((idx - ofs + 1) << MI_ARENA_SLICE_SHIFT);
+  #else
+  const uintptr_t idx = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
+  const uintptr_t up   = idx << MI_ARENA_SLICE_SHIFT;
+  __builtin_prefetch((void*)up);
+  const size_t ofs = _mi_page_map[idx];
+  if (valid != NULL) *valid = (ofs != 0);
+  return (mi_page_t*)(up - ((ofs - 1) << MI_ARENA_SLICE_SHIFT));
+  #endif
+}
+
+static inline mi_page_t* _mi_checked_ptr_page(const void* p) {
+  bool valid;
+  mi_page_t* const page = _mi_ptr_page_ex(p,&valid);
+  return (valid ? page : NULL);
+}

 static inline mi_page_t* _mi_ptr_page(const void* p) {
-  const uintptr_t up  = ((uintptr_t)p) >> MI_ARENA_SLICE_SHIFT;
-  // __builtin_prefetch((void*)(up << MI_ARENA_SLICE_SHIFT));
-  const ptrdiff_t ofs = _mi_page_map[up];
  #if MI_DEBUG
-  if mi_unlikely(ofs==0) return MI_PAGE_PTR_INVALID;
+  return _mi_checked_ptr_page(p);
+  #else
+  return _mi_ptr_page_ex(p,NULL);
  #endif
-  return (mi_page_t*)((up - ofs + 1) << MI_ARENA_SLICE_SHIFT);
 }


@ -509,12 +527,13 @@ static inline mi_threadid_t mi_page_thread_id(const mi_page_t* page) {

 static inline void mi_page_set_heap(mi_page_t* page, mi_heap_t* heap) {
  mi_assert_internal(mi_page_thread_free_flag(page) != MI_DELAYED_FREEING);
-  mi_atomic_store_release(&page->xheap,(uintptr_t)heap);
  if (heap != NULL) {
+    mi_atomic_store_release(&page->xheap, (uintptr_t)heap);
    page->heap_tag = heap->tag;
    mi_atomic_store_release(&page->xthread_id, heap->thread_id);
  }
  else {
+    mi_atomic_store_release(&page->xheap, (uintptr_t)mi_page_heap(page)->tld->subproc);
    mi_atomic_store_release(&page->xthread_id,0);
  }
 }
@ -578,11 +597,12 @@ static inline bool mi_page_mostly_used(const mi_page_t* page) {
 }

 static inline bool mi_page_is_abandoned(const mi_page_t* page) {
+  // note: the xheap field of an abandoned heap is set to the subproc (for fast reclaim-on-free)
  return (mi_page_thread_id(page) == 0);
 }

 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE);
+  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN));
 }


--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@ -123,15 +123,16 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BITMAP_CHUNK_BITS_SHIFT        8                           // optimized for 256 bits per chunk (avx2)
 #endif

+#define MI_BITMAP_CHUNK_BITS              (1 << MI_BITMAP_CHUNK_BITS_SHIFT)
 #define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)
 #define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
-#define MI_BITMAP_CHUNK_BITS              (1 << MI_BITMAP_CHUNK_BITS_SHIFT)

-#define MI_ARENA_MIN_OBJ_BLOCKS           (1)
-#define MI_ARENA_MAX_OBJ_BLOCKS           (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries
+#define MI_ARENA_MIN_OBJ_SLICES           (1)
+#define MI_ARENA_MAX_OBJ_SLICES           (MI_SIZE_BITS)              // for now, cannot cross bit field boundaries.. todo: make it at least MI_BITMAP_CHUNK_BITS ? (16 MiB)
+// #define MI_ARENA_MAX_OBJ_BLOCKS        (MI_BITMAP_CHUNK_BITS)      // for now, cannot cross chunk boundaries

-#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE)
-#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_BLOCKS * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MIN_OBJ_SIZE             (MI_ARENA_MIN_OBJ_SLICES * MI_ARENA_SLICE_SIZE)
+#define MI_ARENA_MAX_OBJ_SIZE             (MI_ARENA_MAX_OBJ_SLICES * MI_ARENA_SLICE_SIZE)

 #define MI_SMALL_PAGE_SIZE                MI_ARENA_MIN_OBJ_SIZE
 #define MI_MEDIUM_PAGE_SIZE               (8*MI_SMALL_PAGE_SIZE)                   // 512 KiB  (=byte in the bitmap)
@ -144,9 +145,6 @@ terms of the MIT license. A copy of the license can be found in the file
 #define MI_BIN_COUNT (MI_BIN_FULL+1)


-// Alignments over MI_BLOCK_ALIGNMENT_MAX are allocated in singleton pages
-#define MI_BLOCK_ALIGNMENT_MAX   (MI_ARENA_SLICE_ALIGN)
-
 // We never allocate more than PTRDIFF_MAX (see also <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
 #define MI_MAX_ALLOC_SIZE        PTRDIFF_MAX

@ -318,8 +316,10 @@ typedef struct mi_page_s {
 // Object sizes
 // ------------------------------------------------------

-#define MI_PAGE_ALIGN                     (64)
-#define MI_PAGE_INFO_SIZE                 (2*MI_PAGE_ALIGN)   // should be > sizeof(mi_page_t)
+#define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
+#define MI_PAGE_MIN_BLOCK_ALIGN           (32)                 // minimal block alignment in a page
+#define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
+#define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)

 // The max object size are checked to not waste more than 12.5% internally over the page sizes.
 // (Except for large pages since huge objects are allocated in 4MiB chunks)
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@ -16,12 +16,11 @@ terms of the MIT license. A copy of the license can be found in the file
 // ------------------------------------------------------

 static bool mi_malloc_is_naturally_aligned( size_t size, size_t alignment ) {
-  // objects up to `MI_PAGE_ALIGN` are allocated aligned to their size
+  // objects up to `MI_PAGE_MIN_BLOCK_ALIGN` are always allocated aligned to their size
  mi_assert_internal(_mi_is_power_of_two(alignment) && (alignment > 0));
  if (alignment > size) return false;
-  if (alignment <= MI_MAX_ALIGN_SIZE) return true;
  const size_t bsize = mi_good_size(size);
-  return (bsize <= MI_PAGE_ALIGN && (bsize & (alignment-1)) == 0);
+  return (bsize <= MI_PAGE_MIN_BLOCK_ALIGN && (bsize & (alignment-1)) == 0);
 }

 #if MI_GUARDED
@ -39,9 +38,9 @@ static mi_decl_restrict void* mi_heap_malloc_guarded_aligned(mi_heap_t* heap, si

 static void* mi_heap_malloc_zero_no_guarded(mi_heap_t* heap, size_t size, bool zero) {
  const size_t rate = heap->guarded_sample_rate;
-  heap->guarded_sample_rate = 0;
+  if (rate != 0) { heap->guarded_sample_rate = 0; }   // don't write to constant heap_empty
  void* p = _mi_heap_malloc_zero(heap, size, zero);
-  heap->guarded_sample_rate = rate;
+  if (rate != 0) { heap->guarded_sample_rate = rate; }
  return p;
 }
 #else
@ -58,21 +57,20 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t

  void* p;
  size_t oversize;
-  if mi_unlikely(alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // use OS allocation for very large alignment and allocate inside a huge page (not in an arena)
-    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned at a point in the
-    // first (and single) page such that the page info is `MI_ARENA_SLICE_SIZE` bytes before it (and can be found in the _mi_page_map).
+  if mi_unlikely(alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    // use OS allocation for large alignments and allocate inside a singleton page (not in an arena)
+    // This can support alignments >= MI_PAGE_ALIGN by ensuring the object can be aligned 
+    // in the first (and single) page such that the page info is `MI_PAGE_ALIGN` bytes before it (and can be found in the _mi_page_map).
    if mi_unlikely(offset != 0) {
      // todo: cannot support offset alignment for very large alignments yet
      #if MI_DEBUG > 0
-      _mi_error_message(EOVERFLOW, "aligned allocation with a very large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
+      _mi_error_message(EOVERFLOW, "aligned allocation with a large alignment cannot be used with an alignment offset (size %zu, alignment %zu, offset %zu)\n", size, alignment, offset);
      #endif
      return NULL;
    }
    oversize = (size <= MI_SMALL_SIZE_MAX ? MI_SMALL_SIZE_MAX + 1 /* ensure we use generic malloc path */ : size);
    // note: no guarded as alignment > 0
-    p = _mi_heap_malloc_zero_ex(heap, oversize, false, alignment); // the page block size should be large enough to align in the single huge page block
-    // zero afterwards as only the area from the aligned_p may be committed!
+    p = _mi_heap_malloc_zero_ex(heap, oversize, zero, alignment); // the page block size should be large enough to align in the single huge page block
    if (p == NULL) return NULL;
  }
  else {
@ -113,13 +111,13 @@ static mi_decl_noinline void* mi_heap_malloc_zero_aligned_at_overalloc(mi_heap_t
  #endif

  // now zero the block if needed
-  if (alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    // for the tracker, on huge aligned allocations only from the start of the large block is defined
-    mi_track_mem_undefined(aligned_p, size);
-    if (zero) {
-      _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
-    }
-  }
+  //if (alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+  //  // for the tracker, on huge aligned allocations only from the start of the large block is defined
+  //  mi_track_mem_undefined(aligned_p, size);
+  //  if (zero) {
+  //    _mi_memzero_aligned(aligned_p, mi_usable_size(aligned_p));
+  //  }
+  //}

  if (p != aligned_p) {
    mi_track_align(p,aligned_p,adjust,mi_usable_size(aligned_p));
--- a/src/arena.c
+++ b/src/arena.c
@ -354,7 +354,7 @@ static mi_decl_noinline void* mi_arena_try_alloc(
  bool commit, bool allow_large,
  mi_arena_id_t req_arena_id, mi_memid_t* memid, mi_tld_t* tld)
 {
-  mi_assert(slice_count <= MI_ARENA_MAX_OBJ_BLOCKS);
+  mi_assert(slice_count <= MI_ARENA_MAX_OBJ_SLICES);
  mi_assert(alignment <= MI_ARENA_SLICE_ALIGN);
  
  // try to find free slices in the arena's
@ -469,33 +469,48 @@ static mi_page_t* mi_arena_page_try_find_abandoned(size_t slice_count, size_t bl
  return NULL;
 }

-static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, mi_arena_id_t req_arena_id, mi_tld_t* tld)
+static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_size, size_t block_alignment,
+                                            mi_arena_id_t req_arena_id, mi_tld_t* tld)
 {
  const bool allow_large = true;
  const bool commit = true;
-  const size_t alignment = 1;
+  const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t page_alignment = MI_ARENA_SLICE_ALIGN;
  
  // try to allocate from free space in arena's
  mi_memid_t memid = _mi_memid_none();
  mi_page_t* page = NULL;
-  if (_mi_option_get_fast(mi_option_disallow_arena_alloc)==0 && req_arena_id == _mi_arena_id_none()) {
-    page = (mi_page_t*)mi_arena_try_alloc(slice_count, alignment, commit, allow_large, req_arena_id, &memid, tld);
+  if (!_mi_option_get_fast(mi_option_disallow_arena_alloc) && // allowed to allocate from arena's?       
+      !os_align &&                            // not large alignment
+      slice_count <= MI_ARENA_MAX_OBJ_SLICES) // and not too large
+  {
+    page = (mi_page_t*)mi_arena_try_alloc(slice_count, page_alignment, commit, allow_large, req_arena_id, &memid, tld);
  }

  // otherwise fall back to the OS
  if (page == NULL) {
-    page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+    if (os_align) {
+      // note: slice_count already includes the page
+      mi_assert_internal(slice_count >= mi_slice_count_of_size(block_size) + mi_slice_count_of_size(page_alignment));
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), block_alignment, page_alignment /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+    }
+    else {
+      page = (mi_page_t*)mi_arena_os_alloc_aligned(mi_size_of_slices(slice_count), page_alignment, 0 /* align offset */, commit, allow_large, req_arena_id, &memid, tld);
+    }
  }

  if (page == NULL) return NULL;
+  mi_assert_internal(_mi_is_aligned(page, MI_PAGE_ALIGN));
+  mi_assert_internal(!os_align || _mi_is_aligned((uint8_t*)page + page_alignment, block_alignment));

  // claimed free slices: initialize the page partly
-  _mi_memzero_aligned(page, sizeof(*page));
-  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_ALIGN));
-  const size_t reserved = (mi_size_of_slices(slice_count) - MI_PAGE_INFO_SIZE) / block_size;
+  if (!memid.initially_zero) { _mi_memzero_aligned(page, sizeof(*page)); }
+  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN));
+  const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
+  const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
  mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
  page->reserved = (uint16_t)reserved;
-  page->page_start = (uint8_t*)page + MI_PAGE_INFO_SIZE;
+  page->page_start = (uint8_t*)page + block_start;
  page->block_size = block_size;
  page->memid = memid;
  page->free_is_zero = memid.initially_zero;
@ -523,7 +538,7 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size
  }

  // 2. find a free block, potentially allocating a new arena
-  page = mi_arena_page_alloc_fresh(slice_count, block_size, req_arena_id, tld);
+  page = mi_arena_page_alloc_fresh(slice_count, block_size, 1, req_arena_id, tld);
  if (page != NULL) {
    mi_assert_internal(page->memid.memkind != MI_MEM_ARENA || page->memid.mem.arena.slice_count == slice_count);
    _mi_page_init(heap, page);
@ -534,18 +549,27 @@ static mi_page_t* mi_arena_page_allocN(mi_heap_t* heap, size_t slice_count, size
 }


-static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
-  MI_UNUSED(heap); MI_UNUSED(block_size); MI_UNUSED(page_alignment);
-  _mi_error_message(EINVAL, "singleton page is not yet implemented\n");
-  return NULL;
+static mi_page_t* mi_singleton_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
+  const mi_arena_id_t  req_arena_id = heap->arena_id;
+  mi_tld_t* const tld = heap->tld;
+  const bool os_align = (block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN);
+  const size_t info_size = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
+  const size_t slice_count = mi_slice_count_of_size(info_size + block_size);
+  
+  mi_page_t* page = mi_arena_page_alloc_fresh(slice_count, block_size, block_alignment, req_arena_id, tld);
+  if (page == NULL) return NULL;
+  
+  mi_assert(page != NULL);
+  mi_assert(page->reserved == 1);  
+  return page;  
 }


-mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_alignment) {
+mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t block_alignment) {
  mi_page_t* page;
-  if mi_unlikely(page_alignment > MI_BLOCK_ALIGNMENT_MAX) {
-    mi_assert_internal(_mi_is_power_of_two(page_alignment));
-    page = mi_singleton_page_alloc(heap, block_size, page_alignment);
+  if mi_unlikely(block_alignment > MI_PAGE_MAX_OVERALLOC_ALIGN) {
+    mi_assert_internal(_mi_is_power_of_two(block_alignment));
+    page = mi_singleton_page_alloc(heap, block_size, block_alignment);
  }
  else if (block_size <= MI_SMALL_MAX_OBJ_SIZE) {
    page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_SMALL_PAGE_SIZE), block_size);
@ -557,7 +581,7 @@ mi_page_t* _mi_arena_page_alloc(mi_heap_t* heap, size_t block_size, size_t page_
    page = mi_arena_page_allocN(heap, mi_slice_count_of_size(MI_LARGE_PAGE_SIZE), block_size);
  }
  else {
-    page = mi_singleton_page_alloc(heap, block_size, page_alignment);
+    page = mi_singleton_page_alloc(heap, block_size, block_alignment);
  }
  // mi_assert_internal(page == NULL || _mi_page_segment(page)->subproc == tld->subproc);
  return page;
@ -598,7 +622,10 @@ void _mi_arena_page_abandon(mi_page_t* page, mi_tld_t* tld) {

 bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
  if (mi_page_is_singleton(page)) { mi_assert_internal(mi_page_is_abandoned(page)); }
-  if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned
+  // if (!mi_page_is_abandoned(page)) return false;  // it is not abandoned (anymore)
+   
+  // note: we can access the page even it is in the meantime reclaimed by another thread since
+  // we only call this when on free (and thus there is still an object alive in the page)
  mi_memid_t memid = page->memid;
  if (!_mi_arena_memid_is_suitable(memid, heap->arena_id)) return false;  // don't reclaim between exclusive and non-exclusive arena's

@ -623,11 +650,12 @@ bool _mi_arena_try_reclaim(mi_heap_t* heap, mi_page_t* page) {
  }
  else {
    // A page in OS or external memory
+    if (mi_atomic_load_acquire(&page->xheap) != (uintptr_t)heap->tld->subproc) return false;
+
    // we use the thread_id to atomically grab ownership
-    // TODO: respect the subproc -- do we need to add this to the page?
    mi_threadid_t abandoned_thread_id = 0;
    if (mi_atomic_cas_strong_acq_rel(&page->xthread_id, &abandoned_thread_id, heap->thread_id)) {
-      // we unabandoned partly
+      // we got it atomically
      _mi_page_reclaim(heap, page);
      mi_assert_internal(!mi_page_is_abandoned(page));
      return true;
--- a/src/bitmap.c
+++ b/src/bitmap.c
@ -263,7 +263,7 @@ restore:
 // set `*pidx` to the bit index (0 <= *pidx < MI_BITMAP_CHUNK_BITS) on success.
 // todo: try neon version
 static inline bool mi_bitmap_chunk_find_and_try_xset(mi_bit_t set, mi_bitmap_chunk_t* chunk, size_t* pidx) {
-#if 0 && defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
+#if defined(__AVX2__) && (MI_BITMAP_CHUNK_BITS==256)
  while (true) {
    const __m256i vec   = _mm256_load_si256((const __m256i*)chunk->bfields);
    const __m256i vcmp  = _mm256_cmpeq_epi64(vec, (set ? _mm256_set1_epi64x(~0) : _mm256_setzero_si256())); // (elem64 == ~0 / 0 ? 0xFF  : 0)
--- a/src/free.c
+++ b/src/free.c
@ -115,7 +115,7 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
  #endif
  mi_page_t* const page = _mi_ptr_page(p);
  #if MI_DEBUG
-  if (page == MI_PAGE_PTR_INVALID) {
+  if (page == NULL && p != NULL) {
    _mi_error_message(EINVAL, "%s: invalid pointer: %p\n", msg, p);
  }
  #endif
@ -126,10 +126,8 @@ static inline mi_page_t* mi_checked_ptr_page(const void* p, const char* msg)
 // Fast path written carefully to prevent register spilling on the stack
 void mi_free(void* p) mi_attr_noexcept
 {
-  if (p==NULL) return;
  mi_page_t* const page = mi_checked_ptr_page(p,"mi_free");
-  // if mi_unlikely(page==NULL) return;
-
+  if mi_unlikely(page==NULL) return;
  
  const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
  if mi_likely(is_local) {                        // thread-local free?
@ -240,6 +238,9 @@ static void mi_decl_noinline mi_free_block_mt(mi_page_t* page, mi_block_t* block
      mi_free(block);  // recursively free as now it will be a local free in our heap
      return;
    }
+    else {
+      mi_assert_internal(!mi_page_is_singleton(page)); // we should have succeeded on singleton pages
+    }
  }

  // The padding check may access the non-thread-owned page for the key values.
--- a/src/page-map.c
+++ b/src/page-map.c
@ -37,7 +37,8 @@ static bool mi_page_map_init(void) {
  // commit the first part so NULL pointers get resolved without an access violation
  if (!mi_page_map_all_committed) {
    _mi_os_commit(_mi_page_map, _mi_os_page_size(), NULL, NULL);
-    _mi_page_map[0] = -1; // so _mi_ptr_page(NULL) == NULL
+    _mi_page_map[0] = 1; // so _mi_ptr_page(NULL) == NULL
+    mi_assert_internal(_mi_ptr_page(NULL)==NULL);
  }
  return true;
 }
@ -60,9 +61,9 @@ static void mi_page_map_ensure_committed(size_t idx, size_t slice_count) {
 static size_t mi_page_map_get_idx(mi_page_t* page, uint8_t** page_start, size_t* slice_count) {
  size_t page_size;
  *page_start = mi_page_area(page, &page_size);
-  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE; }  // furthest interior pointer
-  *slice_count = mi_slice_count_of_size(page_size);
-  return ((uintptr_t)*page_start >> MI_ARENA_SLICE_SHIFT);
+  if (page_size > MI_LARGE_PAGE_SIZE) { page_size = MI_LARGE_PAGE_SIZE - MI_ARENA_SLICE_SIZE; }  // furthest interior pointer
+  *slice_count = mi_slice_count_of_size(page_size) + (((uint8_t*)*page_start - (uint8_t*)page)/MI_ARENA_SLICE_SIZE); // add for large aligned blocks
+  return ((uintptr_t)page >> MI_ARENA_SLICE_SHIFT);
 }


@ -79,9 +80,9 @@ void _mi_page_map_register(mi_page_t* page) {
  mi_page_map_ensure_committed(idx, slice_count);

  // set the offsets
-  for (int i = 0; i < (int)slice_count; i++) {
+  for (size_t i = 0; i < slice_count; i++) {
    mi_assert_internal(i < 128);
-    _mi_page_map[idx + i] = (i+1);
+    _mi_page_map[idx + i] = (uint8_t)(i+1);
  }
 }

--- a/src/page.c
+++ b/src/page.c
@ -41,9 +41,10 @@ static void mi_page_extend_free(mi_heap_t* heap, mi_page_t* page);

 #if (MI_DEBUG>=3)
 static size_t mi_page_list_count(mi_page_t* page, mi_block_t* head) {
+  mi_assert_internal(_mi_ptr_page(page) == page);
  size_t count = 0;
  while (head != NULL) {
-    mi_assert_internal(page == _mi_ptr_page(head));
+    mi_assert_internal((uint8_t*)head - (uint8_t*)page > MI_LARGE_PAGE_SIZE || page == _mi_ptr_page(head));
    count++;
    head = mi_block_next(page, head);
  }
@ -123,7 +124,7 @@ bool _mi_page_is_valid(mi_page_t* page) {
    {
      mi_page_queue_t* pq = mi_page_queue_of(page);
      mi_assert_internal(mi_page_queue_contains(pq, page));
-      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_block_size(page) > MI_LARGE_MAX_OBJ_SIZE || mi_page_is_in_full(page));
+      mi_assert_internal(pq->block_size==mi_page_block_size(page) || mi_page_is_huge(page) || mi_page_is_in_full(page));
      mi_assert_internal(mi_heap_contains_queue(mi_page_heap(page),pq));
    }
  }
@ -258,7 +259,7 @@ void _mi_page_reclaim(mi_heap_t* heap, mi_page_t* page) {
  mi_assert_internal(mi_page_thread_free_flag(page) != MI_NEVER_DELAYED_FREE);

  // TODO: push on full queue immediately if it is full?
-  mi_page_queue_t* pq = mi_page_queue(heap, mi_page_block_size(page));
+  mi_page_queue_t* pq = mi_heap_page_queue_of(heap, page);
  mi_page_queue_push(heap, pq, page);
  mi_assert_expensive(_mi_page_is_valid(page));
 }
@ -279,6 +280,15 @@ static mi_page_t* mi_page_fresh_alloc(mi_heap_t* heap, mi_page_queue_t* pq, size
  }
  if (mi_page_is_abandoned(page)) {
    _mi_page_reclaim(heap, page);
+    if (!mi_page_immediate_available(page)) {
+      if (mi_page_is_expandable(page)) {
+        mi_page_extend_free(heap, page);
+      }
+      else {
+        mi_assert(false); // should not happen?
+        return NULL;
+      }
+    }
  }
  else if (pq != NULL) {
    mi_page_queue_push(heap, pq, page);
@ -295,7 +305,7 @@ static mi_page_t* mi_page_fresh(mi_heap_t* heap, mi_page_queue_t* pq) {
  mi_page_t* page = mi_page_fresh_alloc(heap, pq, pq->block_size, 0);
  if (page==NULL) return NULL;
  mi_assert_internal(pq->block_size==mi_page_block_size(page));
-  mi_assert_internal(pq==mi_page_queue(heap, mi_page_block_size(page)));
+  mi_assert_internal(pq==mi_heap_page_queue_of(heap, page));
  return page;
 }

@ -713,7 +723,7 @@ void _mi_page_init(mi_heap_t* heap, mi_page_t* page) {
 -------------------------------------------------------------*/

 // search for a best next page to use for at most N pages (often cut short if immediate blocks are available)
-#define MI_MAX_CANDIDATE_SEARCH  (0)
+#define MI_MAX_CANDIDATE_SEARCH  (8)


 // Find a page with free blocks of `page->block_size`.
@ -723,7 +733,9 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
  #if MI_STAT
  size_t count = 0;
  #endif
+  #if MI_MAX_CANDIDATE_SEARCH > 1
  size_t candidate_count = 0;        // we reset this on the first candidate to limit the search
+  #endif
  mi_page_t* page_candidate = NULL;  // a page with free space
  mi_page_t* page = pq->first;

@ -793,17 +805,21 @@ static mi_page_t* mi_page_queue_find_free_ex(mi_heap_t* heap, mi_page_queue_t* p
      mi_assert_internal(mi_page_is_expandable(page));
      mi_page_extend_free(heap, page);
    }
+    mi_assert_internal(mi_page_immediate_available(page));
  }

  if (page == NULL) {
    _mi_heap_collect_retired(heap, false); // perhaps make a page available
    page = mi_page_fresh(heap, pq);
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    if (page == NULL && first_try) {
      // out-of-memory _or_ an abandoned page with free blocks was reclaimed, try once again
      page = mi_page_queue_find_free_ex(heap, pq, false);
+      mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    }
  }
  else {
+    mi_assert_internal(page == NULL || mi_page_immediate_available(page));
    // move the page to the front of the queue
    mi_page_queue_move_to_front(heap, pq, page);
    page->retire_expire = 0;
--- a/test/test-api.c
+++ b/test/test-api.c
@ -34,7 +34,7 @@ we therefore test the API over various inputs. Please add more tests :-)

 #include "mimalloc.h"
 // #include "mimalloc/internal.h"
-#include "mimalloc/types.h" // for MI_DEBUG and MI_BLOCK_ALIGNMENT_MAX
+#include "mimalloc/types.h" // for MI_DEBUG and MI_PAGE_MAX_OVERALLOC_ALIGN

 #include "testhelper.h"

@ -169,7 +169,7 @@ int main(void) {
  /*
  CHECK_BODY("malloc-aligned6") {
    bool ok = true;
-    for (size_t align = 1; align <= MI_BLOCK_ALIGNMENT_MAX && ok; align *= 2) {
+    for (size_t align = 1; align <= MI_PAGE_MAX_OVERALLOC_ALIGN && ok; align *= 2) {
      void* ps[8];
      for (int i = 0; i < 8 && ok; i++) {
        ps[i] = mi_malloc_aligned(align*13  // size
@ -186,16 +186,16 @@ int main(void) {
  };
  */
  CHECK_BODY("malloc-aligned7") {
-    void* p = mi_malloc_aligned(1024,MI_BLOCK_ALIGNMENT_MAX);
+    void* p = mi_malloc_aligned(1024,MI_PAGE_MAX_OVERALLOC_ALIGN);
    mi_free(p);
-    result = ((uintptr_t)p % MI_BLOCK_ALIGNMENT_MAX) == 0;
+    result = ((uintptr_t)p % MI_PAGE_MAX_OVERALLOC_ALIGN) == 0;
  };
  CHECK_BODY("malloc-aligned8") {
    bool ok = true;
    for (int i = 0; i < 5 && ok; i++) {
      int n = (1 << i);
-      void* p = mi_malloc_aligned(1024, n * MI_BLOCK_ALIGNMENT_MAX);
-      ok = ((uintptr_t)p % (n*MI_BLOCK_ALIGNMENT_MAX)) == 0;
+      void* p = mi_malloc_aligned(1024, n * MI_PAGE_MAX_OVERALLOC_ALIGN);
+      ok = ((uintptr_t)p % (n*MI_PAGE_MAX_OVERALLOC_ALIGN)) == 0;
      mi_free(p);
    }
    result = ok;
@ -203,7 +203,7 @@ int main(void) {
  CHECK_BODY("malloc-aligned9") { // test large alignments
    bool ok = true;
    void* p[8];
-    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_BLOCK_ALIGNMENT_MAX, MI_BLOCK_ALIGNMENT_MAX + 1, 2 * MI_BLOCK_ALIGNMENT_MAX, 8 * MI_BLOCK_ALIGNMENT_MAX, 0 };
+    size_t sizes[8] = { 8, 512, 1024 * 1024, MI_PAGE_MAX_OVERALLOC_ALIGN, MI_PAGE_MAX_OVERALLOC_ALIGN + 1, 2 * MI_PAGE_MAX_OVERALLOC_ALIGN, 8 * MI_PAGE_MAX_OVERALLOC_ALIGN, 0 };
    for (int i = 0; i < 28 && ok; i++) {
      int align = (1 << i);
      for (int j = 0; j < 8 && ok; j++) {
--- a/test/test-stress.c
+++ b/test/test-stress.c
@ -42,7 +42,7 @@ static int SCALE   = 10;
 static int ITER    = 10;
 #elif 0
 static int THREADS = 4;
-static int SCALE   = 20;
+static int SCALE   = 100;
 static int ITER    = 20;
 #else
 static int THREADS = 32;      // more repeatable if THREADS <= #processors
@ -54,7 +54,7 @@ static int ITER    = 50;      // N full iterations destructing and re-creating a

 #define STRESS                // undefine for leak test

-static bool   allow_large_objects = false;     // allow very large objects? (set to `true` if SCALE>100)
+static bool   allow_large_objects = true;     // allow very large objects? (set to `true` if SCALE>100)
 static size_t use_one_size = 0;               // use single object size of `N * sizeof(uintptr_t)`?

 static bool   main_participates = false;       // main thread participates as a worker too