From 659a9dd51d1d02b620ea569d62fdda76dcb60c38 Mon Sep 17 00:00:00 2001
From: daanx <daan@effp.org>
Date: Fri, 6 Dec 2024 22:37:59 -0800
Subject: [PATCH] fix page info size and order; atomic page flags

---
 CMakeLists.txt              |  2 +-
 include/mimalloc/atomic.h   |  4 +-
 include/mimalloc/internal.h | 24 ++++++++---
 include/mimalloc/types.h    | 81 +++++++++++++++----------------------
 src/arena.c                 | 36 ++++++++---------
 src/bitmap.c                |  6 +--
 src/free.c                  |  5 ++-
 src/init.c                  | 10 ++---
 src/os.c                    |  4 +-
 test/test-stress.c          |  4 +-
 10 files changed, 87 insertions(+), 89 deletions(-)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index 2c04aea8..1a4cc1f1 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -360,7 +360,7 @@ if(CMAKE_C_COMPILER_ID MATCHES "AppleClang|Clang|GNU|Intel" AND NOT CMAKE_SYSTEM
       list(APPEND mi_cflags_dynamic -ftls-model=initial-exec)
       message(STATUS "Use local dynamic TLS for the static build (since MI_LIBC_MUSL=ON)")
     else()
-      list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2)
+      list(APPEND mi_cflags -ftls-model=initial-exec -march=haswell -mavx2 -O2)
     endif()
   endif()
   if(MI_OVERRIDE)
diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 3a0d4892..caa90cf8 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -80,10 +80,12 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_atomic_cas_strong_acq_rel(p,exp,des)  mi_atomic_cas_strong(p,exp,des,mi_memory_order(acq_rel),mi_memory_order(acquire))
 
 #define mi_atomic_add_relaxed(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(relaxed))
-#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_add_acq_rel(p,x)               mi_atomic(fetch_add_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_sub_relaxed(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_sub_acq_rel(p,x)               mi_atomic(fetch_sub_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_and_relaxed(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_and_acq_rel(p,x)               mi_atomic(fetch_and_explicit)(p,x,mi_memory_order(acq_rel))
+#define mi_atomic_or_relaxed(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(relaxed))
 #define mi_atomic_or_acq_rel(p,x)                mi_atomic(fetch_or_explicit)(p,x,mi_memory_order(acq_rel))
 
 #define mi_atomic_increment_relaxed(p)           mi_atomic_add_relaxed(p,(uintptr_t)1)
diff --git a/include/mimalloc/internal.h b/include/mimalloc/internal.h
index d9c2cd6e..ad7c41c6 100644
--- a/include/mimalloc/internal.h
+++ b/include/mimalloc/internal.h
@@ -667,7 +667,8 @@ static inline void mi_page_clear_abandoned_mapped(mi_page_t* page) {
 
 
 static inline bool mi_page_is_huge(const mi_page_t* page) {
-  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.alignment > MI_PAGE_MAX_OVERALLOC_ALIGN));
+  return (page->block_size > MI_LARGE_MAX_OBJ_SIZE || 
+          (mi_memkind_is_os(page->memid.memkind) && page->memid.mem.os.base < (void*)page));
 }
 
 
@@ -727,20 +728,33 @@ static inline bool _mi_page_unown(mi_page_t* page) {
 //-----------------------------------------------------------
 // Page flags
 //-----------------------------------------------------------
+static inline mi_page_flags_t mi_page_flags(const mi_page_t* page) {
+  return mi_atomic_load_acquire(&page->xflags);
+}
+
+static inline void mi_page_flags_set(mi_page_t* page, bool set, mi_page_flags_t newflag) {
+  if (set) {
+    mi_atomic_or_acq_rel(&page->xflags, newflag);
+  }
+  else {
+    mi_atomic_and_acq_rel(&page->xflags, ~newflag);
+  }
+}
+
 static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return page->flags.x.in_full;
+  return ((mi_page_flags(page) & MI_PAGE_IN_FULL_QUEUE) != 0);
 }
 
 static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  page->flags.x.in_full = in_full;
+  mi_page_flags_set(page, in_full, MI_PAGE_IN_FULL_QUEUE);
 }
 
 static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return page->flags.x.has_aligned;
+  return ((mi_page_flags(page) & MI_PAGE_HAS_ALIGNED) != 0);
 }
 
 static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  page->flags.x.has_aligned = has_aligned;
+  mi_page_flags_set(page, has_aligned, MI_PAGE_HAS_ALIGNED);
 }
 
 /* -------------------------------------------------------------------
diff --git a/include/mimalloc/types.h b/include/mimalloc/types.h
index d78dbc59..5dfbb808 100644
--- a/include/mimalloc/types.h
+++ b/include/mimalloc/types.h
@@ -111,17 +111,17 @@ terms of the MIT license. A copy of the license can be found in the file
 
 // Sizes are for 64-bit
 #ifndef MI_ARENA_SLICE_SHIFT
-#ifdef  MI_SMALL_PAGE_SHIFT  // compatibility
+#ifdef  MI_SMALL_PAGE_SHIFT   // compatibility
 #define MI_ARENA_SLICE_SHIFT              MI_SMALL_PAGE_SHIFT
 #else
 #define MI_ARENA_SLICE_SHIFT              (13 + MI_SIZE_SHIFT)        // 64 KiB (32 KiB on 32-bit)
 #endif
 #endif
 #ifndef MI_BCHUNK_BITS_SHIFT
-#define MI_BCHUNK_BITS_SHIFT        (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
+#define MI_BCHUNK_BITS_SHIFT              (6 + MI_SIZE_SHIFT)         // optimized for 512 bits per chunk (avx512)
 #endif
 
-#define MI_BCHUNK_BITS              (1 << MI_BCHUNK_BITS_SHIFT)
+#define MI_BCHUNK_BITS                    (1 << MI_BCHUNK_BITS_SHIFT)
 #define MI_ARENA_SLICE_SIZE               (MI_ZU(1) << MI_ARENA_SLICE_SHIFT)
 #define MI_ARENA_SLICE_ALIGN              (MI_ARENA_SLICE_SIZE)
 
@@ -167,8 +167,8 @@ static inline bool mi_memkind_is_os(mi_memkind_t memkind) {
 
 typedef struct mi_memid_os_info {
   void*         base;               // actual base address of the block (used for offset aligned allocations)
-  size_t        alignment;          // alignment at allocation
   size_t        size;               // allocated full size
+  // size_t        alignment;       // alignment at allocation
 } mi_memid_os_info_t;
 
 typedef struct mi_memid_arena_info {
@@ -224,26 +224,11 @@ typedef enum mi_owned_e {
 } mi_owned_t;
 
 
-// The `in_full` and `has_aligned` page flags are put in a union to efficiently
-// test if both are false (`full_aligned == 0`) in the `mi_free` routine.
-#if !MI_TSAN
-typedef union mi_page_flags_s {
-  uint8_t full_aligned;
-  struct {
-    uint8_t in_full : 1;
-    uint8_t has_aligned : 1;
-  } x;
-} mi_page_flags_t;
-#else
-// under thread sanitizer, use a byte for each flag to suppress warning, issue #130
-typedef union mi_page_flags_s {
-  uint32_t full_aligned;
-  struct {
-    uint8_t in_full;
-    uint8_t has_aligned;
-  } x;
-} mi_page_flags_t;
-#endif
+// The `in_full` and `has_aligned` page flags are put in the same field
+// to efficiently test if both are false (`full_aligned == 0`) in the `mi_free` routine.
+#define MI_PAGE_IN_FULL_QUEUE  MI_ZU(0x01)
+#define MI_PAGE_HAS_ALIGNED    MI_ZU(0x02)
+typedef size_t mi_page_flags_t;
 
 // Thread free list.
 // We use the bottom bit of the pointer for `mi_owned_t` flags
@@ -280,35 +265,33 @@ typedef struct mi_subproc_s mi_subproc_t;
 //   the owning heap `thread_delayed_free` list. This guarantees that pages
 //   will be freed correctly even if only other threads free blocks.
 typedef struct mi_page_s {
-  _Atomic(mi_threadid_t)xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
+  _Atomic(mi_threadid_t)    xthread_id;        // thread this page belongs to. (= xheap->thread_id, or 0 if abandoned)
 
-  mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
-  uint16_t              used;              // number of blocks in use (including blocks in `thread_free`)
-  uint16_t              capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
-  uint16_t              reserved;          // number of blocks reserved in memory
-  uint8_t               block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
-  uint8_t               heap_tag;          // tag of the owning heap, used to separate heaps by object type
+  mi_block_t*               free;              // list of available free blocks (`malloc` allocates from this list)
+  uint16_t                  used;              // number of blocks in use (including blocks in `thread_free`)
+  uint16_t                  capacity;          // number of blocks committed (must be the first field for proper zero-initialisation)
+  uint16_t                  reserved;          // number of blocks reserved in memory
+  uint8_t                   block_size_shift;  // if not zero, then `(1 << block_size_shift) == block_size` (only used for fast path in `free.c:_mi_page_ptr_unalign`)
+  uint8_t                   retire_expire;     // expiration count for retired blocks
 
-  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (8 bits)
-  uint8_t               free_is_zero:1;    // `true` if the blocks in the free list are zero initialized
-  uint8_t               retire_expire:7;   // expiration count for retired blocks
-                                           // padding
-
-  mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
-  size_t                block_size;        // size available in each block (always `>0`)
-  uint8_t*              page_start;        // start of the blocks
+  mi_block_t*               local_free;        // list of deferred free blocks by this thread (migrates to `free`)
+  _Atomic(mi_thread_free_t) xthread_free;      // list of deferred free blocks freed by other threads
+  _Atomic(mi_page_flags_t)  xflags;            // `in_full` and `has_aligned` flags 
 
+  size_t                    block_size;        // size available in each block (always `>0`)  
+  uint8_t*                  page_start;        // start of the blocks
+  uint8_t                   heap_tag;          // tag of the owning heap, used to separate heaps by object type
+  bool                      free_is_zero;      // `true` if the blocks in the free list are zero initialized
+                                               // padding
   #if (MI_ENCODE_FREELIST || MI_PADDING)
-  uintptr_t             keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
+  uintptr_t                 keys[2];           // two random keys to encode the free lists (see `_mi_block_next`) or padding canary
   #endif
 
-  _Atomic(mi_thread_free_t) xthread_free;  // list of deferred free blocks freed by other threads
-
-  mi_heap_t*            heap;              // heap this threads belong to.
-  struct mi_page_s*     next;              // next page owned by the heap with the same `block_size`
-  struct mi_page_s*     prev;              // previous page owned by the heap with the same `block_size`
-  mi_subproc_t*         subproc;           // sub-process of this heap
-  mi_memid_t            memid;             // provenance of the page memory
+  mi_heap_t*                heap;              // heap this threads belong to.
+  struct mi_page_s*         next;              // next page owned by the heap with the same `block_size`
+  struct mi_page_s*         prev;              // previous page owned by the heap with the same `block_size`
+  mi_subproc_t*             subproc;           // sub-process of this heap
+  mi_memid_t                memid;             // provenance of the page memory
 } mi_page_t;
 
 
@@ -317,10 +300,10 @@ typedef struct mi_page_s {
 // ------------------------------------------------------
 
 #define MI_PAGE_ALIGN                     MI_ARENA_SLICE_ALIGN // pages must be aligned on this for the page map.
-#define MI_PAGE_MIN_BLOCK_ALIGN           (32)                 // minimal block alignment in a page
+#define MI_PAGE_MIN_BLOCK_ALIGN           (64)                 // minimal block alignment in a page 
 #define MI_PAGE_MAX_OVERALLOC_ALIGN       MI_ARENA_SLICE_SIZE  // (64 KiB) limit for which we overallocate in arena pages, beyond this use OS allocation
 
-#if MI_DEBUG && MI_SIZE_SIZE == 8
+#if (MI_ENCODE_FREELIST || MI_PADDING) && MI_SIZE_SIZE == 8
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+2)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
 #else
 #define MI_PAGE_INFO_SIZE                 ((MI_INTPTR_SHIFT+1)*MI_PAGE_MIN_BLOCK_ALIGN)  // >= sizeof(mi_page_t)
diff --git a/src/arena.c b/src/arena.c
index 2c215264..45697081 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -483,7 +483,7 @@ static bool mi_arena_try_claim_abandoned(size_t slice_index, void* arg1, void* a
   mi_page_t* const    page    = (mi_page_t*)mi_arena_slice_start(arena, slice_index);
   // can we claim ownership?
   if (!mi_page_try_claim_ownership(page)) {
-    // there was a concurrent free .. 
+    // there was a concurrent free ..
     // we need to keep it in the abandoned map as the free will call `mi_arena_page_unabandon`,
     // and wait for readers (us!) to finish. This is why it is very important to set the abandoned
     // bit again (or otherwise the unabandon will never stop waiting).
@@ -596,7 +596,9 @@ static mi_page_t* mi_arena_page_alloc_fresh(size_t slice_count, size_t block_siz
     }
   }
   #endif
-  mi_assert(MI_PAGE_INFO_SIZE >= _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN));
+  if (MI_PAGE_INFO_SIZE < _mi_align_up(sizeof(*page), MI_PAGE_MIN_BLOCK_ALIGN)) {
+    _mi_error_message(EFAULT, "fatal internal error: MI_PAGE_INFO_SIZE is too small\n");
+  };
   const size_t block_start = (os_align ? MI_PAGE_ALIGN : MI_PAGE_INFO_SIZE);
   const size_t reserved    = (os_align ? 1 : (mi_size_of_slices(slice_count) - block_start) / block_size);
   mi_assert_internal(reserved > 0 && reserved <= UINT16_MAX);
@@ -1126,28 +1128,22 @@ static size_t mi_debug_show_bfield(mi_bfield_t field, char* buf) {
   return bit_set_count;
 }
 
-static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) {
-  _mi_output_message("%s%s:\n", prefix, header);
+static size_t mi_debug_show_bitmap(const char* header, size_t slice_count, mi_bitmap_t* bitmap, bool invert) {
+  _mi_output_message("%s:\n", header);
   size_t bit_count = 0;
   size_t bit_set_count = 0;
   for (size_t i = 0; i < mi_bitmap_chunk_count(bitmap) && bit_count < slice_count; i++) {
     char buf[MI_BCHUNK_BITS + 64]; _mi_memzero(buf, sizeof(buf));
     size_t k = 0;
     mi_bchunk_t* chunk = &bitmap->chunks[i];
-    
-    if (i<10)  { buf[k++] = ' '; }
-    if (i<100) { itoa((int)i, buf+k, 10); k += (i < 10 ? 1 : 2); }
-    buf[k++] = ' ';
 
+    if (i<10)        { buf[k++] = ('0' + (char)i); buf[k++] = ' '; buf[k++] = ' '; }
+    else if (i<100)  { buf[k++] = ('0' + (char)(i/10)); buf[k++] = ('0' + (char)(i%10)); buf[k++] = ' '; }
+    else if (i<1000) { buf[k++] = ('0' + (char)(i/100)); buf[k++] = ('0' + (char)((i%100)/10)); buf[k++] = ('0' + (char)(i%10)); }
+    
     for (size_t j = 0; j < MI_BCHUNK_FIELDS; j++) {
       if (j > 0 && (j % 4) == 0) {
-        buf[k++] = '\n';
-        _mi_memcpy(buf+k, prefix, strlen(prefix)); k += strlen(prefix);
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
-        buf[k++] = ' ';
+        buf[k++] = '\n'; _mi_memset(buf+k,' ',5); k += 5;
       }
       if (bit_count < slice_count) {
         mi_bfield_t bfield = chunk->bfields[j];
@@ -1164,9 +1160,9 @@ static size_t mi_debug_show_bitmap(const char* prefix, const char* header, size_
       }
       bit_count += MI_BFIELD_BITS;
     }
-    _mi_output_message("%s  %s\n", prefix, buf);
+    _mi_output_message("  %s\n", buf);
   }
-  _mi_output_message("%s  total ('x'): %zu\n", prefix, bit_set_count);
+  _mi_output_message("  total ('x'): %zu\n", bit_set_count);
   return bit_set_count;
 }
 
@@ -1183,12 +1179,12 @@ void mi_debug_show_arenas(bool show_inuse, bool show_abandoned, bool show_purge)
     slice_total += arena->slice_count;
     _mi_output_message("arena %zu: %zu slices (%zu MiB)%s\n", i, arena->slice_count, mi_size_of_slices(arena->slice_count)/MI_MiB, (arena->memid.is_pinned ? ", pinned" : ""));
     if (show_inuse) {
-      free_total += mi_debug_show_bitmap("  ", "in-use slices", arena->slice_count, arena->slices_free, true);
+      free_total += mi_debug_show_bitmap("in-use slices", arena->slice_count, arena->slices_free, true);
     }
-    mi_debug_show_bitmap("  ", "committed slices", arena->slice_count, arena->slices_committed, false);
+    mi_debug_show_bitmap("committed slices", arena->slice_count, arena->slices_committed, false);
     // todo: abandoned slices
     if (show_purge) {
-      purge_total += mi_debug_show_bitmap("  ", "purgeable slices", arena->slice_count, arena->slices_purge, false);
+      purge_total += mi_debug_show_bitmap("purgeable slices", arena->slice_count, arena->slices_purge, false);
     }
   }
   if (show_inuse)     _mi_output_message("total inuse slices    : %zu\n", slice_total - free_total);
diff --git a/src/bitmap.c b/src/bitmap.c
index 15401d8d..2ef692cb 100644
--- a/src/bitmap.c
+++ b/src/bitmap.c
@@ -805,10 +805,10 @@ static bool mi_bitmap_chunkmap_try_clear(mi_bitmap_t* bitmap, size_t chunk_idx)
     return false;
   }
   // record the max clear 
-  size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
+  /*size_t oldmax = mi_atomic_load_relaxed(&bitmap->chunk_max_clear);
   do {
     if mi_likely(chunk_idx <= oldmax) break;
-  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));
+  } while (!mi_atomic_cas_weak_acq_rel(&bitmap->chunk_max_clear, &oldmax, chunk_idx));*/
   return true;
 }
 
@@ -1046,7 +1046,7 @@ bool mi_bitmap_is_xsetN(mi_xset_t set, mi_bitmap_t* bitmap, size_t idx, size_t n
   { \
   /* start chunk index -- todo: can depend on the tseq to decrease contention between threads */ \
   MI_UNUSED(tseq); \
-  const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear);  /* mi_bitmap_chunk_count(bitmap) */ \
+  /* const size_t chunk_max = mi_atomic_load_acquire(&bitmap->chunk_max_clear); */ /* mi_bitmap_chunk_count(bitmap) */ \
   const size_t chunk_start = 0; /* (chunk_max <= 1 ? 0 : (tseq % chunk_max)); */      /* space out threads */ \
   const size_t chunkmap_max_bfield = _mi_divide_up( mi_bitmap_chunk_count(bitmap), MI_BCHUNK_BITS ); \
   const size_t chunkmap_start = chunk_start / MI_BFIELD_BITS; \
diff --git a/src/free.c b/src/free.c
index 0ff4bf60..afb23838 100644
--- a/src/free.c
+++ b/src/free.c
@@ -163,8 +163,9 @@ void mi_free(void* p) mi_attr_noexcept
   if mi_unlikely(page==NULL) return;
 
   const bool is_local = (_mi_prim_thread_id() == mi_page_thread_id(page));
+  const mi_page_flags_t flags = mi_page_flags(page);
   if mi_likely(is_local) {                        // thread-local free?
-    if mi_likely(page->flags.full_aligned == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
+    if mi_likely(flags == 0) { // and it is not a full page (full pages need to move from the full bin), nor has aligned blocks (aligned blocks need to be unaligned)
       // thread-local, aligned, and not a full page
       mi_block_t* const block = (mi_block_t*)p;
       mi_free_block_local(page, block, true /* track stats */, false /* no need to check if the page is full */);
@@ -176,7 +177,7 @@ void mi_free(void* p) mi_attr_noexcept
   }
   else {
     // free-ing in a page owned by a heap in another thread, or on abandoned page (not belonging to a heap)
-    if mi_likely(page->flags.full_aligned == 0) {
+    if mi_likely(flags == 0) {
       // blocks are aligned (and not a full page)
       mi_block_t* const block = (mi_block_t*)p;
       mi_free_block_mt(page,block);
diff --git a/src/init.c b/src/init.c
index 5d4a775a..4fbd50ed 100644
--- a/src/init.c
+++ b/src/init.c
@@ -20,21 +20,21 @@ const mi_page_t _mi_page_empty = {
   0,       // capacity
   0,       // reserved capacity
   0,       // block size shift
-  0,       // heap tag
-  { 0 },   // flags
-  false,   // is_zero
   0,       // retire_expire
   NULL,    // local_free
+  MI_ATOMIC_VAR_INIT(0), // xthread_free
+  MI_ATOMIC_VAR_INIT(0), // xflags
   0,       // block_size
   NULL,    // page_start
+  0,       // heap tag
+  false,   // is_zero
   #if (MI_PADDING || MI_ENCODE_FREELIST)
   { 0, 0 },
   #endif
-  MI_ATOMIC_VAR_INIT(0), // xthread_free
   NULL,       // xheap
   NULL, NULL, // next, prev
   NULL,       // subproc
-  { {{ NULL, 0, 0}}, false, false, false, MI_MEM_NONE }  // memid
+  { {{ NULL, 0}}, false, false, false, MI_MEM_NONE }  // memid
 };
 
 #define MI_PAGE_EMPTY() ((mi_page_t*)&_mi_page_empty)
diff --git a/src/os.c b/src/os.c
index c7f464c0..156a655b 100644
--- a/src/os.c
+++ b/src/os.c
@@ -128,7 +128,7 @@ void _mi_os_free_ex(void* addr, size_t size, bool still_committed, mi_memid_t me
     // different base? (due to alignment)
     if (memid.mem.os.base != base) {
       mi_assert(memid.mem.os.base <= addr);
-      mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
+      // mi_assert((uint8_t*)memid.mem.os.base + memid.mem.os.alignment >= (uint8_t*)addr);
       base = memid.mem.os.base;
       if (memid.mem.os.size==0) { csize += ((uint8_t*)addr - (uint8_t*)memid.mem.os.base); }
     }
@@ -305,7 +305,7 @@ void* _mi_os_alloc_aligned(size_t size, size_t alignment, bool commit, bool allo
   if (p != NULL) {
     *memid = _mi_memid_create_os(p, size, commit, os_is_zero, os_is_large);
     memid->mem.os.base = os_base;
-    memid->mem.os.alignment = alignment;
+    // memid->mem.os.alignment = alignment;
     memid->mem.os.size += ((uint8_t*)p - (uint8_t*)os_base);  // todo: return from prim_alloc_aligned
   }
   return p;
diff --git a/test/test-stress.c b/test/test-stress.c
index d5f106d5..d46c2484 100644
--- a/test/test-stress.c
+++ b/test/test-stress.c
@@ -40,7 +40,7 @@ static int ITER    = 20;
 static int THREADS = 8;
 static int SCALE   = 10;
 static int ITER    = 10;
-#elif 0
+#elif 1
 static int THREADS = 4;
 static int SCALE   = 100;
 static int ITER    = 10;
@@ -347,6 +347,8 @@ int main(int argc, char** argv) {
   mi_collect(true);
   mi_debug_show_arenas(true,true,false);
   #endif
+  mi_collect(true);
+  mi_debug_show_arenas(true, true, false);
   // mi_stats_print(NULL);
 #else
   mi_stats_print(NULL);  // so we see rss/commit/elapsed