diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 03888b89..7566baa0 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -348,39 +348,24 @@ static inline mi_page_queue_t* mi_page_queue(const mi_heap_t* heap, size_t size)
 }
 
 
+
 //-----------------------------------------------------------
 // Page flags
 //-----------------------------------------------------------
-static inline uintptr_t mi_page_thread_id(const mi_page_t* page) {
-  return (page->flags & ~MI_PAGE_FLAGS_MASK);
-}
-
-static inline void mi_page_init_flags(mi_page_t* page, uintptr_t thread_id) {
-  mi_assert_internal((thread_id & MI_PAGE_FLAGS_MASK) == 0);
-  page->flags = thread_id;
-}
-
-static inline void mi_page_set_thread_id(mi_page_t* page, uintptr_t thread_id) {
-  mi_assert_internal((thread_id & MI_PAGE_FLAGS_MASK) == 0);
-  page->flags = thread_id | (page->flags & MI_PAGE_FLAGS_MASK);
-}
-
 static inline bool mi_page_is_in_full(const mi_page_t* page) {
-  return ((page->flags & 0x01) != 0);
+  return page->flags.in_full;
 }
 
 static inline void mi_page_set_in_full(mi_page_t* page, bool in_full) {
-  if (in_full) page->flags |= 0x01;
-          else page->flags &= ~0x01;
+  page->flags.in_full = in_full;
 }
 
 static inline bool mi_page_has_aligned(const mi_page_t* page) {
-  return ((page->flags & 0x02) != 0);
+  return page->flags.has_aligned;
 }
 
 static inline void mi_page_set_has_aligned(mi_page_t* page, bool has_aligned) {
-  if (has_aligned) page->flags |= 0x02;
-              else page->flags &= ~0x02;
+  page->flags.has_aligned = has_aligned;
 }
 
 
diff --git a/include/mimalloc-types.h b/include/mimalloc-types.h
index ee489623..9a482aff 100644
--- a/include/mimalloc-types.h
+++ b/include/mimalloc-types.h
@@ -125,12 +125,15 @@ typedef enum mi_delayed_e {
 } mi_delayed_t;
 
 
-// Use the bottom 2 bits for the `in_full` and `has_aligned` flags
-// and the rest for the threadid (we assume tid's never use those lower 2 bits).
-// This allows a single test in `mi_free` to check for unlikely cases
-// (namely, non-local free, aligned free, or freeing in a full page)
-#define MI_PAGE_FLAGS_MASK  ((uintptr_t)0x03)
-typedef uintptr_t mi_page_flags_t;
+// The `in_full` and `has_aligned` page flags are put in a union to efficiently 
+// test if both are false (`value == 0`) in the `mi_free` routine.
+typedef union mi_page_flags_u {
+  uint16_t value;
+  struct {
+    bool in_full;
+    bool has_aligned;
+  };
+} mi_page_flags_t;
 
 // Thread free list.
 // We use the bottom 2 bits of the pointer for mi_delayed_t flags
@@ -164,12 +167,12 @@ typedef struct mi_page_s {
   // layout like this to optimize access in `mi_malloc` and `mi_free`
   uint16_t              capacity;          // number of blocks committed
   uint16_t              reserved;          // number of blocks reserved in memory
+  mi_page_flags_t       flags;             // `in_full` and `has_aligned` flags (16 bits)
 
   mi_block_t*           free;              // list of available free blocks (`malloc` allocates from this list)
   #if MI_SECURE
   uintptr_t             cookie;            // random cookie to encode the free lists
   #endif
-  mi_page_flags_t       flags;
   size_t                used;              // number of blocks in use (including blocks in `local_free` and `thread_free`)
 
   mi_block_t*           local_free;        // list of deferred free blocks by this thread (migrates to `free`)
@@ -182,12 +185,11 @@ typedef struct mi_page_s {
   struct mi_page_s*     next;              // next page owned by this thread with the same `block_size`
   struct mi_page_s*     prev;              // previous page owned by this thread with the same `block_size`
 
-// improve page index calculation
-#if (MI_INTPTR_SIZE==8 && MI_SECURE==0)
-  // void*                 padding[1];        // 12 words on 64-bit
-#elif MI_INTPTR_SIZE==4
-  // void*                 padding[1];         // 12 words on 32-bit
-#endif
+  // improve page index calculation
+  // without padding: 10 words on 64-bit, 11 on 32-bit. Secure adds one word
+  #if (MI_INTPTR_SIZE==8 && MI_SECURE>0) || (MI_INTPTR_SIZE==4 && MI_SECURE==0)
+  void*                 padding[1];        // 12 words on 64-bit in secure mode, 12 words on 32-bit plain
+  #endif
 } mi_page_t;
 
 
@@ -212,7 +214,7 @@ typedef mi_page_t mi_slice_t;
 typedef struct mi_segment_s {
   struct mi_segment_s* next;
   struct mi_segment_s* prev;
-  struct mi_segment_s* abandoned_next;  // abandoned segment stack: `used == abandoned`
+  volatile struct mi_segment_s* abandoned_next;
   size_t          abandoned;   // abandoned pages (i.e. the original owning thread stopped) (`abandoned <= used`)
   size_t          used;        // count of pages in use
   size_t          segment_size;// for huge pages this may be different from `MI_SEGMENT_SIZE`
diff --git a/include/mimalloc.h b/include/mimalloc.h
index fb41b037..7000cd42 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -53,8 +53,8 @@ terms of the MIT license. A copy of the license can be found in the file
   #else
   #define mi_attr_alloc_size(s)       __attribute__((alloc_size(s)))
   #define mi_attr_alloc_size2(s1,s2)  __attribute__((alloc_size(s1,s2)))
-  #define mi_cdecl                    // leads to warnings... __attribute__((cdecl))
   #endif
+  #define mi_cdecl                    // leads to warnings... __attribute__((cdecl))
 #else
   #define mi_decl_thread              __thread
   #define mi_decl_export
diff --git a/src/alloc.c b/src/alloc.c
index 8fd8c4b3..91c13c31 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -225,19 +225,19 @@ void mi_free(void* p) mi_attr_noexcept
   }
 #endif
 
+  const uintptr_t tid = _mi_thread_id();
   mi_page_t* const page = _mi_segment_page_of(segment, p);
   
 #if (MI_STAT>1)
   mi_heap_t* heap = mi_heap_get_default();
-  mi_heap_stat_decrease( heap, malloc, mi_usable_size(p));
+  mi_heap_stat_decrease(heap, malloc, mi_usable_size(p));
   if (page->block_size <= MI_LARGE_OBJ_SIZE_MAX) {
-    mi_heap_stat_decrease( heap, normal[_mi_bin(page->block_size)], 1);
+    mi_heap_stat_decrease(heap, normal[_mi_bin(page->block_size)], 1);
   }
   // huge page stat is accounted for in `_mi_page_retire`
 #endif
-  
-  uintptr_t tid = _mi_thread_id();
-  if (mi_likely(page->flags == tid)) {  
+
+  if (mi_likely(tid == segment->thread_id && page->flags.value == 0)) {  // the thread id matches and it is not a full page, nor has aligned blocks
     // local, and not full or aligned
     mi_block_t* block = (mi_block_t*)p;
     mi_block_set_next(page, block, page->local_free);
@@ -247,7 +247,7 @@ void mi_free(void* p) mi_attr_noexcept
   }
   else {
     // non-local, aligned blocks, or a full page; use the more generic path
-    mi_free_generic(segment, page, tid == mi_page_thread_id(page), p);
+    mi_free_generic(segment, page, tid == segment->thread_id, p);
   }
 }
 
diff --git a/src/init.c b/src/init.c
index ab9b1aa9..b0fa60cc 100644
--- a/src/init.c
+++ b/src/init.c
@@ -13,15 +13,16 @@ terms of the MIT license. A copy of the license can be found in the file
 // Empty page used to initialize the small free pages array
 const mi_page_t _mi_page_empty = {
   0, false, false, false, 0, 0,
+  { 0 },
   NULL,    // free
   #if MI_SECURE
   0,
   #endif
-  0, 0, // flags, used
+  0,       // used
   NULL, 0, 0,
   0, NULL, NULL, NULL
-  #if (MI_INTPTR_SIZE==8 && MI_SECURE==0)
-  // , { NULL }
+  #if (MI_INTPTR_SIZE==8 && MI_SECURE>0) || (MI_INTPTR_SIZE==4 && MI_SECURE==0)
+  , { NULL } // padding
   #endif
 };
 
@@ -360,7 +361,7 @@ void mi_thread_init(void) mi_attr_noexcept
     pthread_setspecific(mi_pthread_key, (void*)(_mi_thread_id()|1)); // set to a dummy value so that `mi_pthread_done` is called
   #endif
 
-  #if (MI_DEBUG>0) // not in release mode as that leads to crashes on Windows dynamic override
+  #if (MI_DEBUG>0) && !defined(NDEBUG) // not in release mode as that leads to crashes on Windows dynamic override
   _mi_verbose_message("thread init: 0x%zx\n", _mi_thread_id());
   #endif
 }
diff --git a/src/os.c b/src/os.c
index bcce5d7d..a1b6cdf3 100644
--- a/src/os.c
+++ b/src/os.c
@@ -217,10 +217,23 @@ static void* mi_win_virtual_allocx(void* addr, size_t size, size_t try_alignment
     }
     else {
       // else fall back to regular large OS pages
-      _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) page instead (error %lx)\n", err);
+      _mi_warning_message("unable to allocate huge (1GiB) page, trying large (2MiB) pages instead (error %lx)\n", err);
     }
   }
-  
+#endif
+#if (MI_INTPTR_SIZE >= 8) 
+  // on 64-bit systems, use the virtual address area after 4TiB for 4MiB aligned allocations
+  static volatile intptr_t aligned_base = ((intptr_t)4 << 40); // starting at 4TiB
+  if (addr == NULL && try_alignment > 0 && 
+      try_alignment <= MI_SEGMENT_SIZE && (size%MI_SEGMENT_SIZE) == 0) 
+  {
+	  intptr_t hint = mi_atomic_add(&aligned_base, size) - size;
+	  if (hint%try_alignment == 0) {
+		  return VirtualAlloc((void*)hint, size, flags, PAGE_READWRITE);      
+	  }
+  }
+#endif
+#if defined(MEM_EXTENDED_PARAMETER_TYPE_BITS)  
   // on modern Windows try use VirtualAlloc2 for aligned allocation
   if (try_alignment > 0 && (try_alignment % _mi_os_page_size()) == 0 && pVirtualAlloc2 != NULL) {
     MEM_ADDRESS_REQUIREMENTS reqs = { 0 };
@@ -539,7 +552,7 @@ static bool mi_os_commitx(void* addr, size_t size, bool commit, bool conservativ
   // page align in the range, commit liberally, decommit conservative
   size_t csize;
   void* start = mi_os_page_align_areax(conservative, addr, size, &csize);
-  if (csize == 0) return true;
+  if (csize == 0 || mi_os_is_huge_reserved(addr)) return true;
   int err = 0;
   if (commit) {
     _mi_stat_increase(&stats->committed, csize);
@@ -591,7 +604,7 @@ static bool mi_os_resetx(void* addr, size_t size, bool reset, mi_stats_t* stats)
   // page align conservatively within the range
   size_t csize;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
-  if (csize == 0) return true;
+  if (csize == 0 || mi_os_is_huge_reserved(addr)) return true;
   if (reset) _mi_stat_increase(&stats->reset, csize);
         else _mi_stat_decrease(&stats->reset, csize);
   if (!reset) return true; // nothing to do on unreset!
@@ -659,7 +672,9 @@ static  bool mi_os_protectx(void* addr, size_t size, bool protect) {
   size_t csize = 0;
   void* start = mi_os_page_align_area_conservative(addr, size, &csize);
   if (csize == 0) return false;
-
+  if (mi_os_is_huge_reserved(addr)) {
+	_mi_warning_message("cannot mprotect memory allocated in huge OS pages\n");
+  }
   int err = 0;
 #ifdef _WIN32
   DWORD oldprotect = 0;
@@ -779,7 +794,7 @@ int mi_reserve_huge_os_pages( size_t pages, double max_secs ) mi_attr_noexcept
   // Allocate one page at the time but try to place them contiguously
   // We allocate one page at the time to be able to abort if it takes too long
   double start_t = _mi_clock_start();
-  uint8_t* start = (uint8_t*)((uintptr_t)8 << 40); // 8TiB virtual start address
+  uint8_t* start = (uint8_t*)((uintptr_t)16 << 40); // 16TiB virtual start address
   uint8_t* addr = start;  // current top of the allocations
   for (size_t page = 0; page < pages; page++, addr += MI_HUGE_OS_PAGE_SIZE ) {
     // allocate lorgu pages
diff --git a/src/page-queue.c b/src/page-queue.c
index c53edf82..0fb46ffa 100644
--- a/src/page-queue.c
+++ b/src/page-queue.c
@@ -130,6 +130,7 @@ extern inline uint8_t _mi_bin(size_t size) {
     // - adjust with 3 because we use do not round the first 8 sizes
     //   which each get an exact bin
     bin = ((b << 2) + (uint8_t)((wsize >> (b - 2)) & 0x03)) - 3;
+    mi_assert_internal(bin < MI_BIN_HUGE);
   }
   mi_assert_internal(bin > 0 && bin <= MI_BIN_HUGE);
   return bin;
diff --git a/src/page.c b/src/page.c
index 7ed90232..7e7bd5aa 100644
--- a/src/page.c
+++ b/src/page.c
@@ -75,9 +75,7 @@ static bool mi_page_is_valid_init(mi_page_t* page) {
   mi_segment_t* segment = _mi_page_segment(page);
   uint8_t* start = _mi_page_start(segment,page,NULL);
   mi_assert_internal(start == _mi_segment_page_start(segment,page,NULL));
-  mi_assert_internal(segment->thread_id==0 || segment->thread_id == mi_page_thread_id(page));
-  //mi_assert_internal(start + page->capacity*page->block_size == page->top);
-
+  
   mi_assert_internal(mi_page_list_is_valid(page,page->free));
   mi_assert_internal(mi_page_list_is_valid(page,page->local_free));
 
diff --git a/src/segment.c b/src/segment.c
index 6a3fe87b..64b9f4ac 100644
--- a/src/segment.c
+++ b/src/segment.c
@@ -594,7 +594,6 @@ static mi_page_t* mi_segment_page_alloc(mi_page_kind_t page_kind, size_t require
   // initialize the page and return
   mi_assert_internal(segment->thread_id == _mi_thread_id());
   segment->used++;
-  mi_page_init_flags(page, segment->thread_id);
   return page;
 }
 
@@ -729,21 +728,23 @@ static void mi_segment_abandon(mi_segment_t* segment, mi_segments_tld_t* tld) {
   }
 
   // add it to the abandoned list
-  segment->thread_id = 0;
-  do {
-    segment->abandoned_next = (mi_segment_t*)abandoned;
-  } while (!mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, segment, segment->abandoned_next));
-  mi_atomic_increment(&abandoned_count);
-  _mi_stat_increase(&tld->stats->segments_abandoned,1);
+  _mi_stat_increase(&tld->stats->segments_abandoned, 1);
   mi_segments_track_size(-((long)segment->segment_size), tld);
+
+  segment->thread_id = 0;
+  mi_segment_t* next;
+  do {
+    next = (mi_segment_t*)abandoned;
+    mi_atomic_write_ptr((volatile void**)&segment->abandoned_next, next);
+  } while (!mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, segment, next));
+  mi_atomic_increment(&abandoned_count);
 }
 
 void _mi_segment_page_abandon(mi_page_t* page, mi_segments_tld_t* tld) {
-  mi_assert(page != NULL && mi_page_thread_id(page) != 0);
+  mi_assert(page != NULL);
   mi_segment_t* segment = _mi_page_segment(page);
   mi_assert_expensive(mi_segment_is_valid(segment,tld));
-  segment->abandoned++;
-  mi_page_set_thread_id(page, 0);
+  segment->abandoned++;  
   _mi_stat_increase(&tld->stats->pages_abandoned, 1);
   mi_assert_internal(segment->abandoned <= segment->used);
   if (segment->used == segment->abandoned) {
@@ -769,7 +770,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
     mi_segment_t* segment;
     do {
       segment = (mi_segment_t*)abandoned;
-    } while(segment != NULL && !mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, segment->abandoned_next, segment));
+    } while(segment != NULL && !mi_atomic_compare_exchange_ptr((volatile void**)&abandoned, (mi_segment_t*)segment->abandoned_next, segment));
     if (segment==NULL) break; // stop early if no more segments available
 
     // got it.
@@ -811,7 +812,6 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
         }
         else {
           // otherwise reclaim it
-          mi_page_set_thread_id(page,segment->thread_id);
           _mi_page_reclaim(heap,page);
         }
       }
@@ -832,7 +832,7 @@ bool _mi_segment_try_reclaim_abandoned( mi_heap_t* heap, bool try_all, mi_segmen
 
 
 /* -----------------------------------------------------------
-   Small page allocation
+   Huge page allocation
 ----------------------------------------------------------- */
 
 static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld, mi_os_tld_t* os_tld)
@@ -841,6 +841,7 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
   if (segment == NULL) return NULL;
   mi_assert_internal(segment->segment_size - segment->segment_info_size >= size);
   segment->used = 1;
+
   mi_page_t* page = mi_slice_to_page(&segment->slices[0]);
   mi_assert_internal(page->block_size > 0 && page->slice_count > 0);
   size_t initial_count = page->slice_count;
@@ -857,7 +858,6 @@ static mi_page_t* mi_segment_huge_page_alloc(size_t size, mi_segments_tld_t* tld
     slice->block_size = 1;
     slice->slice_count = 0;
   }
-  mi_page_init_flags(page,segment->thread_id);
   return page;
 }