diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 971b374a..fe79fbca 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -275,6 +275,15 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub) {
   return (intptr_t)mi_atomic_addi(p, -sub);
 }
 
+typedef _Atomic(uintptr_t) mi_atomic_once_t;
+
+// Returns true only on the first invocation
+static inline bool mi_atomic_once( mi_atomic_once_t* once ) {
+  if (mi_atomic_load_relaxed(once) != 0) return false;     // quick test 
+  uintptr_t expected = 0;
+  return mi_atomic_cas_strong_acq_rel(once, &expected, 1); // try to set to 1
+}
+
 // Yield
 #if defined(__cplusplus)
 #include <thread>
diff --git a/include/mimalloc/prim.h b/include/mimalloc/prim.h
index 97d8b45d..68f0871e 100644
--- a/include/mimalloc/prim.h
+++ b/include/mimalloc/prim.h
@@ -104,12 +104,13 @@ void _mi_prim_thread_associate_default_heap(mi_heap_t* heap);
 
 
 //-------------------------------------------------------------------
-// Thread id
+// Thread id: `_mi_prim_thread_id()`
 // 
 // Getting the thread id should be performant as it is called in the
 // fast path of `_mi_free` and we specialize for various platforms as
 // inlined definitions. Regular code should call `init.c:_mi_thread_id()`.
-// We only require _mi_prim_thread_id() to return a unique id for each thread.
+// We only require _mi_prim_thread_id() to return a unique id
+// for each thread (unequal to zero).
 //-------------------------------------------------------------------
 
 static inline mi_threadid_t _mi_prim_thread_id(void) mi_attr_noexcept;
diff --git a/src/init.c b/src/init.c
index f82aa534..fcfcb659 100644
--- a/src/init.c
+++ b/src/init.c
@@ -409,15 +409,24 @@ void mi_thread_done(void) mi_attr_noexcept {
   _mi_thread_done(NULL);
 }
 
+#include <unistd.h>
+
 void _mi_thread_done(mi_heap_t* heap) 
 {
-  mi_atomic_decrement_relaxed(&thread_count);
-  _mi_stat_decrease(&_mi_stats_main.threads, 1);
-
+  // calling with NULL implies using the default heap
   if (heap == NULL) { 
     heap = mi_prim_get_default_heap(); 
     if (heap == NULL) return;
   }
+
+  // prevent re-entrancy through heap_done/heap_set_default_direct (issue #699)
+  if (!mi_heap_is_initialized(heap)) {
+    return; 
+  }
+
+  // adjust stats
+  mi_atomic_decrement_relaxed(&thread_count);
+  _mi_stat_decrease(&_mi_stats_main.threads, 1);
   
   // check thread-id as on Windows shutdown with FLS the main (exit) thread may call this on thread-local heaps...
   if (heap->thread_id != _mi_thread_id()) return;
@@ -540,7 +549,8 @@ static void mi_detect_cpu_features(void) {
 // Initialize the process; called by thread_init or the process loader
 void mi_process_init(void) mi_attr_noexcept {
   // ensure we are called once
-  if (_mi_process_is_initialized) return;
+  static mi_atomic_once_t process_init;
+  if (!mi_atomic_once(&process_init)) return;
   _mi_verbose_message("process init: 0x%zx\n", _mi_thread_id());
   _mi_process_is_initialized = true;
   mi_process_setup_auto_thread_done();
diff --git a/src/os.c b/src/os.c
index cd460f68..5a0035b9 100644
--- a/src/os.c
+++ b/src/os.c
@@ -519,14 +519,15 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
   // We allocate one page at the time to be able to abort if it takes too long
   // or to at least allocate as many as available on the system.
   mi_msecs_t start_t = _mi_clock_start();
-  size_t page;
-  for (page = 0; page < pages; page++) {
+  size_t page = 0;
+  while (page < pages) {
     // allocate a page
     void* addr = start + (page * MI_HUGE_OS_PAGE_SIZE);
     void* p = NULL;
     int err = _mi_prim_alloc_huge_os_pages(addr, MI_HUGE_OS_PAGE_SIZE, numa_node, &p);
     if (err != 0) {
-      _mi_warning_message("unable to allocate huge OS page (error: %d (0x%d), address: %p, size: %zx bytes)", err, err, addr, MI_HUGE_OS_PAGE_SIZE);
+      _mi_warning_message("unable to allocate huge OS page (error: %d (0x%d), address: %p, size: %zx bytes)\n", err, err, addr, MI_HUGE_OS_PAGE_SIZE);
+      break;
     }
 
     // Did we succeed at a contiguous address?
@@ -540,6 +541,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
     }
 
     // success, record it
+    page++;  // increase before timeout check (see issue #711)
     _mi_stat_increase(&_mi_stats_main.committed, MI_HUGE_OS_PAGE_SIZE);
     _mi_stat_increase(&_mi_stats_main.reserved, MI_HUGE_OS_PAGE_SIZE);
 
@@ -553,7 +555,7 @@ void* _mi_os_alloc_huge_os_pages(size_t pages, int numa_node, mi_msecs_t max_mse
         }
       }
       if (elapsed > max_msecs) {
-        _mi_warning_message("huge page allocation timed out\n");
+        _mi_warning_message("huge OS page allocation timed out (after allocating %zu page(s))\n", page);
         break;
       }
     }
diff --git a/src/prim/osx/alloc-override-zone.c b/src/prim/osx/alloc-override-zone.c
index a517ddea..80bcfa93 100644
--- a/src/prim/osx/alloc-override-zone.c
+++ b/src/prim/osx/alloc-override-zone.c
@@ -420,7 +420,7 @@ __attribute__((constructor(0)))
 #else
 __attribute__((constructor))      // seems not supported by g++-11 on the M1
 #endif
-static void _mi_macos_override_malloc() {
+static void _mi_macos_override_malloc(void) {
   malloc_zone_t* purgeable_zone = NULL;
 
   #if defined(MAC_OS_X_VERSION_10_6) && (MAC_OS_X_VERSION_MAX_ALLOWED >= MAC_OS_X_VERSION_10_6)
diff --git a/src/prim/unix/prim.c b/src/prim/unix/prim.c
index 0ac69f1a..0ca9bc64 100644
--- a/src/prim/unix/prim.c
+++ b/src/prim/unix/prim.c
@@ -629,8 +629,8 @@ bool _mi_prim_getenv(const char* name, char* result, size_t result_size) {
   if (len == 0) return false;
   char** env = mi_get_environ();
   if (env == NULL) return false;
-  // compare up to 256 entries
-  for (int i = 0; i < 256 && env[i] != NULL; i++) {
+  // compare up to 10000 entries
+  for (int i = 0; i < 10000 && env[i] != NULL; i++) {
     const char* s = env[i];
     if (_mi_strnicmp(name, s, len) == 0 && s[len] == '=') { // case insensitive
       // found it
diff --git a/test/main-override.cpp b/test/main-override.cpp
index 4f4799f8..d2814b68 100644
--- a/test/main-override.cpp
+++ b/test/main-override.cpp
@@ -47,7 +47,7 @@ static void test_stl_allocators();
 int main() {
   // mi_stats_reset();  // ignore earlier allocations
   
-  test_std_string();
+  // test_std_string();
   // heap_thread_free_huge();
   /*
    heap_thread_free_huge();
@@ -65,6 +65,7 @@ int main() {
   // test_large_migrate();
   
   //fail_aslr();
+  // bench_alloc_large();
   // mi_stats_print(NULL);
   return 0;
 }