diff --git a/include/mimalloc/atomic.h b/include/mimalloc/atomic.h
index 1b306181..55ea3781 100644
--- a/include/mimalloc/atomic.h
+++ b/include/mimalloc/atomic.h
@@ -134,7 +134,8 @@ static inline intptr_t mi_atomic_subi(_Atomic(intptr_t)*p, intptr_t sub);
 static inline int64_t mi_atomic_addi64_relaxed(volatile int64_t* p, int64_t add) {
   return mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
 }
-static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, int64_t add) {
+static inline void mi_atomic_void_addi64_relaxed(volatile int64_t* p, const volatile int64_t* padd) {
+  const int64_t add = mi_atomic_load_relaxed((_Atomic(int64_t)*)padd);
   if (add != 0) {
     mi_atomic(fetch_add_explicit)((_Atomic(int64_t)*)p, add, mi_memory_order(relaxed));
   }
diff --git a/readme.md b/readme.md
index 1ea10883..66d0755e 100644
--- a/readme.md
+++ b/readme.md
@@ -12,7 +12,7 @@ is a general purpose allocator with excellent [performance](#performance) charac
 Initially developed by Daan Leijen for the runtime systems of the
 [Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
 
-Latest release   : `v3.0.2` (beta) (2025-03-06)
+Latest release   : `v3.0.2` (beta) (2025-03-06).  
 Latest v2 release: `v2.2.2` (2025-03-06).  
 Latest v1 release: `v1.9.2` (2024-03-06).
 
@@ -87,12 +87,13 @@ Enjoy!
 * 2025-03-06, `v1.9.2`, `v2.2.2`, `v3.0.2-beta`: Various small bug and build fixes. 
   Add `mi_options_print`, `mi_arenas_print`, and the experimental `mi_stat_get` and `mi_stat_get_json`. 
   Add `mi_thread_set_in_threadpool` and `mi_heap_set_numa_affinity` (v3 only). Add vcpkg portfile. 
-  On Windows, use `mimalloc.lib` for the static library, and `mimalloc.dll` for the dynamic override (which used to be `mimalloc-override.dll`) -- and use `mimalloc-dll.lib` for the export library of `mimalloc.dll`. Upgrade redirect to v1.3.2.  
+  Upgrade mimalloc-redirect to v1.3.2. `MI_OPT_ARCH` is off by default now but still assumes armv8.1-a on arm64
+  for fast atomic operations.
 * 2025-01-03, `v1.8.9`, `v2.1.9`, `v3.0.1-alpha`: Interim release. Support Windows arm64. New [guarded](#guarded) build that can place OS 
   guard pages behind objects to catch buffer overflows as they occur. 
   Many small fixes: build on Windows arm64, cygwin, riscV, and dragonfly; fix Windows static library initialization to account for
   thread local destructors (in Rust/C++); macOS tag change; macOS TLS slot fix; improve stats; 
-  consistent mimalloc.dll on Windows (instead of mimalloc-override.dll); fix mimalloc-redirect on Win11 H2; 
+  consistent `mimalloc.dll` on Windows (instead of `mimalloc-override.dll`); fix mimalloc-redirect on Win11 H2; 
   add 0-byte to canary; upstream CPython fixes; reduce .bss size; allow fixed TLS slot on Windows for improved performance.
 * 2024-05-21, `v1.8.7`, `v2.1.7`: Fix build issues on less common platforms. Started upstreaming patches
   from the CPython [integration](https://github.com/python/cpython/issues/113141#issuecomment-2119255217). Upstream `vcpkg` patches.
diff --git a/src/stats.c b/src/stats.c
index b40fa474..27dc69d0 100644
--- a/src/stats.c
+++ b/src/stats.c
@@ -92,23 +92,23 @@ void __mi_stat_adjust_decrease(mi_stat_count_t* stat, size_t amount) {
 
 
 // must be thread safe as it is called from stats_merge
-static void mi_stat_count_add(mi_stat_count_t* stat, const mi_stat_count_t* src) {
+static void mi_stat_count_add_mt(mi_stat_count_t* stat, const mi_stat_count_t* src) {
   if (stat==src) return;
-  mi_atomic_void_addi64_relaxed(&stat->total, src->total); 
-  mi_atomic_void_addi64_relaxed(&stat->current, src->current); 
+  mi_atomic_void_addi64_relaxed(&stat->total, &src->total); 
+  mi_atomic_void_addi64_relaxed(&stat->current, &src->current); 
   // peak scores do really not work across threads .. we just add them
-  mi_atomic_void_addi64_relaxed( &stat->peak, src->peak);
+  mi_atomic_void_addi64_relaxed( &stat->peak, &src->peak);
   // or, take the max?
   // mi_atomic_maxi64_relaxed(&stat->peak, src->peak);
 }
 
-static void mi_stat_counter_add(mi_stat_counter_t* stat, const mi_stat_counter_t* src) {
+static void mi_stat_counter_add_mt(mi_stat_counter_t* stat, const mi_stat_counter_t* src) {
   if (stat==src) return;
-  if (src->total!=0) { mi_atomic_addi64_relaxed(&stat->total, src->total); }
+  mi_atomic_void_addi64_relaxed(&stat->total, &src->total);
 }
 
-#define MI_STAT_COUNT(stat)    mi_stat_count_add(&stats->stat, &src->stat);
-#define MI_STAT_COUNTER(stat)  mi_stat_counter_add(&stats->stat, &src->stat);
+#define MI_STAT_COUNT(stat)    mi_stat_count_add_mt(&stats->stat, &src->stat);
+#define MI_STAT_COUNTER(stat)  mi_stat_counter_add_mt(&stats->stat, &src->stat);
 
 // must be thread safe as it is called from stats_merge
 static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
@@ -119,11 +119,11 @@ static void mi_stats_add(mi_stats_t* stats, const mi_stats_t* src) {
 
   #if MI_STAT>1
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    mi_stat_count_add(&stats->malloc_bins[i], &src->malloc_bins[i]);
+    mi_stat_count_add_mt(&stats->malloc_bins[i], &src->malloc_bins[i]);
   }
   #endif
   for (size_t i = 0; i <= MI_BIN_HUGE; i++) {
-    mi_stat_count_add(&stats->page_bins[i], &src->page_bins[i]);
+    mi_stat_count_add_mt(&stats->page_bins[i], &src->page_bins[i]);
   }
 }
 
@@ -318,8 +318,8 @@ static void _mi_stats_print(mi_stats_t* stats, mi_output_fun* out0, void* arg0)
   mi_stat_print(&stats->malloc_normal, "normal", (stats->malloc_normal_count.total == 0 ? 1 : -1), out, arg);
   mi_stat_print(&stats->malloc_huge, "huge", (stats->malloc_huge_count.total == 0 ? 1 : -1), out, arg);
   mi_stat_count_t total = { 0,0,0 };
-  mi_stat_count_add(&total, &stats->malloc_normal);
-  mi_stat_count_add(&total, &stats->malloc_huge);
+  mi_stat_count_add_mt(&total, &stats->malloc_normal);
+  mi_stat_count_add_mt(&total, &stats->malloc_huge);
   mi_stat_print_ex(&total, "total", 1, out, arg, "");
   #endif
   #if MI_STAT>1