diff --git a/src/arena.c b/src/arena.c
index c0231e4d..b16d5679 100644
--- a/src/arena.c
+++ b/src/arena.c
@@ -1155,15 +1155,20 @@ static bool mi_manage_os_memory_ex2(mi_subproc_t* subproc, void* start, size_t s
 {
   mi_assert(_mi_is_aligned(start,MI_ARENA_SLICE_SIZE));
   mi_assert(start!=NULL);
+  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
   if (start==NULL) return false;
   if (!_mi_is_aligned(start,MI_ARENA_SLICE_SIZE)) {
-    // todo: use alignment in memid to align to slice size first?
-    _mi_warning_message("cannot use OS memory since it is not aligned to %zu KiB (address %p)", MI_ARENA_SLICE_SIZE/MI_KiB, start);
-    return false;
+    // we can align the start since the memid tracks the real base of the memory.
+    void* const aligned_start = _mi_align_up_ptr(start, MI_ARENA_SLICE_SIZE);
+    const size_t diff = (uint8_t*)aligned_start - (uint8_t*)start;
+    if (diff >= size || (size - diff) < MI_ARENA_SLICE_SIZE) {
+      _mi_warning_message("after alignment, the size of the arena becomes too small (memory at %p with size %zu)\n", start, size);
+      return false;
+    }
+    start = aligned_start;
+    size = size - diff;
   }
-
-  if (arena_id != NULL) { *arena_id = _mi_arena_id_none(); }
-
+  
   const size_t slice_count = _mi_align_down(size / MI_ARENA_SLICE_SIZE, MI_BCHUNK_BITS);
   if (slice_count > MI_BITMAP_MAX_BIT_COUNT) {  // 16 GiB for now
     // todo: allow larger areas (either by splitting it up in arena's or having larger arena's)
diff --git a/test/main-override-static.c b/test/main-override-static.c
index b16864db..1e0df3ee 100644
--- a/test/main-override-static.c
+++ b/test/main-override-static.c
@@ -1,3 +1,6 @@
+#if _WIN32
+#include <windows.h>
+#endif
 #include <stdlib.h>
 #include <stdio.h>
 #include <assert.h>
@@ -22,12 +25,14 @@ static void negative_stat(void);
 static void alloc_huge(void);
 static void test_heap_walk(void);
 static void test_canary_leak(void);
+static void test_manage_os_memory(void);
 // static void test_large_pages(void);
 
 
 int main() {
   mi_version();
   mi_stats_reset();
+  test_manage_os_memory();
   // test_large_pages();
   // detect double frees and heap corruption
   // double_free1();
@@ -241,6 +246,34 @@ static void test_canary_leak(void) {
   free(p);
 }
 
+#if _WIN32
+static void test_manage_os_memory(void) {
+  size_t size = 256 * 1024 * 1024;
+  void* ptr = VirtualAlloc(NULL, size, MEM_RESERVE | MEM_COMMIT, PAGE_READWRITE); 
+  mi_arena_id_t arena_id;
+  mi_manage_os_memory_ex(ptr, size, true /* committed */, true /* pinned */, false /* is zero */, -1 /* numa node */, true /* exclusive */, &arena_id);
+  mi_heap_t* cuda_heap = mi_heap_new_in_arena(arena_id);    // you can do this in any thread
+
+  // now allocate only in the cuda arena
+  void* p1 = mi_heap_malloc(cuda_heap, 8);
+  int* p2 = mi_heap_malloc_tp(cuda_heap, int);
+  *p2 = 42;
+  
+  // and maybe set the cuda heap as the default heap? (but careful as now `malloc` will allocate in the cuda heap as well)
+  {
+    mi_heap_t* prev_default_heap = mi_heap_set_default(cuda_heap);
+    void* p3 = mi_malloc(8);  // allocate in the cuda heap 
+    mi_free(p3);
+  }
+  mi_free(p1);
+  mi_free(p2);
+}
+#else
+static void test_manage_os_memory(void) {
+  // empty
+}
+#endif
+
 // Experiment with huge OS pages
 #if 0