diff --git a/include/mimalloc-internal.h b/include/mimalloc-internal.h
index 97b96885..90bf6f82 100644
--- a/include/mimalloc-internal.h
+++ b/include/mimalloc-internal.h
@@ -32,6 +32,26 @@ terms of the MIT license. A copy of the license can be found in the file
 #define mi_decl_cache_align
 #endif
 
+/* -----------------------------------------------------------
+  Padding
+----------------------------------------------------------- */
+#if (MI_PADDING) 
+#define MI_EXTRA_PADDING_XPARAM  , size_t __extra_padding
+#define MI_EXTRA_PADDING_XARG    , __extra_padding
+#define MI_EXTRA_PADDING_ARG     __extra_padding
+static inline size_t mi_extra_padding() {
+  return MI_PADDING_SIZE + mi_option_get(mi_option_debug_extra_padding);
+}
+#else
+#define MI_EXTRA_PADDING_XPARAM  
+#define MI_EXTRA_PADDING_XARG    
+#define MI_EXTRA_PADDING_ARG     0
+static inline size_t mi_extra_padding() {
+  return 0;
+}
+#endif
+
+
 
 // "options.c"
 void       _mi_fputs(mi_output_fun* out, void* arg, const char* prefix, const char* message);
@@ -90,7 +110,7 @@ void       _mi_abandoned_await_readers(void);
 
 
 // "page.c"
-void*      _mi_malloc_generic(mi_heap_t* heap, size_t size MI_SOURCE_XPARAM)  mi_attr_noexcept mi_attr_malloc;
+void*      _mi_malloc_generic(mi_heap_t* heap, size_t size  MI_EXTRA_PADDING_XPARAM  MI_SOURCE_XPARAM)  mi_attr_noexcept mi_attr_malloc;
 
 void       _mi_page_retire(mi_page_t* page);                                  // free the page if there are no other pages with many free blocks
 void       _mi_page_unfull(mi_page_t* page);
@@ -123,7 +143,7 @@ mi_msecs_t  _mi_clock_end(mi_msecs_t start);
 mi_msecs_t  _mi_clock_start(void);
 
 // "alloc.c"
-void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size  MI_SOURCE_XPARAM) mi_attr_noexcept;  // called from `_mi_malloc_generic`
+void*       _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size  MI_EXTRA_PADDING_XPARAM  MI_SOURCE_XPARAM) mi_attr_noexcept;  // called from `_mi_malloc_generic`
 mi_block_t* _mi_page_ptr_unalign(const mi_segment_t* segment, const mi_page_t* page, const void* p);
 bool        _mi_free_delayed_block(mi_block_t* block);
 void        _mi_block_zero_init(const mi_page_t* page, void* p, size_t size);
diff --git a/include/mimalloc.h b/include/mimalloc.h
index b7a388c2..f79a4ff6 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -338,6 +338,7 @@ typedef enum mi_option_e {
   mi_option_eager_commit_delay,
   mi_option_reset_delay,
   mi_option_use_numa_nodes,
+  mi_option_debug_extra_padding,
   mi_option_os_tag,
   mi_option_max_errors,
   _mi_option_last,
diff --git a/src/alloc-aligned.c b/src/alloc-aligned.c
index 44d73f6b..14ed76c6 100644
--- a/src/alloc-aligned.c
+++ b/src/alloc-aligned.c
@@ -24,7 +24,8 @@ static mi_decl_restrict void* mi_base_malloc_zero_aligned_at(mi_heap_t* const he
   const uintptr_t align_mask = alignment-1;  // for any x, `(x & align_mask) == (x % alignment)`
   
   // try if there is a small block available with just the right alignment
-  const size_t padsize = size + MI_PADDING_SIZE;
+  const size_t __extra_padding = mi_extra_padding();
+  const size_t padsize = size + __extra_padding;        // safe for overflow as size <= PTRDIFF_MAX 
   if (mi_likely(padsize <= MI_SMALL_SIZE_MAX)) {
     mi_page_t* page = _mi_heap_get_free_small_page(heap,padsize);
     const bool is_aligned = (((uintptr_t)page->free+offset) & align_mask)==0;
@@ -33,7 +34,7 @@ static mi_decl_restrict void* mi_base_malloc_zero_aligned_at(mi_heap_t* const he
       #if MI_STAT>1
       mi_heap_stat_increase( heap, malloc, size);
       #endif
-      void* p = _mi_page_malloc(heap,page,padsize  MI_SOURCE_XARG); // TODO: inline _mi_page_malloc
+      void* p = _mi_page_malloc(heap,page,padsize  MI_EXTRA_PADDING_XARG  MI_SOURCE_XARG); // TODO: inline _mi_page_malloc
       mi_assert_internal(p != NULL);
       mi_assert_internal(((uintptr_t)p + offset) % alignment == 0);
       if (zero) _mi_block_zero_init(page,p,size);
diff --git a/src/alloc.c b/src/alloc.c
index fbd88166..ba0a5b5d 100644
--- a/src/alloc.c
+++ b/src/alloc.c
@@ -19,17 +19,19 @@ terms of the MIT license. A copy of the license can be found in the file
 #undef MI_IN_ALLOC_C
 
 
-// ------------------------------------------------------
+// ----------------------------------------------------------------------------------------
 // Allocation
-// ------------------------------------------------------
+// Eventually all allocations pass through `mi_heap_malloc` and `mi_heap_malloc_small`,
+// and those end up at `_mi_page_malloc` (sometimes via `_mi_malloc_generic`).
+// ----------------------------------------------------------------------------------------
 
 // Fast allocation in a page: just pop from the free list.
 // Fall back to generic allocation only if the list is empty.
-extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size MI_SOURCE_XPARAM) mi_attr_noexcept {
+extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t size MI_EXTRA_PADDING_XPARAM MI_SOURCE_XPARAM) mi_attr_noexcept {
   mi_assert_internal(page->xblock_size==0||mi_page_block_size(page) >= size);
   mi_block_t* block = page->free;
   if (mi_unlikely(block == NULL)) {
-    return _mi_malloc_generic(heap, size  MI_SOURCE_XARG); // slow path
+    return _mi_malloc_generic(heap, size  MI_EXTRA_PADDING_XARG  MI_SOURCE_XARG); // slow path
   }
   mi_assert_internal(block != NULL && _mi_ptr_page(block) == page);
   // pop from the free list
@@ -50,13 +52,13 @@ extern inline void* _mi_page_malloc(mi_heap_t* heap, mi_page_t* page, size_t siz
 #endif
 #if defined(MI_PADDING) && defined(MI_ENCODE_FREELIST)
   mi_padding_t* const padding = (mi_padding_t*)((uint8_t*)block + mi_page_usable_block_size(page));
-  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - MI_PADDING_SIZE));
-  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - MI_PADDING_SIZE + delta));
+  ptrdiff_t delta = ((uint8_t*)padding - (uint8_t*)block - (size - __extra_padding));
+  mi_assert_internal(delta >= 0 && mi_page_usable_block_size(page) >= (size - __extra_padding + delta));
   padding->canary = (uint32_t)(mi_ptr_encode(page,block,page->keys));
   padding->delta  = (uint32_t)(delta);
   padding->source = __mi_source;
   uint8_t* fill = (uint8_t*)padding - delta;
-  const size_t maxpad = (delta > MI_MAX_ALIGN_SIZE ? MI_MAX_ALIGN_SIZE : delta); // set at most N initial padding bytes
+  const size_t maxpad = (delta > 4096 ? 4096 : delta); // set at most N initial padding bytes
   for (size_t i = 0; i < maxpad; i++) { fill[i] = MI_DEBUG_PADDING; }
 #endif
   return block;
@@ -68,13 +70,17 @@ MI_ALLOC_API1(inline mi_decl_restrict void*, malloc_small, mi_heap_t*, heap, siz
   mi_assert(heap!=NULL);
   mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
   mi_assert(size <= MI_SMALL_SIZE_MAX);
+  const size_t __extra_padding = mi_extra_padding();
   #if (MI_PADDING)
   if (size == 0) {
     size = sizeof(void*);
   }
+  if ((size + __extra_padding) > MI_SMALL_SIZE_MAX) {
+    return MI_SOURCE_ARG(mi_heap_malloc, heap, size);  // call base malloc in case we were invoked directly
+  }
   #endif
-  mi_page_t* page = _mi_heap_get_free_small_page(heap,size + MI_PADDING_SIZE);
-  void* p = _mi_page_malloc(heap, page, size + MI_PADDING_SIZE  MI_SOURCE_XARG);
+  mi_page_t* page = _mi_heap_get_free_small_page(heap,size + __extra_padding);
+  void* p = _mi_page_malloc(heap, page, size + __extra_padding  MI_EXTRA_PADDING_XARG  MI_SOURCE_XARG);
   mi_assert_internal(p==NULL || mi_usable_size(p) >= size);
   #if MI_STAT>1
   if (p != NULL) {
@@ -89,13 +95,14 @@ MI_ALLOC_API1(inline mi_decl_restrict void*, malloc_small, mi_heap_t*, heap, siz
 // The main allocation function
 MI_ALLOC_API1(inline mi_decl_restrict void*, malloc, mi_heap_t*, heap, size_t, size)
 {
-  if (mi_likely(size <= MI_SMALL_SIZE_MAX)) {
+  const size_t __extra_padding = mi_extra_padding();
+  if (mi_likely(size <= MI_SMALL_SIZE_MAX - __extra_padding && __extra_padding < MI_SMALL_SIZE_MAX)) {  // careful for overflow
     return mi_base_malloc_small(heap, size  MI_SOURCE_XARG);
   }
   else {
     mi_assert(heap!=NULL);
-    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local
-    void* const p = _mi_malloc_generic(heap, size + MI_PADDING_SIZE  MI_SOURCE_XARG); // note: size can overflow but it is detected in malloc_generic
+    mi_assert(heap->thread_id == 0 || heap->thread_id == _mi_thread_id()); // heaps are thread local    
+    void* const p = _mi_malloc_generic(heap, size + __extra_padding  MI_EXTRA_PADDING_XARG  MI_SOURCE_XARG); // note: size + __extra_padding can overflow but it is detected in malloc_generic
     mi_assert_internal(p == NULL || mi_usable_size(p) >= size);
     #if MI_STAT>1
     if (p != NULL) {
diff --git a/src/options.c b/src/options.c
index 3017aa9c..45e619f4 100644
--- a/src/options.c
+++ b/src/options.c
@@ -77,6 +77,7 @@ static mi_option_desc_t options[_mi_option_last] =
 #endif
   { 100, UNINIT, MI_OPTION(reset_delay) },       // reset delay in milli-seconds
   { 0,   UNINIT, MI_OPTION(use_numa_nodes) },    // 0 = use available numa nodes, otherwise use at most N nodes.
+  { 160,  UNINIT, MI_OPTION(debug_extra_padding) }, // extra padding in bytes
   { 100, UNINIT, MI_OPTION(os_tag) },            // only apple specific for now but might serve more or less related purpose
   { 16,  UNINIT, MI_OPTION(max_errors) }         // maximum errors that are output
 };
diff --git a/src/page.c b/src/page.c
index ff812e42..010769b7 100644
--- a/src/page.c
+++ b/src/page.c
@@ -792,7 +792,7 @@ static mi_page_t* mi_huge_page_alloc(mi_heap_t* heap, size_t size) {
 
 // Generic allocation routine if the fast path (`alloc.c:mi_page_malloc`) does not succeed.
 // Note: in debug mode the size includes MI_PADDING_SIZE and might have overflowed.
-void* _mi_malloc_generic(mi_heap_t* heap, size_t size MI_SOURCE_XPARAM) mi_attr_noexcept
+void* _mi_malloc_generic(mi_heap_t* heap, size_t size  MI_EXTRA_PADDING_XPARAM  MI_SOURCE_XPARAM) mi_attr_noexcept
 {
   mi_assert_internal(heap != NULL);
 
@@ -811,8 +811,8 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size MI_SOURCE_XPARAM) mi_attr_
 
   // huge allocation?
   mi_page_t* page;
-  const size_t req_size = size - MI_PADDING_SIZE;  // correct for padding_size in case of an overflow on `size`  
-  if (mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_PADDING_SIZE) )) {
+  const size_t req_size = size - MI_EXTRA_PADDING_ARG;  // correct for padding_size in case of an overflow on `size`  
+  if (mi_unlikely(req_size > (MI_LARGE_OBJ_SIZE_MAX - MI_EXTRA_PADDING_ARG) )) {
     if (mi_unlikely(req_size > PTRDIFF_MAX)) {  // we don't allocate more than PTRDIFF_MAX (see <https://sourceware.org/ml/libc-announce/2019/msg00001.html>)
       _mi_error_message(EOVERFLOW, "allocation request is too large (%zu b requested)\n", req_size);
       return NULL;
@@ -823,7 +823,7 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size MI_SOURCE_XPARAM) mi_attr_
   }
   else {
     // otherwise find a page with free blocks in our size segregated queues
-    mi_assert_internal(size >= MI_PADDING_SIZE);
+    mi_assert_internal(size >= MI_EXTRA_PADDING_ARG);
     page = mi_find_free_page(heap,size);
   }
   if (mi_unlikely(page == NULL)) { // out of memory
@@ -835,5 +835,5 @@ void* _mi_malloc_generic(mi_heap_t* heap, size_t size MI_SOURCE_XPARAM) mi_attr_
   mi_assert_internal(mi_page_block_size(page) >= size);
 
   // and try again, this time succeeding! (i.e. this should never recurse)
-  return _mi_page_malloc(heap, page, size  MI_SOURCE_XARG);
+  return _mi_page_malloc(heap, page, size  MI_EXTRA_PADDING_XARG  MI_SOURCE_XARG);
 }