Merge branch 'dev' into dev-slice

2025-08-24 00:04:48 +03:00 · 2020-09-24 16:33:22 -07:00 · 2020-09-24 16:33:22 -07:00 · 2822e5c1f3
commit 2822e5c1f3
parent b59abce8ea ed8cc1fc19
11 changed files with 75 additions and 59 deletions
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@ -26,17 +26,25 @@ without code changes, for example, on Unix you can use it as:

 Notable aspects of the design include:

- __small and consistent__: the library is less than 6k LOC using simple and
+- __small and consistent__: the library is about 8k LOC using simple and
  consistent data structures. This makes it very suitable
  to integrate and adapt in other projects. For runtime systems it
  provides hooks for a monotonic _heartbeat_ and deferred freeing (for
  bounded worst-case times with reference counting).
- __free list sharding__: the big idea: instead of one big free list (per size class) we have
-  many smaller lists per memory "page" which both reduces fragmentation
-  and increases locality --
+- __free list sharding__: instead of one big free list (per size class) we have
+  many smaller lists per "mimalloc page" which reduces fragmentation and
+  increases locality --
  things that are allocated close in time get allocated close in memory.
-  (A memory "page" in _mimalloc_ contains blocks of one size class and is
-  usually 64KiB on a 64-bit system).
+  (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system).
+- __free list multi-sharding__: the big idea! Not only do we shard the free list
+  per mimalloc page, but for each page we have multiple free lists. In particular, there
+  is one list for thread-local `free` operations, and another one for concurrent `free`
+  operations. Free-ing from another thread can now be a single CAS without needing
+  sophisticated coordination between threads. Since there will be 
+  thousands of separate free lists, contention is naturally distributed over the heap,
+  and the chance of contending on a single location will be low -- this is quite
+  similar to randomized algorithms like skip lists where adding
+  a random oracle removes the need for a more complex algorithm.
 - __eager page reset__: when a "page" becomes empty (with increased chance
  due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged")
  reducing (real) memory pressure and fragmentation, especially in long running
@ -51,7 +59,7 @@ Notable aspects of the design include:
  times (_wcat_), bounded space overhead (~0.2% meta-data, with at most 12.5% waste in allocation sizes),
  and has no internal points of contention using only atomic operations.
 - __fast__: In our benchmarks (see [below](#performance)),
-  _mimalloc_ always outperforms all other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
+  _mimalloc_ outperforms all other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
  and usually uses less memory (up to 25% more in the worst case). A nice property
  is that it does consistently well over a wide range of benchmarks.

@ -442,7 +450,8 @@ int mi_reserve_huge_os_pages_at(size_t pages, int numa_node, size_t timeout_msec
 bool mi_is_redirected();

 /// Return process information (time and memory usage).
-/// @param user_msecs      Optional. User time in milli-seconds.
+/// @param elapsed_msecs   Optional. Elapsed wall-clock time of the process in milli-seconds.
+/// @param user_msecs      Optional. User time in milli-seconds (as the sum over all threads).
 /// @param system_msecs    Optional. System time in milli-seconds.
 /// @param current_rss     Optional. Current working set size (touched pages).
 /// @param peak_rss        Optional. Peak working set size (touched pages).
@ -453,7 +462,7 @@ bool mi_is_redirected();
 /// The \a current_rss is precise on Windows and MacOSX; other systems estimate
 /// this using \a current_commit. The \a commit is precise on Windows but estimated
 /// on other systems as the amount of read/write accessible memory reserved by mimalloc.
-void mi_process_info(size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);
+void mi_process_info(size_t* elapsed_msecs, size_t* user_msecs, size_t* system_msecs, size_t* current_rss, size_t* peak_rss, size_t* current_commit, size_t* peak_commit, size_t* page_faults);

 /// \}

--- a/docs/group__extended.html
+++ b/docs/group__extended.html
@ -187,9 +187,9 @@ Functions</h2></td></tr>
 <tr class="memitem:gaad25050b19f30cd79397b227e0157a3f"><td class="memItemLeft" align="right" valign="top">bool&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#gaad25050b19f30cd79397b227e0157a3f">mi_is_redirected</a> ()</td></tr>
 <tr class="memdesc:gaad25050b19f30cd79397b227e0157a3f"><td class="mdescLeft">&#160;</td><td class="mdescRight">Is the C runtime <em>malloc</em> API redirected?  <a href="#gaad25050b19f30cd79397b227e0157a3f">More...</a><br /></td></tr>
 <tr class="separator:gaad25050b19f30cd79397b227e0157a3f"><td class="memSeparator" colspan="2">&#160;</td></tr>
-<tr class="memitem:ga9144506d5ffa8cc03547867bd15e1032"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#ga9144506d5ffa8cc03547867bd15e1032">mi_process_info</a> (size_t *user_msecs, size_t *system_msecs, size_t *current_rss, size_t *peak_rss, size_t *current_commit, size_t *peak_commit, size_t *page_faults)</td></tr>
-<tr class="memdesc:ga9144506d5ffa8cc03547867bd15e1032"><td class="mdescLeft">&#160;</td><td class="mdescRight">Return process information (time and memory usage).  <a href="#ga9144506d5ffa8cc03547867bd15e1032">More...</a><br /></td></tr>
-<tr class="separator:ga9144506d5ffa8cc03547867bd15e1032"><td class="memSeparator" colspan="2">&#160;</td></tr>
+<tr class="memitem:ga7d862c2affd5790381da14eb102a364d"><td class="memItemLeft" align="right" valign="top">void&#160;</td><td class="memItemRight" valign="bottom"><a class="el" href="group__extended.html#ga7d862c2affd5790381da14eb102a364d">mi_process_info</a> (size_t *elapsed_msecs, size_t *user_msecs, size_t *system_msecs, size_t *current_rss, size_t *peak_rss, size_t *current_commit, size_t *peak_commit, size_t *page_faults)</td></tr>
+<tr class="memdesc:ga7d862c2affd5790381da14eb102a364d"><td class="mdescLeft">&#160;</td><td class="mdescRight">Return process information (time and memory usage).  <a href="#ga7d862c2affd5790381da14eb102a364d">More...</a><br /></td></tr>
+<tr class="separator:ga7d862c2affd5790381da14eb102a364d"><td class="memSeparator" colspan="2">&#160;</td></tr>
 </table>
 <a name="details" id="details"></a><h2 class="groupheader">Detailed Description</h2>
 <p>Extended functionality. </p>
@ -416,8 +416,8 @@ Functions</h2></td></tr>

 </div>
 </div>
-<a id="ga9144506d5ffa8cc03547867bd15e1032"></a>
-<h2 class="memtitle"><span class="permalink"><a href="#ga9144506d5ffa8cc03547867bd15e1032">&#9670;&nbsp;</a></span>mi_process_info()</h2>
+<a id="ga7d862c2affd5790381da14eb102a364d"></a>
+<h2 class="memtitle"><span class="permalink"><a href="#ga7d862c2affd5790381da14eb102a364d">&#9670;&nbsp;</a></span>mi_process_info()</h2>

 <div class="memitem">
 <div class="memproto">
@ -426,6 +426,12 @@ Functions</h2></td></tr>
          <td class="memname">void mi_process_info </td>
          <td>(</td>
          <td class="paramtype">size_t *&#160;</td>
+          <td class="paramname"><em>elapsed_msecs</em>, </td>
+        </tr>
+        <tr>
+          <td class="paramkey"></td>
+          <td></td>
+          <td class="paramtype">size_t *&#160;</td>
          <td class="paramname"><em>user_msecs</em>, </td>
        </tr>
        <tr>
@ -475,7 +481,8 @@ Functions</h2></td></tr>
 <p>Return process information (time and memory usage). </p>
 <dl class="params"><dt>Parameters</dt><dd>
  <table class="params">
-    <tr><td class="paramname">user_msecs</td><td>Optional. User time in milli-seconds. </td></tr>
+    <tr><td class="paramname">elapsed_msecs</td><td>Optional. Elapsed wall-clock time of the process in milli-seconds. </td></tr>
+    <tr><td class="paramname">user_msecs</td><td>Optional. User time in milli-seconds (as the sum over all threads). </td></tr>
    <tr><td class="paramname">system_msecs</td><td>Optional. System time in milli-seconds. </td></tr>
    <tr><td class="paramname">current_rss</td><td>Optional. Current working set size (touched pages). </td></tr>
    <tr><td class="paramname">peak_rss</td><td>Optional. Peak working set size (touched pages). </td></tr>
--- a/docs/group__extended.js
+++ b/docs/group__extended.js
@ -9,7 +9,7 @@ var group__extended =
    [ "mi_is_in_heap_region", "group__extended.html#ga5f071b10d4df1c3658e04e7fd67a94e6", null ],
    [ "mi_is_redirected", "group__extended.html#gaad25050b19f30cd79397b227e0157a3f", null ],
    [ "mi_malloc_small", "group__extended.html#ga7136c2e55cb22c98ecf95d08d6debb99", null ],
-    [ "mi_process_info", "group__extended.html#ga9144506d5ffa8cc03547867bd15e1032", null ],
+    [ "mi_process_info", "group__extended.html#ga7d862c2affd5790381da14eb102a364d", null ],
    [ "mi_register_deferred_free", "group__extended.html#ga3460a6ca91af97be4058f523d3cb8ece", null ],
    [ "mi_register_error", "group__extended.html#gaa1d55e0e894be240827e5d87ec3a1f45", null ],
    [ "mi_register_output", "group__extended.html#gae5b17ff027cd2150b43a33040250cf3f", null ],
--- a/docs/index.html
+++ b/docs/index.html
@ -105,13 +105,14 @@ $(document).ready(function(){initNavTree('index.html','');});
 <div class="textblock"><p>This is the API documentation of the <a href="https://github.com/microsoft/mimalloc">mimalloc</a> allocator (pronounced "me-malloc") &ndash; a general purpose allocator with excellent <a href="bench.html">performance</a> characteristics. Initially developed by Daan Leijen for the run-time systems of the <a href="https://github.com/koka-lang/koka">Koka</a> and <a href="https://github.com/leanprover/lean">Lean</a> languages.</p>
 <p>It is a drop-in replacement for <code>malloc</code> and can be used in other programs without code changes, for example, on Unix you can use it as: </p><div class="fragment"><div class="line">&gt; LD_PRELOAD=/usr/bin/libmimalloc.so  myprogram</div></div><!-- fragment --><p>Notable aspects of the design include:</p>
 <ul>
-<li><b>small and consistent</b>: the library is less than 6k LOC using simple and consistent data structures. This makes it very suitable to integrate and adapt in other projects. For runtime systems it provides hooks for a monotonic <em>heartbeat</em> and deferred freeing (for bounded worst-case times with reference counting).</li>
-<li><b>free list sharding</b>: the big idea: instead of one big free list (per size class) we have many smaller lists per memory "page" which both reduces fragmentation and increases locality &ndash; things that are allocated close in time get allocated close in memory. (A memory "page" in <em>mimalloc</em> contains blocks of one size class and is usually 64KiB on a 64-bit system).</li>
+<li><b>small and consistent</b>: the library is about 8k LOC using simple and consistent data structures. This makes it very suitable to integrate and adapt in other projects. For runtime systems it provides hooks for a monotonic <em>heartbeat</em> and deferred freeing (for bounded worst-case times with reference counting).</li>
+<li><b>free list sharding</b>: instead of one big free list (per size class) we have many smaller lists per "mimalloc page" which reduces fragmentation and increases locality &ndash; things that are allocated close in time get allocated close in memory. (A mimalloc page contains blocks of one size class and is usually 64KiB on a 64-bit system).</li>
+<li><b>free list multi-sharding</b>: the big idea! Not only do we shard the free list per mimalloc page, but for each page we have multiple free lists. In particular, there is one list for thread-local <code>free</code> operations, and another one for concurrent <code>free</code> operations. Free-ing from another thread can now be a single CAS without needing sophisticated coordination between threads. Since there will be thousands of separate free lists, contention is naturally distributed over the heap, and the chance of contending on a single location will be low &ndash; this is quite similar to randomized algorithms like skip lists where adding a random oracle removes the need for a more complex algorithm.</li>
 <li><b>eager page reset</b>: when a "page" becomes empty (with increased chance due to free list sharding) the memory is marked to the OS as unused ("reset" or "purged") reducing (real) memory pressure and fragmentation, especially in long running programs.</li>
 <li><b>secure</b>: <em>mimalloc</em> can be build in secure mode, adding guard pages, randomized allocation, encrypted free lists, etc. to protect against various heap vulnerabilities. The performance penalty is only around 3% on average over our benchmarks.</li>
 <li><b>first-class heaps</b>: efficiently create and use multiple heaps to allocate across different regions. A heap can be destroyed at once instead of deallocating each object separately.</li>
 <li><b>bounded</b>: it does not suffer from <em>blowup</em> [1], has bounded worst-case allocation times (<em>wcat</em>), bounded space overhead (~0.2% meta-data, with at most 12.5% waste in allocation sizes), and has no internal points of contention using only atomic operations.</li>
-<li><b>fast</b>: In our benchmarks (see <a href="#performance">below</a>), <em>mimalloc</em> always outperforms all other leading allocators (<em>jemalloc</em>, <em>tcmalloc</em>, <em>Hoard</em>, etc), and usually uses less memory (up to 25% more in the worst case). A nice property is that it does consistently well over a wide range of benchmarks.</li>
+<li><b>fast</b>: In our benchmarks (see <a href="#performance">below</a>), <em>mimalloc</em> outperforms all other leading allocators (<em>jemalloc</em>, <em>tcmalloc</em>, <em>Hoard</em>, etc), and usually uses less memory (up to 25% more in the worst case). A nice property is that it does consistently well over a wide range of benchmarks.</li>
 </ul>
 <p>You can read more on the design of <em>mimalloc</em> in the <a href="https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action">technical report</a> which also has detailed benchmark results.</p>
 <p>Further information:</p>
--- a/docs/mimalloc-doc_8h_source.html
+++ b/docs/mimalloc-doc_8h_source.html
--- a/docs/navtreeindex0.js
+++ b/docs/navtreeindex0.js
@ -53,8 +53,8 @@ var NAVTREEINDEX0 =
 "group__extended.html#ga5f071b10d4df1c3658e04e7fd67a94e6":[5,1,6],
 "group__extended.html#ga7136c2e55cb22c98ecf95d08d6debb99":[5,1,8],
 "group__extended.html#ga7795a13d20087447281858d2c771cca1":[5,1,13],
+"group__extended.html#ga7d862c2affd5790381da14eb102a364d":[5,1,9],
 "group__extended.html#ga854b1de8cb067c7316286c28b2fcd3d1":[5,1,15],
-"group__extended.html#ga9144506d5ffa8cc03547867bd15e1032":[5,1,9],
 "group__extended.html#gaa1d55e0e894be240827e5d87ec3a1f45":[5,1,11],
 "group__extended.html#gaad25050b19f30cd79397b227e0157a3f":[5,1,7],
 "group__extended.html#gab1dac8476c46cb9eecab767eb40c1525":[5,1,21],
--- a/docs/search/all_6.js
+++ b/docs/search/all_6.js
@ -106,7 +106,7 @@ var searchData=
  ['mi_5foption_5fverbose',['mi_option_verbose',['../group__options.html#ggafebf7ed116adb38ae5218bc3ce06884ca7c8b7bf5281c581bad64f5daa6442777',1,'mimalloc-doc.h']]],
  ['mi_5foutput_5ffun',['mi_output_fun',['../group__extended.html#gad823d23444a4b77a40f66bf075a98a0c',1,'mimalloc-doc.h']]],
  ['mi_5fposix_5fmemalign',['mi_posix_memalign',['../group__posix.html#gacff84f226ba9feb2031b8992e5579447',1,'mimalloc-doc.h']]],
-  ['mi_5fprocess_5finfo',['mi_process_info',['../group__extended.html#ga9144506d5ffa8cc03547867bd15e1032',1,'mimalloc-doc.h']]],
+  ['mi_5fprocess_5finfo',['mi_process_info',['../group__extended.html#ga7d862c2affd5790381da14eb102a364d',1,'mimalloc-doc.h']]],
  ['mi_5fpvalloc',['mi_pvalloc',['../group__posix.html#gaeb325c39b887d3b90d85d1eb1712fb1e',1,'mimalloc-doc.h']]],
  ['mi_5frealloc',['mi_realloc',['../group__malloc.html#gaf11eb497da57bdfb2de65eb191c69db6',1,'mimalloc-doc.h']]],
  ['mi_5frealloc_5faligned',['mi_realloc_aligned',['../group__aligned.html#ga4028d1cf4aa4c87c880747044a8322ae',1,'mimalloc-doc.h']]],
--- a/docs/search/functions_0.js
+++ b/docs/search/functions_0.js
@ -75,7 +75,7 @@ var searchData=
  ['mi_5foption_5fset_5fenabled',['mi_option_set_enabled',['../group__options.html#ga9a13d05fcb77489cb06d4d017ebd8bed',1,'mimalloc-doc.h']]],
  ['mi_5foption_5fset_5fenabled_5fdefault',['mi_option_set_enabled_default',['../group__options.html#ga65518b69ec5d32336b50e07f74b3f629',1,'mimalloc-doc.h']]],
  ['mi_5fposix_5fmemalign',['mi_posix_memalign',['../group__posix.html#gacff84f226ba9feb2031b8992e5579447',1,'mimalloc-doc.h']]],
-  ['mi_5fprocess_5finfo',['mi_process_info',['../group__extended.html#ga9144506d5ffa8cc03547867bd15e1032',1,'mimalloc-doc.h']]],
+  ['mi_5fprocess_5finfo',['mi_process_info',['../group__extended.html#ga7d862c2affd5790381da14eb102a364d',1,'mimalloc-doc.h']]],
  ['mi_5fpvalloc',['mi_pvalloc',['../group__posix.html#gaeb325c39b887d3b90d85d1eb1712fb1e',1,'mimalloc-doc.h']]],
  ['mi_5frealloc',['mi_realloc',['../group__malloc.html#gaf11eb497da57bdfb2de65eb191c69db6',1,'mimalloc-doc.h']]],
  ['mi_5frealloc_5faligned',['mi_realloc_aligned',['../group__aligned.html#ga4028d1cf4aa4c87c880747044a8322ae',1,'mimalloc-doc.h']]],
--- a/readme.md
+++ b/readme.md
@ -11,7 +11,7 @@ mimalloc (pronounced "me-malloc")
 is a general purpose allocator with excellent [performance](#performance) characteristics.
 Initially developed by Daan Leijen for the run-time systems of the
 [Koka](https://github.com/koka-lang/koka) and [Lean](https://github.com/leanprover/lean) languages.
-Latest release:`v1.6.5` (2020-09-24).
+Latest release:`v1.6.7` (2020-09-24).

 It is a drop-in replacement for `malloc` and can be used in other programs
 without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
@ -73,8 +73,8 @@ Enjoy!

 ### Releases

-* 2020-09-24, `v1.6.6`: stable release 1.6: using standard C atomics, passing tsan testing, improved
-  handling of failing to commit on Windows, add `mi_process_info` api call.
+* 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
+  handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
 * 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
  support for IllumOS and Haiku, NUMA support for Vista/XP, improved NUMA detection for AMD Ryzen, ubsan support.
 * 2020-05-05, `v1.6.3`: stable release 1.6: improved behavior in out-of-memory situations, improved malloc zones on macOS,
--- a/src/alloc.c
+++ b/src/alloc.c
@ -448,8 +448,7 @@ void mi_free(void* p) mi_attr_noexcept
    #endif
    mi_block_set_next(page, block, page->local_free);
    page->local_free = block;
-    page->used--;
-    if (mi_unlikely(mi_page_all_free(page))) {
+    if (mi_unlikely(--page->used == 0)) {   // using this expression generates better code than: page->used--; if (mi_page_all_free(page))    
      _mi_page_retire(page);
    }
  }
--- a/src/region.c
+++ b/src/region.c
@ -243,7 +243,7 @@ static bool mi_region_is_suitable(const mem_region_t* region, int numa_node, boo
 static bool mi_region_try_claim(int numa_node, size_t blocks, bool allow_large, mem_region_t** region, mi_bitmap_index_t* bit_idx, mi_os_tld_t* tld)
 {
  // try all regions for a free slot  
-  const size_t count = mi_atomic_load_acquire(&regions_count);
+  const size_t count = mi_atomic_load_relaxed(&regions_count); // monotonic, so ok to be relaxed
  size_t idx = tld->region_idx; // Or start at 0 to reuse low addresses? Starting at 0 seems to increase latency though
  for (size_t visited = 0; visited < count; visited++, idx++) {
    if (idx >= count) idx = 0;  // wrap around