diff --git a/cmake/mimalloc-config-version.cmake b/cmake/mimalloc-config-version.cmake
index edffeea1..ed95c19e 100644
--- a/cmake/mimalloc-config-version.cmake
+++ b/cmake/mimalloc-config-version.cmake
@@ -1,5 +1,5 @@
set(mi_version_major 1)
-set(mi_version_minor 6)
+set(mi_version_minor 7)
set(mi_version ${mi_version_major}.${mi_version_minor})
set(PACKAGE_VERSION ${mi_version})
diff --git a/doc/bench-c5-18xlarge-2020-01-20-a.svg b/doc/bench-2020/bench-c5-18xlarge-2020-01-20-a.svg
similarity index 100%
rename from doc/bench-c5-18xlarge-2020-01-20-a.svg
rename to doc/bench-2020/bench-c5-18xlarge-2020-01-20-a.svg
diff --git a/doc/bench-c5-18xlarge-2020-01-20-b.svg b/doc/bench-2020/bench-c5-18xlarge-2020-01-20-b.svg
similarity index 100%
rename from doc/bench-c5-18xlarge-2020-01-20-b.svg
rename to doc/bench-2020/bench-c5-18xlarge-2020-01-20-b.svg
diff --git a/doc/bench-c5-18xlarge-2020-01-20-rss-a.svg b/doc/bench-2020/bench-c5-18xlarge-2020-01-20-rss-a.svg
similarity index 100%
rename from doc/bench-c5-18xlarge-2020-01-20-rss-a.svg
rename to doc/bench-2020/bench-c5-18xlarge-2020-01-20-rss-a.svg
diff --git a/doc/bench-c5-18xlarge-2020-01-20-rss-b.svg b/doc/bench-2020/bench-c5-18xlarge-2020-01-20-rss-b.svg
similarity index 100%
rename from doc/bench-c5-18xlarge-2020-01-20-rss-b.svg
rename to doc/bench-2020/bench-c5-18xlarge-2020-01-20-rss-b.svg
diff --git a/doc/bench-r5a-1.svg b/doc/bench-2020/bench-r5a-1.svg
similarity index 100%
rename from doc/bench-r5a-1.svg
rename to doc/bench-2020/bench-r5a-1.svg
diff --git a/doc/bench-r5a-12xlarge-2020-01-16-a.svg b/doc/bench-2020/bench-r5a-12xlarge-2020-01-16-a.svg
similarity index 100%
rename from doc/bench-r5a-12xlarge-2020-01-16-a.svg
rename to doc/bench-2020/bench-r5a-12xlarge-2020-01-16-a.svg
diff --git a/doc/bench-r5a-12xlarge-2020-01-16-b.svg b/doc/bench-2020/bench-r5a-12xlarge-2020-01-16-b.svg
similarity index 100%
rename from doc/bench-r5a-12xlarge-2020-01-16-b.svg
rename to doc/bench-2020/bench-r5a-12xlarge-2020-01-16-b.svg
diff --git a/doc/bench-r5a-2.svg b/doc/bench-2020/bench-r5a-2.svg
similarity index 100%
rename from doc/bench-r5a-2.svg
rename to doc/bench-2020/bench-r5a-2.svg
diff --git a/doc/bench-r5a-rss-1.svg b/doc/bench-2020/bench-r5a-rss-1.svg
similarity index 100%
rename from doc/bench-r5a-rss-1.svg
rename to doc/bench-2020/bench-r5a-rss-1.svg
diff --git a/doc/bench-r5a-rss-2.svg b/doc/bench-2020/bench-r5a-rss-2.svg
similarity index 100%
rename from doc/bench-r5a-rss-2.svg
rename to doc/bench-2020/bench-r5a-rss-2.svg
diff --git a/doc/bench-spec-rss.svg b/doc/bench-2020/bench-spec-rss.svg
similarity index 100%
rename from doc/bench-spec-rss.svg
rename to doc/bench-2020/bench-spec-rss.svg
diff --git a/doc/bench-spec.svg b/doc/bench-2020/bench-spec.svg
similarity index 100%
rename from doc/bench-spec.svg
rename to doc/bench-2020/bench-spec.svg
diff --git a/doc/bench-z4-1.svg b/doc/bench-2020/bench-z4-1.svg
similarity index 100%
rename from doc/bench-z4-1.svg
rename to doc/bench-2020/bench-z4-1.svg
diff --git a/doc/bench-z4-2.svg b/doc/bench-2020/bench-z4-2.svg
similarity index 100%
rename from doc/bench-z4-2.svg
rename to doc/bench-2020/bench-z4-2.svg
diff --git a/doc/bench-z4-rss-1.svg b/doc/bench-2020/bench-z4-rss-1.svg
similarity index 100%
rename from doc/bench-z4-rss-1.svg
rename to doc/bench-2020/bench-z4-rss-1.svg
diff --git a/doc/bench-z4-rss-2.svg b/doc/bench-2020/bench-z4-rss-2.svg
similarity index 100%
rename from doc/bench-z4-rss-2.svg
rename to doc/bench-2020/bench-z4-rss-2.svg
diff --git a/doc/bench-2021/bench-amd5950x-2021-01-30-a.svg b/doc/bench-2021/bench-amd5950x-2021-01-30-a.svg
new file mode 100644
index 00000000..86a97bfd
--- /dev/null
+++ b/doc/bench-2021/bench-amd5950x-2021-01-30-a.svg
@@ -0,0 +1,952 @@
+
+
+
\ No newline at end of file
diff --git a/doc/bench-2021/bench-amd5950x-2021-01-30-b.svg b/doc/bench-2021/bench-amd5950x-2021-01-30-b.svg
new file mode 100644
index 00000000..c7488770
--- /dev/null
+++ b/doc/bench-2021/bench-amd5950x-2021-01-30-b.svg
@@ -0,0 +1,1255 @@
+
+
+
\ No newline at end of file
diff --git a/doc/bench-2021/bench-c5-18xlarge-2021-01-30-a.svg b/doc/bench-2021/bench-c5-18xlarge-2021-01-30-a.svg
new file mode 100644
index 00000000..bc91c218
--- /dev/null
+++ b/doc/bench-2021/bench-c5-18xlarge-2021-01-30-a.svg
@@ -0,0 +1,955 @@
+
+
+
\ No newline at end of file
diff --git a/doc/bench-2021/bench-c5-18xlarge-2021-01-30-b.svg b/doc/bench-2021/bench-c5-18xlarge-2021-01-30-b.svg
new file mode 100644
index 00000000..e8b04a0d
--- /dev/null
+++ b/doc/bench-2021/bench-c5-18xlarge-2021-01-30-b.svg
@@ -0,0 +1,1269 @@
+
+
+
\ No newline at end of file
diff --git a/doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-a.svg b/doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-a.svg
new file mode 100644
index 00000000..6cd36aaa
--- /dev/null
+++ b/doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-a.svg
@@ -0,0 +1,836 @@
+
+
+
\ No newline at end of file
diff --git a/doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-b.svg b/doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-b.svg
new file mode 100644
index 00000000..c81072e9
--- /dev/null
+++ b/doc/bench-2021/bench-c5-18xlarge-2021-01-30-rss-b.svg
@@ -0,0 +1,1131 @@
+
+
+
\ No newline at end of file
diff --git a/doc/bench-2021/bench-macmini-2021-01-30.svg b/doc/bench-2021/bench-macmini-2021-01-30.svg
new file mode 100644
index 00000000..ece64185
--- /dev/null
+++ b/doc/bench-2021/bench-macmini-2021-01-30.svg
@@ -0,0 +1,766 @@
+
+
+
\ No newline at end of file
diff --git a/doc/mimalloc-doc.h b/doc/mimalloc-doc.h
index 7c238d29..59113402 100644
--- a/doc/mimalloc-doc.h
+++ b/doc/mimalloc-doc.h
@@ -1209,7 +1209,7 @@ synthetic benchmarks that see how the allocator behaves under more
extreme circumstances.
In our benchmarks, _mimalloc_ always outperforms all other leading
-allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc) (Apr 2019),
+allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc) (Jan 2021),
and usually uses less memory (up to 25% more in the worst case).
A nice property is that it does *consistently* well over the wide
range of benchmarks.
diff --git a/include/mimalloc.h b/include/mimalloc.h
index 4ecae58d..8d1108a6 100644
--- a/include/mimalloc.h
+++ b/include/mimalloc.h
@@ -8,7 +8,7 @@ terms of the MIT license. A copy of the license can be found in the file
#ifndef MIMALLOC_H
#define MIMALLOC_H
-#define MI_MALLOC_VERSION 167 // major + 2 digits minor
+#define MI_MALLOC_VERSION 170 // major + 2 digits minor
// ------------------------------------------------------
// Compiler specific attributes
diff --git a/readme.md b/readme.md
index 18d50636..2ccccea2 100644
--- a/readme.md
+++ b/readme.md
@@ -10,10 +10,11 @@
mimalloc (pronounced "me-malloc")
is a general purpose allocator with excellent [performance](#performance) characteristics.
Initially developed by Daan Leijen for the run-time systems of the
-[Koka](https://github.com/koka-lang/koka) and [Lean](https://github.com/leanprover/lean) languages.
-Latest release:`v1.6.7` (2020-09-24).
+[Koka](https://koka-lang.github.io) and [Lean](https://github.com/leanprover/lean) languages.
-It is a drop-in replacement for `malloc` and can be used in other programs
+Latest release tag: `v1.7.0` (2020-01-31).
+
+mimalloc is a drop-in replacement for `malloc` and can be used in other programs
without code changes, for example, on dynamically linked ELF-based systems (Linux, BSD, etc.) you can use it as:
```
> LD_PRELOAD=/usr/bin/libmimalloc.so myprogram
@@ -54,7 +55,7 @@ It also has an easy way to override the default allocator in [Windows](#override
and has no internal points of contention using only atomic operations.
- __fast__: In our benchmarks (see [below](#performance)),
_mimalloc_ outperforms other leading allocators (_jemalloc_, _tcmalloc_, _Hoard_, etc),
- and usually uses less memory (up to 25% more in the worst case). A nice property
+ and often uses less memory. A nice property
is that it does consistently well over a wide range of benchmarks. There is also good huge OS page
support for larger server programs.
@@ -73,6 +74,8 @@ Enjoy!
### Releases
+* 2021-01-31, `v1.7.0`: stable release 1.7: support explicit user provided memory regions, more precise statistics,
+ improve macOS overriding, initial support for Apple M1, improved DragonFly support, faster memcpy on Windows, various small fixes.
* 2020-09-24, `v1.6.7`: stable release 1.6: using standard C atomics, passing tsan testing, improved
handling of failing to commit on Windows, add [`mi_process_info`](https://github.com/microsoft/mimalloc/blob/master/include/mimalloc.h#L156) api call.
* 2020-08-06, `v1.6.4`: stable release 1.6: improved error recovery in low-memory situations,
@@ -413,7 +416,7 @@ under your control or otherwise mixing of pointers from different heaps may occu
# Performance
-Last update: 2020-01-20
+Last update: 2021-01-30
We tested _mimalloc_ against many other top allocators over a wide
range of benchmarks, ranging from various real world programs to
@@ -430,10 +433,15 @@ suffering from (too much) underperformance in less common situations.
As always, interpret these results with care since some benchmarks test synthetic
or uncommon situations that may never apply to your workloads. For example, most
-allocators do not do well on `xmalloc-testN` but that includes the best
+allocators do not do well on `xmalloc-testN` but that includes even the best
industrial allocators like _jemalloc_ and _tcmalloc_ that are used in some of
the world's largest systems (like Chrome or FreeBSD).
+Also, the benchmarks here do not measure the behaviour on very large and long-running server workloads,
+or worst-case latencies of allocation. Much work has gone into `mimalloc` to work well on such
+workloads (for example, to reduce virtual memory fragmentation on long-running services)
+but such optimizations are not always reflected in the current benchmark suite.
+
We show here only an overview -- for
more specific details and further benchmarks we refer to the
[technical report](https://www.microsoft.com/en-us/research/publication/mimalloc-free-list-sharding-in-action).
@@ -441,27 +449,28 @@ The benchmark suite is automated and available separately
as [mimalloc-bench](https://github.com/daanx/mimalloc-bench).
-## Benchmark Results on 36-core Intel
+## Benchmark Results on a 16-core AMD 5950x (Zen3)
-Testing on a big Amazon EC2 compute instance
-([c5.18xlarge](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized))
-consisting of a 72 processor Intel Xeon at 3GHz
-with 144GiB ECC memory, running Ubuntu 18.04.1 with glibc 2.27 and GCC 7.4.0.
-The measured allocators are _mimalloc_ (xmi, tag:v1.4.0, page reset enabled)
-and its secure build as _smi_,
-Google's [_tcmalloc_](https://github.com/gperftools/gperftools) (tc, tag:gperftools-2.7) used in Chrome,
-Facebook's [_jemalloc_](https://github.com/jemalloc/jemalloc) (je, tag:5.2.1) by Jason Evans used in Firefox and FreeBSD,
-the Intel thread building blocks [allocator](https://github.com/intel/tbb) (tbb, tag:2020),
-[rpmalloc](https://github.com/mjansson/rpmalloc) (rp,tag:1.4.0) by Mattias Jansson,
-the original scalable [_Hoard_](https://github.com/emeryberger/Hoard) (tag:3.13) allocator by Emery Berger \[1],
-the memory compacting [_Mesh_](https://github.com/plasma-umass/Mesh) (git:51222e7) allocator by
+Testing on the 16-core AMD 5950x processor at 3.4Ghz (4.9Ghz boost), with
+with 32GiB memory at 3600Mhz, running Ubuntu 20.04 with glibc 2.31 and GCC 9.3.0.
+
+We measure three versions of _mimalloc_: the main version `mi` (tag:v1.7.0),
+the new v2.0 beta version as `xmi` (tag:v2.0.0), and the main version in secure mode as `smi` (tag:v1.7.0).
+
+The other allocators are
+Google's [_tcmalloc_](https://github.com/gperftools/gperftools) (`tc`, tag:gperftools-2.8.1) used in Chrome,
+Facebook's [_jemalloc_](https://github.com/jemalloc/jemalloc) (`je`, tag:5.2.1) by Jason Evans used in Firefox and FreeBSD,
+the Intel thread building blocks [allocator](https://github.com/intel/tbb) (`tbb`, tag:v2020.3),
+[rpmalloc](https://github.com/mjansson/rpmalloc) (`rp`,tag:1.4.1) by Mattias Jansson,
+the original scalable [_Hoard_](https://github.com/emeryberger/Hoard) (git:d880f72) allocator by Emery Berger \[1],
+the memory compacting [_Mesh_](https://github.com/plasma-umass/Mesh) (git:67ff31a) allocator by
Bobby Powers _et al_ \[8],
-and finally the default system allocator (glibc, 2.27) (based on _PtMalloc2_).
+and finally the default system allocator (`glibc`, 2.31) (based on _PtMalloc2_).
-
-
+
+
-Any benchmarks ending in `N` run on all processors in parallel.
+Any benchmarks ending in `N` run on all 32 logical cores in parallel.
Results are averaged over 10 runs and reported relative
to mimalloc (where 1.2 means it took 1.2× longer to run).
The legend also contains the _overall relative score_ between the
@@ -476,18 +485,17 @@ _jemalloc_.
The _leanN_ program is interesting as a large realistic and
concurrent workload of the [Lean](https://github.com/leanprover/lean)
-theorem prover compiling its own standard library, and there is a 7%
+theorem prover compiling its own standard library, and there is a 13%
speedup over _tcmalloc_. This is
quite significant: if Lean spends 20% of its time in the
-allocator that means that _mimalloc_ is 1.3× faster than _tcmalloc_
+allocator that means that _mimalloc_ is 1.6× faster than _tcmalloc_
here. (This is surprising as that is not measured in a pure
allocation benchmark like _alloc-test_. We conjecture that we see this
outsized improvement here because _mimalloc_ has better locality in
the allocation which improves performance for the *other* computations
in a program as well).
-The single threaded _redis_ benchmark again show that most allocators do well on such workloads where _tcmalloc_
-did best this time.
+The single threaded _redis_ benchmark again show that most allocators do well on such workloads.
The _larsonN_ server benchmark by Larson and Krishnan \[2] allocates and frees between threads. They observed this
behavior (which they call _bleeding_) in actual server applications, and the benchmark simulates this.
@@ -511,14 +519,12 @@ The _alloc-test_, by
[OLogN Technologies AG](http://ithare.com/testing-memory-allocators-ptmalloc2-tcmalloc-hoard-jemalloc-while-trying-to-simulate-real-world-loads/), is a very allocation intensive benchmark doing millions of
allocations in various size classes. The test is scaled such that when an
allocator performs almost identically on _alloc-test1_ as _alloc-testN_ it
-means that it scales linearly. Here, _tcmalloc_, and
-_Hoard_ seem to scale less well and do more than 10% worse on the multi-core version. Even the best industrial
-allocators (_tcmalloc_, _jemalloc_, and _tbb_) are more than 10% slower as _mimalloc_ here.
+means that it scales linearly.
The _sh6bench_ and _sh8bench_ benchmarks are
developed by [MicroQuill](http://www.microquill.com/) as part of SmartHeap.
In _sh6bench_ _mimalloc_ does much
-better than the others (more than 1.5× faster than _jemalloc_).
+better than the others (more than 2.5× faster than _jemalloc_).
We cannot explain this well but believe it is
caused in part by the "reverse" free-ing pattern in _sh6bench_.
The _sh8bench_ is a variation with object migration
@@ -528,7 +534,7 @@ The _xmalloc-testN_ benchmark by Lever and Boreham \[5] and Christian Eder, simu
some threads only allocate, and others only free -- they observed this pattern in
larger server applications. Here we see that
the _mimalloc_ technique of having non-contended sharded thread free
-lists pays off as it outperforms others by a very large margin. Only _rpmalloc_ and _tbb_ also scale well on this benchmark.
+lists pays off as it outperforms others by a very large margin. Only _rpmalloc_, _tbb_, and _glibc_ also scale well on this benchmark.
The _cache-scratch_ benchmark by Emery Berger \[1], and introduced with
the Hoard allocator to test for _passive-false_ sharing of cache lines.
@@ -542,16 +548,20 @@ cache line sharing completely, while _Hoard_ and _glibc_ seem to mitigate
the effects. Kukanov and Voss \[7] describe in detail
how the design of _tbb_ avoids the false cache line sharing.
-## On 24-core AMD Epyc
-For completeness, here are the results on a
-[r5a.12xlarge](https://aws.amazon.com/ec2/instance-types/#Memory_Optimized) instance
-having a 48 processor AMD Epyc 7000 at 2.5GHz with 384GiB of memory.
-The results are similar to the Intel results but it is interesting to
+## On a 36-core Intel Xeon
+
+For completeness, here are the results on a big Amazon
+[c5.18xlarge](https://aws.amazon.com/ec2/instance-types/#Compute_Optimized) instance
+consisting of a 2×18-core Intel Xeon (Cascade Lake) at 3.4GHz (boost 3.5GHz)
+with 144GiB ECC memory, running Ubuntu 20.04 with glibc 2.31, GCC 9.3.0, and
+Clang 10.0.0. This time, the mimalloc allocators (mi, xmi, and smi) were
+compiled with the Clang compiler instead of GCC.
+The results are similar to the AMD results but it is interesting to
see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks.
-
-
+
+
## Peak Working Set
@@ -559,15 +569,59 @@ see the differences in the _larsonN_, _mstressN_, and _xmalloc-testN_ benchmarks
The following figure shows the peak working set (rss) of the allocators
on the benchmarks (on the c5.18xlarge instance).
-
-
+
+
Note that the _xmalloc-testN_ memory usage should be disregarded as it
allocates more the faster the program runs. Similarly, memory usage of
-_mstressN_, _rptestN_ and _sh8bench_ can vary depending on scheduling and
-speed. Nevertheless, even though _mimalloc_ is fast on these benchmarks we
-believe the memory usage is too high and hope to improve.
+_larsonN_, _mstressN_, _rptestN_ and _sh8bench_ can vary depending on scheduling and
+speed. Nevertheless, we hope to improve the memory usage on _mstressN_
+and _rptestN_ (just as _cfrac_, _larsonN_ and _sh8bench_ have a small working set which skews the results).
+
# References
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index 4152f99d..7392d20e 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -13,7 +13,7 @@ if (NOT CMAKE_BUILD_TYPE)
endif()
# Import mimalloc (if installed)
-find_package(mimalloc 1.6 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
+find_package(mimalloc 1.7 REQUIRED NO_SYSTEM_ENVIRONMENT_PATH)
message(STATUS "Found mimalloc installed at: ${MIMALLOC_TARGET_DIR}")
# overriding with a dynamic library