update bit primitives

This commit is contained in:
daanx 2024-12-08 09:03:25 -08:00
parent c33de86da3
commit 5a06d2aeba
3 changed files with 122 additions and 155 deletions

View file

@ -36,6 +36,12 @@ terms of the MIT license. A copy of the license can be found in the file
#error platform pointers must be 32, 64, or 128 bits
#endif
#if (INTPTR_MAX) > LONG_MAX
# define MI_PU(x) x##ULL
#else
# define MI_PU(x) x##UL
#endif
#if SIZE_MAX == UINT64_MAX
# define MI_SIZE_SHIFT (3)
typedef int64_t mi_ssize_t;
@ -43,15 +49,13 @@ typedef int64_t mi_ssize_t;
# define MI_SIZE_SHIFT (2)
typedef int32_t mi_ssize_t;
#else
#error platform objects must be 32 or 64 bits
#error platform objects must be 32 or 64 bits in size
#endif
#if (SIZE_MAX/2) > LONG_MAX
# define MI_ZU(x) x##ULL
# define MI_ZI(x) x##LL
#else
# define MI_ZU(x) x##UL
# define MI_ZI(x) x##L
#endif
#define MI_INTPTR_SIZE (1<<MI_INTPTR_SHIFT)
@ -131,11 +135,13 @@ typedef int32_t mi_ssize_t;
#endif
#if (MI_SIZE_BITS == 32)
#define mi_builtin_size(name) mi_builtin32(name)
#define mi_has_builtin_size(name) mi_has_builtin32(name)
#define mi_builtinz(name) mi_builtin32(name)
#define mi_has_builtinz(name) mi_has_builtin32(name)
#define mi_msc_builtinz(name) name
#elif (MI_SIZE_BITS == 64)
#define mi_builtin_size(name) mi_builtin64(name)
#define mi_has_builtin_size(name) mi_has_builtin64(name)
#define mi_builtinz(name) mi_builtin64(name)
#define mi_has_builtinz(name) mi_has_builtin64(name)
#define mi_msc_builtinz(name) name##64
#endif
@ -145,91 +151,40 @@ typedef int32_t mi_ssize_t;
size_t _mi_clz_generic(size_t x);
size_t _mi_ctz_generic(size_t x);
uint32_t _mi_ctz_generic32(uint32_t x);
static inline size_t mi_ctz(size_t x) {
#if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
uint64_t r;
__asm volatile ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
__asm ("tzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
return r;
#elif MI_ARCH_X64 && defined(__BMI1__)
return (size_t)_tzcnt_u64(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long idx;
#if MI_SIZE_BITS==32
return (_BitScanForward(&idx, x) ? (size_t)idx : 32);
#else
return (_BitScanForward64(&idx, x) ? (size_t)idx : 64);
#endif
/*
// for arm64 and riscv, the builtin_ctz is defined for 0 as well
#elif defined(__GNUC__) && MI_ARCH_ARM64
uint64_t r;
__asm volatile ("rbit\t%0, %1\n\tclz\t%0, %0" : "=&r"(r) : "r"(x) : "cc");
return r;
#elif defined(__GNUC__) && MI_ARCH_RISCV
size_t r;
__asm volatile ("ctz\t%0, %1" : "=&r"(r) : "r"(x) : );
return r;
*/
#elif mi_has_builtin_size(ctz)
return (x!=0 ? (size_t)mi_builtin_size(ctz)(x) : MI_SIZE_BITS);
return (mi_msc_builtinz(_BitScanForward)(&idx, x) ? (size_t)idx : MI_SIZE_BITS);
#elif mi_has_builtinz(ctz)
return (x!=0 ? (size_t)mi_builtinz(ctz)(x) : MI_SIZE_BITS);
#else
#define MI_HAS_FAST_BITSCAN 0
return _mi_ctz_generic(x);
return (x!=0 ? _mi_ctz_generic(x) : MI_SIZE_BITS);
#endif
}
static inline size_t mi_clz(size_t x) {
#if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 lzcnt is defined for 0
uint64_t r;
__asm volatile ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
__asm ("lzcnt\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
return r;
#elif MI_ARCH_X64 && defined(__BMI1__)
return (size_t)_lzcnt_u64(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long idx;
#if MI_SIZE_BITS==32
return (_BitScanReverse(&idx, x) ? 31 - (size_t)idx : 32);
#else
return (_BitScanReverse64(&idx, x) ? 63 - (size_t)idx : 64);
#endif
/*
// for arm64 and riscv, the builtin_clz is defined for 0 as well
#elif defined(__GNUC__) && MI_ARCH_ARM64
uint64_t r;
__asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : "cc");
return r;
#elif defined(__GNUC__) && MI_ARCH_RISCV
size_t r;
__asm volatile ("clz\t%0, %1" : "=&r"(r) : "r"(x) : );
return r;
*/
#elif mi_has_builtin_size(clz)
return (x!=0 ? (size_t)mi_builtin_size(clz)(x) : MI_SIZE_BITS);
return (mi_msc_builtinz(_BitScanReverse)(&idx, x) ? MI_SIZE_BITS - 1 - (size_t)idx : MI_SIZE_BITS);
#elif mi_has_builtinz(clz)
return (x!=0 ? (size_t)mi_builtinz(clz)(x) : MI_SIZE_BITS);
#else
#define MI_HAS_FAST_BITSCAN 0
return _mi_clz_generic(x);
#endif
}
static inline uint32_t mi_ctz32(uint32_t x) {
#if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__) // on x64 tzcnt is defined for 0
uint32_t r;
__asm volatile ("tzcntl\t%1, %0" : "=&r"(r) : "r"(x) : "cc");
return r;
#elif MI_ARCH_X64 && defined(__BMI1__)
return (uint32_t)_tzcnt_u32(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long idx;
return (_BitScanForward(&idx, x) ? (uint32_t)idx : 32);
#elif mi_has_builtin(ctz) && (INT_MAX == INT32_MAX)
return (x!=0 ? (uint32_t)mi_builtin(ctz)(x) : 32);
#elif mi_has_builtin(ctzl) && (LONG_MAX == INT32_MAX)
return (x!=0 ? (uint32_t)mi_builtin(ctzl)(x) : 32);
#else
#define MI_HAS_FAST_BITSCAN 0
return _mi_ctz_generic32(x);
return (x!=0 ? _mi_clz_generic(x) : MI_SIZE_BITS);
#endif
}
@ -237,23 +192,19 @@ static inline uint32_t mi_ctz32(uint32_t x) {
#define MI_HAS_FAST_BITSCAN 1
#endif
size_t _mi_popcount_generic(size_t x);
static inline size_t mi_popcount(size_t x) {
#if mi_has_builtin_size(popcount)
return mi_builtin_size(popcount)(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
#if MI_SIZE_BITS==32
return __popcnt(x);
#else
return __popcnt64(x);
#endif
#elif MI_ARCH_X64 && defined(__BMI1__)
#if mi_has_builtinz(popcount)
return mi_builtinz(popcount)(x);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
return mi_msc_builtinz(__popcnt)(x);
#elif MI_ARCH_X64 && defined(__BMI1__)
return (size_t)_mm_popcnt_u64(x);
#else
#define MI_HAS_FAST_POPCOUNT 0
error define generic popcount
#endif
#else
#define MI_HAS_FAST_POPCOUNT 0
return (x<=1 ? x : _mi_popcount_generic(x));
#endif
}
#ifndef MI_HAS_FAST_POPCOUNT
@ -274,60 +225,31 @@ static inline bool mi_bsf(size_t x, size_t* idx) {
bool is_zero;
__asm ( "tzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc" );
return !is_zero;
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long i;
return (mi_msc_builtinz(_BitScanForward)(&i, x) ? (*idx = (size_t)i, true) : false);
#else
*idx = mi_ctz(x);
return (x!=0);
return (x!=0 ? (*idx = mi_ctz(x), true) : false);
#endif
}
// Bit scan forward: find the least significant bit that is set (i.e. count trailing zero's)
// return false if `x==0` (with `*idx` undefined) and true otherwise,
// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
static inline bool mi_bsf32(uint32_t x, uint32_t* idx) {
#if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
// on x64 the carry flag is set on zero which gives better codegen
bool is_zero;
__asm ("tzcntl\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
return !is_zero;
#else
*idx = mi_ctz32(x);
return (x!=0);
#endif
}
// Bit scan reverse: find the most significant bit that is set
// return false if `x==0` (with `*idx` undefined) and true otherwise,
// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
static inline bool mi_bsr(size_t x, size_t* idx) {
#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
#if defined(__GNUC__) && MI_ARCH_X64 && defined(__BMI1__)
// on x64 the carry flag is set on zero which gives better codegen
bool is_zero;
__asm ("lzcnt\t%2, %1" : "=@ccc"(is_zero), "=r"(*idx) : "r"(x) : "cc");
return !is_zero;
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long i;
#if MI_SIZE_BITS==32
return (_BitScanReverse(&i, x) ? (*idx = i, true) : false);
#else
return (_BitScanReverse64(&i, x) ? (*idx = i, true) : false);
#endif
return (mi_msc_builtinz(_BitScanReverse)(&i, x) ? (*idx = (size_t)i, true) : false);
#else
const size_t r = mi_clz(x);
*idx = (~r & (MI_SIZE_BITS - 1));
return (x!=0);
return (x!=0 ? (*idx = MI_SIZE_BITS - 1 - mi_clz(x), true) : false);
#endif
}
// Bit scan reverse: find the most significant bit that is set
// return false if `x==0` (with `*idx` undefined) and true otherwise,
// with the `idx` is set to the bit index (`0 <= *idx < MI_BFIELD_BITS`).
static inline bool mi_bsr32(uint32_t x, uint32_t* idx) {
#if defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
unsigned long i;
return (_BitScanReverse(&i, x) ? (*idx = i, true) : false);
#else
const size_t r = mi_clz((size_t)x);
*idx = (~r & (MI_SIZE_BITS - 1)) - (MI_SIZE_SIZE - sizeof(uint32_t));
return (x!=0);
#endif
}
/* --------------------------------------------------------------------------------
rotate
@ -338,12 +260,10 @@ static inline size_t mi_rotr(size_t x, size_t r) {
return mi_builtin(rotateright64)(x,r);
#elif (mi_has_builtin(rotateright32) && MI_SIZE_BITS==32)
return mi_builtin(rotateright32)(x,r);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
#if MI_SIZE_BITS==32
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
return _rotr64(x, (int)r);
#elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
return _lrotr(x,(int)r);
#else
return _rotr64(x,(int)r);
#endif
#else
// The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
// avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
@ -352,30 +272,15 @@ static inline size_t mi_rotr(size_t x, size_t r) {
#endif
}
static inline uint32_t mi_rotr32(uint32_t x, uint32_t r) {
#if mi_has_builtin(rotateright32)
return mi_builtin(rotateright32)(x, r);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
return _lrotr(x, (int)r);
#else
// The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
// avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
const unsigned int rshift = (unsigned int)(r) & 31;
return ((x >> rshift) | (x << ((-rshift) & 31)));
#endif
}
static inline size_t mi_rotl(size_t x, size_t r) {
#if (mi_has_builtin(rotateleft64) && MI_SIZE_BITS==64)
return mi_builtin(rotateleft64)(x,r);
#elif (mi_has_builtin(rotateleft32) && MI_SIZE_BITS==32)
return mi_builtin(rotateleft32)(x,r);
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_X86 || MI_ARCH_ARM64 || MI_ARCH_ARM32)
#if MI_SIZE_BITS==32
return _lrotl(x,(int)r);
#else
return _rotl64(x,(int)r);
#endif
#elif defined(_MSC_VER) && (MI_ARCH_X64 || MI_ARCH_ARM64)
return _rotl64(x, (int)r);
#elif defined(_MSC_VER) && (MI_ARCH_X86 || MI_ARCH_ARM32)
return _lrotl(x, (int)r);
#else
// The term `(-rshift)&(BITS-1)` is written instead of `BITS - rshift` to
// avoid UB when `rshift==0`. See <https://blog.regehr.org/archives/1063>
@ -385,5 +290,4 @@ static inline size_t mi_rotl(size_t x, size_t r) {
}
#endif // MI_BITS_H

View file

@ -1,5 +1,5 @@
/* ----------------------------------------------------------------------------
Copyright (c) 2018-2023, Microsoft Research, Daan Leijen
Copyright (c) 2018-2024, Microsoft Research, Daan Leijen
This is free software; you can redistribute it and/or modify it under the
terms of the MIT license. A copy of the license can be found in the file
"LICENSE" at the root of this distribution.
@ -277,10 +277,12 @@ void _mi_snprintf(char* buf, size_t buflen, const char* fmt, ...) {
// --------------------------------------------------------
// generic trailing and leading zero count
// generic trailing and leading zero count, and popcount
// --------------------------------------------------------
uint32_t _mi_ctz_generic32(uint32_t x) {
#if !MI_HAS_FAST_BITSCAN
static size_t mi_ctz_generic32(uint32_t x) {
// de Bruijn multiplication, see <http://supertech.csail.mit.edu/papers/debruijn.pdf>
static const uint8_t debruijn[32] = {
0, 1, 28, 2, 29, 14, 24, 3, 30, 22, 20, 15, 25, 17, 4, 8,
@ -319,10 +321,71 @@ size_t _mi_clz_generic(size_t x) {
size_t _mi_ctz_generic(size_t x) {
if (x==0) return MI_SIZE_BITS;
#if (MI_SIZE_BITS <= 32)
return _mi_ctz_generic32((uint32_t)x);
return mi_ctz_generic32((uint32_t)x);
#else
const size_t count = _mi_ctz_generic32((uint32_t)x);
const size_t count = mi_ctz_generic32((uint32_t)x);
if (count < 32) return count;
return (32 + _mi_ctz_generic32((uint32_t)(x>>32)));
return (32 + mi_ctz_generic32((uint32_t)(x>>32)));
#endif
}
#endif // bit scan
#if !MI_HAS_FAST_POPCOUNT
#if MI_SIZE_SIZE == 4
#define mi_mask_even_bits32 (0x55555555)
#define mi_mask_even_pairs32 (0x33333333)
#define mi_mask_even_nibbles32 (0x0F0F0F0F)
// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
static size_t mi_byte_sum32(uint32_t x) {
// perform `x * 0x01010101`: the highest byte contains the sum of all bytes.
x += (x << 8);
x += (x << 16);
return (size_t)(x >> 24);
}
static size_t mi_popcount_generic32(uint32_t x) {
// first count each 2-bit group `a`, where: a==0b00 -> 00, a==0b01 -> 01, a==0b10 -> 01, a==0b11 -> 10
// in other words, `a - (a>>1)`; to do this in parallel, we need to mask to prevent spilling a bit pair
// into the lower bit-pair:
x = x - ((x >> 1) & mi_mask_even_bits32);
// add the 2-bit pair results
x = (x & mi_mask_even_pairs32) + ((x >> 2) & mi_mask_even_pairs32);
// add the 4-bit nibble results
x = (x + (x >> 4)) & mi_mask_even_nibbles32;
// each byte now has a count of its bits, we can sum them now:
return mi_byte_sum32(x);
}
size_t _mi_popcount_generic(size_t x) {
return mi_popcount_generic32(x);
}
#else
#define mi_mask_even_bits64 (0x5555555555555555)
#define mi_mask_even_pairs64 (0x3333333333333333)
#define mi_mask_even_nibbles64 (0x0F0F0F0F0F0F0F0F)
// sum of all the bytes in `x` if it is guaranteed that the sum < 256!
static size_t mi_byte_sum64(uint64_t x) {
x += (x << 8);
x += (x << 16);
x += (x << 32);
return (size_t)(x >> 56);
}
static size_t mi_popcount_generic64(uint64_t x) {
x = x - ((x >> 1) & mi_mask_even_bits64);
x = (x & mi_mask_even_pairs64) + ((x >> 2) & mi_mask_even_pairs64);
x = (x + (x >> 4)) & mi_mask_even_nibbles64;
return mi_byte_sum64(x);
}
size_t _mi_popcount_generic(size_t x) {
return mi_popcount_generic64(x);
}
#endif
#endif // popcount

View file

@ -175,7 +175,7 @@ static void* mi_os_prim_alloc_at(void* hint_addr, size_t size, size_t try_alignm
MI_UNUSED(tld_stats);
mi_stats_t* stats = &_mi_stats_main;
mi_stat_counter_increase(stats->mmap_calls, 1);
_mi_stat_counter_increase(&stats->mmap_calls, 1);
if (p != NULL) {
_mi_stat_increase(&stats->reserved, size);
if (commit) {