Use much faster getcpu() via vDSO

vDSO (virtual dynamic shared object) is exported by Linux kernel into
every userspace program, designed to speed up this process for certain
system calls. For Linux/x86_64, getcpu() can be called via vDSO, which
makes getcpu() much faster. The faster getcpu() invocation is beneficial
when retrieving NUMA node information.

Benchmarking[1] on AMD Ryzen Threadripper 2990WX 32-Core Processor:
  getcpu: syscall: 103 nsec/call
  getcpu:    vdso: 18 nsec/call

We can not use dlsym to resolve the vDSO symbol "__vdso_getcpu" directly
becase it would cause recursive malloc calls when MI_DEBUG_FULL is enabled.

[1] https://github.com/nathanlynch/vdsotest

Co-authored-by: Chin-Hao Lo <hankluo6@gmail.com>
Signed-off-by: Jim Huang <jserv@biilabs.io>
This commit is contained in:
Jim Huang 2021-06-25 22:41:44 +08:00
parent 076f815cec
commit 6cd59aa50c

View file

@ -214,6 +214,81 @@ void _mi_os_init() {
os_alloc_granularity = 16;
}
#else
#if defined(__linux__) && defined(__x86_64__)
#include <elf.h>
#include <sys/auxv.h>
typedef int (*getcpu_vdso_t)(unsigned*, unsigned*, void*);
static getcpu_vdso_t mi_os_getcpu_vdso = NULL;
static struct vdso_info {
uintptr_t load_addr, load_offset;
Elf64_Sym *symtab; // ELF symbol table
const char *symstrings;
void *bucket, *chain;
Elf64_Word nbucket;
} vdso_info;
static void mi_os_vdso_init(void) {
unsigned long base = getauxval(AT_SYSINFO_EHDR);
if (!base)
return;
bool found_vaddr = false;
vdso_info.load_addr = base;
Elf64_Ehdr* hdr = (Elf64_Ehdr*)base;
if (hdr->e_ident[EI_CLASS] != ELFCLASS64)
return;
Elf64_Phdr* pt = (Elf64_Phdr*)(vdso_info.load_addr + hdr->e_phoff);
Elf64_Dyn* dyn = 0;
// we need to load offset and the dynamic table from the segment table
for (size_t i = 0; i < hdr->e_phnum; i++) {
if (pt[i].p_type == PT_LOAD && !found_vaddr) {
found_vaddr = true;
vdso_info.load_offset = base + (uintptr_t)pt[i].p_offset - (uintptr_t)pt[i].p_vaddr;
} else if (pt[i].p_type == PT_DYNAMIC) {
dyn = (Elf64_Dyn*)(base + pt[i].p_offset);
}
}
if (!found_vaddr || !dyn)
return;
Elf64_Word* hash = 0;
vdso_info.symstrings = NULL;
vdso_info.symtab = NULL;
for (size_t i = 0; dyn[i].d_tag != DT_NULL; i++) {
switch (dyn[i].d_tag) {
case DT_STRTAB:
vdso_info.symstrings = (const char*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
break;
case DT_SYMTAB:
vdso_info.symtab = (Elf64_Sym*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
break;
case DT_HASH:
hash = (Elf64_Word*)((uintptr_t)dyn[i].d_un.d_ptr + vdso_info.load_offset);
break;
}
}
if (!vdso_info.symstrings || !vdso_info.symtab || !hash)
return;
vdso_info.nbucket = hash[0];
vdso_info.bucket = &hash[2];
vdso_info.chain = &hash[vdso_info.nbucket + 2];
}
static void* mi_os_vdso_get_sym(void) {
const char *name = "__vdso_getcpu";
Elf64_Word chain = ((Elf64_Word*)vdso_info.bucket)[11538501 % vdso_info.nbucket];
for (; chain != STN_UNDEF; chain = ((Elf64_Word*)vdso_info.chain)[chain]) {
Elf64_Sym* sym = &vdso_info.symtab[chain];
// Check for a defined global or weak function with right name
if (ELF64_ST_TYPE(sym->st_info) != STT_FUNC)
continue;
if (ELF64_ST_BIND(sym->st_info) != STB_GLOBAL && ELF64_ST_BIND(sym->st_info) != STB_WEAK)
continue;
if (sym->st_shndx == SHN_UNDEF)
continue;
if (strcmp(name, vdso_info.symstrings + sym->st_name))
continue;
return (void*)(vdso_info.load_offset + sym->st_value);
}
return 0;
}
#endif
void _mi_os_init() {
// get the page size
long result = sysconf(_SC_PAGESIZE);
@ -222,6 +297,11 @@ void _mi_os_init() {
os_alloc_granularity = os_page_size;
}
large_os_page_size = 2*MiB; // TODO: can we query the OS for this?
#if defined(__linux__) && defined(__x86_64__)
// set up symbols exported by vDSO (virtual dynamic shared object)
mi_os_vdso_init();
mi_os_getcpu_vdso = (getcpu_vdso_t)mi_os_vdso_get_sym();
#endif
}
#endif
@ -1173,9 +1253,13 @@ static size_t mi_os_numa_node_countx(void) {
#include <stdio.h> // access
static size_t mi_os_numa_nodex(void) {
#ifdef SYS_getcpu
unsigned long node = 0;
unsigned long ncpu = 0;
#if defined(SYS_getcpu)
unsigned int node = 0, ncpu = 0;
#if defined(__x86_64__)
if (mi_likely(mi_os_getcpu_vdso != NULL)) {
return (mi_os_getcpu_vdso(&ncpu, &node, NULL) != -1) ? node : 0;
}
#endif
long err = syscall(SYS_getcpu, &ncpu, &node, NULL);
if (err != 0) return 0;
return node;