diff --git a/CMakeLists.txt b/CMakeLists.txt index 8c4802295a7..45ee1dfbb41 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -317,6 +317,7 @@ include (cmake/find_hdfs3.cmake) # uses protobuf include (cmake/find_consistent-hashing.cmake) include (cmake/find_base64.cmake) include (cmake/find_hyperscan.cmake) +include (cmake/find_lfalloc.cmake) find_contrib_lib(cityhash) find_contrib_lib(farmhash) find_contrib_lib(metrohash) diff --git a/cmake/find_lfalloc.cmake b/cmake/find_lfalloc.cmake new file mode 100644 index 00000000000..9383bd01f30 --- /dev/null +++ b/cmake/find_lfalloc.cmake @@ -0,0 +1,9 @@ +if (NOT SANITIZE AND NOT ARCH_ARM AND NOT ARCH_32 AND NOT ARCH_PPC64LE) + if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lfalloc/src/lf_allocX64.h") + message (FATAL_ERROR "submodule contrib/lfalloc is missing. to fix try run: \n git submodule update --init --recursive") + endif() + set (USE_LFALLOC 1) + set (USE_LFALLOC_RANDOM_HINT 1) + set (LFALLOC_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/lfalloc/src) + message (STATUS "Using lfalloc=${USE_LFALLOC}: ${LFALLOC_INCLUDE_DIR}") +endif () diff --git a/contrib/lfalloc/src/lf_allocX64.h b/contrib/lfalloc/src/lf_allocX64.h new file mode 100644 index 00000000000..2c4cf3f1021 --- /dev/null +++ b/contrib/lfalloc/src/lf_allocX64.h @@ -0,0 +1,1813 @@ +#pragma once + +#include +#include +#include + +#include "lfmalloc.h" + +#include "util/system/compiler.h" +#include "util/system/types.h" +#include + +#ifdef _MSC_VER +#ifndef _CRT_SECURE_NO_WARNINGS +#define _CRT_SECURE_NO_WARNINGS +#endif +#ifdef _M_X64 +#define _64_ +#endif +#include +#define WIN32_LEAN_AND_MEAN +#include +#pragma intrinsic(_InterlockedCompareExchange) +#pragma intrinsic(_InterlockedExchangeAdd) + +#include +#include +#include + +#define PERTHREAD __declspec(thread) +#define _win_ +#define Y_FORCE_INLINE __forceinline + +using TAtomic = volatile long; + +static inline long AtomicAdd(TAtomic& a, long b) { + return _InterlockedExchangeAdd(&a, b) + b; +} + +static inline long AtomicSub(TAtomic& a, long b) { + return AtomicAdd(a, -b); +} + +#define Y_ASSERT_NOBT(x) ((void)0) + +#else + +#include "util/system/defaults.h" +#include "util/system/atomic.h" +#include + +#if !defined(NDEBUG) && !defined(__GCCXML__) +#define Y_ASSERT_NOBT(a) \ + do { \ + if (Y_UNLIKELY(!(a))) { \ + assert(false && (a)); \ + } \ + } while (0) +#else +#define Y_ASSERT_NOBT(a) \ + do { \ + if (false) { \ + bool __xxx = static_cast(a); \ + Y_UNUSED(__xxx); \ + } \ + } while (0) +#endif + +#include +#include +#include +#include +#include +#include + +#if defined(_linux_) +#if !defined(MADV_HUGEPAGE) +#define MADV_HUGEPAGE 14 +#endif +#if !defined(MAP_HUGETLB) +#define MAP_HUGETLB 0x40000 +#endif +#endif + +#define PERTHREAD __thread + +#endif + +#ifndef _darwin_ + +#ifndef Y_ARRAY_SIZE +#define Y_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0])) +#endif + +#ifndef NDEBUG +#define DBG_FILL_MEMORY +static bool FillMemoryOnAllocation = true; +#endif + +static bool TransparentHugePages = false; // force MADV_HUGEPAGE for large allocs +static bool MapHugeTLB = false; // force MAP_HUGETLB for small allocs +static bool EnableDefrag = true; + +// Buffers that are larger than this size will not be filled with 0xcf +#ifndef DBG_FILL_MAX_SIZE +#define DBG_FILL_MAX_SIZE 0x01000000000000ULL +#endif + +template +inline T* DoCas(T* volatile* target, T* exchange, T* compare) { +#if defined(_linux_) + return __sync_val_compare_and_swap(target, compare, exchange); +#elif defined(_WIN32) +#ifdef _64_ + return (T*)_InterlockedCompareExchange64((__int64*)target, (__int64)exchange, (__int64)compare); +#else + //return (T*)InterlockedCompareExchangePointer(targetVoidP, exchange, compare); + return (T*)_InterlockedCompareExchange((LONG*)target, (LONG)exchange, (LONG)compare); +#endif +#elif defined(__i386) || defined(__x86_64__) + union { + T* volatile* NP; + void* volatile* VoidP; + } gccSucks; + gccSucks.NP = target; + void* volatile* targetVoidP = gccSucks.VoidP; + + __asm__ __volatile__( + "lock\n\t" + "cmpxchg %2,%0\n\t" + : "+m"(*(targetVoidP)), "+a"(compare) + : "r"(exchange) + : "cc", "memory"); + return compare; +#else +#error inline_cas not defined for this platform +#endif +} + +#ifdef _64_ +const uintptr_t N_MAX_WORKSET_SIZE = 0x100000000ll * 200; +const uintptr_t N_HUGE_AREA_FINISH = 0x700000000000ll; +#ifndef _freebsd_ +const uintptr_t LINUX_MMAP_AREA_START = 0x100000000ll; +static uintptr_t volatile linuxAllocPointer = LINUX_MMAP_AREA_START; +static uintptr_t volatile linuxAllocPointerHuge = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE; +#endif +#else +const uintptr_t N_MAX_WORKSET_SIZE = 0xffffffff; +#endif +#define ALLOC_START ((char*)0) + +const size_t N_CHUNK_SIZE = 1024 * 1024; +const size_t N_CHUNKS = N_MAX_WORKSET_SIZE / N_CHUNK_SIZE; +const size_t N_LARGE_ALLOC_SIZE = N_CHUNK_SIZE * 128; + +// map size idx to size in bytes +#ifdef LFALLOC_YT +const int N_SIZES = 27; +#else +const int N_SIZES = 25; +#endif +const int nSizeIdxToSize[N_SIZES] = { + -1, +#if defined(_64_) + 16, 16, 32, 32, 48, 64, 96, 128, +#else + 8, + 16, + 24, + 32, + 48, + 64, + 96, + 128, +#endif + 192, 256, 384, 512, 768, 1024, 1536, 2048, + 3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768, +#ifdef LFALLOC_YT + 49152, 65536 +#endif +}; +#ifdef LFALLOC_YT +const size_t N_MAX_FAST_SIZE = 65536; +#else +const size_t N_MAX_FAST_SIZE = 32768; +#endif +const unsigned char size2idxArr1[64 + 1] = { + 1, +#if defined(_64_) + 2, 2, 4, 4, // 16, 16, 32, 32 +#else + 1, 2, 3, 4, // 8, 16, 24, 32 +#endif + 5, 5, 6, 6, // 48, 64 + 7, 7, 7, 7, 8, 8, 8, 8, // 96, 128 + 9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10, // 192, 256 + 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // 384 + 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12 // 512 +}; +#ifdef LFALLOC_YT +const unsigned char size2idxArr2[256] = { +#else +const unsigned char size2idxArr2[128] = { +#endif + 12, 12, 13, 14, // 512, 512, 768, 1024 + 15, 15, 16, 16, // 1536, 2048 + 17, 17, 17, 17, 18, 18, 18, 18, // 3072, 4096 + 19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, // 6144, 8192 + 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, // 12288 + 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, // 16384 + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, + 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, // 24576 + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, // 32768 +#ifdef LFALLOC_YT + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, + 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, // 49152 + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, + 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, // 65536 +#endif +}; + +// map entry number to size idx +// special size idx's: 0 = not used, -1 = mem locked, but not allocated +static volatile char chunkSizeIdx[N_CHUNKS]; +const int FREE_CHUNK_ARR_BUF = 0x20000; // this is effectively 128G of free memory (with 1M chunks), should not be exhausted actually +static volatile uintptr_t freeChunkArr[FREE_CHUNK_ARR_BUF]; +static volatile int freeChunkCount; + +static void AddFreeChunk(uintptr_t chunkId) { + chunkSizeIdx[chunkId] = -1; + if (Y_UNLIKELY(freeChunkCount == FREE_CHUNK_ARR_BUF)) + NMalloc::AbortFromCorruptedAllocator(); // free chunks arrray overflowed + freeChunkArr[freeChunkCount++] = chunkId; +} + +static bool GetFreeChunk(uintptr_t* res) { + if (freeChunkCount == 0) { + *res = 0; + return false; + } + *res = freeChunkArr[--freeChunkCount]; + return true; +} + +////////////////////////////////////////////////////////////////////////// +enum ELFAllocCounter { + CT_USER_ALLOC, // accumulated size requested by user code + CT_MMAP, // accumulated mmapped size + CT_MMAP_CNT, // number of mmapped regions + CT_MUNMAP, // accumulated unmmapped size + CT_MUNMAP_CNT, // number of munmaped regions + CT_SYSTEM_ALLOC, // accumulated allocated size for internal lfalloc needs + CT_SYSTEM_FREE, // accumulated deallocated size for internal lfalloc needs + CT_SMALL_ALLOC, // accumulated allocated size for fixed-size blocks + CT_SMALL_FREE, // accumulated deallocated size for fixed-size blocks + CT_LARGE_ALLOC, // accumulated allocated size for large blocks + CT_LARGE_FREE, // accumulated deallocated size for large blocks + CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations + CT_DEGRAGMENT_CNT, // number of memory defragmentations + CT_MAX +}; + +static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value); + +////////////////////////////////////////////////////////////////////////// +enum EMMapMode { + MM_NORMAL, // memory for small allocs + MM_HUGE // memory for large allocs +}; + +#ifndef _MSC_VER +inline void VerifyMmapResult(void* result) { + if (Y_UNLIKELY(result == MAP_FAILED)) + NMalloc::AbortFromCorruptedAllocator(); // negative size requested? or just out of mem +} +#endif + +#if !defined(_MSC_VER) && !defined(_freebsd_) && defined(_64_) +static char* AllocWithMMapLinuxImpl(uintptr_t sz, EMMapMode mode) { + char* volatile* areaPtr; + char* areaStart; + uintptr_t areaFinish; + + int mapProt = PROT_READ | PROT_WRITE; + int mapFlags = MAP_PRIVATE | MAP_ANON; + + if (mode == MM_HUGE) { + areaPtr = reinterpret_cast(&linuxAllocPointerHuge); + areaStart = reinterpret_cast(LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE); + areaFinish = N_HUGE_AREA_FINISH; + } else { + areaPtr = reinterpret_cast(&linuxAllocPointer); + areaStart = reinterpret_cast(LINUX_MMAP_AREA_START); + areaFinish = N_MAX_WORKSET_SIZE; + + if (MapHugeTLB) { + mapFlags |= MAP_HUGETLB; + } + } + + bool wrapped = false; + for (;;) { + char* prevAllocPtr = *areaPtr; + char* nextAllocPtr = prevAllocPtr + sz; + if (uintptr_t(nextAllocPtr - (char*)nullptr) >= areaFinish) { + if (Y_UNLIKELY(wrapped)) { + // virtual memory is over fragmented + NMalloc::AbortFromCorruptedAllocator(); + } + // wrap after all area is used + DoCas(areaPtr, areaStart, prevAllocPtr); + wrapped = true; + continue; + } + + if (DoCas(areaPtr, nextAllocPtr, prevAllocPtr) != prevAllocPtr) + continue; + + char* largeBlock = (char*)mmap(prevAllocPtr, sz, mapProt, mapFlags, -1, 0); + VerifyMmapResult(largeBlock); + if (largeBlock == prevAllocPtr) + return largeBlock; + if (largeBlock) + munmap(largeBlock, sz); + + if (sz < 0x80000) { + // skip utilized area with big steps + DoCas(areaPtr, nextAllocPtr + 0x10 * 0x10000, nextAllocPtr); + } + } +} +#endif + +static char* AllocWithMMap(uintptr_t sz, EMMapMode mode) { + (void)mode; +#ifdef _MSC_VER + char* largeBlock = (char*)VirtualAlloc(0, sz, MEM_RESERVE, PAGE_READWRITE); + if (Y_UNLIKELY(largeBlock == nullptr)) + NMalloc::AbortFromCorruptedAllocator(); // out of memory + if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE)) + NMalloc::AbortFromCorruptedAllocator(); // out of working set, something has broken +#else +#if defined(_freebsd_) || !defined(_64_) || defined(USE_LFALLOC_RANDOM_HINT) + uintptr_t areaStart; + uintptr_t areaFinish; + if (mode == MM_HUGE) { + areaStart = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE; + areaFinish = N_HUGE_AREA_FINISH; + } else { + areaStart = LINUX_MMAP_AREA_START; + areaFinish = N_MAX_WORKSET_SIZE; + } +#if defined(USE_LFALLOC_RANDOM_HINT) + static thread_local std::mt19937_64 generator(std::random_device{}()); + std::uniform_int_distribution distr(areaStart, areaFinish / 2); + char* largeBlock = (char*)mmap(reinterpret_cast(distr(generator)), sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); +#else + char* largeBlock = (char*)mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); +#endif + VerifyMmapResult(largeBlock); + if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= areaFinish)) + NMalloc::AbortFromCorruptedAllocator(); // out of working set, something has broken +#else + char* largeBlock = AllocWithMMapLinuxImpl(sz, mode); + if (TransparentHugePages) { + madvise(largeBlock, sz, MADV_HUGEPAGE); + } +#endif +#endif + Y_ASSERT_NOBT(largeBlock); + IncrementCounter(CT_MMAP, sz); + IncrementCounter(CT_MMAP_CNT, 1); + return largeBlock; +} + +enum class ELarge : ui8 { + Free = 0, // block in free cache + Alloc = 1, // block is allocated + Gone = 2, // block was unmapped +}; + +struct TLargeBlk { + + static TLargeBlk* As(void *raw) { + return reinterpret_cast((char*)raw - 4096ll); + } + + static const TLargeBlk* As(const void *raw) { + return reinterpret_cast((const char*)raw - 4096ll); + } + + void SetSize(size_t bytes, size_t pages) { + Pages = pages; + Bytes = bytes; + } + + void Mark(ELarge state) { + const ui64 marks[] = { + 0x8b38aa5ca4953c98, // ELarge::Free + 0xf916d33584eb5087, // ELarge::Alloc + 0xd33b0eca7651bc3f // ELarge::Gone + }; + + Token = size_t(marks[ui8(state)]); + } + + size_t Pages; // Total pages allocated with mmap like call + size_t Bytes; // Actually requested bytes by user + size_t Token; // Block state token, see ELarge enum. +}; + + +static void LargeBlockUnmap(void* p, size_t pages) { + const auto bytes = (pages + 1) * uintptr_t(4096); + + IncrementCounter(CT_MUNMAP, bytes); + IncrementCounter(CT_MUNMAP_CNT, 1); +#ifdef _MSC_VER + Y_ASSERT_NOBT(0); +#else + TLargeBlk::As(p)->Mark(ELarge::Gone); + munmap((char*)p - 4096ll, bytes); +#endif +} + +////////////////////////////////////////////////////////////////////////// +const size_t LB_BUF_SIZE = 250; +const size_t LB_BUF_HASH = 977; +static int LB_LIMIT_TOTAL_SIZE = 500 * 1024 * 1024 / 4096; // do not keep more then this mem total in lbFreePtrs[] +static void* volatile lbFreePtrs[LB_BUF_HASH][LB_BUF_SIZE]; +static TAtomic lbFreePageCount; + + +static void* LargeBlockAlloc(size_t _nSize, ELFAllocCounter counter) { + size_t pgCount = (_nSize + 4095) / 4096; +#ifdef _MSC_VER + char* pRes = (char*)VirtualAlloc(0, (pgCount + 1) * 4096ll, MEM_COMMIT, PAGE_READWRITE); + if (Y_UNLIKELY(pRes == 0)) { + NMalloc::AbortFromCorruptedAllocator(); // out of memory + } +#else + + IncrementCounter(counter, pgCount * 4096ll); + IncrementCounter(CT_SYSTEM_ALLOC, 4096ll); + + int lbHash = pgCount % LB_BUF_HASH; + for (int i = 0; i < LB_BUF_SIZE; ++i) { + void* p = lbFreePtrs[lbHash][i]; + if (p == nullptr) + continue; + if (DoCas(&lbFreePtrs[lbHash][i], (void*)nullptr, p) == p) { + size_t realPageCount = TLargeBlk::As(p)->Pages; + if (realPageCount == pgCount) { + AtomicAdd(lbFreePageCount, -pgCount); + TLargeBlk::As(p)->Mark(ELarge::Alloc); + return p; + } else { + if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) != (void*)nullptr) { + // block was freed while we were busy + AtomicAdd(lbFreePageCount, -realPageCount); + LargeBlockUnmap(p, realPageCount); + --i; + } + } + } + } + char* pRes = AllocWithMMap((pgCount + 1) * 4096ll, MM_HUGE); +#endif + pRes += 4096ll; + TLargeBlk::As(pRes)->SetSize(_nSize, pgCount); + TLargeBlk::As(pRes)->Mark(ELarge::Alloc); + + return pRes; +} + +#ifndef _MSC_VER +static void FreeAllLargeBlockMem() { + for (auto& lbFreePtr : lbFreePtrs) { + for (int i = 0; i < LB_BUF_SIZE; ++i) { + void* p = lbFreePtr[i]; + if (p == nullptr) + continue; + if (DoCas(&lbFreePtr[i], (void*)nullptr, p) == p) { + int pgCount = TLargeBlk::As(p)->Pages; + AtomicAdd(lbFreePageCount, -pgCount); + LargeBlockUnmap(p, pgCount); + } + } + } +} +#endif + +static void LargeBlockFree(void* p, ELFAllocCounter counter) { + if (p == nullptr) + return; +#ifdef _MSC_VER + VirtualFree((char*)p - 4096ll, 0, MEM_RELEASE); +#else + size_t pgCount = TLargeBlk::As(p)->Pages; + + TLargeBlk::As(p)->Mark(ELarge::Free); + IncrementCounter(counter, pgCount * 4096ll); + IncrementCounter(CT_SYSTEM_FREE, 4096ll); + + if (lbFreePageCount > LB_LIMIT_TOTAL_SIZE) + FreeAllLargeBlockMem(); + int lbHash = pgCount % LB_BUF_HASH; + for (int i = 0; i < LB_BUF_SIZE; ++i) { + if (lbFreePtrs[lbHash][i] == nullptr) { + if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) == nullptr) { + AtomicAdd(lbFreePageCount, pgCount); + return; + } + } + } + + LargeBlockUnmap(p, pgCount); +#endif +} + +static void* SystemAlloc(size_t _nSize) { + //HeapAlloc(GetProcessHeap(), HEAP_GENERATE_EXCEPTIONS, _nSize); + return LargeBlockAlloc(_nSize, CT_SYSTEM_ALLOC); +} +static void SystemFree(void* p) { + //HeapFree(GetProcessHeap(), 0, p); + LargeBlockFree(p, CT_SYSTEM_FREE); +} + +////////////////////////////////////////////////////////////////////////// +static int* volatile nLock = nullptr; +static int nLockVar; +inline void RealEnterCriticalDefault(int* volatile* lockPtr) { + while (DoCas(lockPtr, &nLockVar, (int*)nullptr) != nullptr) + ; //pthread_yield(); +} +inline void RealLeaveCriticalDefault(int* volatile* lockPtr) { + *lockPtr = nullptr; +} +static void (*RealEnterCritical)(int* volatile* lockPtr) = RealEnterCriticalDefault; +static void (*RealLeaveCritical)(int* volatile* lockPtr) = RealLeaveCriticalDefault; +static void (*BeforeLFAllocGlobalLockAcquired)() = nullptr; +static void (*AfterLFAllocGlobalLockReleased)() = nullptr; +class CCriticalSectionLockMMgr { +public: + CCriticalSectionLockMMgr() { + if (BeforeLFAllocGlobalLockAcquired) { + BeforeLFAllocGlobalLockAcquired(); + } + RealEnterCritical(&nLock); + } + ~CCriticalSectionLockMMgr() { + RealLeaveCritical(&nLock); + if (AfterLFAllocGlobalLockReleased) { + AfterLFAllocGlobalLockReleased(); + } + } +}; + +////////////////////////////////////////////////////////////////////////// +class TLFAllocFreeList { + struct TNode { + TNode* Next; + }; + + TNode* volatile Head; + TNode* volatile Pending; + TAtomic PendingToFreeListCounter; + TAtomic AllocCount; + + static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) { + for (;;) { + TNode* volatile prevHead = *headPtr; + n->Next = prevHead; + if (DoCas(headPtr, n, prevHead) == prevHead) + break; + } + } + Y_FORCE_INLINE void* DoAlloc() { + TNode* res; + for (res = Head; res; res = Head) { + TNode* keepNext = res->Next; + if (DoCas(&Head, keepNext, res) == res) { + //Y_VERIFY(keepNext == res->Next); + break; + } + } + return res; + } + void FreeList(TNode* fl) { + if (!fl) + return; + TNode* flTail = fl; + while (flTail->Next) + flTail = flTail->Next; + for (;;) { + TNode* volatile prevHead = Head; + flTail->Next = prevHead; + if (DoCas(&Head, fl, prevHead) == prevHead) + break; + } + } + +public: + Y_FORCE_INLINE void Free(void* ptr) { + TNode* newFree = (TNode*)ptr; + if (AtomicAdd(AllocCount, 0) == 0) + Enqueue(&Head, newFree); + else + Enqueue(&Pending, newFree); + } + Y_FORCE_INLINE void* Alloc() { + TAtomic keepCounter = AtomicAdd(PendingToFreeListCounter, 0); + TNode* fl = Pending; + if (AtomicAdd(AllocCount, 1) == 1) { + // No other allocs in progress. + // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads. + // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList + if (fl && keepCounter == AtomicAdd(PendingToFreeListCounter, 0) && DoCas(&Pending, (TNode*)nullptr, fl) == fl) { + // pick first element from Pending and return it + void* res = fl; + fl = fl->Next; + // if there are other elements in Pending list, add them to main free list + FreeList(fl); + AtomicAdd(PendingToFreeListCounter, 1); + AtomicAdd(AllocCount, -1); + return res; + } + } + void* res = DoAlloc(); + AtomicAdd(AllocCount, -1); + return res; + } + void* GetWholeList() { + TNode* res; + for (res = Head; res; res = Head) { + if (DoCas(&Head, (TNode*)nullptr, res) == res) + break; + } + return res; + } + void ReturnWholeList(void* ptr) { + while (AtomicAdd(AllocCount, 0) != 0) // theoretically can run into problems with parallel DoAlloc() + ; //ThreadYield(); + for (;;) { + TNode* prevHead = Head; + if (DoCas(&Head, (TNode*)ptr, prevHead) == prevHead) { + FreeList(prevHead); + break; + } + } + } +}; + +///////////////////////////////////////////////////////////////////////// +static TLFAllocFreeList globalFreeLists[N_SIZES]; +static char* volatile globalCurrentPtr[N_SIZES]; +static TLFAllocFreeList blockFreeList; + +// globalFreeLists[] contains TFreeListGroup, each of them points up to 15 free blocks +const int FL_GROUP_SIZE = 15; +struct TFreeListGroup { + TFreeListGroup* Next; + char* Ptrs[FL_GROUP_SIZE]; +}; +#ifdef _64_ +const int FREE_LIST_GROUP_SIZEIDX = 8; +#else +const int FREE_LIST_GROUP_SIZEIDX = 6; +#endif + +////////////////////////////////////////////////////////////////////////// +// find free chunks and reset chunk size so they can be reused by different sized allocations +// do not look at blockFreeList (TFreeListGroup has same size for any allocations) +static bool DefragmentMem() { + if (!EnableDefrag) { + return false; + } + + IncrementCounter(CT_DEGRAGMENT_CNT, 1); + + int* nFreeCount = (int*)SystemAlloc(N_CHUNKS * sizeof(int)); + if (Y_UNLIKELY(!nFreeCount)) { + //__debugbreak(); + NMalloc::AbortFromCorruptedAllocator(); + } + memset(nFreeCount, 0, N_CHUNKS * sizeof(int)); + + TFreeListGroup* wholeLists[N_SIZES]; + for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) { + wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList(); + for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) { + for (auto pData : g->Ptrs) { + if (pData) { + uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE; + ++nFreeCount[nChunk]; + Y_ASSERT_NOBT(chunkSizeIdx[nChunk] == nSizeIdx); + } + } + } + } + + bool bRes = false; + for (size_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) { + int fc = nFreeCount[nChunk]; + nFreeCount[nChunk] = 0; + if (chunkSizeIdx[nChunk] <= 0) + continue; + int nEntries = N_CHUNK_SIZE / nSizeIdxToSize[static_cast(chunkSizeIdx[nChunk])]; + Y_ASSERT_NOBT(fc <= nEntries); // can not have more free blocks then total count + if (fc == nEntries) { + bRes = true; + nFreeCount[nChunk] = 1; + } + } + if (bRes) { + for (auto& wholeList : wholeLists) { + TFreeListGroup** ppPtr = &wholeList; + while (*ppPtr) { + TFreeListGroup* g = *ppPtr; + int dst = 0; + for (auto pData : g->Ptrs) { + if (pData) { + uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE; + if (nFreeCount[nChunk] == 0) + g->Ptrs[dst++] = pData; // block is not freed, keep pointer + } + } + if (dst == 0) { + // no valid pointers in group, free it + *ppPtr = g->Next; + blockFreeList.Free(g); + } else { + // reset invalid pointers to 0 + for (int i = dst; i < FL_GROUP_SIZE; ++i) + g->Ptrs[i] = nullptr; + ppPtr = &g->Next; + } + } + } + for (uintptr_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) { + if (!nFreeCount[nChunk]) + continue; + char* pStart = ALLOC_START + nChunk * N_CHUNK_SIZE; +#ifdef _win_ + VirtualFree(pStart, N_CHUNK_SIZE, MEM_DECOMMIT); +#elif defined(_freebsd_) + madvise(pStart, N_CHUNK_SIZE, MADV_FREE); +#else + madvise(pStart, N_CHUNK_SIZE, MADV_DONTNEED); +#endif + AddFreeChunk(nChunk); + } + } + + for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) + globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]); + + SystemFree(nFreeCount); + return bRes; +} + +static Y_FORCE_INLINE void* LFAllocFromCurrentChunk(int nSizeIdx, int blockSize, int count) { + char* volatile* pFreeArray = &globalCurrentPtr[nSizeIdx]; + while (char* newBlock = *pFreeArray) { + char* nextFree = newBlock + blockSize * count; + + // check if there is space in chunk + char* globalEndPtr = ALLOC_START + ((newBlock - ALLOC_START) & ~((uintptr_t)N_CHUNK_SIZE - 1)) + N_CHUNK_SIZE; + if (nextFree >= globalEndPtr) { + if (nextFree > globalEndPtr) + break; + nextFree = nullptr; // it was last block in chunk + } + if (DoCas(pFreeArray, nextFree, newBlock) == newBlock) + return newBlock; + } + return nullptr; +} + +enum EDefrag { + MEM_DEFRAG, + NO_MEM_DEFRAG, +}; + +static void* SlowLFAlloc(int nSizeIdx, int blockSize, EDefrag defrag) { + IncrementCounter(CT_SLOW_ALLOC_CNT, 1); + + CCriticalSectionLockMMgr ls; + void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1); + if (res) + return res; // might happen when other thread allocated new current chunk + + for (;;) { + uintptr_t nChunk; + if (GetFreeChunk(&nChunk)) { + char* newPlace = ALLOC_START + nChunk * N_CHUNK_SIZE; +#ifdef _MSC_VER + void* pTest = VirtualAlloc(newPlace, N_CHUNK_SIZE, MEM_COMMIT, PAGE_READWRITE); + Y_ASSERT_NOBT(pTest == newPlace); +#endif + chunkSizeIdx[nChunk] = (char)nSizeIdx; + globalCurrentPtr[nSizeIdx] = newPlace + blockSize; + return newPlace; + } + + // out of luck, try to defrag + if (defrag == MEM_DEFRAG && DefragmentMem()) { + continue; + } + + char* largeBlock = AllocWithMMap(N_LARGE_ALLOC_SIZE, MM_NORMAL); + uintptr_t addr = ((largeBlock - ALLOC_START) + N_CHUNK_SIZE - 1) & (~(N_CHUNK_SIZE - 1)); + uintptr_t endAddr = ((largeBlock - ALLOC_START) + N_LARGE_ALLOC_SIZE) & (~(N_CHUNK_SIZE - 1)); + for (uintptr_t p = addr; p < endAddr; p += N_CHUNK_SIZE) { + uintptr_t chunk = p / N_CHUNK_SIZE; + Y_ASSERT_NOBT(chunk * N_CHUNK_SIZE == p); + Y_ASSERT_NOBT(chunkSizeIdx[chunk] == 0); + AddFreeChunk(chunk); + } + } + return nullptr; +} + +// allocate single block +static Y_FORCE_INLINE void* LFAllocNoCache(int nSizeIdx, EDefrag defrag) { + int blockSize = nSizeIdxToSize[nSizeIdx]; + void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1); + if (res) + return res; + + return SlowLFAlloc(nSizeIdx, blockSize, defrag); +} + +// allocate multiple blocks, returns number of blocks allocated (max FL_GROUP_SIZE) +// buf should have space for at least FL_GROUP_SIZE elems +static Y_FORCE_INLINE int LFAllocNoCacheMultiple(int nSizeIdx, char** buf) { + int blockSize = nSizeIdxToSize[nSizeIdx]; + void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, FL_GROUP_SIZE); + if (res) { + char* resPtr = (char*)res; + for (int k = 0; k < FL_GROUP_SIZE; ++k) { + buf[k] = resPtr; + resPtr += blockSize; + } + return FL_GROUP_SIZE; + } + buf[0] = (char*)SlowLFAlloc(nSizeIdx, blockSize, MEM_DEFRAG); + return 1; +} + +// take several blocks from global free list (max FL_GROUP_SIZE blocks), returns number of blocks taken +// buf should have space for at least FL_GROUP_SIZE elems +static Y_FORCE_INLINE int TakeBlocksFromGlobalFreeList(int nSizeIdx, char** buf) { + TLFAllocFreeList& fl = globalFreeLists[nSizeIdx]; + TFreeListGroup* g = (TFreeListGroup*)fl.Alloc(); + if (g) { + int resCount = 0; + for (auto& ptr : g->Ptrs) { + if (ptr) + buf[resCount++] = ptr; + else + break; + } + blockFreeList.Free(g); + return resCount; + } + return 0; +} + +// add several blocks to global free list +static Y_FORCE_INLINE void PutBlocksToGlobalFreeList(ptrdiff_t nSizeIdx, char** buf, int count) { + for (int startIdx = 0; startIdx < count;) { + TFreeListGroup* g = (TFreeListGroup*)blockFreeList.Alloc(); + Y_ASSERT_NOBT(sizeof(TFreeListGroup) == nSizeIdxToSize[FREE_LIST_GROUP_SIZEIDX]); + if (!g) { + g = (TFreeListGroup*)LFAllocNoCache(FREE_LIST_GROUP_SIZEIDX, NO_MEM_DEFRAG); + } + + int groupSize = count - startIdx; + if (groupSize > FL_GROUP_SIZE) + groupSize = FL_GROUP_SIZE; + for (int i = 0; i < groupSize; ++i) + g->Ptrs[i] = buf[startIdx + i]; + for (int i = groupSize; i < FL_GROUP_SIZE; ++i) + g->Ptrs[i] = nullptr; + + // add free group to the global list + TLFAllocFreeList& fl = globalFreeLists[nSizeIdx]; + fl.Free(g); + + startIdx += groupSize; + } +} + +////////////////////////////////////////////////////////////////////////// +static TAtomic GlobalCounters[CT_MAX]; +const int MAX_LOCAL_UPDATES = 100; + +struct TLocalCounter { + intptr_t Value; + int Updates; + TAtomic* Parent; + + Y_FORCE_INLINE void Init(TAtomic* parent) { + Parent = parent; + Value = 0; + Updates = 0; + } + + Y_FORCE_INLINE void Increment(size_t value) { + Value += value; + if (++Updates > MAX_LOCAL_UPDATES) { + Flush(); + } + } + + Y_FORCE_INLINE void Flush() { + AtomicAdd(*Parent, Value); + Value = 0; + Updates = 0; + } +}; + +//////////////////////////////////////////////////////////////////////////////// +// DBG stuff +//////////////////////////////////////////////////////////////////////////////// + +#if defined(LFALLOC_DBG) + +struct TPerTagAllocCounter { + TAtomic Size; + TAtomic Count; + + Y_FORCE_INLINE void Alloc(size_t size) { + AtomicAdd(Size, size); + AtomicAdd(Count, 1); + } + + Y_FORCE_INLINE void Free(size_t size) { + AtomicSub(Size, size); + AtomicSub(Count, 1); + } +}; + +struct TLocalPerTagAllocCounter { + intptr_t Size; + int Count; + int Updates; + + Y_FORCE_INLINE void Init() { + Size = 0; + Count = 0; + Updates = 0; + } + + Y_FORCE_INLINE void Alloc(TPerTagAllocCounter& parent, size_t size) { + Size += size; + ++Count; + if (++Updates > MAX_LOCAL_UPDATES) { + Flush(parent); + } + } + + Y_FORCE_INLINE void Free(TPerTagAllocCounter& parent, size_t size) { + Size -= size; + --Count; + if (++Updates > MAX_LOCAL_UPDATES) { + Flush(parent); + } + } + + Y_FORCE_INLINE void Flush(TPerTagAllocCounter& parent) { + AtomicAdd(parent.Size, Size); + Size = 0; + AtomicAdd(parent.Count, Count); + Count = 0; + Updates = 0; + } +}; + +static const int DBG_ALLOC_MAX_TAG = 1000; +static const int DBG_ALLOC_NUM_SIZES = 30; +static TPerTagAllocCounter GlobalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES]; + +#endif // LFALLOC_DBG + +////////////////////////////////////////////////////////////////////////// +const int THREAD_BUF = 256; +static int borderSizes[N_SIZES]; +const int MAX_MEM_PER_SIZE_PER_THREAD = 512 * 1024; +struct TThreadAllocInfo { + // FreePtrs - pointers to first free blocks in per thread block list + // LastFreePtrs - pointers to last blocks in lists, may be invalid if FreePtr is zero + char* FreePtrs[N_SIZES][THREAD_BUF]; + int FreePtrIndex[N_SIZES]; + TThreadAllocInfo* pNextInfo; + TLocalCounter LocalCounters[CT_MAX]; + +#if defined(LFALLOC_DBG) + TLocalPerTagAllocCounter LocalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES]; +#endif +#ifdef _win_ + HANDLE hThread; +#endif + + void Init(TThreadAllocInfo** pHead) { + memset(this, 0, sizeof(*this)); + for (auto& i : FreePtrIndex) + i = THREAD_BUF; +#ifdef _win_ + BOOL b = DuplicateHandle( + GetCurrentProcess(), GetCurrentThread(), + GetCurrentProcess(), &hThread, + 0, FALSE, DUPLICATE_SAME_ACCESS); + Y_ASSERT_NOBT(b); +#endif + pNextInfo = *pHead; + *pHead = this; + for (int k = 0; k < N_SIZES; ++k) { + int maxCount = MAX_MEM_PER_SIZE_PER_THREAD / nSizeIdxToSize[k]; + if (maxCount > THREAD_BUF) + maxCount = THREAD_BUF; + borderSizes[k] = THREAD_BUF - maxCount; + } + for (int i = 0; i < CT_MAX; ++i) { + LocalCounters[i].Init(&GlobalCounters[i]); + } +#if defined(LFALLOC_DBG) + for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) { + for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) { + auto& local = LocalPerTagAllocCounters[tag][sizeIdx]; + local.Init(); + } + } +#endif + } + void Done() { + for (auto sizeIdx : FreePtrIndex) { + Y_ASSERT_NOBT(sizeIdx == THREAD_BUF); + } + for (auto& localCounter : LocalCounters) { + localCounter.Flush(); + } +#if defined(LFALLOC_DBG) + for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) { + for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) { + auto& local = LocalPerTagAllocCounters[tag][sizeIdx]; + auto& global = GlobalPerTagAllocCounters[tag][sizeIdx]; + local.Flush(global); + } + } +#endif +#ifdef _win_ + if (hThread) + CloseHandle(hThread); +#endif + } +}; +PERTHREAD TThreadAllocInfo* pThreadInfo; +static TThreadAllocInfo* pThreadInfoList; + +static int* volatile nLockThreadInfo = nullptr; +class TLockThreadListMMgr { +public: + TLockThreadListMMgr() { + RealEnterCritical(&nLockThreadInfo); + } + ~TLockThreadListMMgr() { + RealLeaveCritical(&nLockThreadInfo); + } +}; + +static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value) { +#ifdef LFALLOC_YT + TThreadAllocInfo* thr = pThreadInfo; + if (thr) { + thr->LocalCounters[counter].Increment(value); + } else { + AtomicAdd(GlobalCounters[counter], value); + } +#endif +} + +extern "C" i64 GetLFAllocCounterFast(int counter) { +#ifdef LFALLOC_YT + return GlobalCounters[counter]; +#else + return 0; +#endif +} + +extern "C" i64 GetLFAllocCounterFull(int counter) { +#ifdef LFALLOC_YT + i64 ret = GlobalCounters[counter]; + { + TLockThreadListMMgr ll; + for (TThreadAllocInfo** p = &pThreadInfoList; *p;) { + TThreadAllocInfo* pInfo = *p; + ret += pInfo->LocalCounters[counter].Value; + p = &pInfo->pNextInfo; + } + } + return ret; +#else + return 0; +#endif +} + +static void MoveSingleThreadFreeToGlobal(TThreadAllocInfo* pInfo) { + for (int sizeIdx = 0; sizeIdx < N_SIZES; ++sizeIdx) { + int& freePtrIdx = pInfo->FreePtrIndex[sizeIdx]; + char** freePtrs = pInfo->FreePtrs[sizeIdx]; + PutBlocksToGlobalFreeList(sizeIdx, freePtrs + freePtrIdx, THREAD_BUF - freePtrIdx); + freePtrIdx = THREAD_BUF; + } +} + +#ifdef _win_ +static bool IsDeadThread(TThreadAllocInfo* pInfo) { + DWORD dwExit; + bool isDead = !GetExitCodeThread(pInfo->hThread, &dwExit) || dwExit != STILL_ACTIVE; + return isDead; +} + +static void CleanupAfterDeadThreads() { + TLockThreadListMMgr ls; + for (TThreadAllocInfo** p = &pThreadInfoList; *p;) { + TThreadAllocInfo* pInfo = *p; + if (IsDeadThread(pInfo)) { + MoveSingleThreadFreeToGlobal(pInfo); + pInfo->Done(); + *p = pInfo->pNextInfo; + SystemFree(pInfo); + } else + p = &pInfo->pNextInfo; + } +} +#endif + +#ifndef _win_ +static pthread_key_t ThreadCacheCleaner; +static void* volatile ThreadCacheCleanerStarted; // 0 = not started, -1 = started, -2 = is starting +static PERTHREAD bool IsStoppingThread; + +static void FreeThreadCache(void*) { + TThreadAllocInfo* pToDelete = nullptr; + { + TLockThreadListMMgr ls; + pToDelete = pThreadInfo; + if (pToDelete == nullptr) + return; + + // remove from the list + for (TThreadAllocInfo** p = &pThreadInfoList; *p; p = &(*p)->pNextInfo) { + if (*p == pToDelete) { + *p = pToDelete->pNextInfo; + break; + } + } + IsStoppingThread = true; + pThreadInfo = nullptr; + } + + // free per thread buf + MoveSingleThreadFreeToGlobal(pToDelete); + pToDelete->Done(); + SystemFree(pToDelete); +} +#endif + +static void AllocThreadInfo() { +#ifndef _win_ + if (DoCas(&ThreadCacheCleanerStarted, (void*)-2, (void*)nullptr) == (void*)nullptr) { + pthread_key_create(&ThreadCacheCleaner, FreeThreadCache); + ThreadCacheCleanerStarted = (void*)-1; + } + if (ThreadCacheCleanerStarted != (void*)-1) + return; // do not use ThreadCacheCleaner until it is constructed + + { + if (IsStoppingThread) + return; + TLockThreadListMMgr ls; + if (IsStoppingThread) // better safe than sorry + return; + + pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo)); + pThreadInfo->Init(&pThreadInfoList); + } + pthread_setspecific(ThreadCacheCleaner, (void*)-1); // without value destructor will not be called +#else + CleanupAfterDeadThreads(); + { + TLockThreadListMMgr ls; + pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo)); + pThreadInfo->Init(&pThreadInfoList); + } +#endif +} + + ////////////////////////////////////////////////////////////////////////// + // DBG stuff + ////////////////////////////////////////////////////////////////////////// + +#if defined(LFALLOC_DBG) + +struct TAllocHeader { + size_t Size; + int Tag; + int Cookie; +}; + +static inline void* GetAllocPtr(TAllocHeader* p) { + return p + 1; +} + +static inline TAllocHeader* GetAllocHeader(void* p) { + return ((TAllocHeader*)p) - 1; +} + +PERTHREAD int AllocationTag; +extern "C" int SetThreadAllocTag(int tag) { + int prevTag = AllocationTag; + if (tag < DBG_ALLOC_MAX_TAG && tag >= 0) { + AllocationTag = tag; + } + return prevTag; +} + +PERTHREAD bool ProfileCurrentThread; +extern "C" bool SetProfileCurrentThread(bool newVal) { + bool prevVal = ProfileCurrentThread; + ProfileCurrentThread = newVal; + return prevVal; +} + +static volatile bool ProfileAllThreads; +extern "C" bool SetProfileAllThreads(bool newVal) { + bool prevVal = ProfileAllThreads; + ProfileAllThreads = newVal; + return prevVal; +} + +static volatile bool AllocationSamplingEnabled; +extern "C" bool SetAllocationSamplingEnabled(bool newVal) { + bool prevVal = AllocationSamplingEnabled; + AllocationSamplingEnabled = newVal; + return prevVal; +} + +static size_t AllocationSampleRate = 1000; +extern "C" size_t SetAllocationSampleRate(size_t newVal) { + size_t prevVal = AllocationSampleRate; + AllocationSampleRate = newVal; + return prevVal; +} + +static size_t AllocationSampleMaxSize = N_MAX_FAST_SIZE; +extern "C" size_t SetAllocationSampleMaxSize(size_t newVal) { + size_t prevVal = AllocationSampleMaxSize; + AllocationSampleMaxSize = newVal; + return prevVal; +} + +using TAllocationCallback = int(int tag, size_t size, int sizeIdx); +static TAllocationCallback* AllocationCallback; +extern "C" TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) { + TAllocationCallback* prevVal = AllocationCallback; + AllocationCallback = newVal; + return prevVal; +} + +using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx); +static TDeallocationCallback* DeallocationCallback; +extern "C" TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) { + TDeallocationCallback* prevVal = DeallocationCallback; + DeallocationCallback = newVal; + return prevVal; +} + +PERTHREAD TAtomic AllocationsCount; +PERTHREAD bool InAllocationCallback; + +static const int DBG_ALLOC_INVALID_COOKIE = -1; +static inline int SampleAllocation(TAllocHeader* p, int sizeIdx) { + int cookie = DBG_ALLOC_INVALID_COOKIE; + if (AllocationSamplingEnabled && (ProfileCurrentThread || ProfileAllThreads) && !InAllocationCallback) { + if (p->Size > AllocationSampleMaxSize || ++AllocationsCount % AllocationSampleRate == 0) { + if (AllocationCallback) { + InAllocationCallback = true; + cookie = AllocationCallback(p->Tag, p->Size, sizeIdx); + InAllocationCallback = false; + } + } + } + return cookie; +} + +static inline void SampleDeallocation(TAllocHeader* p, int sizeIdx) { + if (p->Cookie != DBG_ALLOC_INVALID_COOKIE && !InAllocationCallback) { + if (DeallocationCallback) { + InAllocationCallback = true; + DeallocationCallback(p->Cookie, p->Tag, p->Size, sizeIdx); + InAllocationCallback = false; + } + } +} + +static inline void TrackPerTagAllocation(TAllocHeader* p, int sizeIdx) { + if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) { + Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES); + auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx]; + + TThreadAllocInfo* thr = pThreadInfo; + if (thr) { + auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx]; + local.Alloc(global, p->Size); + } else { + global.Alloc(p->Size); + } + } +} + +static inline void TrackPerTagDeallocation(TAllocHeader* p, int sizeIdx) { + if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) { + Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES); + auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx]; + + TThreadAllocInfo* thr = pThreadInfo; + if (thr) { + auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx]; + local.Free(global, p->Size); + } else { + global.Free(p->Size); + } + } +} + +static void* TrackAllocation(void* ptr, size_t size, int sizeIdx) { + TAllocHeader* p = (TAllocHeader*)ptr; + p->Size = size; + p->Tag = AllocationTag; + p->Cookie = SampleAllocation(p, sizeIdx); + TrackPerTagAllocation(p, sizeIdx); + return GetAllocPtr(p); +} + +static void TrackDeallocation(void* ptr, int sizeIdx) { + TAllocHeader* p = (TAllocHeader*)ptr; + SampleDeallocation(p, sizeIdx); + TrackPerTagDeallocation(p, sizeIdx); +} + +struct TPerTagAllocInfo { + ssize_t Count; + ssize_t Size; +}; + +extern "C" void GetPerTagAllocInfo( + bool flushPerThreadCounters, + TPerTagAllocInfo* info, + int& maxTag, + int& numSizes) { + maxTag = DBG_ALLOC_MAX_TAG; + numSizes = DBG_ALLOC_NUM_SIZES; + + if (info) { + if (flushPerThreadCounters) { + TLockThreadListMMgr ll; + for (TThreadAllocInfo** p = &pThreadInfoList; *p;) { + TThreadAllocInfo* pInfo = *p; + for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) { + for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) { + auto& local = pInfo->LocalPerTagAllocCounters[tag][sizeIdx]; + auto& global = GlobalPerTagAllocCounters[tag][sizeIdx]; + local.Flush(global); + } + } + p = &pInfo->pNextInfo; + } + } + + for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) { + for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) { + auto& global = GlobalPerTagAllocCounters[tag][sizeIdx]; + auto& res = info[tag * DBG_ALLOC_NUM_SIZES + sizeIdx]; + res.Count = global.Count; + res.Size = global.Size; + } + } + } +} + +#endif // LFALLOC_DBG + +////////////////////////////////////////////////////////////////////////// +static Y_FORCE_INLINE void* LFAllocImpl(size_t _nSize) { +#if defined(LFALLOC_DBG) + size_t size = _nSize; + _nSize += sizeof(TAllocHeader); +#endif + + IncrementCounter(CT_USER_ALLOC, _nSize); + + int nSizeIdx; + if (_nSize > 512) { + if (_nSize > N_MAX_FAST_SIZE) { + void* ptr = LargeBlockAlloc(_nSize, CT_LARGE_ALLOC); +#if defined(LFALLOC_DBG) + ptr = TrackAllocation(ptr, size, N_SIZES); +#endif + return ptr; + } + nSizeIdx = size2idxArr2[(_nSize - 1) >> 8]; + } else + nSizeIdx = size2idxArr1[1 + (((int)_nSize - 1) >> 3)]; + + IncrementCounter(CT_SMALL_ALLOC, nSizeIdxToSize[nSizeIdx]); + + // check per thread buffer + TThreadAllocInfo* thr = pThreadInfo; + if (!thr) { + AllocThreadInfo(); + thr = pThreadInfo; + if (!thr) { + void* ptr = LFAllocNoCache(nSizeIdx, MEM_DEFRAG); +#if defined(LFALLOC_DBG) + ptr = TrackAllocation(ptr, size, nSizeIdx); +#endif + return ptr; + } + } + { + int& freePtrIdx = thr->FreePtrIndex[nSizeIdx]; + if (freePtrIdx < THREAD_BUF) { + void* ptr = thr->FreePtrs[nSizeIdx][freePtrIdx++]; +#if defined(LFALLOC_DBG) + ptr = TrackAllocation(ptr, size, nSizeIdx); +#endif + return ptr; + } + + // try to alloc from global free list + char* buf[FL_GROUP_SIZE]; + int count = TakeBlocksFromGlobalFreeList(nSizeIdx, buf); + if (count == 0) { + count = LFAllocNoCacheMultiple(nSizeIdx, buf); + if (count == 0) { + NMalloc::AbortFromCorruptedAllocator(); // no way LFAllocNoCacheMultiple() can fail + } + } + char** dstBuf = thr->FreePtrs[nSizeIdx] + freePtrIdx - 1; + for (int i = 0; i < count - 1; ++i) + dstBuf[-i] = buf[i]; + freePtrIdx -= count - 1; + void* ptr = buf[count - 1]; +#if defined(LFALLOC_DBG) + ptr = TrackAllocation(ptr, size, nSizeIdx); +#endif + return ptr; + } +} + +static Y_FORCE_INLINE void* LFAlloc(size_t _nSize) { + void* res = LFAllocImpl(_nSize); +#ifdef DBG_FILL_MEMORY + if (FillMemoryOnAllocation && res && (_nSize <= DBG_FILL_MAX_SIZE)) { + memset(res, 0xcf, _nSize); + } +#endif + return res; +} + +static Y_FORCE_INLINE void LFFree(void* p) { +#if defined(LFALLOC_DBG) + if (p == nullptr) + return; + p = GetAllocHeader(p); +#endif + + uintptr_t chkOffset = ((char*)p - ALLOC_START) - 1ll; + if (chkOffset >= N_MAX_WORKSET_SIZE) { + if (p == nullptr) + return; +#if defined(LFALLOC_DBG) + TrackDeallocation(p, N_SIZES); +#endif + LargeBlockFree(p, CT_LARGE_FREE); + return; + } + + uintptr_t chunk = ((char*)p - ALLOC_START) / N_CHUNK_SIZE; + ptrdiff_t nSizeIdx = chunkSizeIdx[chunk]; + if (nSizeIdx <= 0) { +#if defined(LFALLOC_DBG) + TrackDeallocation(p, N_SIZES); +#endif + LargeBlockFree(p, CT_LARGE_FREE); + return; + } + +#if defined(LFALLOC_DBG) + TrackDeallocation(p, nSizeIdx); +#endif + +#ifdef DBG_FILL_MEMORY + memset(p, 0xfe, nSizeIdxToSize[nSizeIdx]); +#endif + + IncrementCounter(CT_SMALL_FREE, nSizeIdxToSize[nSizeIdx]); + + // try to store info to per thread buf + TThreadAllocInfo* thr = pThreadInfo; + if (thr) { + int& freePtrIdx = thr->FreePtrIndex[nSizeIdx]; + if (freePtrIdx > borderSizes[nSizeIdx]) { + thr->FreePtrs[nSizeIdx][--freePtrIdx] = (char*)p; + return; + } + + // move several pointers to global free list + int freeCount = FL_GROUP_SIZE; + if (freeCount > THREAD_BUF - freePtrIdx) + freeCount = THREAD_BUF - freePtrIdx; + char** freePtrs = thr->FreePtrs[nSizeIdx]; + PutBlocksToGlobalFreeList(nSizeIdx, freePtrs + freePtrIdx, freeCount); + freePtrIdx += freeCount; + + freePtrs[--freePtrIdx] = (char*)p; + + } else { + AllocThreadInfo(); + PutBlocksToGlobalFreeList(nSizeIdx, (char**)&p, 1); + } +} + +static size_t LFGetSize(const void* p) { +#if defined(LFALLOC_DBG) + if (p == nullptr) + return 0; + return GetAllocHeader(const_cast(p))->Size; +#endif + + uintptr_t chkOffset = ((const char*)p - ALLOC_START); + if (chkOffset >= N_MAX_WORKSET_SIZE) { + if (p == nullptr) + return 0; + return TLargeBlk::As(p)->Pages * 4096ll; + } + uintptr_t chunk = ((const char*)p - ALLOC_START) / N_CHUNK_SIZE; + ptrdiff_t nSizeIdx = chunkSizeIdx[chunk]; + if (nSizeIdx <= 0) + return TLargeBlk::As(p)->Pages * 4096ll; + return nSizeIdxToSize[nSizeIdx]; +} + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// Output mem alloc stats +const int N_PAGE_SIZE = 4096; +static void DebugTraceMMgr(const char* pszFormat, ...) // __cdecl +{ + static char buff[20000]; + va_list va; + // + va_start(va, pszFormat); + vsprintf(buff, pszFormat, va); + va_end(va); +// +#ifdef _win_ + OutputDebugStringA(buff); +#else + fprintf(stderr, buff); +#endif +} + +struct TChunkStats { + char *Start, *Finish; + i64 Size; + char* Entries; + i64 FreeCount; + + TChunkStats(size_t chunk, i64 size, char* entries) + : Size(size) + , Entries(entries) + , FreeCount(0) + { + Start = ALLOC_START + chunk * N_CHUNK_SIZE; + Finish = Start + N_CHUNK_SIZE; + } + void CheckBlock(char* pBlock) { + if (pBlock && pBlock >= Start && pBlock < Finish) { + ++FreeCount; + i64 nShift = pBlock - Start; + i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1); + Entries[nOffsetInStep / Size] = 1; + } + } + void SetGlobalFree(char* ptr) { + i64 nShift = ptr - Start; + i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1); + while (nOffsetInStep + Size <= N_CHUNK_SIZE) { + ++FreeCount; + Entries[nOffsetInStep / Size] = 1; + nOffsetInStep += Size; + } + } +}; + +static void DumpMemoryBlockUtilizationLocked() { + TFreeListGroup* wholeLists[N_SIZES]; + for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) { + wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList(); + } + char* bfList = (char*)blockFreeList.GetWholeList(); + + DebugTraceMMgr("memory blocks utilisation stats:\n"); + i64 nTotalAllocated = 0, nTotalFree = 0, nTotalBadPages = 0, nTotalPages = 0, nTotalUsed = 0, nTotalLocked = 0; + i64 nTotalGroupBlocks = 0; + char* entries; + entries = (char*)SystemAlloc((N_CHUNK_SIZE / 4)); + for (size_t k = 0; k < N_CHUNKS; ++k) { + if (chunkSizeIdx[k] <= 0) { + if (chunkSizeIdx[k] == -1) + nTotalLocked += N_CHUNK_SIZE; + continue; + } + i64 nSizeIdx = chunkSizeIdx[k]; + i64 nSize = nSizeIdxToSize[nSizeIdx]; + TChunkStats cs(k, nSize, entries); + int nEntriesTotal = N_CHUNK_SIZE / nSize; + memset(entries, 0, nEntriesTotal); + for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) { + for (auto& ptr : g->Ptrs) + cs.CheckBlock(ptr); + } + TChunkStats csGB(k, nSize, entries); + if (nSizeIdx == FREE_LIST_GROUP_SIZEIDX) { + for (auto g : wholeLists) { + for (; g; g = g->Next) + csGB.CheckBlock((char*)g); + } + for (char* blk = bfList; blk; blk = *(char**)blk) + csGB.CheckBlock(blk); + nTotalGroupBlocks += csGB.FreeCount * nSize; + } + if (((globalCurrentPtr[nSizeIdx] - ALLOC_START) / N_CHUNK_SIZE) == k) + cs.SetGlobalFree(globalCurrentPtr[nSizeIdx]); + nTotalUsed += (nEntriesTotal - cs.FreeCount - csGB.FreeCount) * nSize; + + char pages[N_CHUNK_SIZE / N_PAGE_SIZE]; + memset(pages, 0, sizeof(pages)); + for (int i = 0, nShift = 0; i < nEntriesTotal; ++i, nShift += nSize) { + int nBit = 0; + if (entries[i]) + nBit = 1; // free entry + else + nBit = 2; // used entry + for (i64 nDelta = nSize - 1; nDelta >= 0; nDelta -= N_PAGE_SIZE) + pages[(nShift + nDelta) / N_PAGE_SIZE] |= nBit; + } + i64 nBadPages = 0; + for (auto page : pages) { + nBadPages += page == 3; + nTotalPages += page != 1; + } + DebugTraceMMgr("entry = %lld; size = %lld; free = %lld; system %lld; utilisation = %g%%, fragmentation = %g%%\n", + k, nSize, cs.FreeCount * nSize, csGB.FreeCount * nSize, + (N_CHUNK_SIZE - cs.FreeCount * nSize) * 100.0f / N_CHUNK_SIZE, 100.0f * nBadPages / Y_ARRAY_SIZE(pages)); + nTotalAllocated += N_CHUNK_SIZE; + nTotalFree += cs.FreeCount * nSize; + nTotalBadPages += nBadPages; + } + SystemFree(entries); + DebugTraceMMgr("Total allocated = %llu, free = %lld, system = %lld, locked for future use %lld, utilisation = %g, fragmentation = %g\n", + nTotalAllocated, nTotalFree, nTotalGroupBlocks, nTotalLocked, + 100.0f * (nTotalAllocated - nTotalFree) / nTotalAllocated, 100.0f * nTotalBadPages / nTotalPages); + DebugTraceMMgr("Total %lld bytes used, %lld bytes in used pages\n", nTotalUsed, nTotalPages * N_PAGE_SIZE); + + for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) + globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]); + blockFreeList.ReturnWholeList(bfList); +} + +void FlushThreadFreeList() { + if (pThreadInfo) + MoveSingleThreadFreeToGlobal(pThreadInfo); +} + +void DumpMemoryBlockUtilization() { + // move current thread free to global lists to get better statistics + FlushThreadFreeList(); + { + CCriticalSectionLockMMgr ls; + DumpMemoryBlockUtilizationLocked(); + } +} + +////////////////////////////////////////////////////////////////////////// +// malloc api + +static bool LFAlloc_SetParam(const char* param, const char* value) { + if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE")) { + LB_LIMIT_TOTAL_SIZE = atoi(value); + return true; + } + if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE_BYTES")) { + LB_LIMIT_TOTAL_SIZE = (atoi(value) + N_PAGE_SIZE - 1) / N_PAGE_SIZE; + return true; + } +#ifdef DBG_FILL_MEMORY + if (!strcmp(param, "FillMemoryOnAllocation")) { + FillMemoryOnAllocation = !strcmp(value, "true"); + return true; + } +#endif + if (!strcmp(param, "BeforeLFAllocGlobalLockAcquired")) { + BeforeLFAllocGlobalLockAcquired = (decltype(BeforeLFAllocGlobalLockAcquired))(value); + return true; + } + if (!strcmp(param, "AfterLFAllocGlobalLockReleased")) { + AfterLFAllocGlobalLockReleased = (decltype(AfterLFAllocGlobalLockReleased))(value); + return true; + } + if (!strcmp(param, "EnterCritical")) { + assert(value); + RealEnterCritical = (decltype(RealEnterCritical))(value); + return true; + } + if (!strcmp(param, "LeaveCritical")) { + assert(value); + RealLeaveCritical = (decltype(RealLeaveCritical))(value); + return true; + } + if (!strcmp(param, "TransparentHugePages")) { + TransparentHugePages = !strcmp(value, "true"); + return true; + } + if (!strcmp(param, "MapHugeTLB")) { + MapHugeTLB = !strcmp(value, "true"); + return true; + } + if (!strcmp(param, "EnableDefrag")) { + EnableDefrag = !strcmp(value, "true"); + return true; + } + return false; +}; + +static const char* LFAlloc_GetParam(const char* param) { + struct TParam { + const char* Name; + const char* Value; + }; + + static const TParam Params[] = { + {"GetLFAllocCounterFast", (const char*)&GetLFAllocCounterFast}, + {"GetLFAllocCounterFull", (const char*)&GetLFAllocCounterFull}, +#if defined(LFALLOC_DBG) + {"SetThreadAllocTag", (const char*)&SetThreadAllocTag}, + {"SetProfileCurrentThread", (const char*)&SetProfileCurrentThread}, + {"SetProfileAllThreads", (const char*)&SetProfileAllThreads}, + {"SetAllocationSamplingEnabled", (const char*)&SetAllocationSamplingEnabled}, + {"SetAllocationSampleRate", (const char*)&SetAllocationSampleRate}, + {"SetAllocationSampleMaxSize", (const char*)&SetAllocationSampleMaxSize}, + {"SetAllocationCallback", (const char*)&SetAllocationCallback}, + {"SetDeallocationCallback", (const char*)&SetDeallocationCallback}, + {"GetPerTagAllocInfo", (const char*)&GetPerTagAllocInfo}, +#endif // LFALLOC_DBG + }; + + for (int i = 0; i < Y_ARRAY_SIZE(Params); ++i) { + if (strcmp(param, Params[i].Name) == 0) { + return Params[i].Value; + } + } + return nullptr; +} + +static Y_FORCE_INLINE void* LFVAlloc(size_t size) { + const size_t pg = N_PAGE_SIZE; + size_t bigsize = (size + pg - 1) & (~(pg - 1)); + void* p = LFAlloc(bigsize); + + Y_ASSERT_NOBT((intptr_t)p % N_PAGE_SIZE == 0); + return p; +} + +static Y_FORCE_INLINE int LFPosixMemalign(void** memptr, size_t alignment, size_t size) { + if (Y_UNLIKELY(alignment > 4096)) { +#ifdef _win_ + OutputDebugStringA("Larger alignment are not guaranteed with this implementation\n"); +#else + fprintf(stderr, "Larger alignment are not guaranteed with this implementation\n"); +#endif + NMalloc::AbortFromCorruptedAllocator(); + } + size_t bigsize = size; + if (bigsize <= alignment) { + bigsize = alignment; + } else if (bigsize < 2 * alignment) { + bigsize = 2 * alignment; + } + *memptr = LFAlloc(bigsize); + return 0; +} +#endif diff --git a/contrib/lfalloc/src/lfmalloc.h b/contrib/lfalloc/src/lfmalloc.h new file mode 100644 index 00000000000..1e6a0d55773 --- /dev/null +++ b/contrib/lfalloc/src/lfmalloc.h @@ -0,0 +1,23 @@ +#pragma once + +#include +#include +#include "util/system/compiler.h" + +namespace NMalloc { + volatile inline bool IsAllocatorCorrupted = false; + + static inline void AbortFromCorruptedAllocator() { + IsAllocatorCorrupted = true; + abort(); + } + + struct TAllocHeader { + void* Block; + size_t AllocSize; + void Y_FORCE_INLINE Encode(void* block, size_t size, size_t signature) { + Block = block; + AllocSize = size | signature; + } + }; +} diff --git a/contrib/lfalloc/src/util/README.md b/contrib/lfalloc/src/util/README.md new file mode 100644 index 00000000000..c367cb4b439 --- /dev/null +++ b/contrib/lfalloc/src/util/README.md @@ -0,0 +1,33 @@ +Style guide for the util folder is a stricter version of general style guide (mostly in terms of ambiguity resolution). + + * all {} must be in K&R style + * &, * tied closer to a type, not to variable + * always use `using` not `typedef` + * even a single line block must be in braces {}: + ``` + if (A) { + B(); + } + ``` + * _ at the end of private data member of a class - `First_`, `Second_` + * every .h file must be accompanied with corresponding .cpp to avoid a leakage and check that it is self contained + * prohibited to use `printf`-like functions + + +Things declared in the general style guide, which sometimes are missed: + + * `template <`, not `template<` + * `noexcept`, not `throw ()` nor `throw()`, not required for destructors + * indents inside `namespace` same as inside `class` + + +Requirements for a new code (and for corrections in an old code which involves change of behaviour) in util: + + * presence of UNIT-tests + * presence of comments in Doxygen style + * accessors without Get prefix (`Length()`, but not `GetLength()`) + +This guide is not a mandatory as there is the general style guide. +Nevertheless if it is not followed, then a next `ya style .` run in the util folder will undeservedly update authors of some lines of code. + +Thus before a commit it is recommended to run `ya style .` in the util folder. diff --git a/contrib/lfalloc/src/util/system/atomic.h b/contrib/lfalloc/src/util/system/atomic.h new file mode 100644 index 00000000000..9876515a54d --- /dev/null +++ b/contrib/lfalloc/src/util/system/atomic.h @@ -0,0 +1,51 @@ +#pragma once + +#include "defaults.h" + +using TAtomicBase = intptr_t; +using TAtomic = volatile TAtomicBase; + +#if defined(__GNUC__) +#include "atomic_gcc.h" +#elif defined(_MSC_VER) +#include "atomic_win.h" +#else +#error unsupported platform +#endif + +#if !defined(ATOMIC_COMPILER_BARRIER) +#define ATOMIC_COMPILER_BARRIER() +#endif + +static inline TAtomicBase AtomicSub(TAtomic& a, TAtomicBase v) { + return AtomicAdd(a, -v); +} + +static inline TAtomicBase AtomicGetAndSub(TAtomic& a, TAtomicBase v) { + return AtomicGetAndAdd(a, -v); +} + +#if defined(USE_GENERIC_SETGET) +static inline TAtomicBase AtomicGet(const TAtomic& a) { + return a; +} + +static inline void AtomicSet(TAtomic& a, TAtomicBase v) { + a = v; +} +#endif + +static inline bool AtomicTryLock(TAtomic* a) { + return AtomicCas(a, 1, 0); +} + +static inline bool AtomicTryAndTryLock(TAtomic* a) { + return (AtomicGet(*a) == 0) && AtomicTryLock(a); +} + +static inline void AtomicUnlock(TAtomic* a) { + ATOMIC_COMPILER_BARRIER(); + AtomicSet(*a, 0); +} + +#include "atomic_ops.h" diff --git a/contrib/lfalloc/src/util/system/atomic_gcc.h b/contrib/lfalloc/src/util/system/atomic_gcc.h new file mode 100644 index 00000000000..ed8dc2bdc53 --- /dev/null +++ b/contrib/lfalloc/src/util/system/atomic_gcc.h @@ -0,0 +1,90 @@ +#pragma once + +#define ATOMIC_COMPILER_BARRIER() __asm__ __volatile__("" \ + : \ + : \ + : "memory") + +static inline TAtomicBase AtomicGet(const TAtomic& a) { + TAtomicBase tmp; +#if defined(_arm64_) + __asm__ __volatile__( + "ldar %x[value], %[ptr] \n\t" + : [value] "=r"(tmp) + : [ptr] "Q"(a) + : "memory"); +#else + __atomic_load(&a, &tmp, __ATOMIC_ACQUIRE); +#endif + return tmp; +} + +static inline void AtomicSet(TAtomic& a, TAtomicBase v) { +#if defined(_arm64_) + __asm__ __volatile__( + "stlr %x[value], %[ptr] \n\t" + : [ptr] "=Q"(a) + : [value] "r"(v) + : "memory"); +#else + __atomic_store(&a, &v, __ATOMIC_RELEASE); +#endif +} + +static inline intptr_t AtomicIncrement(TAtomic& p) { + return __atomic_add_fetch(&p, 1, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicGetAndIncrement(TAtomic& p) { + return __atomic_fetch_add(&p, 1, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicDecrement(TAtomic& p) { + return __atomic_sub_fetch(&p, 1, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicGetAndDecrement(TAtomic& p) { + return __atomic_fetch_sub(&p, 1, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicAdd(TAtomic& p, intptr_t v) { + return __atomic_add_fetch(&p, v, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicGetAndAdd(TAtomic& p, intptr_t v) { + return __atomic_fetch_add(&p, v, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicSwap(TAtomic* p, intptr_t v) { + (void)p; // disable strange 'parameter set but not used' warning on gcc + intptr_t ret; + __atomic_exchange(p, &v, &ret, __ATOMIC_SEQ_CST); + return ret; +} + +static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) { + (void)a; // disable strange 'parameter set but not used' warning on gcc + return __atomic_compare_exchange(a, &compare, &exchange, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) { + (void)a; // disable strange 'parameter set but not used' warning on gcc + __atomic_compare_exchange(a, &compare, &exchange, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST); + return compare; +} + +static inline intptr_t AtomicOr(TAtomic& a, intptr_t b) { + return __atomic_or_fetch(&a, b, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicXor(TAtomic& a, intptr_t b) { + return __atomic_xor_fetch(&a, b, __ATOMIC_SEQ_CST); +} + +static inline intptr_t AtomicAnd(TAtomic& a, intptr_t b) { + return __atomic_and_fetch(&a, b, __ATOMIC_SEQ_CST); +} + +static inline void AtomicBarrier() { + __sync_synchronize(); +} diff --git a/contrib/lfalloc/src/util/system/atomic_ops.h b/contrib/lfalloc/src/util/system/atomic_ops.h new file mode 100644 index 00000000000..425b643e14d --- /dev/null +++ b/contrib/lfalloc/src/util/system/atomic_ops.h @@ -0,0 +1,189 @@ +#pragma once + +#include + +template +inline TAtomic* AsAtomicPtr(T volatile* target) { + return reinterpret_cast(target); +} + +template +inline const TAtomic* AsAtomicPtr(T const volatile* target) { + return reinterpret_cast(target); +} + +// integral types + +template +struct TAtomicTraits { + enum { + Castable = std::is_integral::value && sizeof(T) == sizeof(TAtomicBase) && !std::is_const::value, + }; +}; + +template +using TEnableIfCastable = std::enable_if_t::Castable, TT>; + +template +inline TEnableIfCastable AtomicGet(T const volatile& target) { + return static_cast(AtomicGet(*AsAtomicPtr(&target))); +} + +template +inline TEnableIfCastable AtomicSet(T volatile& target, TAtomicBase value) { + AtomicSet(*AsAtomicPtr(&target), value); +} + +template +inline TEnableIfCastable AtomicIncrement(T volatile& target) { + return static_cast(AtomicIncrement(*AsAtomicPtr(&target))); +} + +template +inline TEnableIfCastable AtomicGetAndIncrement(T volatile& target) { + return static_cast(AtomicGetAndIncrement(*AsAtomicPtr(&target))); +} + +template +inline TEnableIfCastable AtomicDecrement(T volatile& target) { + return static_cast(AtomicDecrement(*AsAtomicPtr(&target))); +} + +template +inline TEnableIfCastable AtomicGetAndDecrement(T volatile& target) { + return static_cast(AtomicGetAndDecrement(*AsAtomicPtr(&target))); +} + +template +inline TEnableIfCastable AtomicAdd(T volatile& target, TAtomicBase value) { + return static_cast(AtomicAdd(*AsAtomicPtr(&target), value)); +} + +template +inline TEnableIfCastable AtomicGetAndAdd(T volatile& target, TAtomicBase value) { + return static_cast(AtomicGetAndAdd(*AsAtomicPtr(&target), value)); +} + +template +inline TEnableIfCastable AtomicSub(T volatile& target, TAtomicBase value) { + return static_cast(AtomicSub(*AsAtomicPtr(&target), value)); +} + +template +inline TEnableIfCastable AtomicGetAndSub(T volatile& target, TAtomicBase value) { + return static_cast(AtomicGetAndSub(*AsAtomicPtr(&target), value)); +} + +template +inline TEnableIfCastable AtomicSwap(T volatile* target, TAtomicBase exchange) { + return static_cast(AtomicSwap(AsAtomicPtr(target), exchange)); +} + +template +inline TEnableIfCastable AtomicCas(T volatile* target, TAtomicBase exchange, TAtomicBase compare) { + return AtomicCas(AsAtomicPtr(target), exchange, compare); +} + +template +inline TEnableIfCastable AtomicGetAndCas(T volatile* target, TAtomicBase exchange, TAtomicBase compare) { + return static_cast(AtomicGetAndCas(AsAtomicPtr(target), exchange, compare)); +} + +template +inline TEnableIfCastable AtomicTryLock(T volatile* target) { + return AtomicTryLock(AsAtomicPtr(target)); +} + +template +inline TEnableIfCastable AtomicTryAndTryLock(T volatile* target) { + return AtomicTryAndTryLock(AsAtomicPtr(target)); +} + +template +inline TEnableIfCastable AtomicUnlock(T volatile* target) { + AtomicUnlock(AsAtomicPtr(target)); +} + +template +inline TEnableIfCastable AtomicOr(T volatile& target, TAtomicBase value) { + return static_cast(AtomicOr(*AsAtomicPtr(&target), value)); +} + +template +inline TEnableIfCastable AtomicAnd(T volatile& target, TAtomicBase value) { + return static_cast(AtomicAnd(*AsAtomicPtr(&target), value)); +} + +template +inline TEnableIfCastable AtomicXor(T volatile& target, TAtomicBase value) { + return static_cast(AtomicXor(*AsAtomicPtr(&target), value)); +} + +// pointer types + +template +inline T* AtomicGet(T* const volatile& target) { + return reinterpret_cast(AtomicGet(*AsAtomicPtr(&target))); +} + +template +inline void AtomicSet(T* volatile& target, T* value) { + AtomicSet(*AsAtomicPtr(&target), reinterpret_cast(value)); +} + +using TNullPtr = decltype(nullptr); + +template +inline void AtomicSet(T* volatile& target, TNullPtr) { + AtomicSet(*AsAtomicPtr(&target), 0); +} + +template +inline T* AtomicSwap(T* volatile* target, T* exchange) { + return reinterpret_cast(AtomicSwap(AsAtomicPtr(target), reinterpret_cast(exchange))); +} + +template +inline T* AtomicSwap(T* volatile* target, TNullPtr) { + return reinterpret_cast(AtomicSwap(AsAtomicPtr(target), 0)); +} + +template +inline bool AtomicCas(T* volatile* target, T* exchange, T* compare) { + return AtomicCas(AsAtomicPtr(target), reinterpret_cast(exchange), reinterpret_cast(compare)); +} + +template +inline T* AtomicGetAndCas(T* volatile* target, T* exchange, T* compare) { + return reinterpret_cast(AtomicGetAndCas(AsAtomicPtr(target), reinterpret_cast(exchange), reinterpret_cast(compare))); +} + +template +inline bool AtomicCas(T* volatile* target, T* exchange, TNullPtr) { + return AtomicCas(AsAtomicPtr(target), reinterpret_cast(exchange), 0); +} + +template +inline T* AtomicGetAndCas(T* volatile* target, T* exchange, TNullPtr) { + return reinterpret_cast(AtomicGetAndCas(AsAtomicPtr(target), reinterpret_cast(exchange), 0)); +} + +template +inline bool AtomicCas(T* volatile* target, TNullPtr, T* compare) { + return AtomicCas(AsAtomicPtr(target), 0, reinterpret_cast(compare)); +} + +template +inline T* AtomicGetAndCas(T* volatile* target, TNullPtr, T* compare) { + return reinterpret_cast(AtomicGetAndCas(AsAtomicPtr(target), 0, reinterpret_cast(compare))); +} + +template +inline bool AtomicCas(T* volatile* target, TNullPtr, TNullPtr) { + return AtomicCas(AsAtomicPtr(target), 0, 0); +} + +template +inline T* AtomicGetAndCas(T* volatile* target, TNullPtr, TNullPtr) { + return reinterpret_cast(AtomicGetAndCas(AsAtomicPtr(target), 0, 0)); +} diff --git a/contrib/lfalloc/src/util/system/atomic_win.h b/contrib/lfalloc/src/util/system/atomic_win.h new file mode 100644 index 00000000000..1abebd87b38 --- /dev/null +++ b/contrib/lfalloc/src/util/system/atomic_win.h @@ -0,0 +1,114 @@ +#pragma once + +#include + +#define USE_GENERIC_SETGET + +#if defined(_i386_) + +#pragma intrinsic(_InterlockedIncrement) +#pragma intrinsic(_InterlockedDecrement) +#pragma intrinsic(_InterlockedExchangeAdd) +#pragma intrinsic(_InterlockedExchange) +#pragma intrinsic(_InterlockedCompareExchange) + +static inline intptr_t AtomicIncrement(TAtomic& a) { + return _InterlockedIncrement((volatile long*)&a); +} + +static inline intptr_t AtomicGetAndIncrement(TAtomic& a) { + return _InterlockedIncrement((volatile long*)&a) - 1; +} + +static inline intptr_t AtomicDecrement(TAtomic& a) { + return _InterlockedDecrement((volatile long*)&a); +} + +static inline intptr_t AtomicGetAndDecrement(TAtomic& a) { + return _InterlockedDecrement((volatile long*)&a) + 1; +} + +static inline intptr_t AtomicAdd(TAtomic& a, intptr_t b) { + return _InterlockedExchangeAdd((volatile long*)&a, b) + b; +} + +static inline intptr_t AtomicGetAndAdd(TAtomic& a, intptr_t b) { + return _InterlockedExchangeAdd((volatile long*)&a, b); +} + +static inline intptr_t AtomicSwap(TAtomic* a, intptr_t b) { + return _InterlockedExchange((volatile long*)a, b); +} + +static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) { + return _InterlockedCompareExchange((volatile long*)a, exchange, compare) == compare; +} + +static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) { + return _InterlockedCompareExchange((volatile long*)a, exchange, compare); +} + +#else // _x86_64_ + +#pragma intrinsic(_InterlockedIncrement64) +#pragma intrinsic(_InterlockedDecrement64) +#pragma intrinsic(_InterlockedExchangeAdd64) +#pragma intrinsic(_InterlockedExchange64) +#pragma intrinsic(_InterlockedCompareExchange64) + +static inline intptr_t AtomicIncrement(TAtomic& a) { + return _InterlockedIncrement64((volatile __int64*)&a); +} + +static inline intptr_t AtomicGetAndIncrement(TAtomic& a) { + return _InterlockedIncrement64((volatile __int64*)&a) - 1; +} + +static inline intptr_t AtomicDecrement(TAtomic& a) { + return _InterlockedDecrement64((volatile __int64*)&a); +} + +static inline intptr_t AtomicGetAndDecrement(TAtomic& a) { + return _InterlockedDecrement64((volatile __int64*)&a) + 1; +} + +static inline intptr_t AtomicAdd(TAtomic& a, intptr_t b) { + return _InterlockedExchangeAdd64((volatile __int64*)&a, b) + b; +} + +static inline intptr_t AtomicGetAndAdd(TAtomic& a, intptr_t b) { + return _InterlockedExchangeAdd64((volatile __int64*)&a, b); +} + +static inline intptr_t AtomicSwap(TAtomic* a, intptr_t b) { + return _InterlockedExchange64((volatile __int64*)a, b); +} + +static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) { + return _InterlockedCompareExchange64((volatile __int64*)a, exchange, compare) == compare; +} + +static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) { + return _InterlockedCompareExchange64((volatile __int64*)a, exchange, compare); +} + +static inline intptr_t AtomicOr(TAtomic& a, intptr_t b) { + return _InterlockedOr64(&a, b) | b; +} + +static inline intptr_t AtomicAnd(TAtomic& a, intptr_t b) { + return _InterlockedAnd64(&a, b) & b; +} + +static inline intptr_t AtomicXor(TAtomic& a, intptr_t b) { + return _InterlockedXor64(&a, b) ^ b; +} + +#endif // _x86_ + +//TODO +static inline void AtomicBarrier() { + TAtomic val = 0; + + AtomicSwap(&val, 0); +} diff --git a/contrib/lfalloc/src/util/system/compiler.h b/contrib/lfalloc/src/util/system/compiler.h new file mode 100644 index 00000000000..b5cec600923 --- /dev/null +++ b/contrib/lfalloc/src/util/system/compiler.h @@ -0,0 +1,617 @@ +#pragma once + +// useful cross-platfrom definitions for compilers + +/** + * @def Y_FUNC_SIGNATURE + * + * Use this macro to get pretty function name (see example). + * + * @code + * void Hi() { + * Cout << Y_FUNC_SIGNATURE << Endl; + * } + + * template + * void Do() { + * Cout << Y_FUNC_SIGNATURE << Endl; + * } + + * int main() { + * Hi(); // void Hi() + * Do(); // void Do() [T = int] + * Do(); // void Do() [T = TString] + * } + * @endcode + */ +#if defined(__GNUC__) +#define Y_FUNC_SIGNATURE __PRETTY_FUNCTION__ +#elif defined(_MSC_VER) +#define Y_FUNC_SIGNATURE __FUNCSIG__ +#else +#define Y_FUNC_SIGNATURE "" +#endif + +#ifdef __GNUC__ +#define Y_PRINTF_FORMAT(n, m) __attribute__((__format__(__printf__, n, m))) +#endif + +#ifndef Y_PRINTF_FORMAT +#define Y_PRINTF_FORMAT(n, m) +#endif + +#if defined(__clang__) +#define Y_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__))) +#endif + +#if !defined(Y_NO_SANITIZE) +#define Y_NO_SANITIZE(...) +#endif + +/** + * @def Y_DECLARE_UNUSED + * + * Macro is needed to silence compiler warning about unused entities (e.g. function or argument). + * + * @code + * Y_DECLARE_UNUSED int FunctionUsedSolelyForDebugPurposes(); + * assert(FunctionUsedSolelyForDebugPurposes() == 42); + * + * void Foo(const int argumentUsedOnlyForDebugPurposes Y_DECLARE_UNUSED) { + * assert(argumentUsedOnlyForDebugPurposes == 42); + * // however you may as well omit `Y_DECLARE_UNUSED` and use `UNUSED` macro instead + * Y_UNUSED(argumentUsedOnlyForDebugPurposes); + * } + * @endcode + */ +#ifdef __GNUC__ +#define Y_DECLARE_UNUSED __attribute__((unused)) +#endif + +#ifndef Y_DECLARE_UNUSED +#define Y_DECLARE_UNUSED +#endif + +#if defined(__GNUC__) +#define Y_LIKELY(Cond) __builtin_expect(!!(Cond), 1) +#define Y_UNLIKELY(Cond) __builtin_expect(!!(Cond), 0) +#define Y_PREFETCH_READ(Pointer, Priority) __builtin_prefetch((const void*)(Pointer), 0, Priority) +#define Y_PREFETCH_WRITE(Pointer, Priority) __builtin_prefetch((const void*)(Pointer), 1, Priority) +#endif + +/** + * @def Y_FORCE_INLINE + * + * Macro to use in place of 'inline' in function declaration/definition to force + * it to be inlined. + */ +#if !defined(Y_FORCE_INLINE) +#if defined(CLANG_COVERAGE) +#/* excessive __always_inline__ might significantly slow down compilation of an instrumented unit */ +#define Y_FORCE_INLINE inline +#elif defined(_MSC_VER) +#define Y_FORCE_INLINE __forceinline +#elif defined(__GNUC__) +#/* Clang also defines __GNUC__ (as 4) */ +#define Y_FORCE_INLINE inline __attribute__((__always_inline__)) +#else +#define Y_FORCE_INLINE inline +#endif +#endif + +/** + * @def Y_NO_INLINE + * + * Macro to use in place of 'inline' in function declaration/definition to + * prevent it from being inlined. + */ +#if !defined(Y_NO_INLINE) +#if defined(_MSC_VER) +#define Y_NO_INLINE __declspec(noinline) +#elif defined(__GNUC__) || defined(__INTEL_COMPILER) +#/* Clang also defines __GNUC__ (as 4) */ +#define Y_NO_INLINE __attribute__((__noinline__)) +#else +#define Y_NO_INLINE +#endif +#endif + +//to cheat compiler about strict aliasing or similar problems +#if defined(__GNUC__) +#define Y_FAKE_READ(X) \ + do { \ + __asm__ __volatile__("" \ + : \ + : "m"(X)); \ + } while (0) + +#define Y_FAKE_WRITE(X) \ + do { \ + __asm__ __volatile__("" \ + : "=m"(X)); \ + } while (0) +#endif + +#if !defined(Y_FAKE_READ) +#define Y_FAKE_READ(X) +#endif + +#if !defined(Y_FAKE_WRITE) +#define Y_FAKE_WRITE(X) +#endif + +#ifndef Y_PREFETCH_READ +#define Y_PREFETCH_READ(Pointer, Priority) (void)(const void*)(Pointer), (void)Priority +#endif + +#ifndef Y_PREFETCH_WRITE +#define Y_PREFETCH_WRITE(Pointer, Priority) (void)(const void*)(Pointer), (void)Priority +#endif + +#ifndef Y_LIKELY +#define Y_LIKELY(Cond) (Cond) +#define Y_UNLIKELY(Cond) (Cond) +#endif + +#ifdef __GNUC__ +#define _packed __attribute__((packed)) +#else +#define _packed +#endif + +#if defined(__GNUC__) +#define Y_WARN_UNUSED_RESULT __attribute__((warn_unused_result)) +#endif + +#ifndef Y_WARN_UNUSED_RESULT +#define Y_WARN_UNUSED_RESULT +#endif + +#if defined(__GNUC__) +#define Y_HIDDEN __attribute__((visibility("hidden"))) +#endif + +#if !defined(Y_HIDDEN) +#define Y_HIDDEN +#endif + +#if defined(__GNUC__) +#define Y_PUBLIC __attribute__((visibility("default"))) +#endif + +#if !defined(Y_PUBLIC) +#define Y_PUBLIC +#endif + +#if !defined(Y_UNUSED) && !defined(__cplusplus) +#define Y_UNUSED(var) (void)(var) +#endif +#if !defined(Y_UNUSED) && defined(__cplusplus) +template +constexpr Y_FORCE_INLINE int Y_UNUSED(Types&&...) { + return 0; +}; +#endif + +/** + * @def Y_ASSUME + * + * Macro that tells the compiler that it can generate optimized code + * as if the given expression will always evaluate true. + * The behavior is undefined if it ever evaluates false. + * + * @code + * // factored into a function so that it's testable + * inline int Avg(int x, int y) { + * if (x >= 0 && y >= 0) { + * return (static_cast(x) + static_cast(y)) >> 1; + * } else { + * // a slower implementation + * } + * } + * + * // we know that xs and ys are non-negative from domain knowledge, + * // but we can't change the types of xs and ys because of API constrains + * int Foo(const TVector& xs, const TVector& ys) { + * TVector avgs; + * avgs.resize(xs.size()); + * for (size_t i = 0; i < xs.size(); ++i) { + * auto x = xs[i]; + * auto y = ys[i]; + * Y_ASSUME(x >= 0); + * Y_ASSUME(y >= 0); + * xs[i] = Avg(x, y); + * } + * } + * @endcode + */ +#if defined(__GNUC__) +#define Y_ASSUME(condition) ((condition) ? (void)0 : __builtin_unreachable()) +#elif defined(_MSC_VER) +#define Y_ASSUME(condition) __assume(condition) +#else +#define Y_ASSUME(condition) Y_UNUSED(condition) +#endif + +#ifdef __cplusplus +[[noreturn]] +#endif +Y_HIDDEN void _YandexAbort(); + +/** + * @def Y_UNREACHABLE + * + * Macro that marks the rest of the code branch unreachable. + * The behavior is undefined if it's ever reached. + * + * @code + * switch (i % 3) { + * case 0: + * return foo; + * case 1: + * return bar; + * case 2: + * return baz; + * default: + * Y_UNREACHABLE(); + * } + * @endcode + */ +#if defined(__GNUC__) || defined(_MSC_VER) +#define Y_UNREACHABLE() Y_ASSUME(0) +#else +#define Y_UNREACHABLE() _YandexAbort() +#endif + +#if defined(undefined_sanitizer_enabled) +#define _ubsan_enabled_ +#endif + +#ifdef __clang__ + +#if __has_feature(thread_sanitizer) +#define _tsan_enabled_ +#endif +#if __has_feature(memory_sanitizer) +#define _msan_enabled_ +#endif +#if __has_feature(address_sanitizer) +#define _asan_enabled_ +#endif + +#else + +#if defined(thread_sanitizer_enabled) || defined(__SANITIZE_THREAD__) +#define _tsan_enabled_ +#endif +#if defined(memory_sanitizer_enabled) +#define _msan_enabled_ +#endif +#if defined(address_sanitizer_enabled) || defined(__SANITIZE_ADDRESS__) +#define _asan_enabled_ +#endif + +#endif + +#if defined(_asan_enabled_) || defined(_msan_enabled_) || defined(_tsan_enabled_) || defined(_ubsan_enabled_) +#define _san_enabled_ +#endif + +#if defined(_MSC_VER) +#define __PRETTY_FUNCTION__ __FUNCSIG__ +#endif + +#if defined(__GNUC__) +#define Y_WEAK __attribute__((weak)) +#else +#define Y_WEAK +#endif + +#if defined(__CUDACC_VER_MAJOR__) +#define Y_CUDA_AT_LEAST(x, y) (__CUDACC_VER_MAJOR__ > x || (__CUDACC_VER_MAJOR__ == x && __CUDACC_VER_MINOR__ >= y)) +#else +#define Y_CUDA_AT_LEAST(x, y) 0 +#endif + +// NVidia CUDA C++ Compiler did not know about noexcept keyword until version 9.0 +#if !Y_CUDA_AT_LEAST(9, 0) +#if defined(__CUDACC__) && !defined(noexcept) +#define noexcept throw () +#endif +#endif + +#if defined(__GNUC__) +#define Y_COLD __attribute__((cold)) +#define Y_LEAF __attribute__((leaf)) +#define Y_WRAPPER __attribute__((artificial)) +#else +#define Y_COLD +#define Y_LEAF +#define Y_WRAPPER +#endif + +/** + * @def Y_PRAGMA + * + * Macro for use in other macros to define compiler pragma + * See below for other usage examples + * + * @code + * #if defined(__clang__) || defined(__GNUC__) + * #define Y_PRAGMA_NO_WSHADOW \ + * Y_PRAGMA("GCC diagnostic ignored \"-Wshadow\"") + * #elif defined(_MSC_VER) + * #define Y_PRAGMA_NO_WSHADOW \ + * Y_PRAGMA("warning(disable:4456 4457") + * #else + * #define Y_PRAGMA_NO_WSHADOW + * #endif + * @endcode + */ +#if defined(__clang__) || defined(__GNUC__) +#define Y_PRAGMA(x) _Pragma(x) +#elif defined(_MSC_VER) +#define Y_PRAGMA(x) __pragma(x) +#else +#define Y_PRAGMA(x) +#endif + +/** + * @def Y_PRAGMA_DIAGNOSTIC_PUSH + * + * Cross-compiler pragma to save diagnostic settings + * + * @see + * GCC: https://gcc.gnu.org/onlinedocs/gcc/Diagnostic-Pragmas.html + * MSVC: https://msdn.microsoft.com/en-us/library/2c8f766e.aspx + * Clang: https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas + * + * @code + * Y_PRAGMA_DIAGNOSTIC_PUSH + * @endcode + */ +#if defined(__clang__) || defined(__GNUC__) +#define Y_PRAGMA_DIAGNOSTIC_PUSH \ + Y_PRAGMA("GCC diagnostic push") +#elif defined(_MSC_VER) +#define Y_PRAGMA_DIAGNOSTIC_PUSH \ + Y_PRAGMA(warning(push)) +#else +#define Y_PRAGMA_DIAGNOSTIC_PUSH +#endif + +/** + * @def Y_PRAGMA_DIAGNOSTIC_POP + * + * Cross-compiler pragma to restore diagnostic settings + * + * @see + * GCC: https://gcc.gnu.org/onlinedocs/gcc/Diagnostic-Pragmas.html + * MSVC: https://msdn.microsoft.com/en-us/library/2c8f766e.aspx + * Clang: https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas + * + * @code + * Y_PRAGMA_DIAGNOSTIC_POP + * @endcode + */ +#if defined(__clang__) || defined(__GNUC__) +#define Y_PRAGMA_DIAGNOSTIC_POP \ + Y_PRAGMA("GCC diagnostic pop") +#elif defined(_MSC_VER) +#define Y_PRAGMA_DIAGNOSTIC_POP \ + Y_PRAGMA(warning(pop)) +#else +#define Y_PRAGMA_DIAGNOSTIC_POP +#endif + +/** + * @def Y_PRAGMA_NO_WSHADOW + * + * Cross-compiler pragma to disable warnings about shadowing variables + * + * @code + * Y_PRAGMA_DIAGNOSTIC_PUSH + * Y_PRAGMA_NO_WSHADOW + * + * // some code which use variable shadowing, e.g.: + * + * for (int i = 0; i < 100; ++i) { + * Use(i); + * + * for (int i = 42; i < 100500; ++i) { // this i is shadowing previous i + * AnotherUse(i); + * } + * } + * + * Y_PRAGMA_DIAGNOSTIC_POP + * @endcode + */ +#if defined(__clang__) || defined(__GNUC__) +#define Y_PRAGMA_NO_WSHADOW \ + Y_PRAGMA("GCC diagnostic ignored \"-Wshadow\"") +#elif defined(_MSC_VER) +#define Y_PRAGMA_NO_WSHADOW \ + Y_PRAGMA(warning(disable : 4456 4457)) +#else +#define Y_PRAGMA_NO_WSHADOW +#endif + +/** + * @ def Y_PRAGMA_NO_UNUSED_FUNCTION + * + * Cross-compiler pragma to disable warnings about unused functions + * + * @see + * GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html + * Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wunused-function + * MSVC: there is no such warning + * + * @code + * Y_PRAGMA_DIAGNOSTIC_PUSH + * Y_PRAGMA_NO_UNUSED_FUNCTION + * + * // some code which introduces a function which later will not be used, e.g.: + * + * void Foo() { + * } + * + * int main() { + * return 0; // Foo() never called + * } + * + * Y_PRAGMA_DIAGNOSTIC_POP + * @endcode + */ +#if defined(__clang__) || defined(__GNUC__) +#define Y_PRAGMA_NO_UNUSED_FUNCTION \ + Y_PRAGMA("GCC diagnostic ignored \"-Wunused-function\"") +#else +#define Y_PRAGMA_NO_UNUSED_FUNCTION +#endif + +/** + * @ def Y_PRAGMA_NO_UNUSED_PARAMETER + * + * Cross-compiler pragma to disable warnings about unused function parameters + * + * @see + * GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html + * Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wunused-parameter + * MSVC: https://msdn.microsoft.com/en-us/library/26kb9fy0.aspx + * + * @code + * Y_PRAGMA_DIAGNOSTIC_PUSH + * Y_PRAGMA_NO_UNUSED_PARAMETER + * + * // some code which introduces a function with unused parameter, e.g.: + * + * void foo(int a) { + * // a is not referenced + * } + * + * int main() { + * foo(1); + * return 0; + * } + * + * Y_PRAGMA_DIAGNOSTIC_POP + * @endcode + */ +#if defined(__clang__) || defined(__GNUC__) +#define Y_PRAGMA_NO_UNUSED_PARAMETER \ + Y_PRAGMA("GCC diagnostic ignored \"-Wunused-parameter\"") +#elif defined(_MSC_VER) +#define Y_PRAGMA_NO_UNUSED_PARAMETER \ + Y_PRAGMA(warning(disable : 4100)) +#else +#define Y_PRAGMA_NO_UNUSED_PARAMETER +#endif + +/** + * @def Y_PRAGMA_NO_DEPRECATED + * + * Cross compiler pragma to disable warnings and errors about deprecated + * + * @see + * GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html + * Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wdeprecated + * MSVC: https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4996?view=vs-2017 + * + * @code + * Y_PRAGMA_DIAGNOSTIC_PUSH + * Y_PRAGMA_NO_DEPRECATED + * + * [deprecated] void foo() { + * // ... + * } + * + * int main() { + * foo(); + * return 0; + * } + * + * Y_PRAGMA_DIAGNOSTIC_POP + * @endcode + */ +#if defined(__clang__) || defined(__GNUC__) +#define Y_PRAGMA_NO_DEPRECATED \ + Y_PRAGMA("GCC diagnostic ignored \"-Wdeprecated\"") +#elif defined(_MSC_VER) +#define Y_PRAGMA_NO_DEPRECATED \ + Y_PRAGMA(warning(disable : 4996)) +#else +#define Y_PRAGMA_NO_DEPRECATED +#endif + +#if defined(__clang__) || defined(__GNUC__) +/** + * @def Y_CONST_FUNCTION + methods and functions, marked with this method are promised to: + 1. do not have side effects + 2. this method do not read global memory + NOTE: this attribute can't be set for methods that depend on data, pointed by this + this allow compilers to do hard optimization of that functions + NOTE: in common case this attribute can't be set if method have pointer-arguments + NOTE: as result there no any reason to discard result of such method +*/ +#define Y_CONST_FUNCTION [[gnu::const]] +#endif + +#if !defined(Y_CONST_FUNCTION) +#define Y_CONST_FUNCTION +#endif + +#if defined(__clang__) || defined(__GNUC__) +/** + * @def Y_PURE_FUNCTION + methods and functions, marked with this method are promised to: + 1. do not have side effects + 2. result will be the same if no global memory changed + this allow compilers to do hard optimization of that functions + NOTE: as result there no any reason to discard result of such method +*/ +#define Y_PURE_FUNCTION [[gnu::pure]] +#endif + +#if !defined(Y_PURE_FUNCTION) +#define Y_PURE_FUNCTION +#endif + +/** + * @ def Y_HAVE_INT128 + * + * Defined when the compiler supports __int128 extension + * + * @code + * + * #if defined(Y_HAVE_INT128) + * __int128 myVeryBigInt = 12345678901234567890; + * #endif + * + * @endcode + */ +#if defined(__SIZEOF_INT128__) +#define Y_HAVE_INT128 1 +#endif + +/** + * XRAY macro must be passed to compiler if XRay is enabled. + * + * Define everything XRay-specific as a macro so that it doesn't cause errors + * for compilers that doesn't support XRay. + */ +#if defined(XRAY) && defined(__cplusplus) +#include +#define Y_XRAY_ALWAYS_INSTRUMENT [[clang::xray_always_instrument]] +#define Y_XRAY_NEVER_INSTRUMENT [[clang::xray_never_instrument]] +#define Y_XRAY_CUSTOM_EVENT(__string, __length) \ + do { \ + __xray_customevent(__string, __length); \ + } while (0) +#else +#define Y_XRAY_ALWAYS_INSTRUMENT +#define Y_XRAY_NEVER_INSTRUMENT +#define Y_XRAY_CUSTOM_EVENT(__string, __length) \ + do { \ + } while (0) +#endif diff --git a/contrib/lfalloc/src/util/system/defaults.h b/contrib/lfalloc/src/util/system/defaults.h new file mode 100644 index 00000000000..19196a28b2b --- /dev/null +++ b/contrib/lfalloc/src/util/system/defaults.h @@ -0,0 +1,168 @@ +#pragma once + +#include "platform.h" + +#if defined _unix_ +#define LOCSLASH_C '/' +#define LOCSLASH_S "/" +#else +#define LOCSLASH_C '\\' +#define LOCSLASH_S "\\" +#endif // _unix_ + +#if defined(__INTEL_COMPILER) && defined(__cplusplus) +#include +#endif + +// low and high parts of integers +#if !defined(_win_) +#include +#endif + +#if defined(BSD) || defined(_android_) + +#if defined(BSD) +#include +#endif + +#if defined(_android_) +#include +#endif + +#if (BYTE_ORDER == LITTLE_ENDIAN) +#define _little_endian_ +#elif (BYTE_ORDER == BIG_ENDIAN) +#define _big_endian_ +#else +#error unknown endian not supported +#endif + +#elif (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(WHATEVER_THAT_HAS_BIG_ENDIAN) +#define _big_endian_ +#else +#define _little_endian_ +#endif + +// alignment +#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_QUADS) +#define _must_align8_ +#endif + +#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_LONGS) +#define _must_align4_ +#endif + +#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_SHORTS) +#define _must_align2_ +#endif + +#if defined(__GNUC__) +#define alias_hack __attribute__((__may_alias__)) +#endif + +#ifndef alias_hack +#define alias_hack +#endif + +#include "types.h" + +#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L) +#define PRAGMA(x) _Pragma(#x) +#define RCSID(idstr) PRAGMA(comment(exestr, idstr)) +#else +#define RCSID(idstr) static const char rcsid[] = idstr +#endif + +#include "compiler.h" + +#ifdef _win_ +#include +#elif defined(_sun_) +#include +#endif + +#ifdef NDEBUG +#define Y_IF_DEBUG(X) +#else +#define Y_IF_DEBUG(X) X +#endif + +/** + * @def Y_ARRAY_SIZE + * + * This macro is needed to get number of elements in a statically allocated fixed size array. The + * expression is a compile-time constant and therefore can be used in compile time computations. + * + * @code + * enum ENumbers { + * EN_ONE, + * EN_TWO, + * EN_SIZE + * } + * + * const char* NAMES[] = { + * "one", + * "two" + * } + * + * static_assert(Y_ARRAY_SIZE(NAMES) == EN_SIZE, "you should define `NAME` for each enumeration"); + * @endcode + * + * This macro also catches type errors. If you see a compiler error like "warning: division by zero + * is undefined" when using `Y_ARRAY_SIZE` then you are probably giving it a pointer. + * + * Since all of our code is expected to work on a 64 bit platform where pointers are 8 bytes we may + * falsefully accept pointers to types of sizes that are divisors of 8 (1, 2, 4 and 8). + */ +#if defined(__cplusplus) +namespace NArraySizePrivate { + template + struct TArraySize; + + template + struct TArraySize { + enum { + Result = N + }; + }; + + template + struct TArraySize { + enum { + Result = N + }; + }; +} + +#define Y_ARRAY_SIZE(arr) ((size_t)::NArraySizePrivate::TArraySize::Result) +#else +#undef Y_ARRAY_SIZE +#define Y_ARRAY_SIZE(arr) \ + ((sizeof(arr) / sizeof((arr)[0])) / static_cast(!(sizeof(arr) % sizeof((arr)[0])))) +#endif + +#undef Y_ARRAY_BEGIN +#define Y_ARRAY_BEGIN(arr) (arr) + +#undef Y_ARRAY_END +#define Y_ARRAY_END(arr) ((arr) + Y_ARRAY_SIZE(arr)) + +/** + * Concatenates two symbols, even if one of them is itself a macro. + */ +#define Y_CAT(X, Y) Y_CAT_I(X, Y) +#define Y_CAT_I(X, Y) Y_CAT_II(X, Y) +#define Y_CAT_II(X, Y) X##Y + +#define Y_STRINGIZE(X) UTIL_PRIVATE_STRINGIZE_AUX(X) +#define UTIL_PRIVATE_STRINGIZE_AUX(X) #X + +#if defined(__COUNTER__) +#define Y_GENERATE_UNIQUE_ID(N) Y_CAT(N, __COUNTER__) +#endif + +#if !defined(Y_GENERATE_UNIQUE_ID) +#define Y_GENERATE_UNIQUE_ID(N) Y_CAT(N, __LINE__) +#endif + +#define NPOS ((size_t)-1) diff --git a/contrib/lfalloc/src/util/system/platform.h b/contrib/lfalloc/src/util/system/platform.h new file mode 100644 index 00000000000..0687f239a2e --- /dev/null +++ b/contrib/lfalloc/src/util/system/platform.h @@ -0,0 +1,242 @@ +#pragma once + +// What OS ? +// our definition has the form _{osname}_ + +#if defined(_WIN64) +#define _win64_ +#define _win32_ +#elif defined(__WIN32__) || defined(_WIN32) // _WIN32 is also defined by the 64-bit compiler for backward compatibility +#define _win32_ +#else +#define _unix_ +#if defined(__sun__) || defined(sun) || defined(sparc) || defined(__sparc) +#define _sun_ +#endif +#if defined(__hpux__) +#define _hpux_ +#endif +#if defined(__linux__) +#define _linux_ +#endif +#if defined(__FreeBSD__) +#define _freebsd_ +#endif +#if defined(__CYGWIN__) +#define _cygwin_ +#endif +#if defined(__APPLE__) +#define _darwin_ +#endif +#if defined(__ANDROID__) +#define _android_ +#endif +#endif + +#if defined(__IOS__) +#define _ios_ +#endif + +#if defined(_linux_) +#if defined(_musl_) +//nothing to do +#elif defined(_android_) +#define _bionic_ +#else +#define _glibc_ +#endif +#endif + +#if defined(_darwin_) +#define unix +#define __unix__ +#endif + +#if defined(_win32_) || defined(_win64_) +#define _win_ +#endif + +#if defined(__arm__) || defined(__ARM__) || defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM) +#if defined(__arm64) || defined(__arm64__) || defined(__aarch64__) +#define _arm64_ +#else +#define _arm32_ +#endif +#endif + +#if defined(_arm64_) || defined(_arm32_) +#define _arm_ +#endif + +/* __ia64__ and __x86_64__ - defined by GNU C. + * _M_IA64, _M_X64, _M_AMD64 - defined by Visual Studio. + * + * Microsoft can define _M_IX86, _M_AMD64 (before Visual Studio 8) + * or _M_X64 (starting in Visual Studio 8). + */ +#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64) +#define _x86_64_ +#endif + +#if defined(__i386__) || defined(_M_IX86) +#define _i386_ +#endif + +#if defined(__ia64__) || defined(_M_IA64) +#define _ia64_ +#endif + +#if defined(__powerpc__) +#define _ppc_ +#endif + +#if defined(__powerpc64__) +#define _ppc64_ +#endif + +#if !defined(sparc) && !defined(__sparc) && !defined(__hpux__) && !defined(__alpha__) && !defined(_ia64_) && !defined(_x86_64_) && !defined(_arm_) && !defined(_i386_) && !defined(_ppc_) && !defined(_ppc64_) +#error "platform not defined, please, define one" +#endif + +#if defined(_x86_64_) || defined(_i386_) +#define _x86_ +#endif + +#if defined(__MIC__) +#define _mic_ +#define _k1om_ +#endif + +// stdio or MessageBox +#if defined(__CONSOLE__) || defined(_CONSOLE) +#define _console_ +#endif +#if (defined(_win_) && !defined(_console_)) +#define _windows_ +#elif !defined(_console_) +#define _console_ +#endif + +#if defined(__SSE__) || defined(SSE_ENABLED) +#define _sse_ +#endif + +#if defined(__SSE2__) || defined(SSE2_ENABLED) +#define _sse2_ +#endif + +#if defined(__SSE3__) || defined(SSE3_ENABLED) +#define _sse3_ +#endif + +#if defined(__SSSE3__) || defined(SSSE3_ENABLED) +#define _ssse3_ +#endif + +#if defined(POPCNT_ENABLED) +#define _popcnt_ +#endif + +#if defined(__DLL__) || defined(_DLL) +#define _dll_ +#endif + +// 16, 32 or 64 +#if defined(__sparc_v9__) || defined(_x86_64_) || defined(_ia64_) || defined(_arm64_) || defined(_ppc64_) +#define _64_ +#else +#define _32_ +#endif + +/* All modern 64-bit Unix systems use scheme LP64 (long, pointers are 64-bit). + * Microsoft uses a different scheme: LLP64 (long long, pointers are 64-bit). + * + * Scheme LP64 LLP64 + * char 8 8 + * short 16 16 + * int 32 32 + * long 64 32 + * long long 64 64 + * pointer 64 64 + */ + +#if defined(_32_) +#define SIZEOF_PTR 4 +#elif defined(_64_) +#define SIZEOF_PTR 8 +#endif + +#define PLATFORM_DATA_ALIGN SIZEOF_PTR + +#if !defined(SIZEOF_PTR) +#error todo +#endif + +#define SIZEOF_CHAR 1 +#define SIZEOF_UNSIGNED_CHAR 1 +#define SIZEOF_SHORT 2 +#define SIZEOF_UNSIGNED_SHORT 2 +#define SIZEOF_INT 4 +#define SIZEOF_UNSIGNED_INT 4 + +#if defined(_32_) +#define SIZEOF_LONG 4 +#define SIZEOF_UNSIGNED_LONG 4 +#elif defined(_64_) +#if defined(_win_) +#define SIZEOF_LONG 4 +#define SIZEOF_UNSIGNED_LONG 4 +#else +#define SIZEOF_LONG 8 +#define SIZEOF_UNSIGNED_LONG 8 +#endif // _win_ +#endif // _32_ + +#if !defined(SIZEOF_LONG) +#error todo +#endif + +#define SIZEOF_LONG_LONG 8 +#define SIZEOF_UNSIGNED_LONG_LONG 8 + +#undef SIZEOF_SIZE_T // in case we include which defines it, too +#define SIZEOF_SIZE_T SIZEOF_PTR + +#if defined(__INTEL_COMPILER) +#pragma warning(disable 1292) +#pragma warning(disable 1469) +#pragma warning(disable 193) +#pragma warning(disable 271) +#pragma warning(disable 383) +#pragma warning(disable 424) +#pragma warning(disable 444) +#pragma warning(disable 584) +#pragma warning(disable 593) +#pragma warning(disable 981) +#pragma warning(disable 1418) +#pragma warning(disable 304) +#pragma warning(disable 810) +#pragma warning(disable 1029) +#pragma warning(disable 1419) +#pragma warning(disable 177) +#pragma warning(disable 522) +#pragma warning(disable 858) +#pragma warning(disable 111) +#pragma warning(disable 1599) +#pragma warning(disable 411) +#pragma warning(disable 304) +#pragma warning(disable 858) +#pragma warning(disable 444) +#pragma warning(disable 913) +#pragma warning(disable 310) +#pragma warning(disable 167) +#pragma warning(disable 180) +#pragma warning(disable 1572) +#endif + +#if defined(_MSC_VER) +#undef _WINSOCKAPI_ +#define _WINSOCKAPI_ +#undef NOMINMAX +#define NOMINMAX +#endif diff --git a/contrib/lfalloc/src/util/system/types.h b/contrib/lfalloc/src/util/system/types.h new file mode 100644 index 00000000000..af4f0adb13d --- /dev/null +++ b/contrib/lfalloc/src/util/system/types.h @@ -0,0 +1,117 @@ +#pragma once + +// DO_NOT_STYLE + +#include "platform.h" + +#include + +typedef int8_t i8; +typedef int16_t i16; +typedef uint8_t ui8; +typedef uint16_t ui16; + +typedef int yssize_t; +#define PRIYSZT "d" + +#if defined(_darwin_) && defined(_32_) +typedef unsigned long ui32; +typedef long i32; +#else +typedef uint32_t ui32; +typedef int32_t i32; +#endif + +#if defined(_darwin_) && defined(_64_) +typedef unsigned long ui64; +typedef long i64; +#else +typedef uint64_t ui64; +typedef int64_t i64; +#endif + +#define LL(number) INT64_C(number) +#define ULL(number) UINT64_C(number) + +// Macro for size_t and ptrdiff_t types +#if defined(_32_) +# if defined(_darwin_) +# define PRISZT "lu" +# undef PRIi32 +# define PRIi32 "li" +# undef SCNi32 +# define SCNi32 "li" +# undef PRId32 +# define PRId32 "li" +# undef SCNd32 +# define SCNd32 "li" +# undef PRIu32 +# define PRIu32 "lu" +# undef SCNu32 +# define SCNu32 "lu" +# undef PRIx32 +# define PRIx32 "lx" +# undef SCNx32 +# define SCNx32 "lx" +# elif !defined(_cygwin_) +# define PRISZT PRIu32 +# else +# define PRISZT "u" +# endif +# define SCNSZT SCNu32 +# define PRIPDT PRIi32 +# define SCNPDT SCNi32 +# define PRITMT PRIi32 +# define SCNTMT SCNi32 +#elif defined(_64_) +# if defined(_darwin_) +# define PRISZT "lu" +# undef PRIu64 +# define PRIu64 PRISZT +# undef PRIx64 +# define PRIx64 "lx" +# undef PRIX64 +# define PRIX64 "lX" +# undef PRId64 +# define PRId64 "ld" +# undef PRIi64 +# define PRIi64 "li" +# undef SCNi64 +# define SCNi64 "li" +# undef SCNu64 +# define SCNu64 "lu" +# undef SCNx64 +# define SCNx64 "lx" +# else +# define PRISZT PRIu64 +# endif +# define SCNSZT SCNu64 +# define PRIPDT PRIi64 +# define SCNPDT SCNi64 +# define PRITMT PRIi64 +# define SCNTMT SCNi64 +#else +# error "Unsupported platform" +#endif + +// SUPERLONG +#if !defined(DONT_USE_SUPERLONG) && !defined(SUPERLONG_MAX) +#define SUPERLONG_MAX ~LL(0) +typedef i64 SUPERLONG; +#endif + +// UNICODE +// UCS-2, native byteorder +typedef ui16 wchar16; +// internal symbol type: UTF-16LE +typedef wchar16 TChar; +typedef ui32 wchar32; + +#if defined(_MSC_VER) +#include +typedef SSIZE_T ssize_t; +#define HAVE_SSIZE_T 1 +#include +#endif + +#include diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt index 1306039e9c3..5c1d73b7a74 100644 --- a/dbms/CMakeLists.txt +++ b/dbms/CMakeLists.txt @@ -155,7 +155,6 @@ if (USE_EMBEDDED_COMPILER) target_include_directories (dbms SYSTEM BEFORE PUBLIC ${LLVM_INCLUDE_DIRS}) endif () - if (CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" OR CMAKE_BUILD_TYPE_UC STREQUAL "MINSIZEREL") # Won't generate debug info for files with heavy template instantiation to achieve faster linking and lower size. set_source_files_properties( @@ -214,6 +213,10 @@ target_link_libraries (clickhouse_common_io target_include_directories(clickhouse_common_io SYSTEM BEFORE PUBLIC ${RE2_INCLUDE_DIR}) +if (USE_LFALLOC) + target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${LFALLOC_INCLUDE_DIR}) +endif () + if(CPUID_LIBRARY) target_link_libraries(clickhouse_common_io PRIVATE ${CPUID_LIBRARY}) endif() diff --git a/dbms/src/Common/LFAllocator.cpp b/dbms/src/Common/LFAllocator.cpp new file mode 100644 index 00000000000..71396d341ab --- /dev/null +++ b/dbms/src/Common/LFAllocator.cpp @@ -0,0 +1,53 @@ +#include + +#if USE_LFALLOC +#include "LFAllocator.h" + +#include +#include + +namespace DB +{ + +void * LFAllocator::alloc(size_t size, size_t alignment) +{ + if (alignment == 0) + return LFAlloc(size); + else + { + void * ptr; + int res = LFPosixMemalign(&ptr, alignment, size); + return res ? nullptr : ptr; + } +} + +void LFAllocator::free(void * buf, size_t) +{ + LFFree(buf); +} + +void * LFAllocator::realloc(void * old_ptr, size_t, size_t new_size, size_t alignment) +{ + if (old_ptr == nullptr) + { + void * result = LFAllocator::alloc(new_size, alignment); + return result; + } + if (new_size == 0) + { + LFFree(old_ptr); + return nullptr; + } + + void * new_ptr = LFAllocator::alloc(new_size, alignment); + if (new_ptr == nullptr) + return nullptr; + size_t old_size = LFGetSize(old_ptr); + memcpy(new_ptr, old_ptr, ((old_size < new_size) ? old_size : new_size)); + LFFree(old_ptr); + return new_ptr; +} + +} + +#endif diff --git a/dbms/src/Common/LFAllocator.h b/dbms/src/Common/LFAllocator.h new file mode 100644 index 00000000000..f2a10cc4508 --- /dev/null +++ b/dbms/src/Common/LFAllocator.h @@ -0,0 +1,22 @@ +#pragma once + +#include + +#if !USE_LFALLOC +#error "do not include this file until USE_LFALLOC is set to 1" +#endif + +#include + +namespace DB +{ +struct LFAllocator +{ + static void * alloc(size_t size, size_t alignment = 0); + + static void free(void * buf, size_t); + + static void * realloc(void * buf, size_t, size_t new_size, size_t alignment = 0); +}; + +} diff --git a/dbms/src/Common/config.h.in b/dbms/src/Common/config.h.in index c323afe369e..d6fc6d146f0 100644 --- a/dbms/src/Common/config.h.in +++ b/dbms/src/Common/config.h.in @@ -25,6 +25,8 @@ #cmakedefine01 USE_BROTLI #cmakedefine01 USE_SSL #cmakedefine01 USE_HYPERSCAN +#cmakedefine01 USE_LFALLOC +#cmakedefine01 USE_LFALLOC_RANDOM_HINT #cmakedefine01 CLICKHOUSE_SPLIT_BINARY #cmakedefine01 LLVM_HAS_RTTI diff --git a/dbms/src/DataStreams/MarkInCompressedFile.h b/dbms/src/DataStreams/MarkInCompressedFile.h index 3a1d9aa0f19..b2219f4d55f 100644 --- a/dbms/src/DataStreams/MarkInCompressedFile.h +++ b/dbms/src/DataStreams/MarkInCompressedFile.h @@ -6,6 +6,10 @@ #include #include +#include +#if USE_LFALLOC +#include +#endif namespace DB { @@ -33,7 +37,9 @@ struct MarkInCompressedFile return "(" + DB::toString(offset_in_compressed_file) + "," + DB::toString(offset_in_decompressed_block) + ")"; } }; - +#if USE_LFALLOC +using MarksInCompressedFile = PODArray; +#else using MarksInCompressedFile = PODArray; - +#endif } diff --git a/dbms/src/IO/UncompressedCache.h b/dbms/src/IO/UncompressedCache.h index 86f1530e5b3..2347c6d7a28 100644 --- a/dbms/src/IO/UncompressedCache.h +++ b/dbms/src/IO/UncompressedCache.h @@ -6,6 +6,11 @@ #include #include +#include +#if USE_LFALLOC +#include +#endif + namespace ProfileEvents { @@ -20,7 +25,11 @@ namespace DB struct UncompressedCacheCell { +#if USE_LFALLOC + Memory data; +#else Memory<> data; +#endif size_t compressed_size; UInt32 additional_bytes; }; diff --git a/dbms/src/Interpreters/Compiler.cpp b/dbms/src/Interpreters/Compiler.cpp index 83fbf2918dc..abdb0969121 100644 --- a/dbms/src/Interpreters/Compiler.cpp +++ b/dbms/src/Interpreters/Compiler.cpp @@ -2,6 +2,7 @@ #include #include #include +#include #include #include #include @@ -261,6 +262,9 @@ void Compiler::compile( " -I " << compiler_headers << "/dbms/src/" " -isystem " << compiler_headers << "/contrib/cityhash102/include/" " -isystem " << compiler_headers << "/contrib/libpcg-random/include/" + #if USE_LFALLOC + " -isystem " << compiler_headers << "/contrib/lfalloc/src/" + #endif " -isystem " << compiler_headers << INTERNAL_DOUBLE_CONVERSION_INCLUDE_DIR " -isystem " << compiler_headers << INTERNAL_Poco_Foundation_INCLUDE_DIR " -isystem " << compiler_headers << INTERNAL_Boost_INCLUDE_DIRS diff --git a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in index 97358ac02c9..6d6d5f32e0c 100644 --- a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in +++ b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in @@ -57,6 +57,8 @@ const char * auto_config_build[] "USE_BROTLI", "@USE_BROTLI@", "USE_SSL", "@USE_SSL@", "USE_HYPERSCAN", "@USE_HYPERSCAN@", + "USE_LFALLOC", "@USE_LFALLOC@", + "USE_LFALLOC_RANDOM_HINT", "@USE_LFALLOC_RANDOM_HINT@", nullptr, nullptr }; diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt index c0be7e218e1..bb6fbf79946 100644 --- a/libs/libcommon/CMakeLists.txt +++ b/libs/libcommon/CMakeLists.txt @@ -61,7 +61,6 @@ add_library (common ${LINK_MODE} if (USE_JEMALLOC) message (STATUS "Link jemalloc: ${JEMALLOC_LIBRARIES}") set (MALLOC_LIBRARIES ${JEMALLOC_LIBRARIES}) - elseif (USE_TCMALLOC) if (DEBUG_TCMALLOC AND NOT GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG) message (FATAL_ERROR "Requested DEBUG_TCMALLOC but debug library is not found. You should install Google Perftools. Example: sudo apt-get install libgoogle-perftools-dev")