diff --git a/CMakeLists.txt b/CMakeLists.txt
index 8c4802295a7..45ee1dfbb41 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -317,6 +317,7 @@ include (cmake/find_hdfs3.cmake) # uses protobuf
 include (cmake/find_consistent-hashing.cmake)
 include (cmake/find_base64.cmake)
 include (cmake/find_hyperscan.cmake)
+include (cmake/find_lfalloc.cmake)
 find_contrib_lib(cityhash)
 find_contrib_lib(farmhash)
 find_contrib_lib(metrohash)
diff --git a/cmake/find_lfalloc.cmake b/cmake/find_lfalloc.cmake
new file mode 100644
index 00000000000..9383bd01f30
--- /dev/null
+++ b/cmake/find_lfalloc.cmake
@@ -0,0 +1,9 @@
+if (NOT SANITIZE AND NOT ARCH_ARM AND NOT ARCH_32 AND NOT ARCH_PPC64LE)
+    if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/lfalloc/src/lf_allocX64.h")
+        message (FATAL_ERROR "submodule contrib/lfalloc is missing. to fix try run: \n git submodule update --init --recursive")
+    endif()
+    set (USE_LFALLOC 1)
+    set (USE_LFALLOC_RANDOM_HINT 1)
+    set (LFALLOC_INCLUDE_DIR ${ClickHouse_SOURCE_DIR}/contrib/lfalloc/src)
+    message (STATUS "Using lfalloc=${USE_LFALLOC}: ${LFALLOC_INCLUDE_DIR}")
+endif ()
diff --git a/contrib/lfalloc/src/lf_allocX64.h b/contrib/lfalloc/src/lf_allocX64.h
new file mode 100644
index 00000000000..2c4cf3f1021
--- /dev/null
+++ b/contrib/lfalloc/src/lf_allocX64.h
@@ -0,0 +1,1813 @@
+#pragma once
+
+#include <stdlib.h>
+#include <stdio.h>
+#include <stdarg.h>
+
+#include "lfmalloc.h"
+
+#include "util/system/compiler.h"
+#include "util/system/types.h"
+#include <random>
+
+#ifdef _MSC_VER
+#ifndef _CRT_SECURE_NO_WARNINGS
+#define _CRT_SECURE_NO_WARNINGS
+#endif
+#ifdef _M_X64
+#define _64_
+#endif
+#include <intrin.h>
+#define WIN32_LEAN_AND_MEAN
+#include <Windows.h>
+#pragma intrinsic(_InterlockedCompareExchange)
+#pragma intrinsic(_InterlockedExchangeAdd)
+
+#include <new>
+#include <assert.h>
+#include <errno.h>
+
+#define PERTHREAD __declspec(thread)
+#define _win_
+#define Y_FORCE_INLINE __forceinline
+
+using TAtomic = volatile long;
+
+static inline long AtomicAdd(TAtomic& a, long b) {
+    return _InterlockedExchangeAdd(&a, b) + b;
+}
+
+static inline long AtomicSub(TAtomic& a, long b) {
+    return AtomicAdd(a, -b);
+}
+
+#define Y_ASSERT_NOBT(x) ((void)0)
+
+#else
+
+#include "util/system/defaults.h"
+#include "util/system/atomic.h"
+#include <cassert>
+
+#if !defined(NDEBUG) && !defined(__GCCXML__)
+#define Y_ASSERT_NOBT(a)                       \
+    do {                                       \
+        if (Y_UNLIKELY(!(a))) {                \
+            assert(false && (a));              \
+        }                                      \
+    } while (0)
+#else
+#define Y_ASSERT_NOBT(a)                       \
+    do {                                       \
+        if (false) {                           \
+            bool __xxx = static_cast<bool>(a); \
+            Y_UNUSED(__xxx);                   \
+        }                                      \
+    } while (0)
+#endif
+
+#include <pthread.h>
+#include <sys/mman.h>
+#include <stdlib.h>
+#include <memory.h>
+#include <new>
+#include <errno.h>
+
+#if defined(_linux_)
+#if !defined(MADV_HUGEPAGE)
+#define MADV_HUGEPAGE 14
+#endif
+#if !defined(MAP_HUGETLB)
+#define MAP_HUGETLB 0x40000
+#endif
+#endif
+
+#define PERTHREAD __thread
+
+#endif
+
+#ifndef _darwin_
+
+#ifndef Y_ARRAY_SIZE
+#define Y_ARRAY_SIZE(a) (sizeof(a) / sizeof((a)[0]))
+#endif
+
+#ifndef NDEBUG
+#define DBG_FILL_MEMORY
+static bool FillMemoryOnAllocation = true;
+#endif
+
+static bool TransparentHugePages = false; // force MADV_HUGEPAGE for large allocs
+static bool MapHugeTLB = false;           // force MAP_HUGETLB for small allocs
+static bool EnableDefrag = true;
+
+// Buffers that are larger than this size will not be filled with 0xcf
+#ifndef DBG_FILL_MAX_SIZE
+#define DBG_FILL_MAX_SIZE 0x01000000000000ULL
+#endif
+
+template <class T>
+inline T* DoCas(T* volatile* target, T* exchange, T* compare) {
+#if defined(_linux_)
+    return __sync_val_compare_and_swap(target, compare, exchange);
+#elif defined(_WIN32)
+#ifdef _64_
+    return (T*)_InterlockedCompareExchange64((__int64*)target, (__int64)exchange, (__int64)compare);
+#else
+    //return (T*)InterlockedCompareExchangePointer(targetVoidP, exchange, compare);
+    return (T*)_InterlockedCompareExchange((LONG*)target, (LONG)exchange, (LONG)compare);
+#endif
+#elif defined(__i386) || defined(__x86_64__)
+    union {
+        T* volatile* NP;
+        void* volatile* VoidP;
+    } gccSucks;
+    gccSucks.NP = target;
+    void* volatile* targetVoidP = gccSucks.VoidP;
+
+    __asm__ __volatile__(
+        "lock\n\t"
+        "cmpxchg %2,%0\n\t"
+        : "+m"(*(targetVoidP)), "+a"(compare)
+        : "r"(exchange)
+        : "cc", "memory");
+    return compare;
+#else
+#error inline_cas not defined for this platform
+#endif
+}
+
+#ifdef _64_
+const uintptr_t N_MAX_WORKSET_SIZE = 0x100000000ll * 200;
+const uintptr_t N_HUGE_AREA_FINISH = 0x700000000000ll;
+#ifndef _freebsd_
+const uintptr_t LINUX_MMAP_AREA_START = 0x100000000ll;
+static uintptr_t volatile linuxAllocPointer = LINUX_MMAP_AREA_START;
+static uintptr_t volatile linuxAllocPointerHuge = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE;
+#endif
+#else
+const uintptr_t N_MAX_WORKSET_SIZE = 0xffffffff;
+#endif
+#define ALLOC_START ((char*)0)
+
+const size_t N_CHUNK_SIZE = 1024 * 1024;
+const size_t N_CHUNKS = N_MAX_WORKSET_SIZE / N_CHUNK_SIZE;
+const size_t N_LARGE_ALLOC_SIZE = N_CHUNK_SIZE * 128;
+
+// map size idx to size in bytes
+#ifdef LFALLOC_YT
+const int N_SIZES = 27;
+#else
+const int N_SIZES = 25;
+#endif
+const int nSizeIdxToSize[N_SIZES] = {
+    -1,
+#if defined(_64_)
+    16, 16, 32, 32, 48, 64, 96, 128,
+#else
+    8,
+    16,
+    24,
+    32,
+    48,
+    64,
+    96,
+    128,
+#endif
+    192, 256, 384, 512, 768, 1024, 1536, 2048,
+    3072, 4096, 6144, 8192, 12288, 16384, 24576, 32768,
+#ifdef LFALLOC_YT
+    49152, 65536
+#endif
+};
+#ifdef LFALLOC_YT
+const size_t N_MAX_FAST_SIZE = 65536;
+#else
+const size_t N_MAX_FAST_SIZE = 32768;
+#endif
+const unsigned char size2idxArr1[64 + 1] = {
+    1,
+#if defined(_64_)
+    2, 2, 4, 4, // 16, 16, 32, 32
+#else
+    1, 2, 3, 4, // 8, 16, 24, 32
+#endif
+    5, 5, 6, 6,                                                     // 48, 64
+    7, 7, 7, 7, 8, 8, 8, 8,                                         // 96, 128
+    9, 9, 9, 9, 9, 9, 9, 9, 10, 10, 10, 10, 10, 10, 10, 10,         // 192, 256
+    11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, 11, // 384
+    12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12, 12  // 512
+};
+#ifdef LFALLOC_YT
+const unsigned char size2idxArr2[256] = {
+#else
+const unsigned char size2idxArr2[128] = {
+#endif
+    12, 12, 13, 14,                                                 // 512, 512, 768, 1024
+    15, 15, 16, 16,                                                 // 1536, 2048
+    17, 17, 17, 17, 18, 18, 18, 18,                                 // 3072, 4096
+    19, 19, 19, 19, 19, 19, 19, 19, 20, 20, 20, 20, 20, 20, 20, 20, // 6144, 8192
+    21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, 21, // 12288
+    22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, 22, // 16384
+    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23,
+    23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, 23, // 24576
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24,
+    24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, 24, // 32768
+#ifdef LFALLOC_YT
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25,
+    25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, 25, // 49152
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26,
+    26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, 26, // 65536
+#endif
+};
+
+// map entry number to size idx
+// special size idx's: 0 = not used, -1 = mem locked, but not allocated
+static volatile char chunkSizeIdx[N_CHUNKS];
+const int FREE_CHUNK_ARR_BUF = 0x20000; // this is effectively 128G of free memory (with 1M chunks), should not be exhausted actually
+static volatile uintptr_t freeChunkArr[FREE_CHUNK_ARR_BUF];
+static volatile int freeChunkCount;
+
+static void AddFreeChunk(uintptr_t chunkId) {
+    chunkSizeIdx[chunkId] = -1;
+    if (Y_UNLIKELY(freeChunkCount == FREE_CHUNK_ARR_BUF))
+        NMalloc::AbortFromCorruptedAllocator(); // free chunks arrray overflowed
+    freeChunkArr[freeChunkCount++] = chunkId;
+}
+
+static bool GetFreeChunk(uintptr_t* res) {
+    if (freeChunkCount == 0) {
+        *res = 0;
+        return false;
+    }
+    *res = freeChunkArr[--freeChunkCount];
+    return true;
+}
+
+//////////////////////////////////////////////////////////////////////////
+enum ELFAllocCounter {
+    CT_USER_ALLOC,     // accumulated size requested by user code
+    CT_MMAP,           // accumulated mmapped size
+    CT_MMAP_CNT,       // number of mmapped regions
+    CT_MUNMAP,         // accumulated unmmapped size
+    CT_MUNMAP_CNT,     // number of munmaped regions
+    CT_SYSTEM_ALLOC,   // accumulated allocated size for internal lfalloc needs
+    CT_SYSTEM_FREE,    // accumulated deallocated size for internal lfalloc needs
+    CT_SMALL_ALLOC,    // accumulated allocated size for fixed-size blocks
+    CT_SMALL_FREE,     // accumulated deallocated size for fixed-size blocks
+    CT_LARGE_ALLOC,    // accumulated allocated size for large blocks
+    CT_LARGE_FREE,     // accumulated deallocated size for large blocks
+    CT_SLOW_ALLOC_CNT, // number of slow (not LF) allocations
+    CT_DEGRAGMENT_CNT, // number of memory defragmentations
+    CT_MAX
+};
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value);
+
+//////////////////////////////////////////////////////////////////////////
+enum EMMapMode {
+    MM_NORMAL, // memory for small allocs
+    MM_HUGE    // memory for large allocs
+};
+
+#ifndef _MSC_VER
+inline void VerifyMmapResult(void* result) {
+    if (Y_UNLIKELY(result == MAP_FAILED))
+        NMalloc::AbortFromCorruptedAllocator(); // negative size requested? or just out of mem
+}
+#endif
+
+#if !defined(_MSC_VER) && !defined(_freebsd_) && defined(_64_)
+static char* AllocWithMMapLinuxImpl(uintptr_t sz, EMMapMode mode) {
+    char* volatile* areaPtr;
+    char* areaStart;
+    uintptr_t areaFinish;
+
+    int mapProt = PROT_READ | PROT_WRITE;
+    int mapFlags = MAP_PRIVATE | MAP_ANON;
+
+    if (mode == MM_HUGE) {
+        areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointerHuge);
+        areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE);
+        areaFinish = N_HUGE_AREA_FINISH;
+    } else {
+        areaPtr = reinterpret_cast<char* volatile*>(&linuxAllocPointer);
+        areaStart = reinterpret_cast<char*>(LINUX_MMAP_AREA_START);
+        areaFinish = N_MAX_WORKSET_SIZE;
+
+        if (MapHugeTLB) {
+            mapFlags |= MAP_HUGETLB;
+        }
+    }
+
+    bool wrapped = false;
+    for (;;) {
+        char* prevAllocPtr = *areaPtr;
+        char* nextAllocPtr = prevAllocPtr + sz;
+        if (uintptr_t(nextAllocPtr - (char*)nullptr) >= areaFinish) {
+            if (Y_UNLIKELY(wrapped)) {
+                // virtual memory is over fragmented
+                NMalloc::AbortFromCorruptedAllocator();
+            }
+            // wrap after all area is used
+            DoCas(areaPtr, areaStart, prevAllocPtr);
+            wrapped = true;
+            continue;
+        }
+
+        if (DoCas(areaPtr, nextAllocPtr, prevAllocPtr) != prevAllocPtr)
+            continue;
+
+        char* largeBlock = (char*)mmap(prevAllocPtr, sz, mapProt, mapFlags, -1, 0);
+        VerifyMmapResult(largeBlock);
+        if (largeBlock == prevAllocPtr)
+            return largeBlock;
+        if (largeBlock)
+            munmap(largeBlock, sz);
+
+        if (sz < 0x80000) {
+            // skip utilized area with big steps
+            DoCas(areaPtr, nextAllocPtr + 0x10 * 0x10000, nextAllocPtr);
+        }
+    }
+}
+#endif
+
+static char* AllocWithMMap(uintptr_t sz, EMMapMode mode) {
+    (void)mode;
+#ifdef _MSC_VER
+    char* largeBlock = (char*)VirtualAlloc(0, sz, MEM_RESERVE, PAGE_READWRITE);
+    if (Y_UNLIKELY(largeBlock == nullptr))
+        NMalloc::AbortFromCorruptedAllocator(); // out of memory
+    if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= N_MAX_WORKSET_SIZE))
+        NMalloc::AbortFromCorruptedAllocator(); // out of working set, something has broken
+#else
+#if defined(_freebsd_) || !defined(_64_) || defined(USE_LFALLOC_RANDOM_HINT)
+    uintptr_t areaStart;
+    uintptr_t areaFinish;
+    if (mode == MM_HUGE) {
+        areaStart = LINUX_MMAP_AREA_START + N_MAX_WORKSET_SIZE;
+        areaFinish = N_HUGE_AREA_FINISH;
+    } else {
+        areaStart = LINUX_MMAP_AREA_START;
+        areaFinish = N_MAX_WORKSET_SIZE;
+    }
+#if defined(USE_LFALLOC_RANDOM_HINT)
+    static thread_local std::mt19937_64 generator(std::random_device{}());
+    std::uniform_int_distribution<intptr_t> distr(areaStart, areaFinish / 2);
+    char* largeBlock = (char*)mmap(reinterpret_cast<void*>(distr(generator)), sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+#else
+    char* largeBlock = (char*)mmap(0, sz, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
+#endif
+    VerifyMmapResult(largeBlock);
+    if (Y_UNLIKELY(uintptr_t(((char*)largeBlock - ALLOC_START) + sz) >= areaFinish))
+        NMalloc::AbortFromCorruptedAllocator(); // out of working set, something has broken
+#else
+    char* largeBlock = AllocWithMMapLinuxImpl(sz, mode);
+    if (TransparentHugePages) {
+        madvise(largeBlock, sz, MADV_HUGEPAGE);
+    }
+#endif
+#endif
+    Y_ASSERT_NOBT(largeBlock);
+    IncrementCounter(CT_MMAP, sz);
+    IncrementCounter(CT_MMAP_CNT, 1);
+    return largeBlock;
+}
+
+enum class ELarge : ui8 {
+    Free = 0,   // block in free cache
+    Alloc = 1,  // block is allocated
+    Gone = 2,   // block was unmapped
+};
+
+struct TLargeBlk {
+
+    static TLargeBlk* As(void *raw) {
+        return reinterpret_cast<TLargeBlk*>((char*)raw - 4096ll);
+    }
+
+    static const TLargeBlk* As(const void *raw) {
+        return reinterpret_cast<const TLargeBlk*>((const char*)raw - 4096ll);
+    }
+
+    void SetSize(size_t bytes, size_t pages) {
+        Pages = pages;
+        Bytes = bytes;
+    }
+
+    void Mark(ELarge state) {
+        const ui64 marks[] = {
+            0x8b38aa5ca4953c98, // ELarge::Free
+            0xf916d33584eb5087, // ELarge::Alloc
+            0xd33b0eca7651bc3f  // ELarge::Gone
+        };
+
+        Token = size_t(marks[ui8(state)]);
+    }
+
+    size_t Pages; // Total pages allocated with mmap like call
+    size_t Bytes; // Actually requested bytes by user
+    size_t Token; // Block state token, see ELarge enum.
+};
+
+
+static void LargeBlockUnmap(void* p, size_t pages) {
+    const auto bytes = (pages + 1) * uintptr_t(4096);
+
+    IncrementCounter(CT_MUNMAP, bytes);
+    IncrementCounter(CT_MUNMAP_CNT, 1);
+#ifdef _MSC_VER
+    Y_ASSERT_NOBT(0);
+#else
+    TLargeBlk::As(p)->Mark(ELarge::Gone);
+    munmap((char*)p - 4096ll, bytes);
+#endif
+}
+
+//////////////////////////////////////////////////////////////////////////
+const size_t LB_BUF_SIZE = 250;
+const size_t LB_BUF_HASH = 977;
+static int LB_LIMIT_TOTAL_SIZE = 500 * 1024 * 1024 / 4096; // do not keep more then this mem total in lbFreePtrs[]
+static void* volatile lbFreePtrs[LB_BUF_HASH][LB_BUF_SIZE];
+static TAtomic lbFreePageCount;
+
+
+static void* LargeBlockAlloc(size_t _nSize, ELFAllocCounter counter) {
+    size_t pgCount = (_nSize + 4095) / 4096;
+#ifdef _MSC_VER
+    char* pRes = (char*)VirtualAlloc(0, (pgCount + 1) * 4096ll, MEM_COMMIT, PAGE_READWRITE);
+    if (Y_UNLIKELY(pRes == 0)) {
+        NMalloc::AbortFromCorruptedAllocator(); // out of memory
+    }
+#else
+
+    IncrementCounter(counter, pgCount * 4096ll);
+    IncrementCounter(CT_SYSTEM_ALLOC, 4096ll);
+
+    int lbHash = pgCount % LB_BUF_HASH;
+    for (int i = 0; i < LB_BUF_SIZE; ++i) {
+        void* p = lbFreePtrs[lbHash][i];
+        if (p == nullptr)
+            continue;
+        if (DoCas(&lbFreePtrs[lbHash][i], (void*)nullptr, p) == p) {
+            size_t realPageCount = TLargeBlk::As(p)->Pages;
+            if (realPageCount == pgCount) {
+                AtomicAdd(lbFreePageCount, -pgCount);
+                TLargeBlk::As(p)->Mark(ELarge::Alloc);
+                return p;
+            } else {
+                if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) != (void*)nullptr) {
+                    // block was freed while we were busy
+                    AtomicAdd(lbFreePageCount, -realPageCount);
+                    LargeBlockUnmap(p, realPageCount);
+                    --i;
+                }
+            }
+        }
+    }
+    char* pRes = AllocWithMMap((pgCount + 1) * 4096ll, MM_HUGE);
+#endif
+    pRes += 4096ll;
+    TLargeBlk::As(pRes)->SetSize(_nSize, pgCount);
+    TLargeBlk::As(pRes)->Mark(ELarge::Alloc);
+
+    return pRes;
+}
+
+#ifndef _MSC_VER
+static void FreeAllLargeBlockMem() {
+    for (auto& lbFreePtr : lbFreePtrs) {
+        for (int i = 0; i < LB_BUF_SIZE; ++i) {
+            void* p = lbFreePtr[i];
+            if (p == nullptr)
+                continue;
+            if (DoCas(&lbFreePtr[i], (void*)nullptr, p) == p) {
+                int pgCount = TLargeBlk::As(p)->Pages;
+                AtomicAdd(lbFreePageCount, -pgCount);
+                LargeBlockUnmap(p, pgCount);
+            }
+        }
+    }
+}
+#endif
+
+static void LargeBlockFree(void* p, ELFAllocCounter counter) {
+    if (p == nullptr)
+        return;
+#ifdef _MSC_VER
+    VirtualFree((char*)p - 4096ll, 0, MEM_RELEASE);
+#else
+    size_t pgCount = TLargeBlk::As(p)->Pages;
+
+    TLargeBlk::As(p)->Mark(ELarge::Free);
+    IncrementCounter(counter, pgCount * 4096ll);
+    IncrementCounter(CT_SYSTEM_FREE, 4096ll);
+
+    if (lbFreePageCount > LB_LIMIT_TOTAL_SIZE)
+        FreeAllLargeBlockMem();
+    int lbHash = pgCount % LB_BUF_HASH;
+    for (int i = 0; i < LB_BUF_SIZE; ++i) {
+        if (lbFreePtrs[lbHash][i] == nullptr) {
+            if (DoCas(&lbFreePtrs[lbHash][i], p, (void*)nullptr) == nullptr) {
+                AtomicAdd(lbFreePageCount, pgCount);
+                return;
+            }
+        }
+    }
+
+    LargeBlockUnmap(p, pgCount);
+#endif
+}
+
+static void* SystemAlloc(size_t _nSize) {
+    //HeapAlloc(GetProcessHeap(), HEAP_GENERATE_EXCEPTIONS, _nSize);
+    return LargeBlockAlloc(_nSize, CT_SYSTEM_ALLOC);
+}
+static void SystemFree(void* p) {
+    //HeapFree(GetProcessHeap(), 0, p);
+    LargeBlockFree(p, CT_SYSTEM_FREE);
+}
+
+//////////////////////////////////////////////////////////////////////////
+static int* volatile nLock = nullptr;
+static int nLockVar;
+inline void RealEnterCriticalDefault(int* volatile* lockPtr) {
+    while (DoCas(lockPtr, &nLockVar, (int*)nullptr) != nullptr)
+        ; //pthread_yield();
+}
+inline void RealLeaveCriticalDefault(int* volatile* lockPtr) {
+    *lockPtr = nullptr;
+}
+static void (*RealEnterCritical)(int* volatile* lockPtr) = RealEnterCriticalDefault;
+static void (*RealLeaveCritical)(int* volatile* lockPtr) = RealLeaveCriticalDefault;
+static void (*BeforeLFAllocGlobalLockAcquired)() = nullptr;
+static void (*AfterLFAllocGlobalLockReleased)() = nullptr;
+class CCriticalSectionLockMMgr {
+public:
+    CCriticalSectionLockMMgr() {
+        if (BeforeLFAllocGlobalLockAcquired) {
+            BeforeLFAllocGlobalLockAcquired();
+        }
+        RealEnterCritical(&nLock);
+    }
+    ~CCriticalSectionLockMMgr() {
+        RealLeaveCritical(&nLock);
+        if (AfterLFAllocGlobalLockReleased) {
+            AfterLFAllocGlobalLockReleased();
+        }
+    }
+};
+
+//////////////////////////////////////////////////////////////////////////
+class TLFAllocFreeList {
+    struct TNode {
+        TNode* Next;
+    };
+
+    TNode* volatile Head;
+    TNode* volatile Pending;
+    TAtomic PendingToFreeListCounter;
+    TAtomic AllocCount;
+
+    static Y_FORCE_INLINE void Enqueue(TNode* volatile* headPtr, TNode* n) {
+        for (;;) {
+            TNode* volatile prevHead = *headPtr;
+            n->Next = prevHead;
+            if (DoCas(headPtr, n, prevHead) == prevHead)
+                break;
+        }
+    }
+    Y_FORCE_INLINE void* DoAlloc() {
+        TNode* res;
+        for (res = Head; res; res = Head) {
+            TNode* keepNext = res->Next;
+            if (DoCas(&Head, keepNext, res) == res) {
+                //Y_VERIFY(keepNext == res->Next);
+                break;
+            }
+        }
+        return res;
+    }
+    void FreeList(TNode* fl) {
+        if (!fl)
+            return;
+        TNode* flTail = fl;
+        while (flTail->Next)
+            flTail = flTail->Next;
+        for (;;) {
+            TNode* volatile prevHead = Head;
+            flTail->Next = prevHead;
+            if (DoCas(&Head, fl, prevHead) == prevHead)
+                break;
+        }
+    }
+
+public:
+    Y_FORCE_INLINE void Free(void* ptr) {
+        TNode* newFree = (TNode*)ptr;
+        if (AtomicAdd(AllocCount, 0) == 0)
+            Enqueue(&Head, newFree);
+        else
+            Enqueue(&Pending, newFree);
+    }
+    Y_FORCE_INLINE void* Alloc() {
+        TAtomic keepCounter = AtomicAdd(PendingToFreeListCounter, 0);
+        TNode* fl = Pending;
+        if (AtomicAdd(AllocCount, 1) == 1) {
+            // No other allocs in progress.
+            // If (keepCounter == PendingToFreeListCounter) then Pending was not freed by other threads.
+            // Hence Pending is not used in any concurrent DoAlloc() atm and can be safely moved to FreeList
+            if (fl && keepCounter == AtomicAdd(PendingToFreeListCounter, 0) && DoCas(&Pending, (TNode*)nullptr, fl) == fl) {
+                // pick first element from Pending and return it
+                void* res = fl;
+                fl = fl->Next;
+                // if there are other elements in Pending list, add them to main free list
+                FreeList(fl);
+                AtomicAdd(PendingToFreeListCounter, 1);
+                AtomicAdd(AllocCount, -1);
+                return res;
+            }
+        }
+        void* res = DoAlloc();
+        AtomicAdd(AllocCount, -1);
+        return res;
+    }
+    void* GetWholeList() {
+        TNode* res;
+        for (res = Head; res; res = Head) {
+            if (DoCas(&Head, (TNode*)nullptr, res) == res)
+                break;
+        }
+        return res;
+    }
+    void ReturnWholeList(void* ptr) {
+        while (AtomicAdd(AllocCount, 0) != 0) // theoretically can run into problems with parallel DoAlloc()
+            ;                                 //ThreadYield();
+        for (;;) {
+            TNode* prevHead = Head;
+            if (DoCas(&Head, (TNode*)ptr, prevHead) == prevHead) {
+                FreeList(prevHead);
+                break;
+            }
+        }
+    }
+};
+
+/////////////////////////////////////////////////////////////////////////
+static TLFAllocFreeList globalFreeLists[N_SIZES];
+static char* volatile globalCurrentPtr[N_SIZES];
+static TLFAllocFreeList blockFreeList;
+
+// globalFreeLists[] contains TFreeListGroup, each of them points up to 15 free blocks
+const int FL_GROUP_SIZE = 15;
+struct TFreeListGroup {
+    TFreeListGroup* Next;
+    char* Ptrs[FL_GROUP_SIZE];
+};
+#ifdef _64_
+const int FREE_LIST_GROUP_SIZEIDX = 8;
+#else
+const int FREE_LIST_GROUP_SIZEIDX = 6;
+#endif
+
+//////////////////////////////////////////////////////////////////////////
+// find free chunks and reset chunk size so they can be reused by different sized allocations
+// do not look at blockFreeList (TFreeListGroup has same size for any allocations)
+static bool DefragmentMem() {
+    if (!EnableDefrag) {
+        return false;
+    }
+
+    IncrementCounter(CT_DEGRAGMENT_CNT, 1);
+
+    int* nFreeCount = (int*)SystemAlloc(N_CHUNKS * sizeof(int));
+    if (Y_UNLIKELY(!nFreeCount)) {
+        //__debugbreak();
+        NMalloc::AbortFromCorruptedAllocator();
+    }
+    memset(nFreeCount, 0, N_CHUNKS * sizeof(int));
+
+    TFreeListGroup* wholeLists[N_SIZES];
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+        wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+        for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+            for (auto pData : g->Ptrs) {
+                if (pData) {
+                    uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+                    ++nFreeCount[nChunk];
+                    Y_ASSERT_NOBT(chunkSizeIdx[nChunk] == nSizeIdx);
+                }
+            }
+        }
+    }
+
+    bool bRes = false;
+    for (size_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+        int fc = nFreeCount[nChunk];
+        nFreeCount[nChunk] = 0;
+        if (chunkSizeIdx[nChunk] <= 0)
+            continue;
+        int nEntries = N_CHUNK_SIZE / nSizeIdxToSize[static_cast<int>(chunkSizeIdx[nChunk])];
+        Y_ASSERT_NOBT(fc <= nEntries); // can not have more free blocks then total count
+        if (fc == nEntries) {
+            bRes = true;
+            nFreeCount[nChunk] = 1;
+        }
+    }
+    if (bRes) {
+        for (auto& wholeList : wholeLists) {
+            TFreeListGroup** ppPtr = &wholeList;
+            while (*ppPtr) {
+                TFreeListGroup* g = *ppPtr;
+                int dst = 0;
+                for (auto pData : g->Ptrs) {
+                    if (pData) {
+                        uintptr_t nChunk = (pData - ALLOC_START) / N_CHUNK_SIZE;
+                        if (nFreeCount[nChunk] == 0)
+                            g->Ptrs[dst++] = pData; // block is not freed, keep pointer
+                    }
+                }
+                if (dst == 0) {
+                    // no valid pointers in group, free it
+                    *ppPtr = g->Next;
+                    blockFreeList.Free(g);
+                } else {
+                    // reset invalid pointers to 0
+                    for (int i = dst; i < FL_GROUP_SIZE; ++i)
+                        g->Ptrs[i] = nullptr;
+                    ppPtr = &g->Next;
+                }
+            }
+        }
+        for (uintptr_t nChunk = 0; nChunk < N_CHUNKS; ++nChunk) {
+            if (!nFreeCount[nChunk])
+                continue;
+            char* pStart = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _win_
+            VirtualFree(pStart, N_CHUNK_SIZE, MEM_DECOMMIT);
+#elif defined(_freebsd_)
+            madvise(pStart, N_CHUNK_SIZE, MADV_FREE);
+#else
+            madvise(pStart, N_CHUNK_SIZE, MADV_DONTNEED);
+#endif
+            AddFreeChunk(nChunk);
+        }
+    }
+
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+        globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+
+    SystemFree(nFreeCount);
+    return bRes;
+}
+
+static Y_FORCE_INLINE void* LFAllocFromCurrentChunk(int nSizeIdx, int blockSize, int count) {
+    char* volatile* pFreeArray = &globalCurrentPtr[nSizeIdx];
+    while (char* newBlock = *pFreeArray) {
+        char* nextFree = newBlock + blockSize * count;
+
+        // check if there is space in chunk
+        char* globalEndPtr = ALLOC_START + ((newBlock - ALLOC_START) & ~((uintptr_t)N_CHUNK_SIZE - 1)) + N_CHUNK_SIZE;
+        if (nextFree >= globalEndPtr) {
+            if (nextFree > globalEndPtr)
+                break;
+            nextFree = nullptr; // it was last block in chunk
+        }
+        if (DoCas(pFreeArray, nextFree, newBlock) == newBlock)
+            return newBlock;
+    }
+    return nullptr;
+}
+
+enum EDefrag {
+    MEM_DEFRAG,
+    NO_MEM_DEFRAG,
+};
+
+static void* SlowLFAlloc(int nSizeIdx, int blockSize, EDefrag defrag) {
+    IncrementCounter(CT_SLOW_ALLOC_CNT, 1);
+
+    CCriticalSectionLockMMgr ls;
+    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+    if (res)
+        return res; // might happen when other thread allocated new current chunk
+
+    for (;;) {
+        uintptr_t nChunk;
+        if (GetFreeChunk(&nChunk)) {
+            char* newPlace = ALLOC_START + nChunk * N_CHUNK_SIZE;
+#ifdef _MSC_VER
+            void* pTest = VirtualAlloc(newPlace, N_CHUNK_SIZE, MEM_COMMIT, PAGE_READWRITE);
+            Y_ASSERT_NOBT(pTest == newPlace);
+#endif
+            chunkSizeIdx[nChunk] = (char)nSizeIdx;
+            globalCurrentPtr[nSizeIdx] = newPlace + blockSize;
+            return newPlace;
+        }
+
+        // out of luck, try to defrag
+        if (defrag == MEM_DEFRAG && DefragmentMem()) {
+            continue;
+        }
+
+        char* largeBlock = AllocWithMMap(N_LARGE_ALLOC_SIZE, MM_NORMAL);
+        uintptr_t addr = ((largeBlock - ALLOC_START) + N_CHUNK_SIZE - 1) & (~(N_CHUNK_SIZE - 1));
+        uintptr_t endAddr = ((largeBlock - ALLOC_START) + N_LARGE_ALLOC_SIZE) & (~(N_CHUNK_SIZE - 1));
+        for (uintptr_t p = addr; p < endAddr; p += N_CHUNK_SIZE) {
+            uintptr_t chunk = p / N_CHUNK_SIZE;
+            Y_ASSERT_NOBT(chunk * N_CHUNK_SIZE == p);
+            Y_ASSERT_NOBT(chunkSizeIdx[chunk] == 0);
+            AddFreeChunk(chunk);
+        }
+    }
+    return nullptr;
+}
+
+// allocate single block
+static Y_FORCE_INLINE void* LFAllocNoCache(int nSizeIdx, EDefrag defrag) {
+    int blockSize = nSizeIdxToSize[nSizeIdx];
+    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, 1);
+    if (res)
+        return res;
+
+    return SlowLFAlloc(nSizeIdx, blockSize, defrag);
+}
+
+// allocate multiple blocks, returns number of blocks allocated (max FL_GROUP_SIZE)
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int LFAllocNoCacheMultiple(int nSizeIdx, char** buf) {
+    int blockSize = nSizeIdxToSize[nSizeIdx];
+    void* res = LFAllocFromCurrentChunk(nSizeIdx, blockSize, FL_GROUP_SIZE);
+    if (res) {
+        char* resPtr = (char*)res;
+        for (int k = 0; k < FL_GROUP_SIZE; ++k) {
+            buf[k] = resPtr;
+            resPtr += blockSize;
+        }
+        return FL_GROUP_SIZE;
+    }
+    buf[0] = (char*)SlowLFAlloc(nSizeIdx, blockSize, MEM_DEFRAG);
+    return 1;
+}
+
+// take several blocks from global free list (max FL_GROUP_SIZE blocks), returns number of blocks taken
+// buf should have space for at least FL_GROUP_SIZE elems
+static Y_FORCE_INLINE int TakeBlocksFromGlobalFreeList(int nSizeIdx, char** buf) {
+    TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+    TFreeListGroup* g = (TFreeListGroup*)fl.Alloc();
+    if (g) {
+        int resCount = 0;
+        for (auto& ptr : g->Ptrs) {
+            if (ptr)
+                buf[resCount++] = ptr;
+            else
+                break;
+        }
+        blockFreeList.Free(g);
+        return resCount;
+    }
+    return 0;
+}
+
+// add several blocks to global free list
+static Y_FORCE_INLINE void PutBlocksToGlobalFreeList(ptrdiff_t nSizeIdx, char** buf, int count) {
+    for (int startIdx = 0; startIdx < count;) {
+        TFreeListGroup* g = (TFreeListGroup*)blockFreeList.Alloc();
+        Y_ASSERT_NOBT(sizeof(TFreeListGroup) == nSizeIdxToSize[FREE_LIST_GROUP_SIZEIDX]);
+        if (!g) {
+            g = (TFreeListGroup*)LFAllocNoCache(FREE_LIST_GROUP_SIZEIDX, NO_MEM_DEFRAG);
+        }
+
+        int groupSize = count - startIdx;
+        if (groupSize > FL_GROUP_SIZE)
+            groupSize = FL_GROUP_SIZE;
+        for (int i = 0; i < groupSize; ++i)
+            g->Ptrs[i] = buf[startIdx + i];
+        for (int i = groupSize; i < FL_GROUP_SIZE; ++i)
+            g->Ptrs[i] = nullptr;
+
+        // add free group to the global list
+        TLFAllocFreeList& fl = globalFreeLists[nSizeIdx];
+        fl.Free(g);
+
+        startIdx += groupSize;
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+static TAtomic GlobalCounters[CT_MAX];
+const int MAX_LOCAL_UPDATES = 100;
+
+struct TLocalCounter {
+    intptr_t Value;
+    int Updates;
+    TAtomic* Parent;
+
+    Y_FORCE_INLINE void Init(TAtomic* parent) {
+        Parent = parent;
+        Value = 0;
+        Updates = 0;
+    }
+
+    Y_FORCE_INLINE void Increment(size_t value) {
+        Value += value;
+        if (++Updates > MAX_LOCAL_UPDATES) {
+            Flush();
+        }
+    }
+
+    Y_FORCE_INLINE void Flush() {
+        AtomicAdd(*Parent, Value);
+        Value = 0;
+        Updates = 0;
+    }
+};
+
+////////////////////////////////////////////////////////////////////////////////
+// DBG stuff
+////////////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TPerTagAllocCounter {
+    TAtomic Size;
+    TAtomic Count;
+
+    Y_FORCE_INLINE void Alloc(size_t size) {
+        AtomicAdd(Size, size);
+        AtomicAdd(Count, 1);
+    }
+
+    Y_FORCE_INLINE void Free(size_t size) {
+        AtomicSub(Size, size);
+        AtomicSub(Count, 1);
+    }
+};
+
+struct TLocalPerTagAllocCounter {
+    intptr_t Size;
+    int Count;
+    int Updates;
+
+    Y_FORCE_INLINE void Init() {
+        Size = 0;
+        Count = 0;
+        Updates = 0;
+    }
+
+    Y_FORCE_INLINE void Alloc(TPerTagAllocCounter& parent, size_t size) {
+        Size += size;
+        ++Count;
+        if (++Updates > MAX_LOCAL_UPDATES) {
+            Flush(parent);
+        }
+    }
+
+    Y_FORCE_INLINE void Free(TPerTagAllocCounter& parent, size_t size) {
+        Size -= size;
+        --Count;
+        if (++Updates > MAX_LOCAL_UPDATES) {
+            Flush(parent);
+        }
+    }
+
+    Y_FORCE_INLINE void Flush(TPerTagAllocCounter& parent) {
+        AtomicAdd(parent.Size, Size);
+        Size = 0;
+        AtomicAdd(parent.Count, Count);
+        Count = 0;
+        Updates = 0;
+    }
+};
+
+static const int DBG_ALLOC_MAX_TAG = 1000;
+static const int DBG_ALLOC_NUM_SIZES = 30;
+static TPerTagAllocCounter GlobalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+const int THREAD_BUF = 256;
+static int borderSizes[N_SIZES];
+const int MAX_MEM_PER_SIZE_PER_THREAD = 512 * 1024;
+struct TThreadAllocInfo {
+    // FreePtrs - pointers to first free blocks in per thread block list
+    // LastFreePtrs - pointers to last blocks in lists, may be invalid if FreePtr is zero
+    char* FreePtrs[N_SIZES][THREAD_BUF];
+    int FreePtrIndex[N_SIZES];
+    TThreadAllocInfo* pNextInfo;
+    TLocalCounter LocalCounters[CT_MAX];
+
+#if defined(LFALLOC_DBG)
+    TLocalPerTagAllocCounter LocalPerTagAllocCounters[DBG_ALLOC_MAX_TAG][DBG_ALLOC_NUM_SIZES];
+#endif
+#ifdef _win_
+    HANDLE hThread;
+#endif
+
+    void Init(TThreadAllocInfo** pHead) {
+        memset(this, 0, sizeof(*this));
+        for (auto& i : FreePtrIndex)
+            i = THREAD_BUF;
+#ifdef _win_
+        BOOL b = DuplicateHandle(
+            GetCurrentProcess(), GetCurrentThread(),
+            GetCurrentProcess(), &hThread,
+            0, FALSE, DUPLICATE_SAME_ACCESS);
+        Y_ASSERT_NOBT(b);
+#endif
+        pNextInfo = *pHead;
+        *pHead = this;
+        for (int k = 0; k < N_SIZES; ++k) {
+            int maxCount = MAX_MEM_PER_SIZE_PER_THREAD / nSizeIdxToSize[k];
+            if (maxCount > THREAD_BUF)
+                maxCount = THREAD_BUF;
+            borderSizes[k] = THREAD_BUF - maxCount;
+        }
+        for (int i = 0; i < CT_MAX; ++i) {
+            LocalCounters[i].Init(&GlobalCounters[i]);
+        }
+#if defined(LFALLOC_DBG)
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+                local.Init();
+            }
+        }
+#endif
+    }
+    void Done() {
+        for (auto sizeIdx : FreePtrIndex) {
+            Y_ASSERT_NOBT(sizeIdx == THREAD_BUF);
+        }
+        for (auto& localCounter : LocalCounters) {
+            localCounter.Flush();
+        }
+#if defined(LFALLOC_DBG)
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& local = LocalPerTagAllocCounters[tag][sizeIdx];
+                auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                local.Flush(global);
+            }
+        }
+#endif
+#ifdef _win_
+        if (hThread)
+            CloseHandle(hThread);
+#endif
+    }
+};
+PERTHREAD TThreadAllocInfo* pThreadInfo;
+static TThreadAllocInfo* pThreadInfoList;
+
+static int* volatile nLockThreadInfo = nullptr;
+class TLockThreadListMMgr {
+public:
+    TLockThreadListMMgr() {
+        RealEnterCritical(&nLockThreadInfo);
+    }
+    ~TLockThreadListMMgr() {
+        RealLeaveCritical(&nLockThreadInfo);
+    }
+};
+
+static Y_FORCE_INLINE void IncrementCounter(ELFAllocCounter counter, size_t value) {
+#ifdef LFALLOC_YT
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (thr) {
+        thr->LocalCounters[counter].Increment(value);
+    } else {
+        AtomicAdd(GlobalCounters[counter], value);
+    }
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFast(int counter) {
+#ifdef LFALLOC_YT
+    return GlobalCounters[counter];
+#else
+    return 0;
+#endif
+}
+
+extern "C" i64 GetLFAllocCounterFull(int counter) {
+#ifdef LFALLOC_YT
+    i64 ret = GlobalCounters[counter];
+    {
+        TLockThreadListMMgr ll;
+        for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+            TThreadAllocInfo* pInfo = *p;
+            ret += pInfo->LocalCounters[counter].Value;
+            p = &pInfo->pNextInfo;
+        }
+    }
+    return ret;
+#else
+    return 0;
+#endif
+}
+
+static void MoveSingleThreadFreeToGlobal(TThreadAllocInfo* pInfo) {
+    for (int sizeIdx = 0; sizeIdx < N_SIZES; ++sizeIdx) {
+        int& freePtrIdx = pInfo->FreePtrIndex[sizeIdx];
+        char** freePtrs = pInfo->FreePtrs[sizeIdx];
+        PutBlocksToGlobalFreeList(sizeIdx, freePtrs + freePtrIdx, THREAD_BUF - freePtrIdx);
+        freePtrIdx = THREAD_BUF;
+    }
+}
+
+#ifdef _win_
+static bool IsDeadThread(TThreadAllocInfo* pInfo) {
+    DWORD dwExit;
+    bool isDead = !GetExitCodeThread(pInfo->hThread, &dwExit) || dwExit != STILL_ACTIVE;
+    return isDead;
+}
+
+static void CleanupAfterDeadThreads() {
+    TLockThreadListMMgr ls;
+    for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+        TThreadAllocInfo* pInfo = *p;
+        if (IsDeadThread(pInfo)) {
+            MoveSingleThreadFreeToGlobal(pInfo);
+            pInfo->Done();
+            *p = pInfo->pNextInfo;
+            SystemFree(pInfo);
+        } else
+            p = &pInfo->pNextInfo;
+    }
+}
+#endif
+
+#ifndef _win_
+static pthread_key_t ThreadCacheCleaner;
+static void* volatile ThreadCacheCleanerStarted; // 0 = not started, -1 = started, -2 = is starting
+static PERTHREAD bool IsStoppingThread;
+
+static void FreeThreadCache(void*) {
+    TThreadAllocInfo* pToDelete = nullptr;
+    {
+        TLockThreadListMMgr ls;
+        pToDelete = pThreadInfo;
+        if (pToDelete == nullptr)
+            return;
+
+        // remove from the list
+        for (TThreadAllocInfo** p = &pThreadInfoList; *p; p = &(*p)->pNextInfo) {
+            if (*p == pToDelete) {
+                *p = pToDelete->pNextInfo;
+                break;
+            }
+        }
+        IsStoppingThread = true;
+        pThreadInfo = nullptr;
+    }
+
+    // free per thread buf
+    MoveSingleThreadFreeToGlobal(pToDelete);
+    pToDelete->Done();
+    SystemFree(pToDelete);
+}
+#endif
+
+static void AllocThreadInfo() {
+#ifndef _win_
+    if (DoCas(&ThreadCacheCleanerStarted, (void*)-2, (void*)nullptr) == (void*)nullptr) {
+        pthread_key_create(&ThreadCacheCleaner, FreeThreadCache);
+        ThreadCacheCleanerStarted = (void*)-1;
+    }
+    if (ThreadCacheCleanerStarted != (void*)-1)
+        return; // do not use ThreadCacheCleaner until it is constructed
+
+    {
+        if (IsStoppingThread)
+            return;
+        TLockThreadListMMgr ls;
+        if (IsStoppingThread) // better safe than sorry
+            return;
+
+        pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+        pThreadInfo->Init(&pThreadInfoList);
+    }
+    pthread_setspecific(ThreadCacheCleaner, (void*)-1); // without value destructor will not be called
+#else
+    CleanupAfterDeadThreads();
+    {
+        TLockThreadListMMgr ls;
+        pThreadInfo = (TThreadAllocInfo*)SystemAlloc(sizeof(TThreadAllocInfo));
+        pThreadInfo->Init(&pThreadInfoList);
+    }
+#endif
+}
+
+    //////////////////////////////////////////////////////////////////////////
+    // DBG stuff
+    //////////////////////////////////////////////////////////////////////////
+
+#if defined(LFALLOC_DBG)
+
+struct TAllocHeader {
+    size_t Size;
+    int Tag;
+    int Cookie;
+};
+
+static inline void* GetAllocPtr(TAllocHeader* p) {
+    return p + 1;
+}
+
+static inline TAllocHeader* GetAllocHeader(void* p) {
+    return ((TAllocHeader*)p) - 1;
+}
+
+PERTHREAD int AllocationTag;
+extern "C" int SetThreadAllocTag(int tag) {
+    int prevTag = AllocationTag;
+    if (tag < DBG_ALLOC_MAX_TAG && tag >= 0) {
+        AllocationTag = tag;
+    }
+    return prevTag;
+}
+
+PERTHREAD bool ProfileCurrentThread;
+extern "C" bool SetProfileCurrentThread(bool newVal) {
+    bool prevVal = ProfileCurrentThread;
+    ProfileCurrentThread = newVal;
+    return prevVal;
+}
+
+static volatile bool ProfileAllThreads;
+extern "C" bool SetProfileAllThreads(bool newVal) {
+    bool prevVal = ProfileAllThreads;
+    ProfileAllThreads = newVal;
+    return prevVal;
+}
+
+static volatile bool AllocationSamplingEnabled;
+extern "C" bool SetAllocationSamplingEnabled(bool newVal) {
+    bool prevVal = AllocationSamplingEnabled;
+    AllocationSamplingEnabled = newVal;
+    return prevVal;
+}
+
+static size_t AllocationSampleRate = 1000;
+extern "C" size_t SetAllocationSampleRate(size_t newVal) {
+    size_t prevVal = AllocationSampleRate;
+    AllocationSampleRate = newVal;
+    return prevVal;
+}
+
+static size_t AllocationSampleMaxSize = N_MAX_FAST_SIZE;
+extern "C" size_t SetAllocationSampleMaxSize(size_t newVal) {
+    size_t prevVal = AllocationSampleMaxSize;
+    AllocationSampleMaxSize = newVal;
+    return prevVal;
+}
+
+using TAllocationCallback = int(int tag, size_t size, int sizeIdx);
+static TAllocationCallback* AllocationCallback;
+extern "C" TAllocationCallback* SetAllocationCallback(TAllocationCallback* newVal) {
+    TAllocationCallback* prevVal = AllocationCallback;
+    AllocationCallback = newVal;
+    return prevVal;
+}
+
+using TDeallocationCallback = void(int cookie, int tag, size_t size, int sizeIdx);
+static TDeallocationCallback* DeallocationCallback;
+extern "C" TDeallocationCallback* SetDeallocationCallback(TDeallocationCallback* newVal) {
+    TDeallocationCallback* prevVal = DeallocationCallback;
+    DeallocationCallback = newVal;
+    return prevVal;
+}
+
+PERTHREAD TAtomic AllocationsCount;
+PERTHREAD bool InAllocationCallback;
+
+static const int DBG_ALLOC_INVALID_COOKIE = -1;
+static inline int SampleAllocation(TAllocHeader* p, int sizeIdx) {
+    int cookie = DBG_ALLOC_INVALID_COOKIE;
+    if (AllocationSamplingEnabled && (ProfileCurrentThread || ProfileAllThreads) && !InAllocationCallback) {
+        if (p->Size > AllocationSampleMaxSize || ++AllocationsCount % AllocationSampleRate == 0) {
+            if (AllocationCallback) {
+                InAllocationCallback = true;
+                cookie = AllocationCallback(p->Tag, p->Size, sizeIdx);
+                InAllocationCallback = false;
+            }
+        }
+    }
+    return cookie;
+}
+
+static inline void SampleDeallocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Cookie != DBG_ALLOC_INVALID_COOKIE && !InAllocationCallback) {
+        if (DeallocationCallback) {
+            InAllocationCallback = true;
+            DeallocationCallback(p->Cookie, p->Tag, p->Size, sizeIdx);
+            InAllocationCallback = false;
+        }
+    }
+}
+
+static inline void TrackPerTagAllocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+        Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+        auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+        TThreadAllocInfo* thr = pThreadInfo;
+        if (thr) {
+            auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+            local.Alloc(global, p->Size);
+        } else {
+            global.Alloc(p->Size);
+        }
+    }
+}
+
+static inline void TrackPerTagDeallocation(TAllocHeader* p, int sizeIdx) {
+    if (p->Tag < DBG_ALLOC_MAX_TAG && p->Tag >= 0) {
+        Y_ASSERT_NOBT(sizeIdx < DBG_ALLOC_NUM_SIZES);
+        auto& global = GlobalPerTagAllocCounters[p->Tag][sizeIdx];
+
+        TThreadAllocInfo* thr = pThreadInfo;
+        if (thr) {
+            auto& local = thr->LocalPerTagAllocCounters[p->Tag][sizeIdx];
+            local.Free(global, p->Size);
+        } else {
+            global.Free(p->Size);
+        }
+    }
+}
+
+static void* TrackAllocation(void* ptr, size_t size, int sizeIdx) {
+    TAllocHeader* p = (TAllocHeader*)ptr;
+    p->Size = size;
+    p->Tag = AllocationTag;
+    p->Cookie = SampleAllocation(p, sizeIdx);
+    TrackPerTagAllocation(p, sizeIdx);
+    return GetAllocPtr(p);
+}
+
+static void TrackDeallocation(void* ptr, int sizeIdx) {
+    TAllocHeader* p = (TAllocHeader*)ptr;
+    SampleDeallocation(p, sizeIdx);
+    TrackPerTagDeallocation(p, sizeIdx);
+}
+
+struct TPerTagAllocInfo {
+    ssize_t Count;
+    ssize_t Size;
+};
+
+extern "C" void GetPerTagAllocInfo(
+    bool flushPerThreadCounters,
+    TPerTagAllocInfo* info,
+    int& maxTag,
+    int& numSizes) {
+    maxTag = DBG_ALLOC_MAX_TAG;
+    numSizes = DBG_ALLOC_NUM_SIZES;
+
+    if (info) {
+        if (flushPerThreadCounters) {
+            TLockThreadListMMgr ll;
+            for (TThreadAllocInfo** p = &pThreadInfoList; *p;) {
+                TThreadAllocInfo* pInfo = *p;
+                for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+                    for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                        auto& local = pInfo->LocalPerTagAllocCounters[tag][sizeIdx];
+                        auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                        local.Flush(global);
+                    }
+                }
+                p = &pInfo->pNextInfo;
+            }
+        }
+
+        for (int tag = 0; tag < DBG_ALLOC_MAX_TAG; ++tag) {
+            for (int sizeIdx = 0; sizeIdx < DBG_ALLOC_NUM_SIZES; ++sizeIdx) {
+                auto& global = GlobalPerTagAllocCounters[tag][sizeIdx];
+                auto& res = info[tag * DBG_ALLOC_NUM_SIZES + sizeIdx];
+                res.Count = global.Count;
+                res.Size = global.Size;
+            }
+        }
+    }
+}
+
+#endif // LFALLOC_DBG
+
+//////////////////////////////////////////////////////////////////////////
+static Y_FORCE_INLINE void* LFAllocImpl(size_t _nSize) {
+#if defined(LFALLOC_DBG)
+    size_t size = _nSize;
+    _nSize += sizeof(TAllocHeader);
+#endif
+
+    IncrementCounter(CT_USER_ALLOC, _nSize);
+
+    int nSizeIdx;
+    if (_nSize > 512) {
+        if (_nSize > N_MAX_FAST_SIZE) {
+            void* ptr = LargeBlockAlloc(_nSize, CT_LARGE_ALLOC);
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, N_SIZES);
+#endif
+            return ptr;
+        }
+        nSizeIdx = size2idxArr2[(_nSize - 1) >> 8];
+    } else
+        nSizeIdx = size2idxArr1[1 + (((int)_nSize - 1) >> 3)];
+
+    IncrementCounter(CT_SMALL_ALLOC, nSizeIdxToSize[nSizeIdx]);
+
+    // check per thread buffer
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (!thr) {
+        AllocThreadInfo();
+        thr = pThreadInfo;
+        if (!thr) {
+            void* ptr = LFAllocNoCache(nSizeIdx, MEM_DEFRAG);
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+            return ptr;
+        }
+    }
+    {
+        int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+        if (freePtrIdx < THREAD_BUF) {
+            void* ptr = thr->FreePtrs[nSizeIdx][freePtrIdx++];
+#if defined(LFALLOC_DBG)
+            ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+            return ptr;
+        }
+
+        // try to alloc from global free list
+        char* buf[FL_GROUP_SIZE];
+        int count = TakeBlocksFromGlobalFreeList(nSizeIdx, buf);
+        if (count == 0) {
+            count = LFAllocNoCacheMultiple(nSizeIdx, buf);
+            if (count == 0) {
+                NMalloc::AbortFromCorruptedAllocator(); // no way LFAllocNoCacheMultiple() can fail
+            }
+        }
+        char** dstBuf = thr->FreePtrs[nSizeIdx] + freePtrIdx - 1;
+        for (int i = 0; i < count - 1; ++i)
+            dstBuf[-i] = buf[i];
+        freePtrIdx -= count - 1;
+        void* ptr = buf[count - 1];
+#if defined(LFALLOC_DBG)
+        ptr = TrackAllocation(ptr, size, nSizeIdx);
+#endif
+        return ptr;
+    }
+}
+
+static Y_FORCE_INLINE void* LFAlloc(size_t _nSize) {
+    void* res = LFAllocImpl(_nSize);
+#ifdef DBG_FILL_MEMORY
+    if (FillMemoryOnAllocation && res && (_nSize <= DBG_FILL_MAX_SIZE)) {
+        memset(res, 0xcf, _nSize);
+    }
+#endif
+    return res;
+}
+
+static Y_FORCE_INLINE void LFFree(void* p) {
+#if defined(LFALLOC_DBG)
+    if (p == nullptr)
+        return;
+    p = GetAllocHeader(p);
+#endif
+
+    uintptr_t chkOffset = ((char*)p - ALLOC_START) - 1ll;
+    if (chkOffset >= N_MAX_WORKSET_SIZE) {
+        if (p == nullptr)
+            return;
+#if defined(LFALLOC_DBG)
+        TrackDeallocation(p, N_SIZES);
+#endif
+        LargeBlockFree(p, CT_LARGE_FREE);
+        return;
+    }
+
+    uintptr_t chunk = ((char*)p - ALLOC_START) / N_CHUNK_SIZE;
+    ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+    if (nSizeIdx <= 0) {
+#if defined(LFALLOC_DBG)
+        TrackDeallocation(p, N_SIZES);
+#endif
+        LargeBlockFree(p, CT_LARGE_FREE);
+        return;
+    }
+
+#if defined(LFALLOC_DBG)
+    TrackDeallocation(p, nSizeIdx);
+#endif
+
+#ifdef DBG_FILL_MEMORY
+    memset(p, 0xfe, nSizeIdxToSize[nSizeIdx]);
+#endif
+
+    IncrementCounter(CT_SMALL_FREE, nSizeIdxToSize[nSizeIdx]);
+
+    // try to store info to per thread buf
+    TThreadAllocInfo* thr = pThreadInfo;
+    if (thr) {
+        int& freePtrIdx = thr->FreePtrIndex[nSizeIdx];
+        if (freePtrIdx > borderSizes[nSizeIdx]) {
+            thr->FreePtrs[nSizeIdx][--freePtrIdx] = (char*)p;
+            return;
+        }
+
+        // move several pointers to global free list
+        int freeCount = FL_GROUP_SIZE;
+        if (freeCount > THREAD_BUF - freePtrIdx)
+            freeCount = THREAD_BUF - freePtrIdx;
+        char** freePtrs = thr->FreePtrs[nSizeIdx];
+        PutBlocksToGlobalFreeList(nSizeIdx, freePtrs + freePtrIdx, freeCount);
+        freePtrIdx += freeCount;
+
+        freePtrs[--freePtrIdx] = (char*)p;
+
+    } else {
+        AllocThreadInfo();
+        PutBlocksToGlobalFreeList(nSizeIdx, (char**)&p, 1);
+    }
+}
+
+static size_t LFGetSize(const void* p) {
+#if defined(LFALLOC_DBG)
+    if (p == nullptr)
+        return 0;
+    return GetAllocHeader(const_cast<void*>(p))->Size;
+#endif
+
+    uintptr_t chkOffset = ((const char*)p - ALLOC_START);
+    if (chkOffset >= N_MAX_WORKSET_SIZE) {
+        if (p == nullptr)
+            return 0;
+        return TLargeBlk::As(p)->Pages * 4096ll;
+    }
+    uintptr_t chunk = ((const char*)p - ALLOC_START) / N_CHUNK_SIZE;
+    ptrdiff_t nSizeIdx = chunkSizeIdx[chunk];
+    if (nSizeIdx <= 0)
+        return TLargeBlk::As(p)->Pages * 4096ll;
+    return nSizeIdxToSize[nSizeIdx];
+}
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+// Output mem alloc stats
+const int N_PAGE_SIZE = 4096;
+static void DebugTraceMMgr(const char* pszFormat, ...) // __cdecl
+{
+    static char buff[20000];
+    va_list va;
+    //
+    va_start(va, pszFormat);
+    vsprintf(buff, pszFormat, va);
+    va_end(va);
+//
+#ifdef _win_
+    OutputDebugStringA(buff);
+#else
+    fprintf(stderr, buff);
+#endif
+}
+
+struct TChunkStats {
+    char *Start, *Finish;
+    i64 Size;
+    char* Entries;
+    i64 FreeCount;
+
+    TChunkStats(size_t chunk, i64 size, char* entries)
+        : Size(size)
+        , Entries(entries)
+        , FreeCount(0)
+    {
+        Start = ALLOC_START + chunk * N_CHUNK_SIZE;
+        Finish = Start + N_CHUNK_SIZE;
+    }
+    void CheckBlock(char* pBlock) {
+        if (pBlock && pBlock >= Start && pBlock < Finish) {
+            ++FreeCount;
+            i64 nShift = pBlock - Start;
+            i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+            Entries[nOffsetInStep / Size] = 1;
+        }
+    }
+    void SetGlobalFree(char* ptr) {
+        i64 nShift = ptr - Start;
+        i64 nOffsetInStep = nShift & (N_CHUNK_SIZE - 1);
+        while (nOffsetInStep + Size <= N_CHUNK_SIZE) {
+            ++FreeCount;
+            Entries[nOffsetInStep / Size] = 1;
+            nOffsetInStep += Size;
+        }
+    }
+};
+
+static void DumpMemoryBlockUtilizationLocked() {
+    TFreeListGroup* wholeLists[N_SIZES];
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx) {
+        wholeLists[nSizeIdx] = (TFreeListGroup*)globalFreeLists[nSizeIdx].GetWholeList();
+    }
+    char* bfList = (char*)blockFreeList.GetWholeList();
+
+    DebugTraceMMgr("memory blocks utilisation stats:\n");
+    i64 nTotalAllocated = 0, nTotalFree = 0, nTotalBadPages = 0, nTotalPages = 0, nTotalUsed = 0, nTotalLocked = 0;
+    i64 nTotalGroupBlocks = 0;
+    char* entries;
+    entries = (char*)SystemAlloc((N_CHUNK_SIZE / 4));
+    for (size_t k = 0; k < N_CHUNKS; ++k) {
+        if (chunkSizeIdx[k] <= 0) {
+            if (chunkSizeIdx[k] == -1)
+                nTotalLocked += N_CHUNK_SIZE;
+            continue;
+        }
+        i64 nSizeIdx = chunkSizeIdx[k];
+        i64 nSize = nSizeIdxToSize[nSizeIdx];
+        TChunkStats cs(k, nSize, entries);
+        int nEntriesTotal = N_CHUNK_SIZE / nSize;
+        memset(entries, 0, nEntriesTotal);
+        for (TFreeListGroup* g = wholeLists[nSizeIdx]; g; g = g->Next) {
+            for (auto& ptr : g->Ptrs)
+                cs.CheckBlock(ptr);
+        }
+        TChunkStats csGB(k, nSize, entries);
+        if (nSizeIdx == FREE_LIST_GROUP_SIZEIDX) {
+            for (auto g : wholeLists) {
+                for (; g; g = g->Next)
+                    csGB.CheckBlock((char*)g);
+            }
+            for (char* blk = bfList; blk; blk = *(char**)blk)
+                csGB.CheckBlock(blk);
+            nTotalGroupBlocks += csGB.FreeCount * nSize;
+        }
+        if (((globalCurrentPtr[nSizeIdx] - ALLOC_START) / N_CHUNK_SIZE) == k)
+            cs.SetGlobalFree(globalCurrentPtr[nSizeIdx]);
+        nTotalUsed += (nEntriesTotal - cs.FreeCount - csGB.FreeCount) * nSize;
+
+        char pages[N_CHUNK_SIZE / N_PAGE_SIZE];
+        memset(pages, 0, sizeof(pages));
+        for (int i = 0, nShift = 0; i < nEntriesTotal; ++i, nShift += nSize) {
+            int nBit = 0;
+            if (entries[i])
+                nBit = 1; // free entry
+            else
+                nBit = 2; // used entry
+            for (i64 nDelta = nSize - 1; nDelta >= 0; nDelta -= N_PAGE_SIZE)
+                pages[(nShift + nDelta) / N_PAGE_SIZE] |= nBit;
+        }
+        i64 nBadPages = 0;
+        for (auto page : pages) {
+            nBadPages += page == 3;
+            nTotalPages += page != 1;
+        }
+        DebugTraceMMgr("entry = %lld; size = %lld; free = %lld; system %lld; utilisation = %g%%, fragmentation = %g%%\n",
+                       k, nSize, cs.FreeCount * nSize, csGB.FreeCount * nSize,
+                       (N_CHUNK_SIZE - cs.FreeCount * nSize) * 100.0f / N_CHUNK_SIZE, 100.0f * nBadPages / Y_ARRAY_SIZE(pages));
+        nTotalAllocated += N_CHUNK_SIZE;
+        nTotalFree += cs.FreeCount * nSize;
+        nTotalBadPages += nBadPages;
+    }
+    SystemFree(entries);
+    DebugTraceMMgr("Total allocated = %llu, free = %lld, system = %lld, locked for future use %lld, utilisation = %g, fragmentation = %g\n",
+                   nTotalAllocated, nTotalFree, nTotalGroupBlocks, nTotalLocked,
+                   100.0f * (nTotalAllocated - nTotalFree) / nTotalAllocated, 100.0f * nTotalBadPages / nTotalPages);
+    DebugTraceMMgr("Total %lld bytes used, %lld bytes in used pages\n", nTotalUsed, nTotalPages * N_PAGE_SIZE);
+
+    for (int nSizeIdx = 0; nSizeIdx < N_SIZES; ++nSizeIdx)
+        globalFreeLists[nSizeIdx].ReturnWholeList(wholeLists[nSizeIdx]);
+    blockFreeList.ReturnWholeList(bfList);
+}
+
+void FlushThreadFreeList() {
+    if (pThreadInfo)
+        MoveSingleThreadFreeToGlobal(pThreadInfo);
+}
+
+void DumpMemoryBlockUtilization() {
+    // move current thread free to global lists to get better statistics
+    FlushThreadFreeList();
+    {
+        CCriticalSectionLockMMgr ls;
+        DumpMemoryBlockUtilizationLocked();
+    }
+}
+
+//////////////////////////////////////////////////////////////////////////
+// malloc api
+
+static bool LFAlloc_SetParam(const char* param, const char* value) {
+    if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE")) {
+        LB_LIMIT_TOTAL_SIZE = atoi(value);
+        return true;
+    }
+    if (!strcmp(param, "LB_LIMIT_TOTAL_SIZE_BYTES")) {
+        LB_LIMIT_TOTAL_SIZE = (atoi(value) + N_PAGE_SIZE - 1) / N_PAGE_SIZE;
+        return true;
+    }
+#ifdef DBG_FILL_MEMORY
+    if (!strcmp(param, "FillMemoryOnAllocation")) {
+        FillMemoryOnAllocation = !strcmp(value, "true");
+        return true;
+    }
+#endif
+    if (!strcmp(param, "BeforeLFAllocGlobalLockAcquired")) {
+        BeforeLFAllocGlobalLockAcquired = (decltype(BeforeLFAllocGlobalLockAcquired))(value);
+        return true;
+    }
+    if (!strcmp(param, "AfterLFAllocGlobalLockReleased")) {
+        AfterLFAllocGlobalLockReleased = (decltype(AfterLFAllocGlobalLockReleased))(value);
+        return true;
+    }
+    if (!strcmp(param, "EnterCritical")) {
+        assert(value);
+        RealEnterCritical = (decltype(RealEnterCritical))(value);
+        return true;
+    }
+    if (!strcmp(param, "LeaveCritical")) {
+        assert(value);
+        RealLeaveCritical = (decltype(RealLeaveCritical))(value);
+        return true;
+    }
+    if (!strcmp(param, "TransparentHugePages")) {
+        TransparentHugePages = !strcmp(value, "true");
+        return true;
+    }
+    if (!strcmp(param, "MapHugeTLB")) {
+        MapHugeTLB = !strcmp(value, "true");
+        return true;
+    }
+    if (!strcmp(param, "EnableDefrag")) {
+        EnableDefrag = !strcmp(value, "true");
+        return true;
+    }
+    return false;
+};
+
+static const char* LFAlloc_GetParam(const char* param) {
+    struct TParam {
+        const char* Name;
+        const char* Value;
+    };
+
+    static const TParam Params[] = {
+        {"GetLFAllocCounterFast", (const char*)&GetLFAllocCounterFast},
+        {"GetLFAllocCounterFull", (const char*)&GetLFAllocCounterFull},
+#if defined(LFALLOC_DBG)
+        {"SetThreadAllocTag", (const char*)&SetThreadAllocTag},
+        {"SetProfileCurrentThread", (const char*)&SetProfileCurrentThread},
+        {"SetProfileAllThreads", (const char*)&SetProfileAllThreads},
+        {"SetAllocationSamplingEnabled", (const char*)&SetAllocationSamplingEnabled},
+        {"SetAllocationSampleRate", (const char*)&SetAllocationSampleRate},
+        {"SetAllocationSampleMaxSize", (const char*)&SetAllocationSampleMaxSize},
+        {"SetAllocationCallback", (const char*)&SetAllocationCallback},
+        {"SetDeallocationCallback", (const char*)&SetDeallocationCallback},
+        {"GetPerTagAllocInfo", (const char*)&GetPerTagAllocInfo},
+#endif // LFALLOC_DBG
+    };
+
+    for (int i = 0; i < Y_ARRAY_SIZE(Params); ++i) {
+        if (strcmp(param, Params[i].Name) == 0) {
+            return Params[i].Value;
+        }
+    }
+    return nullptr;
+}
+
+static Y_FORCE_INLINE void* LFVAlloc(size_t size) {
+    const size_t pg = N_PAGE_SIZE;
+    size_t bigsize = (size + pg - 1) & (~(pg - 1));
+    void* p = LFAlloc(bigsize);
+
+    Y_ASSERT_NOBT((intptr_t)p % N_PAGE_SIZE == 0);
+    return p;
+}
+
+static Y_FORCE_INLINE int LFPosixMemalign(void** memptr, size_t alignment, size_t size) {
+    if (Y_UNLIKELY(alignment > 4096)) {
+#ifdef _win_
+        OutputDebugStringA("Larger alignment are not guaranteed with this implementation\n");
+#else
+        fprintf(stderr, "Larger alignment are not guaranteed with this implementation\n");
+#endif
+        NMalloc::AbortFromCorruptedAllocator();
+    }
+    size_t bigsize = size;
+    if (bigsize <= alignment) {
+        bigsize = alignment;
+    } else if (bigsize < 2 * alignment) {
+        bigsize = 2 * alignment;
+    }
+    *memptr = LFAlloc(bigsize);
+    return 0;
+}
+#endif
diff --git a/contrib/lfalloc/src/lfmalloc.h b/contrib/lfalloc/src/lfmalloc.h
new file mode 100644
index 00000000000..1e6a0d55773
--- /dev/null
+++ b/contrib/lfalloc/src/lfmalloc.h
@@ -0,0 +1,23 @@
+#pragma once
+
+#include <string.h>
+#include <stdlib.h>
+#include "util/system/compiler.h"
+
+namespace NMalloc {
+    volatile inline bool IsAllocatorCorrupted = false;
+
+    static inline void AbortFromCorruptedAllocator() {
+        IsAllocatorCorrupted = true;
+        abort();
+    }
+
+    struct TAllocHeader {
+        void* Block;
+        size_t AllocSize;
+        void Y_FORCE_INLINE Encode(void* block, size_t size, size_t signature) {
+            Block = block;
+            AllocSize = size | signature;
+        }
+    };
+}
diff --git a/contrib/lfalloc/src/util/README.md b/contrib/lfalloc/src/util/README.md
new file mode 100644
index 00000000000..c367cb4b439
--- /dev/null
+++ b/contrib/lfalloc/src/util/README.md
@@ -0,0 +1,33 @@
+Style guide for the util folder is a stricter version of general style guide (mostly in terms of ambiguity resolution).
+
+ * all {} must be in K&R style
+ * &, * tied closer to a type, not to variable
+ * always use `using` not `typedef`
+ * even a single line block must be in braces {}:
+   ```
+   if (A) {
+       B();
+   }
+   ```
+ * _ at the end of private data member of a class - `First_`, `Second_`
+ * every .h file must be accompanied with corresponding .cpp to avoid a leakage and check that it is self contained
+ * prohibited to use `printf`-like functions
+
+
+Things declared in the general style guide, which sometimes are missed:
+
+ * `template <`, not `template<`
+ * `noexcept`, not `throw ()` nor `throw()`, not required for destructors
+ * indents inside `namespace` same as inside `class`
+
+
+Requirements for a new code (and for corrections in an old code which involves change of behaviour) in util:
+
+ * presence of UNIT-tests
+ * presence of comments in Doxygen style
+ * accessors without Get prefix (`Length()`, but not `GetLength()`)
+
+This guide is not a mandatory as there is the general style guide.
+Nevertheless if it is not followed, then a next `ya style .` run in the util folder will undeservedly update authors of some lines of code.
+
+Thus before a commit it is recommended to run `ya style .` in the util folder.
diff --git a/contrib/lfalloc/src/util/system/atomic.h b/contrib/lfalloc/src/util/system/atomic.h
new file mode 100644
index 00000000000..9876515a54d
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/atomic.h
@@ -0,0 +1,51 @@
+#pragma once
+
+#include "defaults.h"
+
+using TAtomicBase = intptr_t;
+using TAtomic = volatile TAtomicBase;
+
+#if defined(__GNUC__)
+#include "atomic_gcc.h"
+#elif defined(_MSC_VER)
+#include "atomic_win.h"
+#else
+#error unsupported platform
+#endif
+
+#if !defined(ATOMIC_COMPILER_BARRIER)
+#define ATOMIC_COMPILER_BARRIER()
+#endif
+
+static inline TAtomicBase AtomicSub(TAtomic& a, TAtomicBase v) {
+    return AtomicAdd(a, -v);
+}
+
+static inline TAtomicBase AtomicGetAndSub(TAtomic& a, TAtomicBase v) {
+    return AtomicGetAndAdd(a, -v);
+}
+
+#if defined(USE_GENERIC_SETGET)
+static inline TAtomicBase AtomicGet(const TAtomic& a) {
+    return a;
+}
+
+static inline void AtomicSet(TAtomic& a, TAtomicBase v) {
+    a = v;
+}
+#endif
+
+static inline bool AtomicTryLock(TAtomic* a) {
+    return AtomicCas(a, 1, 0);
+}
+
+static inline bool AtomicTryAndTryLock(TAtomic* a) {
+    return (AtomicGet(*a) == 0) && AtomicTryLock(a);
+}
+
+static inline void AtomicUnlock(TAtomic* a) {
+    ATOMIC_COMPILER_BARRIER();
+    AtomicSet(*a, 0);
+}
+
+#include "atomic_ops.h"
diff --git a/contrib/lfalloc/src/util/system/atomic_gcc.h b/contrib/lfalloc/src/util/system/atomic_gcc.h
new file mode 100644
index 00000000000..ed8dc2bdc53
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/atomic_gcc.h
@@ -0,0 +1,90 @@
+#pragma once
+
+#define ATOMIC_COMPILER_BARRIER() __asm__ __volatile__("" \
+                                                       :  \
+                                                       :  \
+                                                       : "memory")
+
+static inline TAtomicBase AtomicGet(const TAtomic& a) {
+    TAtomicBase tmp;
+#if defined(_arm64_)
+    __asm__ __volatile__(
+        "ldar %x[value], %[ptr]  \n\t"
+        : [value] "=r"(tmp)
+        : [ptr] "Q"(a)
+        : "memory");
+#else
+    __atomic_load(&a, &tmp, __ATOMIC_ACQUIRE);
+#endif
+    return tmp;
+}
+
+static inline void AtomicSet(TAtomic& a, TAtomicBase v) {
+#if defined(_arm64_)
+    __asm__ __volatile__(
+        "stlr %x[value], %[ptr]  \n\t"
+        : [ptr] "=Q"(a)
+        : [value] "r"(v)
+        : "memory");
+#else
+    __atomic_store(&a, &v, __ATOMIC_RELEASE);
+#endif
+}
+
+static inline intptr_t AtomicIncrement(TAtomic& p) {
+    return __atomic_add_fetch(&p, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicGetAndIncrement(TAtomic& p) {
+    return __atomic_fetch_add(&p, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicDecrement(TAtomic& p) {
+    return __atomic_sub_fetch(&p, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicGetAndDecrement(TAtomic& p) {
+    return __atomic_fetch_sub(&p, 1, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicAdd(TAtomic& p, intptr_t v) {
+    return __atomic_add_fetch(&p, v, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicGetAndAdd(TAtomic& p, intptr_t v) {
+    return __atomic_fetch_add(&p, v, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicSwap(TAtomic* p, intptr_t v) {
+    (void)p; // disable strange 'parameter set but not used' warning on gcc
+    intptr_t ret;
+    __atomic_exchange(p, &v, &ret, __ATOMIC_SEQ_CST);
+    return ret;
+}
+
+static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+    (void)a; // disable strange 'parameter set but not used' warning on gcc
+    return __atomic_compare_exchange(a, &compare, &exchange, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+    (void)a; // disable strange 'parameter set but not used' warning on gcc
+    __atomic_compare_exchange(a, &compare, &exchange, false, __ATOMIC_SEQ_CST, __ATOMIC_SEQ_CST);
+    return compare;
+}
+
+static inline intptr_t AtomicOr(TAtomic& a, intptr_t b) {
+    return __atomic_or_fetch(&a, b, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicXor(TAtomic& a, intptr_t b) {
+    return __atomic_xor_fetch(&a, b, __ATOMIC_SEQ_CST);
+}
+
+static inline intptr_t AtomicAnd(TAtomic& a, intptr_t b) {
+    return __atomic_and_fetch(&a, b, __ATOMIC_SEQ_CST);
+}
+
+static inline void AtomicBarrier() {
+    __sync_synchronize();
+}
diff --git a/contrib/lfalloc/src/util/system/atomic_ops.h b/contrib/lfalloc/src/util/system/atomic_ops.h
new file mode 100644
index 00000000000..425b643e14d
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/atomic_ops.h
@@ -0,0 +1,189 @@
+#pragma once
+
+#include <type_traits>
+
+template <typename T>
+inline TAtomic* AsAtomicPtr(T volatile* target) {
+    return reinterpret_cast<TAtomic*>(target);
+}
+
+template <typename T>
+inline const TAtomic* AsAtomicPtr(T const volatile* target) {
+    return reinterpret_cast<const TAtomic*>(target);
+}
+
+// integral types
+
+template <typename T>
+struct TAtomicTraits {
+    enum {
+        Castable = std::is_integral<T>::value && sizeof(T) == sizeof(TAtomicBase) && !std::is_const<T>::value,
+    };
+};
+
+template <typename T, typename TT>
+using TEnableIfCastable = std::enable_if_t<TAtomicTraits<T>::Castable, TT>;
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicGet(T const volatile& target) {
+    return static_cast<T>(AtomicGet(*AsAtomicPtr(&target)));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, void> AtomicSet(T volatile& target, TAtomicBase value) {
+    AtomicSet(*AsAtomicPtr(&target), value);
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicIncrement(T volatile& target) {
+    return static_cast<T>(AtomicIncrement(*AsAtomicPtr(&target)));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicGetAndIncrement(T volatile& target) {
+    return static_cast<T>(AtomicGetAndIncrement(*AsAtomicPtr(&target)));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicDecrement(T volatile& target) {
+    return static_cast<T>(AtomicDecrement(*AsAtomicPtr(&target)));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicGetAndDecrement(T volatile& target) {
+    return static_cast<T>(AtomicGetAndDecrement(*AsAtomicPtr(&target)));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicAdd(T volatile& target, TAtomicBase value) {
+    return static_cast<T>(AtomicAdd(*AsAtomicPtr(&target), value));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicGetAndAdd(T volatile& target, TAtomicBase value) {
+    return static_cast<T>(AtomicGetAndAdd(*AsAtomicPtr(&target), value));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicSub(T volatile& target, TAtomicBase value) {
+    return static_cast<T>(AtomicSub(*AsAtomicPtr(&target), value));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicGetAndSub(T volatile& target, TAtomicBase value) {
+    return static_cast<T>(AtomicGetAndSub(*AsAtomicPtr(&target), value));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicSwap(T volatile* target, TAtomicBase exchange) {
+    return static_cast<T>(AtomicSwap(AsAtomicPtr(target), exchange));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, bool> AtomicCas(T volatile* target, TAtomicBase exchange, TAtomicBase compare) {
+    return AtomicCas(AsAtomicPtr(target), exchange, compare);
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicGetAndCas(T volatile* target, TAtomicBase exchange, TAtomicBase compare) {
+    return static_cast<T>(AtomicGetAndCas(AsAtomicPtr(target), exchange, compare));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, bool> AtomicTryLock(T volatile* target) {
+    return AtomicTryLock(AsAtomicPtr(target));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, bool> AtomicTryAndTryLock(T volatile* target) {
+    return AtomicTryAndTryLock(AsAtomicPtr(target));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, void> AtomicUnlock(T volatile* target) {
+    AtomicUnlock(AsAtomicPtr(target));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicOr(T volatile& target, TAtomicBase value) {
+    return static_cast<T>(AtomicOr(*AsAtomicPtr(&target), value));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicAnd(T volatile& target, TAtomicBase value) {
+    return static_cast<T>(AtomicAnd(*AsAtomicPtr(&target), value));
+}
+
+template <typename T>
+inline TEnableIfCastable<T, T> AtomicXor(T volatile& target, TAtomicBase value) {
+    return static_cast<T>(AtomicXor(*AsAtomicPtr(&target), value));
+}
+
+// pointer types
+
+template <typename T>
+inline T* AtomicGet(T* const volatile& target) {
+    return reinterpret_cast<T*>(AtomicGet(*AsAtomicPtr(&target)));
+}
+
+template <typename T>
+inline void AtomicSet(T* volatile& target, T* value) {
+    AtomicSet(*AsAtomicPtr(&target), reinterpret_cast<TAtomicBase>(value));
+}
+
+using TNullPtr = decltype(nullptr);
+
+template <typename T>
+inline void AtomicSet(T* volatile& target, TNullPtr) {
+    AtomicSet(*AsAtomicPtr(&target), 0);
+}
+
+template <typename T>
+inline T* AtomicSwap(T* volatile* target, T* exchange) {
+    return reinterpret_cast<T*>(AtomicSwap(AsAtomicPtr(target), reinterpret_cast<TAtomicBase>(exchange)));
+}
+
+template <typename T>
+inline T* AtomicSwap(T* volatile* target, TNullPtr) {
+    return reinterpret_cast<T*>(AtomicSwap(AsAtomicPtr(target), 0));
+}
+
+template <typename T>
+inline bool AtomicCas(T* volatile* target, T* exchange, T* compare) {
+    return AtomicCas(AsAtomicPtr(target), reinterpret_cast<TAtomicBase>(exchange), reinterpret_cast<TAtomicBase>(compare));
+}
+
+template <typename T>
+inline T* AtomicGetAndCas(T* volatile* target, T* exchange, T* compare) {
+    return reinterpret_cast<T*>(AtomicGetAndCas(AsAtomicPtr(target), reinterpret_cast<TAtomicBase>(exchange), reinterpret_cast<TAtomicBase>(compare)));
+}
+
+template <typename T>
+inline bool AtomicCas(T* volatile* target, T* exchange, TNullPtr) {
+    return AtomicCas(AsAtomicPtr(target), reinterpret_cast<TAtomicBase>(exchange), 0);
+}
+
+template <typename T>
+inline T* AtomicGetAndCas(T* volatile* target, T* exchange, TNullPtr) {
+    return reinterpret_cast<T*>(AtomicGetAndCas(AsAtomicPtr(target), reinterpret_cast<TAtomicBase>(exchange), 0));
+}
+
+template <typename T>
+inline bool AtomicCas(T* volatile* target, TNullPtr, T* compare) {
+    return AtomicCas(AsAtomicPtr(target), 0, reinterpret_cast<TAtomicBase>(compare));
+}
+
+template <typename T>
+inline T* AtomicGetAndCas(T* volatile* target, TNullPtr, T* compare) {
+    return reinterpret_cast<T*>(AtomicGetAndCas(AsAtomicPtr(target), 0, reinterpret_cast<TAtomicBase>(compare)));
+}
+
+template <typename T>
+inline bool AtomicCas(T* volatile* target, TNullPtr, TNullPtr) {
+    return AtomicCas(AsAtomicPtr(target), 0, 0);
+}
+
+template <typename T>
+inline T* AtomicGetAndCas(T* volatile* target, TNullPtr, TNullPtr) {
+    return reinterpret_cast<T*>(AtomicGetAndCas(AsAtomicPtr(target), 0, 0));
+}
diff --git a/contrib/lfalloc/src/util/system/atomic_win.h b/contrib/lfalloc/src/util/system/atomic_win.h
new file mode 100644
index 00000000000..1abebd87b38
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/atomic_win.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <intrin.h>
+
+#define USE_GENERIC_SETGET
+
+#if defined(_i386_)
+
+#pragma intrinsic(_InterlockedIncrement)
+#pragma intrinsic(_InterlockedDecrement)
+#pragma intrinsic(_InterlockedExchangeAdd)
+#pragma intrinsic(_InterlockedExchange)
+#pragma intrinsic(_InterlockedCompareExchange)
+
+static inline intptr_t AtomicIncrement(TAtomic& a) {
+    return _InterlockedIncrement((volatile long*)&a);
+}
+
+static inline intptr_t AtomicGetAndIncrement(TAtomic& a) {
+    return _InterlockedIncrement((volatile long*)&a) - 1;
+}
+
+static inline intptr_t AtomicDecrement(TAtomic& a) {
+    return _InterlockedDecrement((volatile long*)&a);
+}
+
+static inline intptr_t AtomicGetAndDecrement(TAtomic& a) {
+    return _InterlockedDecrement((volatile long*)&a) + 1;
+}
+
+static inline intptr_t AtomicAdd(TAtomic& a, intptr_t b) {
+    return _InterlockedExchangeAdd((volatile long*)&a, b) + b;
+}
+
+static inline intptr_t AtomicGetAndAdd(TAtomic& a, intptr_t b) {
+    return _InterlockedExchangeAdd((volatile long*)&a, b);
+}
+
+static inline intptr_t AtomicSwap(TAtomic* a, intptr_t b) {
+    return _InterlockedExchange((volatile long*)a, b);
+}
+
+static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+    return _InterlockedCompareExchange((volatile long*)a, exchange, compare) == compare;
+}
+
+static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+    return _InterlockedCompareExchange((volatile long*)a, exchange, compare);
+}
+
+#else // _x86_64_
+
+#pragma intrinsic(_InterlockedIncrement64)
+#pragma intrinsic(_InterlockedDecrement64)
+#pragma intrinsic(_InterlockedExchangeAdd64)
+#pragma intrinsic(_InterlockedExchange64)
+#pragma intrinsic(_InterlockedCompareExchange64)
+
+static inline intptr_t AtomicIncrement(TAtomic& a) {
+    return _InterlockedIncrement64((volatile __int64*)&a);
+}
+
+static inline intptr_t AtomicGetAndIncrement(TAtomic& a) {
+    return _InterlockedIncrement64((volatile __int64*)&a) - 1;
+}
+
+static inline intptr_t AtomicDecrement(TAtomic& a) {
+    return _InterlockedDecrement64((volatile __int64*)&a);
+}
+
+static inline intptr_t AtomicGetAndDecrement(TAtomic& a) {
+    return _InterlockedDecrement64((volatile __int64*)&a) + 1;
+}
+
+static inline intptr_t AtomicAdd(TAtomic& a, intptr_t b) {
+    return _InterlockedExchangeAdd64((volatile __int64*)&a, b) + b;
+}
+
+static inline intptr_t AtomicGetAndAdd(TAtomic& a, intptr_t b) {
+    return _InterlockedExchangeAdd64((volatile __int64*)&a, b);
+}
+
+static inline intptr_t AtomicSwap(TAtomic* a, intptr_t b) {
+    return _InterlockedExchange64((volatile __int64*)a, b);
+}
+
+static inline bool AtomicCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+    return _InterlockedCompareExchange64((volatile __int64*)a, exchange, compare) == compare;
+}
+
+static inline intptr_t AtomicGetAndCas(TAtomic* a, intptr_t exchange, intptr_t compare) {
+    return _InterlockedCompareExchange64((volatile __int64*)a, exchange, compare);
+}
+
+static inline intptr_t AtomicOr(TAtomic& a, intptr_t b) {
+    return _InterlockedOr64(&a, b) | b;
+}
+
+static inline intptr_t AtomicAnd(TAtomic& a, intptr_t b) {
+    return _InterlockedAnd64(&a, b) & b;
+}
+
+static inline intptr_t AtomicXor(TAtomic& a, intptr_t b) {
+    return _InterlockedXor64(&a, b) ^ b;
+}
+
+#endif // _x86_
+
+//TODO
+static inline void AtomicBarrier() {
+    TAtomic val = 0;
+
+    AtomicSwap(&val, 0);
+}
diff --git a/contrib/lfalloc/src/util/system/compiler.h b/contrib/lfalloc/src/util/system/compiler.h
new file mode 100644
index 00000000000..b5cec600923
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/compiler.h
@@ -0,0 +1,617 @@
+#pragma once
+
+// useful cross-platfrom definitions for compilers
+
+/**
+ * @def Y_FUNC_SIGNATURE
+ *
+ * Use this macro to get pretty function name (see example).
+ *
+ * @code
+ * void Hi() {
+ *     Cout << Y_FUNC_SIGNATURE << Endl;
+ * }
+
+ * template <typename T>
+ * void Do() {
+ *     Cout << Y_FUNC_SIGNATURE << Endl;
+ * }
+
+ * int main() {
+ *    Hi();         // void Hi()
+ *    Do<int>();    // void Do() [T = int]
+ *    Do<TString>(); // void Do() [T = TString]
+ * }
+ * @endcode
+ */
+#if defined(__GNUC__)
+#define Y_FUNC_SIGNATURE __PRETTY_FUNCTION__
+#elif defined(_MSC_VER)
+#define Y_FUNC_SIGNATURE __FUNCSIG__
+#else
+#define Y_FUNC_SIGNATURE ""
+#endif
+
+#ifdef __GNUC__
+#define Y_PRINTF_FORMAT(n, m) __attribute__((__format__(__printf__, n, m)))
+#endif
+
+#ifndef Y_PRINTF_FORMAT
+#define Y_PRINTF_FORMAT(n, m)
+#endif
+
+#if defined(__clang__)
+#define Y_NO_SANITIZE(...) __attribute__((no_sanitize(__VA_ARGS__)))
+#endif
+
+#if !defined(Y_NO_SANITIZE)
+#define Y_NO_SANITIZE(...)
+#endif
+
+/**
+ * @def Y_DECLARE_UNUSED
+ *
+ * Macro is needed to silence compiler warning about unused entities (e.g. function or argument).
+ *
+ * @code
+ * Y_DECLARE_UNUSED int FunctionUsedSolelyForDebugPurposes();
+ * assert(FunctionUsedSolelyForDebugPurposes() == 42);
+ *
+ * void Foo(const int argumentUsedOnlyForDebugPurposes Y_DECLARE_UNUSED) {
+ *     assert(argumentUsedOnlyForDebugPurposes == 42);
+ *     // however you may as well omit `Y_DECLARE_UNUSED` and use `UNUSED` macro instead
+ *     Y_UNUSED(argumentUsedOnlyForDebugPurposes);
+ * }
+ * @endcode
+ */
+#ifdef __GNUC__
+#define Y_DECLARE_UNUSED __attribute__((unused))
+#endif
+
+#ifndef Y_DECLARE_UNUSED
+#define Y_DECLARE_UNUSED
+#endif
+
+#if defined(__GNUC__)
+#define Y_LIKELY(Cond) __builtin_expect(!!(Cond), 1)
+#define Y_UNLIKELY(Cond) __builtin_expect(!!(Cond), 0)
+#define Y_PREFETCH_READ(Pointer, Priority) __builtin_prefetch((const void*)(Pointer), 0, Priority)
+#define Y_PREFETCH_WRITE(Pointer, Priority) __builtin_prefetch((const void*)(Pointer), 1, Priority)
+#endif
+
+/**
+ * @def Y_FORCE_INLINE
+ *
+ * Macro to use in place of 'inline' in function declaration/definition to force
+ * it to be inlined.
+ */
+#if !defined(Y_FORCE_INLINE)
+#if defined(CLANG_COVERAGE)
+#/* excessive __always_inline__ might significantly slow down compilation of an instrumented unit */
+#define Y_FORCE_INLINE inline
+#elif defined(_MSC_VER)
+#define Y_FORCE_INLINE __forceinline
+#elif defined(__GNUC__)
+#/* Clang also defines __GNUC__ (as 4) */
+#define Y_FORCE_INLINE inline __attribute__((__always_inline__))
+#else
+#define Y_FORCE_INLINE inline
+#endif
+#endif
+
+/**
+ * @def Y_NO_INLINE
+ *
+ * Macro to use in place of 'inline' in function declaration/definition to
+ * prevent it from being inlined.
+ */
+#if !defined(Y_NO_INLINE)
+#if defined(_MSC_VER)
+#define Y_NO_INLINE __declspec(noinline)
+#elif defined(__GNUC__) || defined(__INTEL_COMPILER)
+#/* Clang also defines __GNUC__ (as 4) */
+#define Y_NO_INLINE __attribute__((__noinline__))
+#else
+#define Y_NO_INLINE
+#endif
+#endif
+
+//to cheat compiler about strict aliasing or similar problems
+#if defined(__GNUC__)
+#define Y_FAKE_READ(X)                  \
+    do {                                \
+        __asm__ __volatile__(""         \
+                             :          \
+                             : "m"(X)); \
+    } while (0)
+
+#define Y_FAKE_WRITE(X)                  \
+    do {                                 \
+        __asm__ __volatile__(""          \
+                             : "=m"(X)); \
+    } while (0)
+#endif
+
+#if !defined(Y_FAKE_READ)
+#define Y_FAKE_READ(X)
+#endif
+
+#if !defined(Y_FAKE_WRITE)
+#define Y_FAKE_WRITE(X)
+#endif
+
+#ifndef Y_PREFETCH_READ
+#define Y_PREFETCH_READ(Pointer, Priority) (void)(const void*)(Pointer), (void)Priority
+#endif
+
+#ifndef Y_PREFETCH_WRITE
+#define Y_PREFETCH_WRITE(Pointer, Priority) (void)(const void*)(Pointer), (void)Priority
+#endif
+
+#ifndef Y_LIKELY
+#define Y_LIKELY(Cond) (Cond)
+#define Y_UNLIKELY(Cond) (Cond)
+#endif
+
+#ifdef __GNUC__
+#define _packed __attribute__((packed))
+#else
+#define _packed
+#endif
+
+#if defined(__GNUC__)
+#define Y_WARN_UNUSED_RESULT __attribute__((warn_unused_result))
+#endif
+
+#ifndef Y_WARN_UNUSED_RESULT
+#define Y_WARN_UNUSED_RESULT
+#endif
+
+#if defined(__GNUC__)
+#define Y_HIDDEN __attribute__((visibility("hidden")))
+#endif
+
+#if !defined(Y_HIDDEN)
+#define Y_HIDDEN
+#endif
+
+#if defined(__GNUC__)
+#define Y_PUBLIC __attribute__((visibility("default")))
+#endif
+
+#if !defined(Y_PUBLIC)
+#define Y_PUBLIC
+#endif
+
+#if !defined(Y_UNUSED) && !defined(__cplusplus)
+#define Y_UNUSED(var) (void)(var)
+#endif
+#if !defined(Y_UNUSED) && defined(__cplusplus)
+template <class... Types>
+constexpr Y_FORCE_INLINE int Y_UNUSED(Types&&...) {
+    return 0;
+};
+#endif
+
+/**
+ * @def Y_ASSUME
+ *
+ * Macro that tells the compiler that it can generate optimized code
+ * as if the given expression will always evaluate true.
+ * The behavior is undefined if it ever evaluates false.
+ *
+ * @code
+ * // factored into a function so that it's testable
+ * inline int Avg(int x, int y) {
+ *     if (x >= 0 && y >= 0) {
+ *         return (static_cast<unsigned>(x) + static_cast<unsigned>(y)) >> 1;
+ *     } else {
+ *         // a slower implementation
+ *     }
+ * }
+ *
+ * // we know that xs and ys are non-negative from domain knowledge,
+ * // but we can't change the types of xs and ys because of API constrains
+ * int Foo(const TVector<int>& xs, const TVector<int>& ys) {
+ *     TVector<int> avgs;
+ *     avgs.resize(xs.size());
+ *     for (size_t i = 0; i < xs.size(); ++i) {
+ *         auto x = xs[i];
+ *         auto y = ys[i];
+ *         Y_ASSUME(x >= 0);
+ *         Y_ASSUME(y >= 0);
+ *         xs[i] = Avg(x, y);
+ *     }
+ * }
+ * @endcode
+ */
+#if defined(__GNUC__)
+#define Y_ASSUME(condition) ((condition) ? (void)0 : __builtin_unreachable())
+#elif defined(_MSC_VER)
+#define Y_ASSUME(condition) __assume(condition)
+#else
+#define Y_ASSUME(condition) Y_UNUSED(condition)
+#endif
+
+#ifdef __cplusplus
+[[noreturn]]
+#endif
+Y_HIDDEN void _YandexAbort();
+
+/**
+ * @def Y_UNREACHABLE
+ *
+ * Macro that marks the rest of the code branch unreachable.
+ * The behavior is undefined if it's ever reached.
+ *
+ * @code
+ * switch (i % 3) {
+ * case 0:
+ *     return foo;
+ * case 1:
+ *     return bar;
+ * case 2:
+ *     return baz;
+ * default:
+ *     Y_UNREACHABLE();
+ * }
+ * @endcode
+ */
+#if defined(__GNUC__) || defined(_MSC_VER)
+#define Y_UNREACHABLE() Y_ASSUME(0)
+#else
+#define Y_UNREACHABLE() _YandexAbort()
+#endif
+
+#if defined(undefined_sanitizer_enabled)
+#define _ubsan_enabled_
+#endif
+
+#ifdef __clang__
+
+#if __has_feature(thread_sanitizer)
+#define _tsan_enabled_
+#endif
+#if __has_feature(memory_sanitizer)
+#define _msan_enabled_
+#endif
+#if __has_feature(address_sanitizer)
+#define _asan_enabled_
+#endif
+
+#else
+
+#if defined(thread_sanitizer_enabled) || defined(__SANITIZE_THREAD__)
+#define _tsan_enabled_
+#endif
+#if defined(memory_sanitizer_enabled)
+#define _msan_enabled_
+#endif
+#if defined(address_sanitizer_enabled) || defined(__SANITIZE_ADDRESS__)
+#define _asan_enabled_
+#endif
+
+#endif
+
+#if defined(_asan_enabled_) || defined(_msan_enabled_) || defined(_tsan_enabled_) || defined(_ubsan_enabled_)
+#define _san_enabled_
+#endif
+
+#if defined(_MSC_VER)
+#define __PRETTY_FUNCTION__ __FUNCSIG__
+#endif
+
+#if defined(__GNUC__)
+#define Y_WEAK __attribute__((weak))
+#else
+#define Y_WEAK
+#endif
+
+#if defined(__CUDACC_VER_MAJOR__)
+#define Y_CUDA_AT_LEAST(x, y) (__CUDACC_VER_MAJOR__ > x || (__CUDACC_VER_MAJOR__ == x && __CUDACC_VER_MINOR__ >= y))
+#else
+#define Y_CUDA_AT_LEAST(x, y) 0
+#endif
+
+// NVidia CUDA C++ Compiler did not know about noexcept keyword until version 9.0
+#if !Y_CUDA_AT_LEAST(9, 0)
+#if defined(__CUDACC__) && !defined(noexcept)
+#define noexcept throw ()
+#endif
+#endif
+
+#if defined(__GNUC__)
+#define Y_COLD __attribute__((cold))
+#define Y_LEAF __attribute__((leaf))
+#define Y_WRAPPER __attribute__((artificial))
+#else
+#define Y_COLD
+#define Y_LEAF
+#define Y_WRAPPER
+#endif
+
+/**
+ * @def Y_PRAGMA
+ *
+ * Macro for use in other macros to define compiler pragma
+ * See below for other usage examples
+ *
+ * @code
+ * #if defined(__clang__) || defined(__GNUC__)
+ * #define Y_PRAGMA_NO_WSHADOW \
+ *     Y_PRAGMA("GCC diagnostic ignored \"-Wshadow\"")
+ * #elif defined(_MSC_VER)
+ * #define Y_PRAGMA_NO_WSHADOW \
+ *     Y_PRAGMA("warning(disable:4456 4457")
+ * #else
+ * #define Y_PRAGMA_NO_WSHADOW
+ * #endif
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA(x) _Pragma(x)
+#elif defined(_MSC_VER)
+#define Y_PRAGMA(x) __pragma(x)
+#else
+#define Y_PRAGMA(x)
+#endif
+
+/**
+ * @def Y_PRAGMA_DIAGNOSTIC_PUSH
+ *
+ * Cross-compiler pragma to save diagnostic settings
+ *
+ * @see
+ *     GCC: https://gcc.gnu.org/onlinedocs/gcc/Diagnostic-Pragmas.html
+ *     MSVC: https://msdn.microsoft.com/en-us/library/2c8f766e.aspx
+ *     Clang: https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_DIAGNOSTIC_PUSH \
+    Y_PRAGMA("GCC diagnostic push")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_DIAGNOSTIC_PUSH \
+    Y_PRAGMA(warning(push))
+#else
+#define Y_PRAGMA_DIAGNOSTIC_PUSH
+#endif
+
+/**
+ * @def Y_PRAGMA_DIAGNOSTIC_POP
+ *
+ * Cross-compiler pragma to restore diagnostic settings
+ *
+ * @see
+ *     GCC: https://gcc.gnu.org/onlinedocs/gcc/Diagnostic-Pragmas.html
+ *     MSVC: https://msdn.microsoft.com/en-us/library/2c8f766e.aspx
+ *     Clang: https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_DIAGNOSTIC_POP \
+    Y_PRAGMA("GCC diagnostic pop")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_DIAGNOSTIC_POP \
+    Y_PRAGMA(warning(pop))
+#else
+#define Y_PRAGMA_DIAGNOSTIC_POP
+#endif
+
+/**
+ * @def Y_PRAGMA_NO_WSHADOW
+ *
+ * Cross-compiler pragma to disable warnings about shadowing variables
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * Y_PRAGMA_NO_WSHADOW
+ *
+ * // some code which use variable shadowing, e.g.:
+ *
+ * for (int i = 0; i < 100; ++i) {
+ *   Use(i);
+ *
+ *   for (int i = 42; i < 100500; ++i) { // this i is shadowing previous i
+ *       AnotherUse(i);
+ *    }
+ * }
+ *
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_NO_WSHADOW \
+    Y_PRAGMA("GCC diagnostic ignored \"-Wshadow\"")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_NO_WSHADOW \
+    Y_PRAGMA(warning(disable : 4456 4457))
+#else
+#define Y_PRAGMA_NO_WSHADOW
+#endif
+
+/**
+ * @ def Y_PRAGMA_NO_UNUSED_FUNCTION
+ *
+ * Cross-compiler pragma to disable warnings about unused functions
+ *
+ * @see
+ *     GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
+ *     Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wunused-function
+ *     MSVC: there is no such warning
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * Y_PRAGMA_NO_UNUSED_FUNCTION
+ *
+ * // some code which introduces a function which later will not be used, e.g.:
+ *
+ * void Foo() {
+ * }
+ *
+ * int main() {
+ *     return 0; // Foo() never called
+ * }
+ *
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_NO_UNUSED_FUNCTION \
+    Y_PRAGMA("GCC diagnostic ignored \"-Wunused-function\"")
+#else
+#define Y_PRAGMA_NO_UNUSED_FUNCTION
+#endif
+
+/**
+ * @ def Y_PRAGMA_NO_UNUSED_PARAMETER
+ *
+ * Cross-compiler pragma to disable warnings about unused function parameters
+ *
+ * @see
+ *     GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
+ *     Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wunused-parameter
+ *     MSVC: https://msdn.microsoft.com/en-us/library/26kb9fy0.aspx
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * Y_PRAGMA_NO_UNUSED_PARAMETER
+ *
+ * // some code which introduces a function with unused parameter, e.g.:
+ *
+ * void foo(int a) {
+ *     // a is not referenced
+ * }
+ *
+ * int main() {
+ *     foo(1);
+ *     return 0;
+ * }
+ *
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_NO_UNUSED_PARAMETER \
+    Y_PRAGMA("GCC diagnostic ignored \"-Wunused-parameter\"")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_NO_UNUSED_PARAMETER \
+    Y_PRAGMA(warning(disable : 4100))
+#else
+#define Y_PRAGMA_NO_UNUSED_PARAMETER
+#endif
+
+/**
+ * @def Y_PRAGMA_NO_DEPRECATED
+ *
+ * Cross compiler pragma to disable warnings and errors about deprecated
+ *
+ * @see
+ *     GCC: https://gcc.gnu.org/onlinedocs/gcc/Warning-Options.html
+ *     Clang: https://clang.llvm.org/docs/DiagnosticsReference.html#wdeprecated
+ *     MSVC: https://docs.microsoft.com/en-us/cpp/error-messages/compiler-warnings/compiler-warning-level-3-c4996?view=vs-2017
+ *
+ * @code
+ * Y_PRAGMA_DIAGNOSTIC_PUSH
+ * Y_PRAGMA_NO_DEPRECATED
+ *
+ * [deprecated] void foo() {
+ *     // ...
+ * }
+ *
+ * int main() {
+ *     foo();
+ *     return 0;
+ * }
+ *
+ * Y_PRAGMA_DIAGNOSTIC_POP
+ * @endcode
+ */
+#if defined(__clang__) || defined(__GNUC__)
+#define Y_PRAGMA_NO_DEPRECATED \
+    Y_PRAGMA("GCC diagnostic ignored \"-Wdeprecated\"")
+#elif defined(_MSC_VER)
+#define Y_PRAGMA_NO_DEPRECATED \
+    Y_PRAGMA(warning(disable : 4996))
+#else
+#define Y_PRAGMA_NO_DEPRECATED
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+/**
+ * @def Y_CONST_FUNCTION
+   methods and functions, marked with this method are promised to:
+     1. do not have side effects
+     2. this method do not read global memory
+   NOTE: this attribute can't be set for methods that depend on data, pointed by this
+   this allow compilers to do hard optimization of that functions
+   NOTE: in common case this attribute can't be set if method have pointer-arguments
+   NOTE: as result there no any reason to discard result of such method
+*/
+#define Y_CONST_FUNCTION [[gnu::const]]
+#endif
+
+#if !defined(Y_CONST_FUNCTION)
+#define Y_CONST_FUNCTION
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+/**
+ * @def Y_PURE_FUNCTION
+   methods and functions, marked with this method are promised to:
+     1. do not have side effects
+     2. result will be the same if no global memory changed
+   this allow compilers to do hard optimization of that functions
+   NOTE: as result there no any reason to discard result of such method
+*/
+#define Y_PURE_FUNCTION [[gnu::pure]]
+#endif
+
+#if !defined(Y_PURE_FUNCTION)
+#define Y_PURE_FUNCTION
+#endif
+
+/**
+ * @ def Y_HAVE_INT128
+ *
+ * Defined when the compiler supports __int128 extension
+ *
+ * @code
+ *
+ * #if defined(Y_HAVE_INT128)
+ *     __int128 myVeryBigInt = 12345678901234567890;
+ * #endif
+ *
+ * @endcode
+ */
+#if defined(__SIZEOF_INT128__)
+#define Y_HAVE_INT128 1
+#endif
+
+/**
+ * XRAY macro must be passed to compiler if XRay is enabled.
+ *
+ * Define everything XRay-specific as a macro so that it doesn't cause errors
+ * for compilers that doesn't support XRay.
+ */
+#if defined(XRAY) && defined(__cplusplus)
+#include <xray/xray_interface.h>
+#define Y_XRAY_ALWAYS_INSTRUMENT [[clang::xray_always_instrument]]
+#define Y_XRAY_NEVER_INSTRUMENT [[clang::xray_never_instrument]]
+#define Y_XRAY_CUSTOM_EVENT(__string, __length) \
+    do {                                        \
+        __xray_customevent(__string, __length); \
+    } while (0)
+#else
+#define Y_XRAY_ALWAYS_INSTRUMENT
+#define Y_XRAY_NEVER_INSTRUMENT
+#define Y_XRAY_CUSTOM_EVENT(__string, __length) \
+    do {                                        \
+    } while (0)
+#endif
diff --git a/contrib/lfalloc/src/util/system/defaults.h b/contrib/lfalloc/src/util/system/defaults.h
new file mode 100644
index 00000000000..19196a28b2b
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/defaults.h
@@ -0,0 +1,168 @@
+#pragma once
+
+#include "platform.h"
+
+#if defined _unix_
+#define LOCSLASH_C '/'
+#define LOCSLASH_S "/"
+#else
+#define LOCSLASH_C '\\'
+#define LOCSLASH_S "\\"
+#endif // _unix_
+
+#if defined(__INTEL_COMPILER) && defined(__cplusplus)
+#include <new>
+#endif
+
+// low and high parts of integers
+#if !defined(_win_)
+#include <sys/param.h>
+#endif
+
+#if defined(BSD) || defined(_android_)
+
+#if defined(BSD)
+#include <machine/endian.h>
+#endif
+
+#if defined(_android_)
+#include <endian.h>
+#endif
+
+#if (BYTE_ORDER == LITTLE_ENDIAN)
+#define _little_endian_
+#elif (BYTE_ORDER == BIG_ENDIAN)
+#define _big_endian_
+#else
+#error unknown endian not supported
+#endif
+
+#elif (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(WHATEVER_THAT_HAS_BIG_ENDIAN)
+#define _big_endian_
+#else
+#define _little_endian_
+#endif
+
+// alignment
+#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_QUADS)
+#define _must_align8_
+#endif
+
+#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_LONGS)
+#define _must_align4_
+#endif
+
+#if (defined(_sun_) && !defined(__i386__)) || defined(_hpux_) || defined(__alpha__) || defined(__ia64__) || defined(WHATEVER_THAT_NEEDS_ALIGNING_SHORTS)
+#define _must_align2_
+#endif
+
+#if defined(__GNUC__)
+#define alias_hack __attribute__((__may_alias__))
+#endif
+
+#ifndef alias_hack
+#define alias_hack
+#endif
+
+#include "types.h"
+
+#if defined(__STDC_VERSION__) && (__STDC_VERSION__ >= 199901L)
+#define PRAGMA(x) _Pragma(#x)
+#define RCSID(idstr) PRAGMA(comment(exestr, idstr))
+#else
+#define RCSID(idstr) static const char rcsid[] = idstr
+#endif
+
+#include "compiler.h"
+
+#ifdef _win_
+#include <malloc.h>
+#elif defined(_sun_)
+#include <alloca.h>
+#endif
+
+#ifdef NDEBUG
+#define Y_IF_DEBUG(X)
+#else
+#define Y_IF_DEBUG(X) X
+#endif
+
+/**
+ * @def Y_ARRAY_SIZE
+ *
+ * This macro is needed to get number of elements in a statically allocated fixed size array. The
+ * expression is a compile-time constant and therefore can be used in compile time computations.
+ *
+ * @code
+ * enum ENumbers {
+ *     EN_ONE,
+ *     EN_TWO,
+ *     EN_SIZE
+ * }
+ *
+ * const char* NAMES[] = {
+ *     "one",
+ *     "two"
+ * }
+ *
+ * static_assert(Y_ARRAY_SIZE(NAMES) == EN_SIZE, "you should define `NAME` for each enumeration");
+ * @endcode
+ *
+ * This macro also catches type errors. If you see a compiler error like "warning: division by zero
+ * is undefined" when using `Y_ARRAY_SIZE` then you are probably giving it a pointer.
+ *
+ * Since all of our code is expected to work on a 64 bit platform where pointers are 8 bytes we may
+ * falsefully accept pointers to types of sizes that are divisors of 8 (1, 2, 4 and 8).
+ */
+#if defined(__cplusplus)
+namespace NArraySizePrivate {
+    template <class T>
+    struct TArraySize;
+
+    template <class T, size_t N>
+    struct TArraySize<T[N]> {
+        enum {
+            Result = N
+        };
+    };
+
+    template <class T, size_t N>
+    struct TArraySize<T (&)[N]> {
+        enum {
+            Result = N
+        };
+    };
+}
+
+#define Y_ARRAY_SIZE(arr) ((size_t)::NArraySizePrivate::TArraySize<decltype(arr)>::Result)
+#else
+#undef Y_ARRAY_SIZE
+#define Y_ARRAY_SIZE(arr) \
+    ((sizeof(arr) / sizeof((arr)[0])) / static_cast<size_t>(!(sizeof(arr) % sizeof((arr)[0]))))
+#endif
+
+#undef Y_ARRAY_BEGIN
+#define Y_ARRAY_BEGIN(arr) (arr)
+
+#undef Y_ARRAY_END
+#define Y_ARRAY_END(arr) ((arr) + Y_ARRAY_SIZE(arr))
+
+/**
+ * Concatenates two symbols, even if one of them is itself a macro.
+ */
+#define Y_CAT(X, Y) Y_CAT_I(X, Y)
+#define Y_CAT_I(X, Y) Y_CAT_II(X, Y)
+#define Y_CAT_II(X, Y) X##Y
+
+#define Y_STRINGIZE(X) UTIL_PRIVATE_STRINGIZE_AUX(X)
+#define UTIL_PRIVATE_STRINGIZE_AUX(X) #X
+
+#if defined(__COUNTER__)
+#define Y_GENERATE_UNIQUE_ID(N) Y_CAT(N, __COUNTER__)
+#endif
+
+#if !defined(Y_GENERATE_UNIQUE_ID)
+#define Y_GENERATE_UNIQUE_ID(N) Y_CAT(N, __LINE__)
+#endif
+
+#define NPOS ((size_t)-1)
diff --git a/contrib/lfalloc/src/util/system/platform.h b/contrib/lfalloc/src/util/system/platform.h
new file mode 100644
index 00000000000..0687f239a2e
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/platform.h
@@ -0,0 +1,242 @@
+#pragma once
+
+// What OS ?
+// our definition has the form _{osname}_
+
+#if defined(_WIN64)
+#define _win64_
+#define _win32_
+#elif defined(__WIN32__) || defined(_WIN32) // _WIN32 is also defined by the 64-bit compiler for backward compatibility
+#define _win32_
+#else
+#define _unix_
+#if defined(__sun__) || defined(sun) || defined(sparc) || defined(__sparc)
+#define _sun_
+#endif
+#if defined(__hpux__)
+#define _hpux_
+#endif
+#if defined(__linux__)
+#define _linux_
+#endif
+#if defined(__FreeBSD__)
+#define _freebsd_
+#endif
+#if defined(__CYGWIN__)
+#define _cygwin_
+#endif
+#if defined(__APPLE__)
+#define _darwin_
+#endif
+#if defined(__ANDROID__)
+#define _android_
+#endif
+#endif
+
+#if defined(__IOS__)
+#define _ios_
+#endif
+
+#if defined(_linux_)
+#if defined(_musl_)
+//nothing to do
+#elif defined(_android_)
+#define _bionic_
+#else
+#define _glibc_
+#endif
+#endif
+
+#if defined(_darwin_)
+#define unix
+#define __unix__
+#endif
+
+#if defined(_win32_) || defined(_win64_)
+#define _win_
+#endif
+
+#if defined(__arm__) || defined(__ARM__) || defined(__ARM_NEON) || defined(__aarch64__) || defined(_M_ARM)
+#if defined(__arm64) || defined(__arm64__) || defined(__aarch64__)
+#define _arm64_
+#else
+#define _arm32_
+#endif
+#endif
+
+#if defined(_arm64_) || defined(_arm32_)
+#define _arm_
+#endif
+
+/* __ia64__ and __x86_64__      - defined by GNU C.
+ * _M_IA64, _M_X64, _M_AMD64    - defined by Visual Studio.
+ *
+ * Microsoft can define _M_IX86, _M_AMD64 (before Visual Studio 8)
+ * or _M_X64 (starting in Visual Studio 8).
+ */
+#if defined(__x86_64__) || defined(_M_X64) || defined(_M_AMD64)
+#define _x86_64_
+#endif
+
+#if defined(__i386__) || defined(_M_IX86)
+#define _i386_
+#endif
+
+#if defined(__ia64__) || defined(_M_IA64)
+#define _ia64_
+#endif
+
+#if defined(__powerpc__)
+#define _ppc_
+#endif
+
+#if defined(__powerpc64__)
+#define _ppc64_
+#endif
+
+#if !defined(sparc) && !defined(__sparc) && !defined(__hpux__) && !defined(__alpha__) && !defined(_ia64_) && !defined(_x86_64_) && !defined(_arm_) && !defined(_i386_) && !defined(_ppc_) && !defined(_ppc64_)
+#error "platform not defined, please, define one"
+#endif
+
+#if defined(_x86_64_) || defined(_i386_)
+#define _x86_
+#endif
+
+#if defined(__MIC__)
+#define _mic_
+#define _k1om_
+#endif
+
+// stdio or MessageBox
+#if defined(__CONSOLE__) || defined(_CONSOLE)
+#define _console_
+#endif
+#if (defined(_win_) && !defined(_console_))
+#define _windows_
+#elif !defined(_console_)
+#define _console_
+#endif
+
+#if defined(__SSE__) || defined(SSE_ENABLED)
+#define _sse_
+#endif
+
+#if defined(__SSE2__) || defined(SSE2_ENABLED)
+#define _sse2_
+#endif
+
+#if defined(__SSE3__) || defined(SSE3_ENABLED)
+#define _sse3_
+#endif
+
+#if defined(__SSSE3__) || defined(SSSE3_ENABLED)
+#define _ssse3_
+#endif
+
+#if defined(POPCNT_ENABLED)
+#define _popcnt_
+#endif
+
+#if defined(__DLL__) || defined(_DLL)
+#define _dll_
+#endif
+
+// 16, 32 or 64
+#if defined(__sparc_v9__) || defined(_x86_64_) || defined(_ia64_) || defined(_arm64_) || defined(_ppc64_)
+#define _64_
+#else
+#define _32_
+#endif
+
+/* All modern 64-bit Unix systems use scheme LP64 (long, pointers are 64-bit).
+ * Microsoft uses a different scheme: LLP64 (long long, pointers are 64-bit).
+ *
+ * Scheme          LP64   LLP64
+ * char              8      8
+ * short            16     16
+ * int              32     32
+ * long             64     32
+ * long long        64     64
+ * pointer          64     64
+ */
+
+#if defined(_32_)
+#define SIZEOF_PTR 4
+#elif defined(_64_)
+#define SIZEOF_PTR 8
+#endif
+
+#define PLATFORM_DATA_ALIGN SIZEOF_PTR
+
+#if !defined(SIZEOF_PTR)
+#error todo
+#endif
+
+#define SIZEOF_CHAR 1
+#define SIZEOF_UNSIGNED_CHAR 1
+#define SIZEOF_SHORT 2
+#define SIZEOF_UNSIGNED_SHORT 2
+#define SIZEOF_INT 4
+#define SIZEOF_UNSIGNED_INT 4
+
+#if defined(_32_)
+#define SIZEOF_LONG 4
+#define SIZEOF_UNSIGNED_LONG 4
+#elif defined(_64_)
+#if defined(_win_)
+#define SIZEOF_LONG 4
+#define SIZEOF_UNSIGNED_LONG 4
+#else
+#define SIZEOF_LONG 8
+#define SIZEOF_UNSIGNED_LONG 8
+#endif // _win_
+#endif // _32_
+
+#if !defined(SIZEOF_LONG)
+#error todo
+#endif
+
+#define SIZEOF_LONG_LONG 8
+#define SIZEOF_UNSIGNED_LONG_LONG 8
+
+#undef SIZEOF_SIZE_T // in case we include <Python.h> which defines it, too
+#define SIZEOF_SIZE_T SIZEOF_PTR
+
+#if defined(__INTEL_COMPILER)
+#pragma warning(disable 1292)
+#pragma warning(disable 1469)
+#pragma warning(disable 193)
+#pragma warning(disable 271)
+#pragma warning(disable 383)
+#pragma warning(disable 424)
+#pragma warning(disable 444)
+#pragma warning(disable 584)
+#pragma warning(disable 593)
+#pragma warning(disable 981)
+#pragma warning(disable 1418)
+#pragma warning(disable 304)
+#pragma warning(disable 810)
+#pragma warning(disable 1029)
+#pragma warning(disable 1419)
+#pragma warning(disable 177)
+#pragma warning(disable 522)
+#pragma warning(disable 858)
+#pragma warning(disable 111)
+#pragma warning(disable 1599)
+#pragma warning(disable 411)
+#pragma warning(disable 304)
+#pragma warning(disable 858)
+#pragma warning(disable 444)
+#pragma warning(disable 913)
+#pragma warning(disable 310)
+#pragma warning(disable 167)
+#pragma warning(disable 180)
+#pragma warning(disable 1572)
+#endif
+
+#if defined(_MSC_VER)
+#undef _WINSOCKAPI_
+#define _WINSOCKAPI_
+#undef NOMINMAX
+#define NOMINMAX
+#endif
diff --git a/contrib/lfalloc/src/util/system/types.h b/contrib/lfalloc/src/util/system/types.h
new file mode 100644
index 00000000000..af4f0adb13d
--- /dev/null
+++ b/contrib/lfalloc/src/util/system/types.h
@@ -0,0 +1,117 @@
+#pragma once
+
+// DO_NOT_STYLE
+
+#include "platform.h"
+
+#include <inttypes.h>
+
+typedef int8_t i8;
+typedef int16_t i16;
+typedef uint8_t ui8;
+typedef uint16_t ui16;
+
+typedef int yssize_t;
+#define PRIYSZT "d"
+
+#if defined(_darwin_) && defined(_32_)
+typedef unsigned long ui32;
+typedef long i32;
+#else
+typedef uint32_t ui32;
+typedef int32_t i32;
+#endif
+
+#if defined(_darwin_) && defined(_64_)
+typedef unsigned long ui64;
+typedef long i64;
+#else
+typedef uint64_t ui64;
+typedef int64_t i64;
+#endif
+
+#define LL(number) INT64_C(number)
+#define ULL(number) UINT64_C(number)
+
+// Macro for size_t and ptrdiff_t types
+#if defined(_32_)
+#   if defined(_darwin_)
+#       define PRISZT "lu"
+#       undef PRIi32
+#       define PRIi32 "li"
+#       undef SCNi32
+#       define SCNi32 "li"
+#       undef PRId32
+#       define PRId32 "li"
+#       undef SCNd32
+#       define SCNd32 "li"
+#       undef PRIu32
+#       define PRIu32 "lu"
+#       undef SCNu32
+#       define SCNu32 "lu"
+#       undef PRIx32
+#       define PRIx32 "lx"
+#       undef SCNx32
+#       define SCNx32 "lx"
+#   elif !defined(_cygwin_)
+#       define PRISZT PRIu32
+#   else
+#       define PRISZT "u"
+#   endif
+#   define SCNSZT SCNu32
+#   define PRIPDT PRIi32
+#   define SCNPDT SCNi32
+#   define PRITMT PRIi32
+#   define SCNTMT SCNi32
+#elif defined(_64_)
+#   if defined(_darwin_)
+#       define PRISZT "lu"
+#       undef PRIu64
+#       define PRIu64 PRISZT
+#       undef PRIx64
+#       define PRIx64 "lx"
+#       undef PRIX64
+#       define PRIX64 "lX"
+#       undef PRId64
+#       define PRId64 "ld"
+#       undef PRIi64
+#       define PRIi64 "li"
+#       undef SCNi64
+#       define SCNi64 "li"
+#       undef SCNu64
+#       define SCNu64 "lu"
+#       undef SCNx64
+#       define SCNx64 "lx"
+#   else
+#       define PRISZT PRIu64
+#   endif
+#   define SCNSZT SCNu64
+#   define PRIPDT PRIi64
+#   define SCNPDT SCNi64
+#   define PRITMT PRIi64
+#   define SCNTMT SCNi64
+#else
+#   error "Unsupported platform"
+#endif
+
+// SUPERLONG
+#if !defined(DONT_USE_SUPERLONG) && !defined(SUPERLONG_MAX)
+#define SUPERLONG_MAX ~LL(0)
+typedef i64 SUPERLONG;
+#endif
+
+// UNICODE
+// UCS-2, native byteorder
+typedef ui16 wchar16;
+// internal symbol type: UTF-16LE
+typedef wchar16 TChar;
+typedef ui32 wchar32;
+
+#if defined(_MSC_VER)
+#include <basetsd.h>
+typedef SSIZE_T ssize_t;
+#define HAVE_SSIZE_T 1
+#include <wchar.h>
+#endif
+
+#include <sys/types.h>
diff --git a/dbms/CMakeLists.txt b/dbms/CMakeLists.txt
index 1306039e9c3..5c1d73b7a74 100644
--- a/dbms/CMakeLists.txt
+++ b/dbms/CMakeLists.txt
@@ -155,7 +155,6 @@ if (USE_EMBEDDED_COMPILER)
     target_include_directories (dbms SYSTEM BEFORE PUBLIC ${LLVM_INCLUDE_DIRS})
 endif ()
 
-
 if (CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO" OR CMAKE_BUILD_TYPE_UC STREQUAL "MINSIZEREL")
     # Won't generate debug info for files with heavy template instantiation to achieve faster linking and lower size.
     set_source_files_properties(
@@ -214,6 +213,10 @@ target_link_libraries (clickhouse_common_io
 
 target_include_directories(clickhouse_common_io SYSTEM BEFORE PUBLIC ${RE2_INCLUDE_DIR})
 
+if (USE_LFALLOC)
+    target_include_directories (clickhouse_common_io SYSTEM BEFORE PUBLIC ${LFALLOC_INCLUDE_DIR})
+endif ()
+
 if(CPUID_LIBRARY)
     target_link_libraries(clickhouse_common_io PRIVATE ${CPUID_LIBRARY})
 endif()
diff --git a/dbms/src/Common/LFAllocator.cpp b/dbms/src/Common/LFAllocator.cpp
new file mode 100644
index 00000000000..71396d341ab
--- /dev/null
+++ b/dbms/src/Common/LFAllocator.cpp
@@ -0,0 +1,53 @@
+#include <Common/config.h>
+
+#if USE_LFALLOC
+#include "LFAllocator.h"
+
+#include <cstring>
+#include <lf_allocX64.h>
+
+namespace DB
+{
+
+void * LFAllocator::alloc(size_t size, size_t alignment)
+{
+    if (alignment == 0)
+        return LFAlloc(size);
+    else
+    {
+        void * ptr;
+        int res = LFPosixMemalign(&ptr, alignment, size);
+        return res ? nullptr : ptr;
+    }
+}
+
+void LFAllocator::free(void * buf, size_t)
+{
+    LFFree(buf);
+}
+
+void * LFAllocator::realloc(void * old_ptr, size_t, size_t new_size, size_t alignment)
+{
+    if (old_ptr == nullptr)
+    {
+        void * result = LFAllocator::alloc(new_size, alignment);
+        return result;
+    }
+    if (new_size == 0)
+    {
+        LFFree(old_ptr);
+        return nullptr;
+    }
+
+    void * new_ptr = LFAllocator::alloc(new_size, alignment);
+    if (new_ptr == nullptr)
+        return nullptr;
+    size_t old_size = LFGetSize(old_ptr);
+    memcpy(new_ptr, old_ptr, ((old_size < new_size) ? old_size : new_size));
+    LFFree(old_ptr);
+    return new_ptr;
+}
+
+}
+
+#endif
diff --git a/dbms/src/Common/LFAllocator.h b/dbms/src/Common/LFAllocator.h
new file mode 100644
index 00000000000..f2a10cc4508
--- /dev/null
+++ b/dbms/src/Common/LFAllocator.h
@@ -0,0 +1,22 @@
+#pragma once
+
+#include <Common/config.h>
+
+#if !USE_LFALLOC
+#error "do not include this file until USE_LFALLOC is set to 1"
+#endif
+
+#include <cstddef>
+
+namespace DB
+{
+struct LFAllocator
+{
+    static void * alloc(size_t size, size_t alignment = 0);
+
+    static void free(void * buf, size_t);
+
+    static void * realloc(void * buf, size_t, size_t new_size, size_t alignment = 0);
+};
+
+}
diff --git a/dbms/src/Common/config.h.in b/dbms/src/Common/config.h.in
index c323afe369e..d6fc6d146f0 100644
--- a/dbms/src/Common/config.h.in
+++ b/dbms/src/Common/config.h.in
@@ -25,6 +25,8 @@
 #cmakedefine01 USE_BROTLI
 #cmakedefine01 USE_SSL
 #cmakedefine01 USE_HYPERSCAN
+#cmakedefine01 USE_LFALLOC
+#cmakedefine01 USE_LFALLOC_RANDOM_HINT
 
 #cmakedefine01 CLICKHOUSE_SPLIT_BINARY
 #cmakedefine01 LLVM_HAS_RTTI
diff --git a/dbms/src/DataStreams/MarkInCompressedFile.h b/dbms/src/DataStreams/MarkInCompressedFile.h
index 3a1d9aa0f19..b2219f4d55f 100644
--- a/dbms/src/DataStreams/MarkInCompressedFile.h
+++ b/dbms/src/DataStreams/MarkInCompressedFile.h
@@ -6,6 +6,10 @@
 #include <IO/WriteHelpers.h>
 #include <Common/PODArray.h>
 
+#include <Common/config.h>
+#if USE_LFALLOC
+#include <Common/LFAllocator.h>
+#endif
 
 namespace DB
 {
@@ -33,7 +37,9 @@ struct MarkInCompressedFile
         return "(" + DB::toString(offset_in_compressed_file) + "," + DB::toString(offset_in_decompressed_block) + ")";
     }
 };
-
+#if USE_LFALLOC
+using MarksInCompressedFile = PODArray<MarkInCompressedFile, 4096, LFAllocator>;
+#else
 using MarksInCompressedFile = PODArray<MarkInCompressedFile>;
-
+#endif
 }
diff --git a/dbms/src/IO/UncompressedCache.h b/dbms/src/IO/UncompressedCache.h
index 86f1530e5b3..2347c6d7a28 100644
--- a/dbms/src/IO/UncompressedCache.h
+++ b/dbms/src/IO/UncompressedCache.h
@@ -6,6 +6,11 @@
 #include <Common/ProfileEvents.h>
 #include <IO/BufferWithOwnMemory.h>
 
+#include <Common/config.h>
+#if USE_LFALLOC
+#include <Common/LFAllocator.h>
+#endif
+
 
 namespace ProfileEvents
 {
@@ -20,7 +25,11 @@ namespace DB
 
 struct UncompressedCacheCell
 {
+#if USE_LFALLOC
+    Memory<LFAllocator> data;
+#else
     Memory<> data;
+#endif
     size_t compressed_size;
     UInt32 additional_bytes;
 };
diff --git a/dbms/src/Interpreters/Compiler.cpp b/dbms/src/Interpreters/Compiler.cpp
index 83fbf2918dc..abdb0969121 100644
--- a/dbms/src/Interpreters/Compiler.cpp
+++ b/dbms/src/Interpreters/Compiler.cpp
@@ -2,6 +2,7 @@
 #include <Poco/Util/Application.h>
 #include <ext/unlock_guard.h>
 #include <Common/ClickHouseRevision.h>
+#include <Common/config.h>
 #include <Common/SipHash.h>
 #include <Common/ShellCommand.h>
 #include <Common/StringUtils/StringUtils.h>
@@ -261,6 +262,9 @@ void Compiler::compile(
             " -I " << compiler_headers << "/dbms/src/"
             " -isystem " << compiler_headers << "/contrib/cityhash102/include/"
             " -isystem " << compiler_headers << "/contrib/libpcg-random/include/"
+        #if USE_LFALLOC
+            " -isystem " << compiler_headers << "/contrib/lfalloc/src/"
+        #endif
             " -isystem " << compiler_headers << INTERNAL_DOUBLE_CONVERSION_INCLUDE_DIR
             " -isystem " << compiler_headers << INTERNAL_Poco_Foundation_INCLUDE_DIR
             " -isystem " << compiler_headers << INTERNAL_Boost_INCLUDE_DIRS
diff --git a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in
index 97358ac02c9..6d6d5f32e0c 100644
--- a/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in
+++ b/dbms/src/Storages/System/StorageSystemBuildOptions.generated.cpp.in
@@ -57,6 +57,8 @@ const char * auto_config_build[]
     "USE_BROTLI", "@USE_BROTLI@",
     "USE_SSL", "@USE_SSL@",
     "USE_HYPERSCAN", "@USE_HYPERSCAN@",
+    "USE_LFALLOC", "@USE_LFALLOC@",
+    "USE_LFALLOC_RANDOM_HINT", "@USE_LFALLOC_RANDOM_HINT@",
 
     nullptr, nullptr
 };
diff --git a/libs/libcommon/CMakeLists.txt b/libs/libcommon/CMakeLists.txt
index c0be7e218e1..bb6fbf79946 100644
--- a/libs/libcommon/CMakeLists.txt
+++ b/libs/libcommon/CMakeLists.txt
@@ -61,7 +61,6 @@ add_library (common ${LINK_MODE}
 if (USE_JEMALLOC)
     message (STATUS "Link jemalloc: ${JEMALLOC_LIBRARIES}")
     set (MALLOC_LIBRARIES ${JEMALLOC_LIBRARIES})
-
 elseif (USE_TCMALLOC)
     if (DEBUG_TCMALLOC AND NOT GPERFTOOLS_TCMALLOC_MINIMAL_DEBUG)
         message (FATAL_ERROR "Requested DEBUG_TCMALLOC but debug library is not found. You should install Google Perftools. Example: sudo apt-get install libgoogle-perftools-dev")