Introduced ThreadStatus. [#CLICKHOUSE-2910]

2024-11-10 01:25:21 +00:00 · 2018-02-01 20:55:08 +03:00 · 2018-02-01 20:55:08 +03:00 · efdda9cc9b
commit efdda9cc9b
parent 1b3573f61f
33 changed files with 705 additions and 221 deletions
--- a/dbms/src/AggregateFunctions/AggregateFunctionCount.h
+++ b/dbms/src/AggregateFunctions/AggregateFunctionCount.h
@ -42,6 +42,9 @@ public:

    void merge(AggregateDataPtr place, ConstAggregateDataPtr rhs, Arena *) const override
    {
+        if (!place || !rhs)
+            throw Exception("nullptr", ErrorCodes::LOGICAL_ERROR);
+
        data(place).count += data(rhs).count;
    }

--- a/dbms/src/Common/ActionBlocker.h
+++ b/dbms/src/Common/ActionBlocker.h
@ -7,10 +7,12 @@ namespace DB

 /// An atomic variable that is used to block and interrupt certain actions
 /// If it is not zero then actions related with it should be considered as interrupted
-class ActionBlocker
+template <typename TCounter=std::atomic<int>>
+class ActionBlockerImpl
 {
 private:
-    mutable std::atomic<int> counter{0};
+    using ActionBlocker = ActionBlockerImpl<TCounter>;
+    mutable TCounter counter{0};

 public:
    bool isCancelled() const { return counter > 0; }
@ -60,4 +62,7 @@ public:
    };
 };

+using ActionBlocker = ActionBlockerImpl<std::atomic<int>>;
+using ActionBlockerSingleThread = ActionBlockerImpl<int>;
+
 }
--- a/dbms/src/Common/MemoryTracker.cpp
+++ b/dbms/src/Common/MemoryTracker.cpp
@ -3,9 +3,7 @@
 #include <Common/Exception.h>
 #include <Common/formatReadable.h>
 #include <IO/WriteHelpers.h>
-#include <iomanip>
-
-#include <Common/MemoryTracker.h>
+#include <Common/ThreadStatus.h>


 namespace DB
@ -56,13 +54,16 @@ void MemoryTracker::logPeakMemoryUsage() const

 void MemoryTracker::alloc(Int64 size)
 {
+    if (blocker.isCancelled())
+        return;
+
    /** Using memory_order_relaxed means that if allocations are done simultaneously,
      *  we allow exception about memory limit exceeded to be thrown only on next allocation.
      * So, we allow over-allocations.
      */
    Int64 will_be = size + amount.fetch_add(size, std::memory_order_relaxed);

-    if (!next.load(std::memory_order_relaxed))
+    if (!parent.load(std::memory_order_relaxed))
        CurrentMetrics::add(metric, size);

    Int64 current_limit = limit.load(std::memory_order_relaxed);
@ -102,13 +103,16 @@ void MemoryTracker::alloc(Int64 size)
    if (will_be > peak.load(std::memory_order_relaxed))        /// Races doesn't matter. Could rewrite with CAS, but not worth.
        peak.store(will_be, std::memory_order_relaxed);

-    if (auto loaded_next = next.load(std::memory_order_relaxed))
+    if (auto loaded_next = parent.load(std::memory_order_relaxed))
        loaded_next->alloc(size);
 }


 void MemoryTracker::free(Int64 size)
 {
+    if (blocker.isCancelled())
+        return;
+
    Int64 new_amount = amount.fetch_sub(size, std::memory_order_relaxed) - size;

    /** Sometimes, query could free some data, that was allocated outside of query context.
@ -123,7 +127,7 @@ void MemoryTracker::free(Int64 size)
        size += new_amount;
    }

-    if (auto loaded_next = next.load(std::memory_order_relaxed))
+    if (auto loaded_next = parent.load(std::memory_order_relaxed))
        loaded_next->free(size);
    else
        CurrentMetrics::sub(metric, size);
@ -132,7 +136,7 @@ void MemoryTracker::free(Int64 size)

 void MemoryTracker::reset()
 {
-    if (!next.load(std::memory_order_relaxed))
+    if (!parent.load(std::memory_order_relaxed))
        CurrentMetrics::sub(metric, amount.load(std::memory_order_relaxed));

    amount.store(0, std::memory_order_relaxed);
@ -149,29 +153,29 @@ void MemoryTracker::setOrRaiseLimit(Int64 value)
        ;
 }

-#if __APPLE__ && __clang__
-__thread MemoryTracker * current_memory_tracker = nullptr;
-#else
-thread_local MemoryTracker * current_memory_tracker = nullptr;
-#endif

 namespace CurrentMemoryTracker
 {
    void alloc(Int64 size)
    {
-        if (current_memory_tracker)
-            current_memory_tracker->alloc(size);
+        if (DB::current_thread)
+            DB::current_thread->memory_tracker.alloc(size);
    }

    void realloc(Int64 old_size, Int64 new_size)
    {
-        if (current_memory_tracker)
-            current_memory_tracker->alloc(new_size - old_size);
+        if (DB::current_thread)
+            DB::current_thread->memory_tracker.alloc(new_size - old_size);
    }

    void free(Int64 size)
    {
-        if (current_memory_tracker)
-            current_memory_tracker->free(size);
+        if (DB::current_thread)
+            DB::current_thread->memory_tracker.free(size);
    }
 }
+
+DB::ActionBlockerSingleThread::BlockHolder getCurrentMemoryTrackerBlocker()
+{
+    return (DB::current_thread) ? DB::current_thread->memory_tracker.blocker.cancel() : DB::ActionBlockerSingleThread::BlockHolder(nullptr);
+}
--- a/dbms/src/Common/MemoryTracker.h
+++ b/dbms/src/Common/MemoryTracker.h
@ -3,6 +3,7 @@
 #include <atomic>
 #include <common/Types.h>
 #include <Common/CurrentMetrics.h>
+#include <Common/ActionBlocker.h>


 namespace CurrentMetrics
@ -26,7 +27,7 @@ class MemoryTracker

    /// Singly-linked list. All information will be passed to subsequent memory trackers also (it allows to implement trackers hierarchy).
    /// In terms of tree nodes it is the list of parents. Lifetime of these trackers should "include" lifetime of current tracker.
-    std::atomic<MemoryTracker *> next {};
+    std::atomic<MemoryTracker *> parent {};

    /// You could specify custom metric to track memory usage.
    CurrentMetrics::Metric metric = CurrentMetrics::MemoryTracking;
@ -37,6 +38,7 @@ class MemoryTracker
 public:
    MemoryTracker() {}
    MemoryTracker(Int64 limit_) : limit(limit_) {}
+    MemoryTracker(MemoryTracker * parent_) : parent(parent_) {}

    ~MemoryTracker();

@ -79,9 +81,15 @@ public:
    }

    /// next should be changed only once: from nullptr to some value.
-    void setNext(MemoryTracker * elem)
+    /// NOTE: It is not true in MergeListElement
+    void setParent(MemoryTracker * elem)
    {
-        next.store(elem, std::memory_order_relaxed);
+        parent.store(elem, std::memory_order_relaxed);
+    }
+
+    MemoryTracker * getParent()
+    {
+        return parent.load(std::memory_order_relaxed);
    }

    /// The memory consumption could be shown in realtime via CurrentMetrics counter
@ -100,21 +108,13 @@ public:

    /// Prints info about peak memory consumption into log.
    void logPeakMemoryUsage() const;
+
+    /// To be able to temporarily stop memory tracker
+    DB::ActionBlockerSingleThread blocker;
 };


-/** The MemoryTracker object is quite difficult to pass to all places where significant amounts of memory are allocated.
-  * Therefore, a thread-local pointer to used MemoryTracker is set, or nullptr if MemoryTracker does not need to be used.
-  * This pointer is set when memory consumption is monitored in current thread.
-  * So, you just need to pass it to all the threads that handle one request.
-  */
-#if __APPLE__ && __clang__
-extern __thread MemoryTracker * current_memory_tracker;
-#else
-extern thread_local MemoryTracker * current_memory_tracker;
-#endif
-
-/// Convenience methods, that use current_memory_tracker if it is available.
+/// Convenience methods, that use current thread's memory_tracker if it is available.
 namespace CurrentMemoryTracker
 {
    void alloc(Int64 size);
@ -123,20 +123,4 @@ namespace CurrentMemoryTracker
 }


-#include <boost/noncopyable.hpp>
-
-struct TemporarilyDisableMemoryTracker : private boost::noncopyable
-{
-    MemoryTracker * memory_tracker;
-
-    TemporarilyDisableMemoryTracker()
-    {
-        memory_tracker = current_memory_tracker;
-        current_memory_tracker = nullptr;
-    }
-
-    ~TemporarilyDisableMemoryTracker()
-    {
-        current_memory_tracker = memory_tracker;
-    }
-};
+DB::ActionBlockerSingleThread::BlockHolder getCurrentMemoryTrackerBlocker();
--- a/dbms/src/Common/ProfileEvents.cpp
+++ b/dbms/src/Common/ProfileEvents.cpp
@ -1,4 +1,5 @@
 #include <Common/ProfileEvents.h>
+#include <Common/ThreadStatus.h>


 /// Available events. Add something here as you wish.
@ -132,30 +133,77 @@
    M(RWLockAcquiredReadLocks) \
    M(RWLockAcquiredWriteLocks) \
    M(RWLockReadersWaitMilliseconds) \
-    M(RWLockWritersWaitMilliseconds)
+    M(RWLockWritersWaitMilliseconds) \
+    \
+    M(RealTimeMicroseconds) \
+    M(RusageUserTimeMicroseconds) \
+    M(RusageSystemTimeMicroseconds) \
+    M(RusagePageReclaims) \
+    M(RusagePageVoluntaryContextSwitches) \
+    M(RusagePageInvoluntaryContextSwitches)

 namespace ProfileEvents
 {
-    #define M(NAME) extern const Event NAME = __COUNTER__;
+
+#define M(NAME) extern const Event NAME = __COUNTER__;
+    APPLY_FOR_EVENTS(M)
+#undef M
+constexpr Event END = __COUNTER__;
+
+/// Global variable, initialized by zeros.
+Counter global_counters_array[END] {};
+/// Initialize global counters statically
+Counters global_counters(global_counters_array);
+
+const Event Counters::num_counters = END;
+
+
+Counters::Counters(Level level, Counters * parent)
+    : parent(parent), level(level),
+      counters_holder(new Counter[num_counters] {})
+{
+    counters = counters_holder.get();
+}
+
+void Counters::resetCounters()
+{
+    if (counters)
+    {
+        for (Event i = 0; i < num_counters; ++i)
+            counters[i].store(0, std::memory_order_relaxed);
+    }
+}
+
+void Counters::reset()
+{
+    parent = nullptr;
+    resetCounters();
+}
+
+const char * getDescription(Event event)
+{
+    static const char * descriptions[] =
+    {
+    #define M(NAME) #NAME,
        APPLY_FOR_EVENTS(M)
    #undef M
-    constexpr Event END = __COUNTER__;
+    };

-    std::atomic<Count> counters[END] {};    /// Global variable, initialized by zeros.
+    return descriptions[event];
+}

-    const char * getDescription(Event event)
-    {
-        static const char * descriptions[] =
-        {
-        #define M(NAME) #NAME,
-            APPLY_FOR_EVENTS(M)
-        #undef M
-        };

-        return descriptions[event];
-    }
+Event end() { return END; }
+
+
+void increment(Event event, Count amount)
+{
+    if (DB::current_thread)
+        DB::current_thread->performance_counters.increment(event, amount);
+    else
+        global_counters.increment(event, amount);
+}

-    Event end() { return END; }
 }

 #undef APPLY_FOR_EVENTS
--- a/dbms/src/Common/ProfileEvents.h
+++ b/dbms/src/Common/ProfileEvents.h
@ -2,6 +2,7 @@

 #include <stddef.h>
 #include <atomic>
+#include <memory>


 /** Implements global counters for various events happening in the application
@ -14,19 +15,66 @@ namespace ProfileEvents
    /// Event identifier (index in array).
    using Event = size_t;
    using Count = size_t;
+    using Counter = std::atomic<Count>;
+
+    enum class Level
+    {
+        Global = 0,
+        User,
+        Process,
+        Thread
+    };
+
+    struct Counters
+    {
+        Counter * counters = nullptr;
+        Counters * parent = nullptr;
+        const Level level = Level::Thread;
+        std::unique_ptr<Counter[]> counters_holder;
+
+        Counters() = default;
+
+        Counters(Level level, Counters * parent = nullptr);
+
+        /// Global level static initializer
+        Counters(Counter * allocated_counters)
+            :  counters(allocated_counters), parent(nullptr), level(Level::Global) {}
+
+        inline Counter & operator[] (Event event)
+        {
+            return counters[event];
+        }
+
+        inline void increment(Event event, Count amount = 1)
+        {
+            Counters * current = this;
+            do
+            {
+                current->counters[event].fetch_add(amount, std::memory_order_relaxed);
+                current = current->parent;
+            } while (current != nullptr);
+        }
+
+        /// Reset metrics and parent
+        void reset();
+
+        /// Reset metrics
+        void resetCounters();
+
+        static const Event num_counters;
+    };
+
+
+    /// Counters - how many times each event happened.
+    extern Counters global_counters;
+
+
+    /// Increment a counter for event. Thread-safe.
+    void increment(Event event, Count amount = 1);

    /// Get text description of event by identifier. Returns statically allocated string.
    const char * getDescription(Event event);

-    /// Counters - how many times each event happened.
-    extern std::atomic<Count> counters[];
-
-    /// Increment a counter for event. Thread-safe.
-    inline void increment(Event event, Count amount = 1)
-    {
-        counters[event].fetch_add(amount, std::memory_order_relaxed);
-    }
-
    /// Get index just after last event identifier.
    Event end();
 }
--- a/dbms/src/Common/ThreadStatus.cpp
+++ b/dbms/src/Common/ThreadStatus.cpp
@ -0,0 +1,223 @@
+#include "ThreadStatus.h"
+#include <Poco/Ext/ThreadNumber.h>
+#include <Poco/Logger.h>
+#include <common/logger_useful.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <Interpreters/ProcessList.h>
+
+
+namespace ProfileEvents
+{
+    extern const Event RealTimeMicroseconds;
+    extern const Event RusageUserTimeMicroseconds;
+    extern const Event RusageSystemTimeMicroseconds;
+    extern const Event RusagePageReclaims;
+    extern const Event RusagePageVoluntaryContextSwitches;
+    extern const Event RusagePageInvoluntaryContextSwitches;
+}
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+
+static UInt64 getCurrentTimeMicroseconds(clockid_t clock_type = CLOCK_MONOTONIC)
+{
+    struct timespec ts;
+    clock_gettime(clock_type, &ts);
+    return ts.tv_sec * 1000000ULL + ts.tv_nsec / 1000UL;
+}
+
+
+struct RusageCounters
+{
+    /// In microseconds
+    UInt64 real_time = 0;
+    UInt64 user_time = 0;
+    UInt64 sys_time = 0;
+
+    UInt64 page_reclaims = 0;
+    UInt64 voluntary_context_switches = 0;
+    UInt64 involuntary_context_switches = 0;
+
+    RusageCounters() = default;
+    RusageCounters(const struct rusage & rusage, UInt64 real_time_)
+    {
+        set(rusage, real_time_);
+    }
+
+    static RusageCounters zeros(UInt64 real_time)
+    {
+        RusageCounters res;
+        res.real_time = real_time;
+        return res;
+    }
+
+    static RusageCounters current()
+    {
+        RusageCounters res;
+        res.setFromCurrent();
+        return res;
+    }
+
+    void setFromCurrent()
+    {
+        struct rusage rusage;
+        getrusage(RUSAGE_THREAD, &rusage);
+        set(rusage, getCurrentTimeMicroseconds());
+    }
+
+    void set(const struct rusage & rusage, UInt64 real_time_)
+    {
+        real_time = real_time_;
+        user_time = rusage.ru_utime.tv_sec * 1000000UL + rusage.ru_utime.tv_usec;
+        sys_time = rusage.ru_utime.tv_sec * 1000000UL + rusage.ru_utime.tv_usec;
+
+        page_reclaims = static_cast<UInt64>(rusage.ru_minflt);
+        voluntary_context_switches = static_cast<UInt64>(rusage.ru_nvcsw);
+        involuntary_context_switches = static_cast<UInt64>(rusage.ru_nivcsw);
+    }
+
+    static void incrementProfileEvents(const RusageCounters & cur, const RusageCounters & prev)
+    {
+        ProfileEvents::increment(ProfileEvents::RealTimeMicroseconds, cur.real_time - prev.real_time);
+        ProfileEvents::increment(ProfileEvents::RusageUserTimeMicroseconds, cur.user_time - prev.user_time);
+        ProfileEvents::increment(ProfileEvents::RusageSystemTimeMicroseconds, cur.sys_time - prev.sys_time);
+        ProfileEvents::increment(ProfileEvents::RusagePageReclaims, cur.page_reclaims - prev.page_reclaims);
+        ProfileEvents::increment(ProfileEvents::RusagePageVoluntaryContextSwitches, cur.voluntary_context_switches - prev.voluntary_context_switches);
+        ProfileEvents::increment(ProfileEvents::RusagePageInvoluntaryContextSwitches, cur.involuntary_context_switches - prev.involuntary_context_switches);
+    }
+
+    static void updateProfileEvents(RusageCounters & last_counters)
+    {
+        RusageCounters current = RusageCounters::current();
+        RusageCounters::incrementProfileEvents(current, last_counters);
+        last_counters = current;
+    }
+};
+
+
+struct ThreadStatus::Payload
+{
+    RusageCounters last_rusage;
+};
+
+
+//static void QueryThreadStatusOnThreadFinish(void * arg)
+//{
+//    auto thread_status = static_cast<ThreadStatus *>(arg);
+//    thread_status->is_finished = true;
+//    LOG_DEBUG(thread_status->log, "Thread " << thread_status->poco_thread_number << " is finished");
+//}
+
+
+ThreadStatus::ThreadStatus()
+    : poco_thread_number(Poco::ThreadNumber::get()),
+      performance_counters(ProfileEvents::Level::Thread),
+      log(&Poco::Logger::get("ThreadStatus"))
+{
+    LOG_DEBUG(log, "Thread " << poco_thread_number << " created");
+}
+
+ThreadStatus::~ThreadStatus()
+{
+    LOG_DEBUG(log, "Thread " << poco_thread_number << " destroyed");
+}
+
+void ThreadStatus::init(QueryStatus * parent_query_, ProfileEvents::Counters * parent_counters, MemoryTracker * parent_memory_tracker)
+{
+    if (!initialized)
+    {
+        if (auto counters_parent = performance_counters.parent)
+            if (counters_parent != parent_counters)
+                LOG_WARNING(current_thread->log, "Parent performance counters are already set, overwrite");
+
+        if (auto tracker_parent = memory_tracker.getParent())
+            if (tracker_parent != parent_memory_tracker)
+                LOG_WARNING(current_thread->log, "Parent memory tracker is already set, overwrite");
+
+        return;
+    }
+
+    initialized = true;
+    parent_query = parent_query_;
+    performance_counters.parent = parent_counters;
+    memory_tracker.setParent(parent_memory_tracker);
+    memory_tracker.setDescription("(for thread)");
+
+    onStart();
+}
+
+void ThreadStatus::onStart()
+{
+    payload = std::make_shared<Payload>();
+
+    /// First init of thread rusage counters, set real time to zero, other metrics remain as is
+    payload->last_rusage.setFromCurrent();
+    RusageCounters::incrementProfileEvents(payload->last_rusage, RusageCounters::zeros(payload->last_rusage.real_time));
+}
+
+void ThreadStatus::onExit()
+{
+    if (!initialized || !payload)
+        return;
+
+    RusageCounters::updateProfileEvents(payload->last_rusage);
+}
+
+void ThreadStatus::reset()
+{
+    parent_query = nullptr;
+    performance_counters.reset();
+    memory_tracker.reset();
+    memory_tracker.setParent(nullptr);
+    initialized = false;
+}
+
+
+void ThreadStatus::setCurrentThreadParentQuery(QueryStatus * parent_process)
+{
+    if (!current_thread)
+        throw Exception("Thread #" + std::to_string(Poco::ThreadNumber::get()) + " status was not initialized", ErrorCodes::LOGICAL_ERROR);
+
+    if (!parent_process)
+    {
+        current_thread->init(parent_process, nullptr, nullptr);
+        return;
+    }
+
+    current_thread->init(parent_process, &parent_process->performance_counters, &parent_process->memory_tracker);
+
+    {
+        std::lock_guard lock(parent_process->threads_mutex);
+        auto res = parent_process->thread_statuses.emplace(current_thread->poco_thread_number, current_thread);
+
+        if (!res.second && res.first->second.get() != current_thread.get())
+            throw Exception("Thread " + std::to_string(current_thread->poco_thread_number) + " is set twice", ErrorCodes::LOGICAL_ERROR);
+    }
+}
+
+void ThreadStatus::setCurrentThreadFromSibling(const ThreadStatusPtr & sibling_thread)
+{
+    if (!current_thread)
+        throw Exception("Thread #" + std::to_string(Poco::ThreadNumber::get()) + " status was not initialized", ErrorCodes::LOGICAL_ERROR);
+
+    if (sibling_thread == nullptr)
+        throw Exception("Sibling thread was not initialized", ErrorCodes::LOGICAL_ERROR);
+
+    current_thread->init(sibling_thread->parent_query, sibling_thread->performance_counters.parent, sibling_thread->memory_tracker.getParent());
+}
+
+
+thread_local ThreadStatusPtr current_thread = ThreadStatus::create();
+
+
+}
--- a/dbms/src/Common/ThreadStatus.h
+++ b/dbms/src/Common/ThreadStatus.h
@ -0,0 +1,54 @@
+#pragma once
+#include <Common/ProfileEvents.h>
+#include <Common/MemoryTracker.h>
+#include <memory>
+#include <ext/shared_ptr_helper.h>
+
+namespace Poco
+{
+    class Logger;
+}
+
+
+namespace DB
+{
+
+struct QueryStatus;
+struct ThreadStatus;
+using ThreadStatusPtr = std::shared_ptr<ThreadStatus>;
+
+
+struct ThreadStatus : public ext::shared_ptr_helper<ThreadStatus>, public std::enable_shared_from_this<ThreadStatus>
+{
+    UInt32 poco_thread_number = 0;
+    QueryStatus * parent_query = nullptr;
+
+    ProfileEvents::Counters performance_counters;
+    MemoryTracker memory_tracker;
+
+    void init(QueryStatus * parent_query_, ProfileEvents::Counters * parent_counters, MemoryTracker * parent_memory_tracker);
+    void onStart();
+    void onExit();
+
+    /// Reset all references and metrics
+    void reset();
+
+    static void setCurrentThreadParentQuery(QueryStatus * parent_process);
+    static void setCurrentThreadFromSibling(const ThreadStatusPtr & sibling_thread);
+
+    ~ThreadStatus();
+
+protected:
+    ThreadStatus();
+
+    bool initialized = false;
+    Poco::Logger * log;
+
+    struct Payload;
+    std::shared_ptr<Payload> payload;
+};
+
+
+extern thread_local ThreadStatusPtr current_thread;
+
+}
--- a/dbms/src/DataStreams/AsynchronousBlockInputStream.h
+++ b/dbms/src/DataStreams/AsynchronousBlockInputStream.h
@ -7,6 +7,8 @@
 #include <Common/CurrentMetrics.h>
 #include <common/ThreadPool.h>
 #include <Common/MemoryTracker.h>
+#include <Common/ThreadStatus.h>
+#include <Poco/Ext/ThreadNumber.h>


 namespace CurrentMetrics
@ -97,7 +99,8 @@ protected:
        /// If there were no calculations yet, calculate the first block synchronously
        if (!started)
        {
-            calculate(current_memory_tracker);
+            ThreadStatusPtr main_thread = current_thread;
+            calculate(main_thread);
            started = true;
        }
        else    /// If the calculations are already in progress - wait for the result
@ -121,12 +124,12 @@ protected:
    void next()
    {
        ready.reset();
-        pool.schedule(std::bind(&AsynchronousBlockInputStream::calculate, this, current_memory_tracker));
+        pool.schedule([this, main_thread=current_thread] () { calculate(main_thread); });
    }


    /// Calculations that can be performed in a separate thread
-    void calculate(MemoryTracker * memory_tracker)
+    void calculate(ThreadStatusPtr main_thread)
    {
        CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};

@ -135,8 +138,9 @@ protected:
            if (first)
            {
                first = false;
+                if (main_thread)
+                    ThreadStatus::setCurrentThreadFromSibling(main_thread);
                setThreadName("AsyncBlockInput");
-                current_memory_tracker = memory_tracker;
                children.back()->readPrefix();
            }

--- a/dbms/src/DataStreams/BlockIO.h
+++ b/dbms/src/DataStreams/BlockIO.h
@ -12,8 +12,7 @@ class ProcessListEntry;
 struct BlockIO
 {
    /** process_list_entry should be destroyed after in and after out,
-      *  since in and out contain pointer to an object inside process_list_entry
-      *  (MemoryTracker * current_memory_tracker),
+      *  since in and out contain pointer to objects inside process_list_entry (query-level MemoryTracker for example),
      *  which could be used before destroying of in and out.
      */
    std::shared_ptr<ProcessListEntry> process_list_entry;
--- a/dbms/src/DataStreams/CountingBlockOutputStream.h
+++ b/dbms/src/DataStreams/CountingBlockOutputStream.h
@ -20,7 +20,7 @@ public:
        progress_callback = callback;
    }

-    void setProcessListElement(ProcessListElement * elem)
+    void setProcessListElement(QueryStatus * elem)
    {
        process_elem = elem;
    }
@ -43,7 +43,7 @@ protected:
    BlockOutputStreamPtr stream;
    Progress progress;
    ProgressCallback progress_callback;
-    ProcessListElement * process_elem = nullptr;
+    QueryStatus * process_elem = nullptr;
 };

 }
--- a/dbms/src/DataStreams/IProfilingBlockInputStream.cpp
+++ b/dbms/src/DataStreams/IProfilingBlockInputStream.cpp
@ -379,7 +379,7 @@ void IProfilingBlockInputStream::setProgressCallback(const ProgressCallback & ca
 }


-void IProfilingBlockInputStream::setProcessListElement(ProcessListElement * elem)
+void IProfilingBlockInputStream::setProcessListElement(QueryStatus * elem)
 {
    process_list_elem = elem;

--- a/dbms/src/DataStreams/IProfilingBlockInputStream.h
+++ b/dbms/src/DataStreams/IProfilingBlockInputStream.h
@ -18,7 +18,7 @@ namespace ErrorCodes
 }

 class QuotaForIntervals;
-struct ProcessListElement;
+struct QueryStatus;
 class IProfilingBlockInputStream;

 using ProfilingBlockInputStreamPtr = std::shared_ptr<IProfilingBlockInputStream>;
@ -101,7 +101,7 @@ public:
      * Based on this information, the quota and some restrictions will be checked.
      * This information will also be available in the SHOW PROCESSLIST request.
      */
-    void setProcessListElement(ProcessListElement * elem);
+    void setProcessListElement(QueryStatus * elem);

    /** Set the approximate total number of rows to read.
      */
@ -192,7 +192,7 @@ protected:
    std::atomic<bool> is_cancelled{false};
    bool is_killed{false};
    ProgressCallback progress_callback;
-    ProcessListElement * process_list_elem = nullptr;
+    QueryStatus * process_list_elem = nullptr;

    /// Additional information that can be generated during the work process.

--- a/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.cpp
+++ b/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.cpp
@ -3,6 +3,7 @@
 #include <Common/CurrentMetrics.h>
 #include <Common/MemoryTracker.h>
 #include <DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h>
+#include <Common/ThreadStatus.h>


 namespace CurrentMetrics
@ -175,10 +176,11 @@ void MergingAggregatedMemoryEfficientBlockInputStream::start()
        {
            auto & child = children[i];

-            auto memory_tracker = current_memory_tracker;
-            reading_pool->schedule([&child, memory_tracker]
+            auto main_thread = current_thread;
+            reading_pool->schedule([&child, main_thread]
            {
-                current_memory_tracker = memory_tracker;
+                if (main_thread)
+                    ThreadStatus::setCurrentThreadFromSibling(main_thread);
                setThreadName("MergeAggReadThr");
                CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};
                child->readPrefix();
@ -196,8 +198,7 @@ void MergingAggregatedMemoryEfficientBlockInputStream::start()
          */

        for (size_t i = 0; i < merging_threads; ++i)
-            pool.schedule(std::bind(&MergingAggregatedMemoryEfficientBlockInputStream::mergeThread,
-                this, current_memory_tracker));
+            pool.schedule([this] () { mergeThread(current_thread); } );
    }
 }

@ -293,10 +294,11 @@ void MergingAggregatedMemoryEfficientBlockInputStream::finalize()
 }


-void MergingAggregatedMemoryEfficientBlockInputStream::mergeThread(MemoryTracker * memory_tracker)
+void MergingAggregatedMemoryEfficientBlockInputStream::mergeThread(ThreadStatusPtr main_thread)
 {
+    if (main_thread)
+        ThreadStatus::setCurrentThreadFromSibling(main_thread);
    setThreadName("MergeAggMergThr");
-    current_memory_tracker = memory_tracker;
    CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};

    try
@ -480,10 +482,11 @@ MergingAggregatedMemoryEfficientBlockInputStream::BlocksToMerge MergingAggregate
        {
            if (need_that_input(input))
            {
-                auto memory_tracker = current_memory_tracker;
-                reading_pool->schedule([&input, &read_from_input, memory_tracker]
+                auto main_thread = current_thread;
+                reading_pool->schedule([&input, &read_from_input, main_thread]
                {
-                    current_memory_tracker = memory_tracker;
+                    if (main_thread)
+                        ThreadStatus::setCurrentThreadFromSibling(main_thread);
                    setThreadName("MergeAggReadThr");
                    CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};
                    read_from_input(input);
--- a/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h
+++ b/dbms/src/DataStreams/MergingAggregatedMemoryEfficientBlockInputStream.h
@ -3,6 +3,7 @@
 #include <Interpreters/Aggregator.h>
 #include <DataStreams/IProfilingBlockInputStream.h>
 #include <Common/ConcurrentBoundedQueue.h>
+#include <Common/ThreadStatus.h>
 #include <common/ThreadPool.h>
 #include <condition_variable>

@ -151,7 +152,7 @@ private:

    std::unique_ptr<ParallelMergeData> parallel_merge_data;

-    void mergeThread(MemoryTracker * memory_tracker);
+    void mergeThread(ThreadStatusPtr main_thread);

    void finalize();
 };
--- a/dbms/src/DataStreams/ParallelInputsProcessor.h
+++ b/dbms/src/DataStreams/ParallelInputsProcessor.h
@ -12,6 +12,7 @@
 #include <Common/setThreadName.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/MemoryTracker.h>
+#include <Common/ThreadStatus.h>


 /** Allows to process multiple block input streams (sources) in parallel, using specified number of threads.
@ -103,10 +104,11 @@ public:
    /// Start background threads, start work.
    void process()
    {
+        QueryStatus * current_query = current_thread ? current_thread->parent_query : nullptr;
        active_threads = max_threads;
        threads.reserve(max_threads);
        for (size_t i = 0; i < max_threads; ++i)
-            threads.emplace_back(std::bind(&ParallelInputsProcessor::thread, this, current_memory_tracker, i));
+            threads.emplace_back([=] () { thread(current_query, i); } );
    }

    /// Ask all sources to stop earlier than they run out.
@ -174,9 +176,9 @@ private:
        }
    }

-    void thread(MemoryTracker * memory_tracker, size_t thread_num)
+    void thread(QueryStatus * query_status, size_t thread_num)
    {
-        current_memory_tracker = memory_tracker;
+        ThreadStatus::setCurrentThreadParentQuery(query_status);
        std::exception_ptr exception;

        setThreadName("ParalInputsProc");
--- a/dbms/src/Interpreters/Aggregator.cpp
+++ b/dbms/src/Interpreters/Aggregator.cpp
@ -24,6 +24,7 @@
 #include <Common/typeid_cast.h>
 #include <Common/demangle.h>
 #include <Interpreters/config_compile.h>
+#include "Common/ThreadStatus.h"


 namespace ProfileEvents
@ -139,8 +140,10 @@ Aggregator::Aggregator(const Params & params_)
    : params(params_),
    isCancelled([]() { return false; })
 {
-    if (current_memory_tracker)
-        memory_usage_before_aggregation = current_memory_tracker->get();
+    /// Use query-level memory tracker
+    if (current_thread)
+        if (auto memory_tracker = current_thread->memory_tracker.getParent())
+            memory_usage_before_aggregation = memory_tracker->get();

    aggregate_functions.resize(params.aggregates_size);
    for (size_t i = 0; i < params.aggregates_size; ++i)
@ -798,8 +801,9 @@ bool Aggregator::executeOnBlock(const Block & block, AggregatedDataVariants & re

    size_t result_size = result.sizeWithoutOverflowRow();
    Int64 current_memory_usage = 0;
-    if (current_memory_tracker)
-        current_memory_usage = current_memory_tracker->get();
+    if (current_thread)
+        if (auto memory_tracker = current_thread->memory_tracker.getParent())
+            current_memory_usage = memory_tracker->get();

    auto result_size_bytes = current_memory_usage - memory_usage_before_aggregation;    /// Here all the results in the sum are taken into account, from different threads.

@ -1271,9 +1275,10 @@ BlocksList Aggregator::prepareBlocksAndFillTwoLevelImpl(
    bool final,
    ThreadPool * thread_pool) const
 {
-    auto converter = [&](size_t bucket, MemoryTracker * memory_tracker)
+    auto converter = [&](size_t bucket, const ThreadStatusPtr & main_thread)
    {
-        current_memory_tracker = memory_tracker;
+        if (main_thread)
+            ThreadStatus::setCurrentThreadFromSibling(main_thread);
        return convertOneBucketToBlock(data_variants, method, final, bucket);
    };

@ -1288,7 +1293,7 @@ BlocksList Aggregator::prepareBlocksAndFillTwoLevelImpl(
            if (method.data.impls[bucket].empty())
                continue;

-            tasks[bucket] = std::packaged_task<Block()>(std::bind(converter, bucket, current_memory_tracker));
+            tasks[bucket] = std::packaged_task<Block()>(std::bind(converter, bucket, current_thread));

            if (thread_pool)
                thread_pool->schedule([bucket, &tasks] { tasks[bucket](); });
@ -1550,7 +1555,7 @@ void NO_INLINE Aggregator::mergeSingleLevelDataImpl(


 template <typename Method>
-void NO_INLINE Aggregator::mergeBucketImpl(
+void NO_INLINE Aggregator:: mergeBucketImpl(
    ManyAggregatedDataVariants & data, Int32 bucket, Arena * arena) const
 {
    /// We connect all aggregation results to the first.
@ -1717,13 +1722,14 @@ private:
        if (max_scheduled_bucket_num >= NUM_BUCKETS)
            return;

-        parallel_merge_data->pool.schedule(std::bind(&MergingAndConvertingBlockInputStream::thread, this,
-            max_scheduled_bucket_num, current_memory_tracker));
+        parallel_merge_data->pool.schedule([this, main_thread=current_thread] () { thread(max_scheduled_bucket_num, main_thread); });
    }

-    void thread(Int32 bucket_num, MemoryTracker * memory_tracker)
+    void thread(Int32 bucket_num, const ThreadStatusPtr & main_thread)
    {
-        current_memory_tracker = memory_tracker;
+        if (main_thread)
+            ThreadStatus::setCurrentThreadFromSibling(main_thread);
+
        setThreadName("MergingAggregtd");
        CurrentMetrics::Increment metric_increment{CurrentMetrics::QueryThread};

@ -2028,9 +2034,10 @@ void Aggregator::mergeStream(const BlockInputStreamPtr & stream, AggregatedDataV

        LOG_TRACE(log, "Merging partially aggregated two-level data.");

-        auto merge_bucket = [&bucket_to_blocks, &result, this](Int32 bucket, Arena * aggregates_pool, MemoryTracker * memory_tracker)
+        auto merge_bucket = [&bucket_to_blocks, &result, this](Int32 bucket, Arena * aggregates_pool, const ThreadStatusPtr & main_thread)
        {
-            current_memory_tracker = memory_tracker;
+            if (main_thread)
+                ThreadStatus::setCurrentThreadFromSibling(main_thread);

            for (Block & block : bucket_to_blocks[bucket])
            {
@ -2064,7 +2071,7 @@ void Aggregator::mergeStream(const BlockInputStreamPtr & stream, AggregatedDataV
            result.aggregates_pools.push_back(std::make_shared<Arena>());
            Arena * aggregates_pool = result.aggregates_pools.back().get();

-            auto task = std::bind(merge_bucket, bucket, aggregates_pool, current_memory_tracker);
+            auto task = std::bind(merge_bucket, bucket, aggregates_pool, current_thread);

            if (thread_pool)
                thread_pool->schedule(task);
--- a/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp
+++ b/dbms/src/Interpreters/ClusterProxy/executeQuery.cpp
@ -42,7 +42,7 @@ BlockInputStreams executeQuery(
    ThrottlerPtr user_level_throttler;
    if (settings.limits.max_network_bandwidth_for_user)
        if (auto process_list_element = context.getProcessListElement())
-            if (auto user_process_list = process_list_element->user_process_list)
+            if (auto user_process_list = process_list_element->getUserProcessList())
                user_level_throttler = user_process_list->user_throttler;

    /// Network bandwidth limit, if needed.
--- a/dbms/src/Interpreters/Context.h
+++ b/dbms/src/Interpreters/Context.h
@ -45,7 +45,7 @@ class Compiler;
 class MarkCache;
 class UncompressedCache;
 class ProcessList;
-struct ProcessListElement;
+struct QueryStatus;
 class Macros;
 struct Progress;
 class Clusters;
@ -99,7 +99,7 @@ private:
    Settings settings;                                  /// Setting for query execution.
    using ProgressCallback = std::function<void(const Progress & progress)>;
    ProgressCallback progress_callback;                 /// Callback for tracking progress of query execution.
-    ProcessListElement * process_list_elem = nullptr;   /// For tracking total resource usage for query.
+    QueryStatus * process_list_elem = nullptr;   /// For tracking total resource usage for query.

    String default_format;  /// Format, used when server formats data by itself and if query does not have FORMAT specification.
                            /// Thus, used in HTTP interface. If not specified - then some globally default format is used.
@ -285,9 +285,9 @@ public:
    /** Set in executeQuery and InterpreterSelectQuery. Then it is used in IProfilingBlockInputStream,
      *  to update and monitor information about the total number of resources spent for the query.
      */
-    void setProcessListElement(ProcessListElement * elem);
+    void setProcessListElement(QueryStatus * elem);
    /// Can return nullptr if the query was not inserted into the ProcessList.
-    ProcessListElement * getProcessListElement() const;
+    QueryStatus * getProcessListElement() const;

    /// List all queries.
    ProcessList & getProcessList();
--- a/dbms/src/Interpreters/ExternalLoader.cpp
+++ b/dbms/src/Interpreters/ExternalLoader.cpp
@ -66,7 +66,7 @@ void ExternalLoader::init(bool throw_on_error)
    {
        /// During synchronous loading of external dictionaries at moment of query execution,
        /// we should not use per query memory limit.
-        TemporarilyDisableMemoryTracker temporarily_disable_memory_tracker;
+        auto temporarily_disable_memory_tracker = getCurrentMemoryTrackerBlocker();

        reloadAndUpdate(throw_on_error);
    }
--- a/dbms/src/Interpreters/ProcessList.cpp
+++ b/dbms/src/Interpreters/ProcessList.cpp
@ -5,6 +5,8 @@
 #include <IO/WriteHelpers.h>
 #include <DataStreams/IProfilingBlockInputStream.h>
 #include <Common/typeid_cast.h>
+#include <common/logger_useful.h>
+#include <pthread.h>


 namespace DB
@ -26,7 +28,7 @@ ProcessList::EntryPtr ProcessList::insert(
    {
        std::lock_guard<std::mutex> lock(mutex);

-        if (!is_kill_query && max_size && cur_size >= max_size
+        if (!is_kill_query && max_size && processes.size() >= max_size
            && (!settings.queue_max_wait_ms.totalMilliseconds() || !have_space.tryWait(mutex, settings.queue_max_wait_ms.totalMilliseconds())))
            throw Exception("Too many simultaneous queries. Maximum: " + toString(max_size), ErrorCodes::TOO_MUCH_SIMULTANEOUS_QUERIES);

@ -70,36 +72,44 @@ ProcessList::EntryPtr ProcessList::insert(
            }
        }

-        ++cur_size;
-
-        res = std::make_shared<Entry>(*this, cont.emplace(cont.end(),
+        auto process_it = processes.emplace(processes.end(),
            query_, client_info,
            settings.limits.max_memory_usage, settings.memory_tracker_fault_probability,
-            priorities.insert(settings.priority)));
+            priorities.insert(settings.priority));
+
+        res = std::make_shared<Entry>(*this, process_it);

        if (!client_info.current_query_id.empty())
        {
            ProcessListForUser & user_process_list = user_to_queries[client_info.current_user];
-            user_process_list.queries[client_info.current_query_id] = &res->get();
+            user_process_list.queries[client_info.current_query_id] = &*process_it;

-            if (current_memory_tracker)
+            /// Limits are only raised (to be more relaxed) or set to something instead of zero,
+            ///  because settings for different queries will interfere each other:
+            ///  setting from one query effectively sets values for all other queries.
+
+            /// Track memory usage for all simultaneously running queries.
+            /// You should specify this value in configuration for default profile,
+            ///  not for specific users, sessions or queries,
+            ///  because this setting is effectively global.
+            total_memory_tracker.setOrRaiseLimit(settings.limits.max_memory_usage_for_all_queries);
+            total_memory_tracker.setDescription("(total)");
+
+            /// Track memory usage for all simultaneously running queries from single user.
+            user_process_list.user_memory_tracker.setParent(&total_memory_tracker);
+            user_process_list.user_memory_tracker.setOrRaiseLimit(settings.limits.max_memory_usage_for_user);
+            user_process_list.user_memory_tracker.setDescription("(for user)");
+
+            /// Query-level memory tracker is already set in the QueryStatus constructor
+
+            if (!current_thread)
+                throw Exception("Thread is not initialized", ErrorCodes::LOGICAL_ERROR);
+
+            if (current_thread)
            {
-                /// Limits are only raised (to be more relaxed) or set to something instead of zero,
-                ///  because settings for different queries will interfere each other:
-                ///  setting from one query effectively sets values for all other queries.
-
-                /// Track memory usage for all simultaneously running queries from single user.
-                user_process_list.user_memory_tracker.setOrRaiseLimit(settings.limits.max_memory_usage_for_user);
-                user_process_list.user_memory_tracker.setDescription("(for user)");
-                current_memory_tracker->setNext(&user_process_list.user_memory_tracker);
-
-                /// Track memory usage for all simultaneously running queries.
-                /// You should specify this value in configuration for default profile,
-                ///  not for specific users, sessions or queries,
-                ///  because this setting is effectively global.
-                total_memory_tracker.setOrRaiseLimit(settings.limits.max_memory_usage_for_all_queries);
-                total_memory_tracker.setDescription("(total)");
-                user_process_list.user_memory_tracker.setNext(&total_memory_tracker);
+                current_thread->setCurrentThreadParentQuery(&*process_it);
+                current_thread->memory_tracker.setOrRaiseLimit(settings.limits.max_memory_usage);
+                current_thread->memory_tracker.setDescription("(for thread)");
            }

            if (settings.limits.max_network_bandwidth_for_user && !user_process_list.user_throttler)
@ -108,7 +118,7 @@ ProcessList::EntryPtr ProcessList::insert(
                    "Network bandwidth limit for a user exceeded.");
            }

-            res->get().user_process_list = &user_process_list;
+            process_it->setUserProcessList(&user_process_list);
        }
    }

@ -121,6 +131,26 @@ ProcessListEntry::~ProcessListEntry()
    /// Destroy all streams to avoid long lock of ProcessList
    it->releaseQueryStreams();

+    /// Finalize all threads statuses
+    {
+        std::lock_guard lock(it->threads_mutex);
+
+        for (auto & elem : it->thread_statuses)
+        {
+            auto & thread_status = elem.second;
+            thread_status->onExit();
+            thread_status->reset();
+            thread_status.reset();
+        }
+
+        it->thread_statuses.clear();
+    }
+
+    /// Also reset query master thread status
+    /// NOTE: we can't destroy it, since master threads are selected from fixed thread pool
+    if (current_thread)
+        current_thread->reset();
+
    std::lock_guard<std::mutex> lock(parent.mutex);

    /// The order of removing memory_trackers is important.
@ -130,7 +160,7 @@ ProcessListEntry::~ProcessListEntry()
    bool is_cancelled = it->is_cancelled;

    /// This removes the memory_tracker of one request.
-    parent.cont.erase(it);
+    parent.processes.erase(it);

    auto user_process_list = parent.user_to_queries.find(user);
    if (user_process_list != parent.user_to_queries.end())
@ -154,11 +184,10 @@ ProcessListEntry::~ProcessListEntry()
            parent.user_to_queries.erase(user_process_list);
    }

-    --parent.cur_size;
    parent.have_space.signal();

    /// This removes memory_tracker for all requests. At this time, no other memory_trackers live.
-    if (parent.cur_size == 0)
+    if (parent.processes.size() == 0)
    {
        /// Reset MemoryTracker, similarly (see above).
        parent.total_memory_tracker.logPeakMemoryUsage();
@ -167,7 +196,28 @@ ProcessListEntry::~ProcessListEntry()
 }


-void ProcessListElement::setQueryStreams(const BlockIO & io)
+QueryStatus::QueryStatus(
+    const String & query_,
+    const ClientInfo & client_info_,
+    size_t max_memory_usage,
+    double memory_tracker_fault_probability,
+    QueryPriorities::Handle && priority_handle_)
+    :
+    query(query_),
+    client_info(client_info_),
+    priority_handle(std::move(priority_handle_)),
+    performance_counters(ProfileEvents::Level::Process),
+    num_queries_increment{CurrentMetrics::Query}
+{
+    memory_tracker.setOrRaiseLimit(max_memory_usage);
+    memory_tracker.setDescription("(for query)");
+
+    if (memory_tracker_fault_probability)
+        memory_tracker.setFaultProbability(memory_tracker_fault_probability);
+}
+
+
+void QueryStatus::setQueryStreams(const BlockIO & io)
 {
    std::lock_guard<std::mutex> lock(query_streams_mutex);

@ -176,7 +226,7 @@ void ProcessListElement::setQueryStreams(const BlockIO & io)
    query_streams_initialized = true;
 }

-void ProcessListElement::releaseQueryStreams()
+void QueryStatus::releaseQueryStreams()
 {
    std::lock_guard<std::mutex> lock(query_streams_mutex);

@ -186,14 +236,14 @@ void ProcessListElement::releaseQueryStreams()
    query_stream_out.reset();
 }

-bool ProcessListElement::streamsAreReleased()
+bool QueryStatus::streamsAreReleased()
 {
    std::lock_guard<std::mutex> lock(query_streams_mutex);

    return query_streams_released;
 }

-bool ProcessListElement::tryGetQueryStreams(BlockInputStreamPtr & in, BlockOutputStreamPtr & out) const
+bool QueryStatus::tryGetQueryStreams(BlockInputStreamPtr & in, BlockOutputStreamPtr & out) const
 {
    std::lock_guard<std::mutex> lock(query_streams_mutex);

@ -206,7 +256,15 @@ bool ProcessListElement::tryGetQueryStreams(BlockInputStreamPtr & in, BlockOutpu
 }


-void ProcessList::addTemporaryTable(ProcessListElement & elem, const String & table_name, const StoragePtr & storage)
+void QueryStatus::setUserProcessList(ProcessListForUser * user_process_list_)
+{
+    user_process_list = user_process_list_;
+    performance_counters.parent = &user_process_list->user_performance_counters;
+    memory_tracker.setParent(&user_process_list->user_memory_tracker);
+}
+
+
+void ProcessList::addTemporaryTable(QueryStatus & elem, const String & table_name, const StoragePtr & storage)
 {
    std::lock_guard<std::mutex> lock(mutex);

@ -214,7 +272,7 @@ void ProcessList::addTemporaryTable(ProcessListElement & elem, const String & ta
 }


-ProcessListElement * ProcessList::tryGetProcessListElement(const String & current_query_id, const String & current_user)
+QueryStatus * ProcessList::tryGetProcessListElement(const String & current_query_id, const String & current_user)
 {
    auto user_it = user_to_queries.find(current_user);
    if (user_it != user_to_queries.end())
@ -234,7 +292,7 @@ ProcessList::CancellationCode ProcessList::sendCancelToQuery(const String & curr
 {
    std::lock_guard<std::mutex> lock(mutex);

-    ProcessListElement * elem = tryGetProcessListElement(current_query_id, current_user);
+    QueryStatus * elem = tryGetProcessListElement(current_query_id, current_user);

    if (!elem)
        return CancellationCode::NotFound;
@ -260,4 +318,10 @@ ProcessList::CancellationCode ProcessList::sendCancelToQuery(const String & curr
    return CancellationCode::QueryIsNotInitializedYet;
 }

+
+ProcessListForUser::ProcessListForUser()
+: user_performance_counters(ProfileEvents::Level::User, &ProfileEvents::global_counters)
+{}
+
+
 }
--- a/dbms/src/Interpreters/ProcessList.h
+++ b/dbms/src/Interpreters/ProcessList.h
@ -5,16 +5,21 @@
 #include <memory>
 #include <mutex>
 #include <unordered_map>
+#include <shared_mutex>
+
 #include <Poco/Condition.h>
-#include <Common/Stopwatch.h>
 #include <Core/Defines.h>
 #include <IO/Progress.h>
+#include <Common/Stopwatch.h>
 #include <Common/MemoryTracker.h>
+#include <Common/CurrentMetrics.h>
+#include <Common/ProfileEvents.h>
+#include <Common/Throttler.h>
 #include <Interpreters/QueryPriorities.h>
 #include <Interpreters/ClientInfo.h>
-#include <Common/CurrentMetrics.h>
+#include <Common/ThreadStatus.h>
 #include <DataStreams/BlockIO.h>
-#include <Common/Throttler.h>
+#include "ThreadPerformanceProfile.h"


 namespace CurrentMetrics
@ -30,7 +35,10 @@ using StoragePtr = std::shared_ptr<IStorage>;
 using Tables = std::map<String, StoragePtr>;
 struct Settings;
 class IAST;
+
 struct ProcessListForUser;
+struct QueryStatus;
+struct ThreadStatus;


 /** List of currently executing queries.
@ -40,7 +48,7 @@ struct ProcessListForUser;
 /** Information of process list element.
  * To output in SHOW PROCESSLIST query. Does not contain any complex objects, that do something on copy or destructor.
  */
-struct ProcessInfo
+struct QueryStatusInfo
 {
    String query;
    double elapsed_seconds;
@ -55,7 +63,7 @@ struct ProcessInfo


 /// Query and information about its execution.
-struct ProcessListElement
+struct QueryStatus
 {
    String query;
    ClientInfo client_info;
@ -67,19 +75,25 @@ struct ProcessListElement
    /// Progress of output stream
    Progress progress_out;

-    MemoryTracker memory_tracker;
-
    QueryPriorities::Handle priority_handle;

-    CurrentMetrics::Increment num_queries {CurrentMetrics::Query};
+    ProfileEvents::Counters performance_counters;
+    MemoryTracker memory_tracker;
+
+    mutable std::shared_mutex threads_mutex;
+    using QueryThreadStatuses = std::map<int, ThreadStatusPtr>; /// Key is Poco's thread_id
+    QueryThreadStatuses thread_statuses;
+
+    CurrentMetrics::Increment num_queries_increment{CurrentMetrics::Query};

    bool is_cancelled = false;

    /// Temporary tables could be registered here. Modify under mutex.
    Tables temporary_tables;

-    /// Be careful using it. For example, queries field could be modified concurrently.
-    const ProcessListForUser * user_process_list = nullptr;
+    void setUserProcessList(ProcessListForUser * user_process_list_);
+    /// Be careful using it. For example, queries field of ProcessListForUser could be modified concurrently.
+    const ProcessListForUser * getUserProcessList() const { return user_process_list; }

 protected:

@ -94,27 +108,20 @@ protected:
    bool query_streams_initialized{false};
    bool query_streams_released{false};

+    ProcessListForUser * user_process_list = nullptr;
+
 public:

-    ProcessListElement(
+    QueryStatus(
        const String & query_,
        const ClientInfo & client_info_,
        size_t max_memory_usage,
        double memory_tracker_fault_probability,
-        QueryPriorities::Handle && priority_handle_)
-        : query(query_), client_info(client_info_), memory_tracker(max_memory_usage),
-        priority_handle(std::move(priority_handle_))
-    {
-        memory_tracker.setDescription("(for query)");
-        current_memory_tracker = &memory_tracker;
+        QueryPriorities::Handle && priority_handle_);

-        if (memory_tracker_fault_probability)
-            memory_tracker.setFaultProbability(memory_tracker_fault_probability);
-    }
-
-    ~ProcessListElement()
+    ~QueryStatus()
    {
-        current_memory_tracker = nullptr;
+        // TODO: master thread should be reset
    }

    bool updateProgressIn(const Progress & value)
@ -134,9 +141,9 @@ public:
    }


-    ProcessInfo getInfo() const
+    QueryStatusInfo getInfo() const
    {
-        ProcessInfo res;
+        QueryStatusInfo res;

        res.query             = query;
        res.client_info       = client_info;
@ -168,10 +175,13 @@ public:
 /// Data about queries for one user.
 struct ProcessListForUser
 {
+    ProcessListForUser();
+
    /// Query_id -> ProcessListElement *
-    using QueryToElement = std::unordered_map<String, ProcessListElement *>;
+    using QueryToElement = std::unordered_map<String, QueryStatus *>;
    QueryToElement queries;

+    ProfileEvents::Counters user_performance_counters;
    /// Limit and counter for memory of all simultaneously running queries of single user.
    MemoryTracker user_memory_tracker;

@ -187,7 +197,7 @@ class ProcessList;
 class ProcessListEntry
 {
 private:
-    using Container = std::list<ProcessListElement>;
+    using Container = std::list<QueryStatus>;

    ProcessList & parent;
    Container::iterator it;
@ -197,11 +207,11 @@ public:

    ~ProcessListEntry();

-    ProcessListElement * operator->() { return &*it; }
-    const ProcessListElement * operator->() const { return &*it; }
+    QueryStatus * operator->() { return &*it; }
+    const QueryStatus * operator->() const { return &*it; }

-    ProcessListElement & get() { return *it; }
-    const ProcessListElement & get() const { return *it; }
+    QueryStatus & get() { return *it; }
+    const QueryStatus & get() const { return *it; }
 };


@ -209,12 +219,12 @@ class ProcessList
 {
    friend class ProcessListEntry;
 public:
-    using Element = ProcessListElement;
+    using Element = QueryStatus;
    using Entry = ProcessListEntry;

    /// list, for iterators not to invalidate. NOTE: could replace with cyclic buffer, but not worth.
    using Container = std::list<Element>;
-    using Info = std::vector<ProcessInfo>;
+    using Info = std::vector<QueryStatusInfo>;
    /// User -> queries
    using UserToQueries = std::unordered_map<String, ProcessListForUser>;

@ -223,8 +233,7 @@ private:
    mutable Poco::Condition have_space;        /// Number of currently running queries has become less than maximum.

    /// List of queries
-    Container cont;
-    size_t cur_size;        /// In C++03 or C++11 and old ABI, std::list::size is not O(1).
+    Container processes;
    size_t max_size;        /// 0 means no limit. Otherwise, when limit exceeded, an exception is thrown.

    /// Stores per-user info: queries, statistics and limits
@ -237,10 +246,10 @@ private:
    MemoryTracker total_memory_tracker;

    /// Call under lock. Finds process with specified current_user and current_query_id.
-    ProcessListElement * tryGetProcessListElement(const String & current_query_id, const String & current_user);
+    QueryStatus * tryGetProcessListElement(const String & current_query_id, const String & current_user);

 public:
-    ProcessList(size_t max_size_ = 0) : cur_size(0), max_size(max_size_) {}
+    ProcessList(size_t max_size_ = 0) : max_size(max_size_) {}

    using EntryPtr = std::shared_ptr<ProcessListEntry>;

@ -252,7 +261,7 @@ public:
    EntryPtr insert(const String & query_, const IAST * ast, const ClientInfo & client_info, const Settings & settings);

    /// Number of currently executing queries.
-    size_t size() const { return cur_size; }
+    size_t size() const { return processes.size(); }

    /// Get current state of process list.
    Info getInfo() const
@ -260,8 +269,8 @@ public:
        std::lock_guard<std::mutex> lock(mutex);

        Info res;
-        res.reserve(cur_size);
-        for (const auto & elem : cont)
+        res.reserve(processes.size());
+        for (const auto & elem : processes)
            res.emplace_back(elem.getInfo());

        return res;
@ -274,7 +283,7 @@ public:
    }

    /// Register temporary table. Then it is accessible by query_id and name.
-    void addTemporaryTable(ProcessListElement & elem, const String & table_name, const StoragePtr & storage);
+    void addTemporaryTable(QueryStatus & elem, const String & table_name, const StoragePtr & storage);

    enum class CancellationCode
    {
--- a/dbms/src/Interpreters/ThreadPerformanceProfile.h
+++ b/dbms/src/Interpreters/ThreadPerformanceProfile.h
@ -0,0 +1,23 @@
+#pragma once
+
+#include <list>
+#include <thread>
+#include <time.h>
+
+#include <sys/time.h>
+#include <sys/resource.h>
+
+#include <Poco/Ext/ThreadNumber.h>
+
+#include <Core/Types.h>
+#include <Common/Exception.h>
+#include <Common/MemoryTracker.h>
+
+
+
+namespace DB
+{
+
+
+
+}
--- a/dbms/src/Interpreters/executeQuery.cpp
+++ b/dbms/src/Interpreters/executeQuery.cpp
@ -272,7 +272,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
            /// Also make possible for caller to log successful query finish and exception during execution.
            res.finish_callback = [elem, &context, log_queries] (IBlockInputStream * stream_in, IBlockOutputStream * stream_out) mutable
            {
-                ProcessListElement * process_list_elem = context.getProcessListElement();
+                QueryStatus * process_list_elem = context.getProcessListElement();

                if (!process_list_elem)
                    return;
@ -337,7 +337,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                elem.query_duration_ms = 1000 * (elem.event_time - elem.query_start_time);
                elem.exception = getCurrentExceptionMessage(false);

-                ProcessListElement * process_list_elem = context.getProcessListElement();
+                QueryStatus * process_list_elem = context.getProcessListElement();

                if (process_list_elem)
                {
--- a/dbms/src/Server/MetricsTransmitter.cpp
+++ b/dbms/src/Server/MetricsTransmitter.cpp
@ -81,7 +81,7 @@ void MetricsTransmitter::transmit(std::vector<ProfileEvents::Count> & prev_count
    {
        for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i)
        {
-            const auto counter = ProfileEvents::counters[i].load(std::memory_order_relaxed);
+            const auto counter = ProfileEvents::global_counters[i].load(std::memory_order_relaxed);
            const auto counter_increment = counter - prev_counters[i];
            prev_counters[i] = counter;

--- a/dbms/src/Storages/Distributed/DistributedBlockOutputStream.cpp
+++ b/dbms/src/Storages/Distributed/DistributedBlockOutputStream.cpp
@ -32,6 +32,8 @@
 #include <future>
 #include <condition_variable>
 #include <mutex>
+#include <Common/ThreadStatus.h>
+


 namespace CurrentMetrics
@ -184,8 +186,8 @@ void DistributedBlockOutputStream::waitForJobs()

 ThreadPool::Job DistributedBlockOutputStream::runWritingJob(DistributedBlockOutputStream::JobInfo & job)
 {
-    auto memory_tracker = current_memory_tracker;
-    return [this, memory_tracker, &job]()
+    auto main_thread = current_thread;
+    return [this, main_thread, &job]()
    {
        SCOPE_EXIT({
            std::lock_guard<std::mutex> lock(mutex);
@ -193,9 +195,9 @@ ThreadPool::Job DistributedBlockOutputStream::runWritingJob(DistributedBlockOutp
            cond_var.notify_one();
        });

-        if (!current_memory_tracker)
+        if (current_thread)
        {
-            current_memory_tracker = memory_tracker;
+            ThreadStatus::setCurrentThreadFromSibling(main_thread);
            setThreadName("DistrOutStrProc");
        }

--- a/dbms/src/Storages/MergeTree/BackgroundProcessingPool.cpp
+++ b/dbms/src/Storages/MergeTree/BackgroundProcessingPool.cpp
@ -6,6 +6,7 @@
 #include <IO/WriteHelpers.h>
 #include <common/logger_useful.h>
 #include <Storages/MergeTree/BackgroundProcessingPool.h>
+#include <Common/ThreadStatus.h>

 #include <pcg_random.hpp>
 #include <random>
@ -112,10 +113,8 @@ BackgroundProcessingPool::~BackgroundProcessingPool()
 void BackgroundProcessingPool::threadFunction()
 {
    setThreadName("BackgrProcPool");
-
-    MemoryTracker memory_tracker;
-    memory_tracker.setMetric(CurrentMetrics::MemoryTrackingInBackgroundProcessingPool);
-    current_memory_tracker = &memory_tracker;
+    if (current_thread)
+        ThreadStatus::setCurrentThreadParentQuery(nullptr);

    pcg64 rng(randomSeed());
    std::this_thread::sleep_for(std::chrono::duration<double>(std::uniform_real_distribution<double>(0, sleep_seconds_random_part)(rng)));
@ -199,8 +198,6 @@ void BackgroundProcessingPool::threadFunction()
            task->iterator = tasks.emplace(next_time_to_execute, task);
        }
    }
-
-    current_memory_tracker = nullptr;
 }

 }
--- a/dbms/src/Storages/MergeTree/MergeList.cpp
+++ b/dbms/src/Storages/MergeTree/MergeList.cpp
@ -1,6 +1,8 @@
 #include <Storages/MergeTree/MergeList.h>
 #include <Common/CurrentMetrics.h>
 #include <Poco/Ext/ThreadNumber.h>
+#include <Common/ThreadStatus.h>
+

 namespace CurrentMetrics
 {
@ -20,11 +22,12 @@ MergeListElement::MergeListElement(const std::string & database, const std::stri
        source_part_names.emplace_back(source_part->name);

    /// Each merge is executed into separate background processing pool thread
-    background_pool_task_memory_tracker = current_memory_tracker;
-    if (background_pool_task_memory_tracker)
+    background_thread_memory_tracker = current_thread ? &current_thread->memory_tracker : nullptr;
+    if (background_thread_memory_tracker)
    {
        memory_tracker.setMetric(CurrentMetrics::MemoryTrackingForMerges);
-        background_pool_task_memory_tracker->setNext(&memory_tracker);
+        background_thread_memory_tracker_prev_parent = background_thread_memory_tracker->getParent();
+        background_thread_memory_tracker->setParent(&memory_tracker);
    }
 }

@ -56,8 +59,8 @@ MergeInfo MergeListElement::getInfo() const
 MergeListElement::~MergeListElement()
 {
    /// Unplug memory_tracker from current background processing pool thread
-    if (background_pool_task_memory_tracker)
-        background_pool_task_memory_tracker->setNext(nullptr);
+    if (background_thread_memory_tracker)
+        background_thread_memory_tracker->setParent(background_thread_memory_tracker_prev_parent);
 }

 }
--- a/dbms/src/Storages/MergeTree/MergeList.h
+++ b/dbms/src/Storages/MergeTree/MergeList.h
@ -66,7 +66,8 @@ struct MergeListElement : boost::noncopyable
    std::atomic<UInt64> columns_written{};

    MemoryTracker memory_tracker;
-    MemoryTracker * background_pool_task_memory_tracker;
+    MemoryTracker * background_thread_memory_tracker;
+    MemoryTracker * background_thread_memory_tracker_prev_parent = nullptr;

    /// Poco thread number used in logs
    UInt32 thread_number;
--- a/dbms/src/Storages/MergeTree/MergeTreeReader.cpp
+++ b/dbms/src/Storages/MergeTree/MergeTreeReader.cpp
@ -267,7 +267,7 @@ void MergeTreeReader::Stream::loadMarks()
    auto load = [&]() -> MarkCache::MappedPtr
    {
        /// Memory for marks must not be accounted as memory usage for query, because they are stored in shared cache.
-        TemporarilyDisableMemoryTracker temporarily_disable_memory_tracker;
+        auto temporarily_disable_memory_tracker = getCurrentMemoryTrackerBlocker();

        size_t file_size = Poco::File(path).getSize();
        size_t expected_file_size = sizeof(MarkInCompressedFile) * marks_count;
--- a/dbms/src/Storages/MergeTree/MergedBlockOutputStream.cpp
+++ b/dbms/src/Storages/MergeTree/MergedBlockOutputStream.cpp
@ -438,7 +438,7 @@ void MergedBlockOutputStream::writeImpl(const Block & block, const IColumn::Perm
          * And otherwise it will look like excessively growing memory consumption in context of query.
          *  (observed in long INSERT SELECTs)
          */
-        TemporarilyDisableMemoryTracker temporarily_disable_memory_tracker;
+        auto temporarily_disable_memory_tracker = getCurrentMemoryTrackerBlocker();

        /// Write index. The index contains Primary Key value for each `index_granularity` row.
        for (size_t i = index_offset; i < rows; i += storage.index_granularity)
--- a/dbms/src/Storages/StorageBuffer.cpp
+++ b/dbms/src/Storages/StorageBuffer.cpp
@ -180,7 +180,7 @@ static void appendBlock(const Block & from, Block & to)
        try
        {
            /// Avoid "memory limit exceeded" exceptions during rollback.
-            TemporarilyDisableMemoryTracker temporarily_disable_memory_tracker;
+            auto temporarily_disable_memory_tracker = getCurrentMemoryTrackerBlocker();

            for (size_t column_no = 0, columns = to.columns(); column_no < columns; ++column_no)
            {
--- a/dbms/src/Storages/System/StorageSystemEvents.cpp
+++ b/dbms/src/Storages/System/StorageSystemEvents.cpp
@ -36,7 +36,7 @@ BlockInputStreams StorageSystemEvents::read(

    for (size_t i = 0, end = ProfileEvents::end(); i < end; ++i)
    {
-        UInt64 value = ProfileEvents::counters[i];
+        UInt64 value = ProfileEvents::global_counters[i];

        if (0 != value)
        {