mirror of
synced 2024-12-14 10:22:10 +00:00
318 lines
11 KiB
318 lines
11 KiB
#pragma once
#include <deque>
#include <functional>
#include <mutex>
#include <future>
#include <condition_variable>
#include <set>
#include <variant>
#include <utility>
#include <boost/circular_buffer.hpp>
#include <boost/noncopyable.hpp>
#include <Poco/Event.h>
#include <Storages/MergeTree/IExecutableTask.h>
#include <base/defines.h>
#include <Common/CurrentMetrics.h>
#include <Common/Exception.h>
#include <Common/Stopwatch.h>
#include <Common/ThreadPool_fwd.h>
namespace DB
namespace ErrorCodes
extern const int NOT_IMPLEMENTED;
struct TaskRuntimeData;
using TaskRuntimeDataPtr = std::shared_ptr<TaskRuntimeData>;
* Has RAII class to determine how many tasks are waiting for the execution and executing at the moment.
* Also has some flags and primitives to wait for current task to be executed.
struct TaskRuntimeData
TaskRuntimeData(ExecutableTaskPtr && task_, CurrentMetrics::Metric metric_)
: task(std::move(task_))
, metric(metric_)
/// Increment and decrement a metric with sequentially consistent memory order
/// This is needed, because in unit test this metric is read from another thread
/// and some invariant is checked. With relaxed memory order we could read stale value
/// for this metric, that's why test can be failed.
ExecutableTaskPtr task;
CurrentMetrics::Metric metric;
/// Guarded by MergeTreeBackgroundExecutor<>::mutex
bool is_currently_deleting{false};
/// Actually autoreset=false is needed only for unit test
/// where multiple threads could remove tasks corresponding to the same storage
/// This scenario in not possible in reality.
Poco::Event is_done{/*autoreset=*/false};
/// This is equal to task->getPriority() not to do useless virtual calls in comparator
Priority priority;
/// By default priority queue will have max element at top
static bool comparePtrByPriority(const TaskRuntimeDataPtr & lhs, const TaskRuntimeDataPtr & rhs)
return lhs->priority > rhs->priority;
/// Simplest First-in-First-out queue, ignores priority.
class RoundRobinRuntimeQueue
TaskRuntimeDataPtr pop()
auto result = std::move(queue.front());
return result;
void push(TaskRuntimeDataPtr item)
void remove(StorageID id)
auto it = std::remove_if(queue.begin(), queue.end(),
[&] (auto && item) -> bool { return item->task->getStorageID() == id; });
queue.erase(it, queue.end());
void setCapacity(size_t count) { queue.set_capacity(count); }
bool empty() { return queue.empty(); }
[[noreturn]] void updatePolicy(std::string_view)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method updatePolicy() is not implemented");
static constexpr std::string_view name = "round_robin";
boost::circular_buffer<TaskRuntimeDataPtr> queue{0};
/// Uses a heap to pop a task with minimal priority.
class PriorityRuntimeQueue
TaskRuntimeDataPtr pop()
std::pop_heap(buffer.begin(), buffer.end(), TaskRuntimeData::comparePtrByPriority);
auto result = std::move(buffer.back());
return result;
void push(TaskRuntimeDataPtr item)
item->priority = item->task->getPriority();
std::push_heap(buffer.begin(), buffer.end(), TaskRuntimeData::comparePtrByPriority);
void remove(StorageID id)
std::erase_if(buffer, [&] (auto && item) -> bool { return item->task->getStorageID() == id; });
std::make_heap(buffer.begin(), buffer.end(), TaskRuntimeData::comparePtrByPriority);
void setCapacity(size_t count) { buffer.reserve(count); }
bool empty() { return buffer.empty(); }
[[noreturn]] void updatePolicy(std::string_view)
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method updatePolicy() is not implemented");
static constexpr std::string_view name = "shortest_task_first";
std::vector<TaskRuntimeDataPtr> buffer;
/// Queue that can dynamically change scheduling policy
template <class ... Policies>
class DynamicRuntimeQueueImpl
TaskRuntimeDataPtr pop()
return std::visit<TaskRuntimeDataPtr>([&] (auto && queue) { return queue.pop(); }, impl);
void push(TaskRuntimeDataPtr item)
std::visit([&] (auto && queue) { queue.push(std::move(item)); }, impl);
void remove(StorageID id)
std::visit([&] (auto && queue) { queue.remove(id); }, impl);
void setCapacity(size_t count)
capacity = count;
std::visit([&] (auto && queue) { queue.setCapacity(count); }, impl);
bool empty()
return std::visit<bool>([&] (auto && queue) { return queue.empty(); }, impl);
// Change policy. It does nothing if new policy is unknown or equals current policy.
void updatePolicy(std::string_view name)
// We use this double lambda trick to generate code for all possible pairs of types of old and new queue.
// If types are different it moves tasks from old queue to new one using corresponding pop() and push()
resolve<Policies...>(name, [&] <class NewQueue> (std::in_place_type_t<NewQueue>)
std::visit([&] (auto && queue)
if constexpr (std::is_same_v<std::decay_t<decltype(queue)>, NewQueue>)
return; // The same policy
NewQueue new_queue;
while (!queue.empty())
impl = std::move(new_queue);
}, impl);
// Find policy with specified `name` and call `func()` if found.
// Tag `std::in_place_type_t<T>` used to help templated lambda to deduce type T w/o creating its instance
template <class T, class ... Ts, class Func>
void resolve(std::string_view name, Func && func)
if (T::name == name)
return func(std::in_place_type<T>);
if constexpr (sizeof...(Ts))
return resolve<Ts...>(name, std::forward<Func>(func));
std::variant<Policies...> impl;
size_t capacity;
// Avoid typedef and alias to facilitate forward declaration
class DynamicRuntimeQueue : public DynamicRuntimeQueueImpl<RoundRobinRuntimeQueue, PriorityRuntimeQueue> {};
* Executor for a background MergeTree related operations such as merges, mutations, fetches and so on.
* It can execute only successors of ExecutableTask interface.
* Which is a self-written coroutine. It suspends, when returns true from executeStep() method.
* There are two queues of a tasks: pending (main queue for all the tasks) and active (currently executing).
* Pending queue is needed since the number of tasks will be more than thread to execute.
* Pending tasks are tasks that successfully scheduled to an executor or tasks that have some extra steps to execute.
* There is an invariant, that task may occur only in one of these queue. It can occur in both queues only in critical sections.
* Pending: Active:
* |s| |s| |s| |s| |s| |s| |s| |s| |s| |s| |s|
* |s| |s| |s| |s| |s| |s| |s| |s| |s| |s|
* |s| |s| |s| |s| |s| |s| |s|
* |s| |s| |s| |s|
* |s| |s|
* |s|
* Each task is simply a sequence of steps. Heavier tasks have longer sequences.
* When a step of a task is executed, we move tasks to pending queue. And take the next task from pending queue.
* Next task is chosen from pending tasks using one of the scheduling policies (class Queue):
* 1) RoundRobinRuntimeQueue. Uses boost::circular_buffer as FIFO-queue. Next task is taken from queue's head and after one step
* enqueued into queue's tail. With this architecture all merges / mutations are fairly scheduled and never starved.
* All decisions regarding priorities are left to components creating tasks (e.g. SimpleMergeSelector).
* 2) PriorityRuntimeQueue. Uses heap to select task with smallest priority value.
* With this architecture all small merges / mutations will be executed faster, than bigger ones.
* WARNING: Starvation is possible in case of overload.
* We use boost::circular_buffer as a container for active queue to avoid allocations.
* Another nuisance that we face is that background operations always interact with an associated Storage.
* So, when a Storage wants to shutdown, it must wait until all its background operations are finished.
template <class Queue>
class MergeTreeBackgroundExecutor final : boost::noncopyable
String name_,
size_t threads_count_,
size_t max_tasks_count_,
CurrentMetrics::Metric metric_,
CurrentMetrics::Metric max_tasks_metric_,
std::string_view policy = {});
/// Handler for hot-reloading
/// Supports only increasing the number of threads and tasks, because
/// implementing tasks eviction will definitely be too error-prone and buggy.
void increaseThreadsAndMaxTasksCount(size_t new_threads_count, size_t new_max_tasks_count);
size_t getMaxThreads() const;
/// This method can return stale value of max_tasks_count (no mutex locking).
/// It's okay because amount of tasks can be only increased and getting stale value
/// can lead only to some postponing, not logical error.
size_t getMaxTasksCount() const;
bool trySchedule(ExecutableTaskPtr task);
void removeTasksCorrespondingToStorage(StorageID id);
void wait();
/// Update scheduling policy for pending tasks. It does nothing if `new_policy` is the same or unknown.
void updateSchedulingPolicy(std::string_view new_policy)
std::lock_guard lock(mutex);
String name;
size_t threads_count TSA_GUARDED_BY(mutex) = 0;
std::atomic<size_t> max_tasks_count = 0;
CurrentMetrics::Metric metric;
CurrentMetrics::Increment max_tasks_metric;
void routine(TaskRuntimeDataPtr item);
/// libc++ does not provide TSA support for std::unique_lock -> TSA_NO_THREAD_SAFETY_ANALYSIS
void threadFunction() TSA_NO_THREAD_SAFETY_ANALYSIS;
/// Initially it will be empty
Queue pending TSA_GUARDED_BY(mutex);
boost::circular_buffer<TaskRuntimeDataPtr> active TSA_GUARDED_BY(mutex);
mutable std::mutex mutex;
std::condition_variable has_tasks TSA_GUARDED_BY(mutex);
bool shutdown TSA_GUARDED_BY(mutex) = false;
std::unique_ptr<ThreadPool> pool;
LoggerPtr log = getLogger("MergeTreeBackgroundExecutor");
extern template class MergeTreeBackgroundExecutor<RoundRobinRuntimeQueue>;
extern template class MergeTreeBackgroundExecutor<PriorityRuntimeQueue>;
extern template class MergeTreeBackgroundExecutor<DynamicRuntimeQueue>;