ClickHouse/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
2024-01-25 14:31:49 +03:00

318 lines
11 KiB
C++

#pragma once
#include <deque>
#include <functional>
#include <mutex>
#include <future>
#include <condition_variable>
#include <set>
#include <variant>
#include <utility>
#include <boost/circular_buffer.hpp>
#include <boost/noncopyable.hpp>
#include <Poco/Event.h>
#include <Storages/MergeTree/IExecutableTask.h>
#include <base/defines.h>
#include <Common/CurrentMetrics.h>
#include <Common/Exception.h>
#include <Common/Stopwatch.h>
#include <Common/ThreadPool_fwd.h>
namespace DB
{
namespace ErrorCodes
{
extern const int NOT_IMPLEMENTED;
}
struct TaskRuntimeData;
using TaskRuntimeDataPtr = std::shared_ptr<TaskRuntimeData>;
/**
* Has RAII class to determine how many tasks are waiting for the execution and executing at the moment.
* Also has some flags and primitives to wait for current task to be executed.
*/
struct TaskRuntimeData
{
TaskRuntimeData(ExecutableTaskPtr && task_, CurrentMetrics::Metric metric_)
: task(std::move(task_))
, metric(metric_)
{
/// Increment and decrement a metric with sequentially consistent memory order
/// This is needed, because in unit test this metric is read from another thread
/// and some invariant is checked. With relaxed memory order we could read stale value
/// for this metric, that's why test can be failed.
CurrentMetrics::values[metric].fetch_add(1);
}
~TaskRuntimeData()
{
CurrentMetrics::values[metric].fetch_sub(1);
}
ExecutableTaskPtr task;
CurrentMetrics::Metric metric;
/// Guarded by MergeTreeBackgroundExecutor<>::mutex
bool is_currently_deleting{false};
/// Actually autoreset=false is needed only for unit test
/// where multiple threads could remove tasks corresponding to the same storage
/// This scenario in not possible in reality.
Poco::Event is_done{/*autoreset=*/false};
/// This is equal to task->getPriority() not to do useless virtual calls in comparator
Priority priority;
/// By default priority queue will have max element at top
static bool comparePtrByPriority(const TaskRuntimeDataPtr & lhs, const TaskRuntimeDataPtr & rhs)
{
return lhs->priority > rhs->priority;
}
};
/// Simplest First-in-First-out queue, ignores priority.
class RoundRobinRuntimeQueue
{
public:
TaskRuntimeDataPtr pop()
{
auto result = std::move(queue.front());
queue.pop_front();
return result;
}
void push(TaskRuntimeDataPtr item)
{
queue.push_back(std::move(item));
}
void remove(StorageID id)
{
auto it = std::remove_if(queue.begin(), queue.end(),
[&] (auto && item) -> bool { return item->task->getStorageID() == id; });
queue.erase(it, queue.end());
}
void setCapacity(size_t count) { queue.set_capacity(count); }
bool empty() { return queue.empty(); }
[[noreturn]] void updatePolicy(std::string_view)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method updatePolicy() is not implemented");
}
static constexpr std::string_view name = "round_robin";
private:
boost::circular_buffer<TaskRuntimeDataPtr> queue{0};
};
/// Uses a heap to pop a task with minimal priority.
class PriorityRuntimeQueue
{
public:
TaskRuntimeDataPtr pop()
{
std::pop_heap(buffer.begin(), buffer.end(), TaskRuntimeData::comparePtrByPriority);
auto result = std::move(buffer.back());
buffer.pop_back();
return result;
}
void push(TaskRuntimeDataPtr item)
{
item->priority = item->task->getPriority();
buffer.push_back(std::move(item));
std::push_heap(buffer.begin(), buffer.end(), TaskRuntimeData::comparePtrByPriority);
}
void remove(StorageID id)
{
std::erase_if(buffer, [&] (auto && item) -> bool { return item->task->getStorageID() == id; });
std::make_heap(buffer.begin(), buffer.end(), TaskRuntimeData::comparePtrByPriority);
}
void setCapacity(size_t count) { buffer.reserve(count); }
bool empty() { return buffer.empty(); }
[[noreturn]] void updatePolicy(std::string_view)
{
throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Method updatePolicy() is not implemented");
}
static constexpr std::string_view name = "shortest_task_first";
private:
std::vector<TaskRuntimeDataPtr> buffer;
};
/// Queue that can dynamically change scheduling policy
template <class ... Policies>
class DynamicRuntimeQueueImpl
{
public:
TaskRuntimeDataPtr pop()
{
return std::visit<TaskRuntimeDataPtr>([&] (auto && queue) { return queue.pop(); }, impl);
}
void push(TaskRuntimeDataPtr item)
{
std::visit([&] (auto && queue) { queue.push(std::move(item)); }, impl);
}
void remove(StorageID id)
{
std::visit([&] (auto && queue) { queue.remove(id); }, impl);
}
void setCapacity(size_t count)
{
capacity = count;
std::visit([&] (auto && queue) { queue.setCapacity(count); }, impl);
}
bool empty()
{
return std::visit<bool>([&] (auto && queue) { return queue.empty(); }, impl);
}
// Change policy. It does nothing if new policy is unknown or equals current policy.
void updatePolicy(std::string_view name)
{
// We use this double lambda trick to generate code for all possible pairs of types of old and new queue.
// If types are different it moves tasks from old queue to new one using corresponding pop() and push()
resolve<Policies...>(name, [&] <class NewQueue> (std::in_place_type_t<NewQueue>)
{
std::visit([&] (auto && queue)
{
if constexpr (std::is_same_v<std::decay_t<decltype(queue)>, NewQueue>)
return; // The same policy
NewQueue new_queue;
new_queue.setCapacity(capacity);
while (!queue.empty())
new_queue.push(queue.pop());
impl = std::move(new_queue);
}, impl);
});
}
private:
// Find policy with specified `name` and call `func()` if found.
// Tag `std::in_place_type_t<T>` used to help templated lambda to deduce type T w/o creating its instance
template <class T, class ... Ts, class Func>
void resolve(std::string_view name, Func && func)
{
if (T::name == name)
return func(std::in_place_type<T>);
if constexpr (sizeof...(Ts))
return resolve<Ts...>(name, std::forward<Func>(func));
}
std::variant<Policies...> impl;
size_t capacity;
};
// Avoid typedef and alias to facilitate forward declaration
class DynamicRuntimeQueue : public DynamicRuntimeQueueImpl<RoundRobinRuntimeQueue, PriorityRuntimeQueue> {};
/**
* Executor for a background MergeTree related operations such as merges, mutations, fetches and so on.
* It can execute only successors of ExecutableTask interface.
* Which is a self-written coroutine. It suspends, when returns true from executeStep() method.
*
* There are two queues of a tasks: pending (main queue for all the tasks) and active (currently executing).
* Pending queue is needed since the number of tasks will be more than thread to execute.
* Pending tasks are tasks that successfully scheduled to an executor or tasks that have some extra steps to execute.
* There is an invariant, that task may occur only in one of these queue. It can occur in both queues only in critical sections.
*
* Pending: Active:
*
* |s| |s| |s| |s| |s| |s| |s| |s| |s| |s| |s|
* |s| |s| |s| |s| |s| |s| |s| |s| |s| |s|
* |s| |s| |s| |s| |s| |s| |s|
* |s| |s| |s| |s|
* |s| |s|
* |s|
*
* Each task is simply a sequence of steps. Heavier tasks have longer sequences.
* When a step of a task is executed, we move tasks to pending queue. And take the next task from pending queue.
* Next task is chosen from pending tasks using one of the scheduling policies (class Queue):
* 1) RoundRobinRuntimeQueue. Uses boost::circular_buffer as FIFO-queue. Next task is taken from queue's head and after one step
* enqueued into queue's tail. With this architecture all merges / mutations are fairly scheduled and never starved.
* All decisions regarding priorities are left to components creating tasks (e.g. SimpleMergeSelector).
* 2) PriorityRuntimeQueue. Uses heap to select task with smallest priority value.
* With this architecture all small merges / mutations will be executed faster, than bigger ones.
* WARNING: Starvation is possible in case of overload.
*
* We use boost::circular_buffer as a container for active queue to avoid allocations.
* Another nuisance that we face is that background operations always interact with an associated Storage.
* So, when a Storage wants to shutdown, it must wait until all its background operations are finished.
*/
template <class Queue>
class MergeTreeBackgroundExecutor final : boost::noncopyable
{
public:
MergeTreeBackgroundExecutor(
String name_,
size_t threads_count_,
size_t max_tasks_count_,
CurrentMetrics::Metric metric_,
CurrentMetrics::Metric max_tasks_metric_,
std::string_view policy = {});
~MergeTreeBackgroundExecutor();
/// Handler for hot-reloading
/// Supports only increasing the number of threads and tasks, because
/// implementing tasks eviction will definitely be too error-prone and buggy.
void increaseThreadsAndMaxTasksCount(size_t new_threads_count, size_t new_max_tasks_count);
size_t getMaxThreads() const;
/// This method can return stale value of max_tasks_count (no mutex locking).
/// It's okay because amount of tasks can be only increased and getting stale value
/// can lead only to some postponing, not logical error.
size_t getMaxTasksCount() const;
bool trySchedule(ExecutableTaskPtr task);
void removeTasksCorrespondingToStorage(StorageID id);
void wait();
/// Update scheduling policy for pending tasks. It does nothing if `new_policy` is the same or unknown.
void updateSchedulingPolicy(std::string_view new_policy)
{
std::lock_guard lock(mutex);
pending.updatePolicy(new_policy);
}
private:
String name;
size_t threads_count TSA_GUARDED_BY(mutex) = 0;
std::atomic<size_t> max_tasks_count = 0;
CurrentMetrics::Metric metric;
CurrentMetrics::Increment max_tasks_metric;
void routine(TaskRuntimeDataPtr item);
/// libc++ does not provide TSA support for std::unique_lock -> TSA_NO_THREAD_SAFETY_ANALYSIS
void threadFunction() TSA_NO_THREAD_SAFETY_ANALYSIS;
/// Initially it will be empty
Queue pending TSA_GUARDED_BY(mutex);
boost::circular_buffer<TaskRuntimeDataPtr> active TSA_GUARDED_BY(mutex);
mutable std::mutex mutex;
std::condition_variable has_tasks TSA_GUARDED_BY(mutex);
bool shutdown TSA_GUARDED_BY(mutex) = false;
std::unique_ptr<ThreadPool> pool;
LoggerPtr log = getLogger("MergeTreeBackgroundExecutor");
};
extern template class MergeTreeBackgroundExecutor<RoundRobinRuntimeQueue>;
extern template class MergeTreeBackgroundExecutor<PriorityRuntimeQueue>;
extern template class MergeTreeBackgroundExecutor<DynamicRuntimeQueue>;
}