Merge pull request #64985 from ClickHouse/allocation-free-scheduler-activation

Faster processing of scheduler queue activations
This commit is contained in:
Sergei Trifonov 2024-06-18 14:18:56 +00:00 committed by GitHub
commit 741e453510
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
4 changed files with 341 additions and 143 deletions

View File

@ -11,10 +11,10 @@
#include <Poco/Util/XMLConfiguration.h>
#include <boost/noncopyable.hpp>
#include <boost/intrusive/list.hpp>
#include <chrono>
#include <deque>
#include <queue>
#include <algorithm>
#include <functional>
#include <memory>
@ -30,6 +30,8 @@ namespace ErrorCodes
}
class ISchedulerNode;
class EventQueue;
using EventId = UInt64;
inline const Poco::Util::AbstractConfiguration & emptyConfig()
{
@ -82,6 +84,115 @@ struct SchedulerNodeInfo
}
};
/*
* Node of hierarchy for scheduling requests for resource. Base class for all
* kinds of scheduling elements (queues, policies, constraints and schedulers).
*
* Root node is a scheduler, which has it's thread to dequeue requests,
* execute requests (see ResourceRequest) and process events in a thread-safe manner.
* Immediate children of the scheduler represent independent resources.
* Each resource has it's own hierarchy to achieve required scheduling policies.
* Non-leaf nodes do not hold requests, but keep scheduling state
* (e.g. consumption history, amount of in-flight requests, etc).
* Leafs of hierarchy are queues capable of holding pending requests.
*
* scheduler (SchedulerRoot)
* / \
* constraint constraint (SemaphoreConstraint)
* | |
* policy policy (PriorityPolicy)
* / \ / \
* q1 q2 q3 q4 (FifoQueue)
*
* Dequeueing request from an inner node will dequeue request from one of active leaf-queues in its subtree.
* Node is considered to be active iff:
* - it has at least one pending request in one of leaves of it's subtree;
* - and enforced constraints, if any, are satisfied
* (e.g. amount of concurrent requests is not greater than some number).
*
* All methods must be called only from scheduler thread for thread-safety.
*/
class ISchedulerNode : public boost::intrusive::list_base_hook<>, private boost::noncopyable
{
public:
explicit ISchedulerNode(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
: event_queue(event_queue_)
, info(config, config_prefix)
{}
virtual ~ISchedulerNode() = default;
/// Checks if two nodes configuration is equal
virtual bool equals(ISchedulerNode * other)
{
return info.equals(other->info);
}
/// Attach new child
virtual void attachChild(const std::shared_ptr<ISchedulerNode> & child) = 0;
/// Detach and destroy child
virtual void removeChild(ISchedulerNode * child) = 0;
/// Get attached child by name
virtual ISchedulerNode * getChild(const String & child_name) = 0;
/// Activation of child due to the first pending request
/// Should be called on leaf node (i.e. queue) to propagate activation signal through chain to the root
virtual void activateChild(ISchedulerNode * child) = 0;
/// Returns true iff node is active
virtual bool isActive() = 0;
/// Returns number of active children
virtual size_t activeChildren() = 0;
/// Returns the first request to be executed as the first component of resulting pair.
/// The second pair component is `true` iff node is still active after dequeueing.
virtual std::pair<ResourceRequest *, bool> dequeueRequest() = 0;
/// Returns full path string using names of every parent
String getPath()
{
String result;
ISchedulerNode * ptr = this;
while (ptr->parent)
{
result = "/" + ptr->basename + result;
ptr = ptr->parent;
}
return result.empty() ? "/" : result;
}
/// Attach to a parent (used by attachChild)
virtual void setParent(ISchedulerNode * parent_)
{
parent = parent_;
}
protected:
/// Notify parents about the first pending request or constraint becoming satisfied.
/// Postponed to be handled in scheduler thread, so it is intended to be called from outside.
void scheduleActivation();
public:
EventQueue * const event_queue;
String basename;
SchedulerNodeInfo info;
ISchedulerNode * parent = nullptr;
EventId activation_event_id = 0; // Valid for `ISchedulerNode` placed in EventQueue::activations
/// Introspection
std::atomic<UInt64> dequeued_requests{0};
std::atomic<UInt64> canceled_requests{0};
std::atomic<ResourceCost> dequeued_cost{0};
std::atomic<ResourceCost> canceled_cost{0};
std::atomic<UInt64> busy_periods{0};
};
using SchedulerNodePtr = std::shared_ptr<ISchedulerNode>;
/*
* Simple waitable thread-safe FIFO task queue.
* Intended to hold postponed events for later handling (usually by scheduler thread).
@ -89,57 +200,70 @@ struct SchedulerNodeInfo
class EventQueue
{
public:
using Event = std::function<void()>;
using Task = std::function<void()>;
static constexpr EventId not_postponed = 0;
using TimePoint = std::chrono::system_clock::time_point;
using Duration = std::chrono::system_clock::duration;
static constexpr UInt64 not_postponed = 0;
struct Event
{
const EventId event_id;
Task task;
Event(EventId event_id_, Task && task_)
: event_id(event_id_)
, task(std::move(task_))
{}
};
struct Postponed
{
TimePoint key;
UInt64 id; // for canceling
std::unique_ptr<Event> event;
EventId event_id; // for canceling
std::unique_ptr<Task> task;
Postponed(TimePoint key_, UInt64 id_, Event && event_)
Postponed(TimePoint key_, EventId event_id_, Task && task_)
: key(key_)
, id(id_)
, event(std::make_unique<Event>(std::move(event_)))
, event_id(event_id_)
, task(std::make_unique<Task>(std::move(task_)))
{}
bool operator<(const Postponed & rhs) const
{
return std::tie(key, id) > std::tie(rhs.key, rhs.id); // reversed for min-heap
return std::tie(key, event_id) > std::tie(rhs.key, rhs.event_id); // reversed for min-heap
}
};
/// Add an `event` to be processed after `until` time point.
/// Returns a unique id for canceling.
[[nodiscard]] UInt64 postpone(TimePoint until, Event && event)
/// Returns a unique event id for canceling.
[[nodiscard]] EventId postpone(TimePoint until, Task && task)
{
std::unique_lock lock{mutex};
if (postponed.empty() || until < postponed.front().key)
pending.notify_one();
auto id = ++last_id;
postponed.emplace_back(until, id, std::move(event));
auto event_id = ++last_event_id;
postponed.emplace_back(until, event_id, std::move(task));
std::push_heap(postponed.begin(), postponed.end());
return id;
return event_id;
}
/// Cancel a postponed event using its unique id.
/// NOTE: Only postponed events can be canceled.
/// NOTE: If you need to cancel enqueued event, consider doing your actions inside another enqueued
/// NOTE: event instead. This ensures that all previous events are processed.
bool cancelPostponed(UInt64 postponed_id)
bool cancelPostponed(EventId postponed_event_id)
{
if (postponed_id == not_postponed)
if (postponed_event_id == not_postponed)
return false;
std::unique_lock lock{mutex};
for (auto i = postponed.begin(), e = postponed.end(); i != e; ++i)
{
if (i->id == postponed_id)
if (i->event_id == postponed_event_id)
{
postponed.erase(i);
// It is O(n), but we do not expect either big heaps or frequent cancels. So it is fine.
// It is O(n), but we do not expect neither big heaps nor frequent cancels. So it is fine.
std::make_heap(postponed.begin(), postponed.end());
return true;
}
@ -148,11 +272,23 @@ public:
}
/// Add an `event` for immediate processing
void enqueue(Event && event)
void enqueue(Task && task)
{
std::unique_lock lock{mutex};
bool was_empty = queue.empty();
queue.emplace_back(event);
bool was_empty = events.empty() && activations.empty();
auto event_id = ++last_event_id;
events.emplace_back(event_id, std::move(task));
if (was_empty)
pending.notify_one();
}
/// Add an activation `event` for immediate processing. Activations use a separate queue for performance reasons.
void enqueueActivation(ISchedulerNode * node)
{
std::unique_lock lock{mutex};
bool was_empty = events.empty() && activations.empty();
node->activation_event_id = ++last_event_id;
activations.push_back(*node);
if (was_empty)
pending.notify_one();
}
@ -163,7 +299,7 @@ public:
bool forceProcess()
{
std::unique_lock lock{mutex};
if (!queue.empty())
if (!events.empty() || !activations.empty())
{
processQueue(std::move(lock));
return true;
@ -181,7 +317,7 @@ public:
bool tryProcess()
{
std::unique_lock lock{mutex};
if (!queue.empty())
if (!events.empty() || !activations.empty())
{
processQueue(std::move(lock));
return true;
@ -205,7 +341,7 @@ public:
std::unique_lock lock{mutex};
while (true)
{
if (!queue.empty())
if (!events.empty() || !activations.empty())
{
processQueue(std::move(lock));
return;
@ -269,141 +405,63 @@ private:
void processQueue(std::unique_lock<std::mutex> && lock)
{
Event event = std::move(queue.front());
queue.pop_front();
if (events.empty())
return processActivation(std::move(lock));
if (activations.empty())
return processEvent(std::move(lock));
if (activations.front().activation_event_id < events.front().event_id)
return processActivation(std::move(lock));
else
return processEvent(std::move(lock));
}
void processActivation(std::unique_lock<std::mutex> && lock)
{
ISchedulerNode * node = &activations.front();
activations.pop_front();
node->activation_event_id = 0;
lock.unlock(); // do not hold queue mutex while processing events
event();
node->parent->activateChild(node);
}
void processEvent(std::unique_lock<std::mutex> && lock)
{
Task task = std::move(events.front().task);
events.pop_front();
lock.unlock(); // do not hold queue mutex while processing events
task();
}
void processPostponed(std::unique_lock<std::mutex> && lock)
{
Event event = std::move(*postponed.front().event);
Task task = std::move(*postponed.front().task);
std::pop_heap(postponed.begin(), postponed.end());
postponed.pop_back();
lock.unlock(); // do not hold queue mutex while processing events
event();
task();
}
std::mutex mutex;
std::condition_variable pending;
std::deque<Event> queue;
// `events` and `activations` logically represent one ordered queue. To preserve the common order we use `EventId`
// Activations are stored in a separate queue for performance reasons (mostly to avoid any allocations)
std::deque<Event> events;
boost::intrusive::list<ISchedulerNode> activations;
std::vector<Postponed> postponed;
UInt64 last_id = 0;
EventId last_event_id = 0;
std::atomic<TimePoint> manual_time{TimePoint()}; // for tests only
};
/*
* Node of hierarchy for scheduling requests for resource. Base class for all
* kinds of scheduling elements (queues, policies, constraints and schedulers).
*
* Root node is a scheduler, which has it's thread to dequeue requests,
* execute requests (see ResourceRequest) and process events in a thread-safe manner.
* Immediate children of the scheduler represent independent resources.
* Each resource has it's own hierarchy to achieve required scheduling policies.
* Non-leaf nodes do not hold requests, but keep scheduling state
* (e.g. consumption history, amount of in-flight requests, etc).
* Leafs of hierarchy are queues capable of holding pending requests.
*
* scheduler (SchedulerRoot)
* / \
* constraint constraint (SemaphoreConstraint)
* | |
* policy policy (PriorityPolicy)
* / \ / \
* q1 q2 q3 q4 (FifoQueue)
*
* Dequeueing request from an inner node will dequeue request from one of active leaf-queues in its subtree.
* Node is considered to be active iff:
* - it has at least one pending request in one of leaves of it's subtree;
* - and enforced constraints, if any, are satisfied
* (e.g. amount of concurrent requests is not greater than some number).
*
* All methods must be called only from scheduler thread for thread-safety.
*/
class ISchedulerNode : private boost::noncopyable
inline void ISchedulerNode::scheduleActivation()
{
public:
explicit ISchedulerNode(EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
: event_queue(event_queue_)
, info(config, config_prefix)
{}
virtual ~ISchedulerNode() = default;
/// Checks if two nodes configuration is equal
virtual bool equals(ISchedulerNode * other)
if (likely(parent))
{
return info.equals(other->info);
// The same as `enqueue([this] { parent->activateChild(this); });` but faster
event_queue->enqueueActivation(this);
}
/// Attach new child
virtual void attachChild(const std::shared_ptr<ISchedulerNode> & child) = 0;
/// Detach and destroy child
virtual void removeChild(ISchedulerNode * child) = 0;
/// Get attached child by name
virtual ISchedulerNode * getChild(const String & child_name) = 0;
/// Activation of child due to the first pending request
/// Should be called on leaf node (i.e. queue) to propagate activation signal through chain to the root
virtual void activateChild(ISchedulerNode * child) = 0;
/// Returns true iff node is active
virtual bool isActive() = 0;
/// Returns number of active children
virtual size_t activeChildren() = 0;
/// Returns the first request to be executed as the first component of resulting pair.
/// The second pair component is `true` iff node is still active after dequeueing.
virtual std::pair<ResourceRequest *, bool> dequeueRequest() = 0;
/// Returns full path string using names of every parent
String getPath()
{
String result;
ISchedulerNode * ptr = this;
while (ptr->parent)
{
result = "/" + ptr->basename + result;
ptr = ptr->parent;
}
return result.empty() ? "/" : result;
}
/// Attach to a parent (used by attachChild)
virtual void setParent(ISchedulerNode * parent_)
{
parent = parent_;
}
protected:
/// Notify parents about the first pending request or constraint becoming satisfied.
/// Postponed to be handled in scheduler thread, so it is intended to be called from outside.
void scheduleActivation()
{
if (likely(parent))
{
event_queue->enqueue([this] { parent->activateChild(this); });
}
}
public:
EventQueue * const event_queue;
String basename;
SchedulerNodeInfo info;
ISchedulerNode * parent = nullptr;
/// Introspection
std::atomic<UInt64> dequeued_requests{0};
std::atomic<UInt64> canceled_requests{0};
std::atomic<ResourceCost> dequeued_cost{0};
std::atomic<ResourceCost> canceled_cost{0};
std::atomic<UInt64> busy_periods{0};
};
using SchedulerNodePtr = std::shared_ptr<ISchedulerNode>;
}
}

View File

@ -0,0 +1,143 @@
#include <chrono>
#include <gtest/gtest.h>
#include <Common/Scheduler/ISchedulerNode.h>
using namespace DB;
class FakeSchedulerNode : public ISchedulerNode
{
public:
explicit FakeSchedulerNode(String & log_, EventQueue * event_queue_, const Poco::Util::AbstractConfiguration & config = emptyConfig(), const String & config_prefix = {})
: ISchedulerNode(event_queue_, config, config_prefix)
, log(log_)
{}
void attachChild(const SchedulerNodePtr & child) override
{
log += " +" + child->basename;
}
void removeChild(ISchedulerNode * child) override
{
log += " -" + child->basename;
}
ISchedulerNode * getChild(const String & /* child_name */) override
{
return nullptr;
}
void activateChild(ISchedulerNode * child) override
{
log += " A" + child->basename;
}
bool isActive() override
{
return false;
}
size_t activeChildren() override
{
return 0;
}
std::pair<ResourceRequest *, bool> dequeueRequest() override
{
log += " D";
return {nullptr, false};
}
private:
String & log;
};
struct QueueTest {
String log;
EventQueue event_queue;
FakeSchedulerNode root_node;
QueueTest()
: root_node(log, &event_queue)
{}
SchedulerNodePtr makeNode(const String & name)
{
auto node = std::make_shared<FakeSchedulerNode>(log, &event_queue);
node->basename = name;
node->setParent(&root_node);
return std::static_pointer_cast<ISchedulerNode>(node);
}
void process(EventQueue::TimePoint now, const String & expected_log, size_t limit = size_t(-1))
{
event_queue.setManualTime(now);
for (;limit > 0; limit--)
{
if (!event_queue.tryProcess())
break;
}
EXPECT_EQ(log, expected_log);
log.clear();
}
void activate(const SchedulerNodePtr & node)
{
event_queue.enqueueActivation(node.get());
}
void event(const String & text)
{
event_queue.enqueue([this, text] { log += " " + text; });
}
EventId postpone(EventQueue::TimePoint until, const String & text)
{
return event_queue.postpone(until, [this, text] { log += " " + text; });
}
void cancel(EventId event_id)
{
event_queue.cancelPostponed(event_id);
}
};
TEST(SchedulerEventQueue, Smoke)
{
QueueTest t;
using namespace std::chrono_literals;
EventQueue::TimePoint start = std::chrono::system_clock::now();
t.process(start, "", 0);
// Activations
auto node1 = t.makeNode("1");
auto node2 = t.makeNode("2");
t.activate(node2);
t.activate(node1);
t.process(start + 42s, " A2 A1");
// Events
t.event("E1");
t.event("E2");
t.process(start + 100s, " E1 E2");
// Postponed events
t.postpone(start + 200s, "P200");
auto p190 = t.postpone(start + 200s, "P190");
t.postpone(start + 150s, "P150");
t.postpone(start + 175s, "P175");
t.process(start + 180s, " P150 P175");
t.event("E3");
t.cancel(p190);
t.process(start + 300s, " E3 P200");
// Ordering of events and activations
t.event("E1");
t.activate(node1);
t.event("E2");
t.activate(node2);
t.process(start + 300s, " E1 A1 E2 A2");
}

View File

@ -5,8 +5,6 @@
#include <Common/Scheduler/Nodes/FairPolicy.h>
#include <Common/Scheduler/Nodes/ThrottlerConstraint.h>
#include "Common/Scheduler/ISchedulerNode.h"
#include "Common/Scheduler/ResourceRequest.h"
using namespace DB;

View File

@ -12,7 +12,6 @@
#include <Common/Scheduler/Nodes/ThrottlerConstraint.h>
#include <Common/Scheduler/Nodes/FifoQueue.h>
#include <Interpreters/Context.h>
#include "Common/Scheduler/ResourceRequest.h"
namespace DB