ClickHouse/src/Storages/MergeTree/ParallelReplicasReadingCoordinator.cpp

Ignoring revisions in .git-blame-ignore-revs. Click here to bypass and see the normal blame view.

522 lines
18 KiB
C++
Raw Normal View History

#include <Storages/MergeTree/ParallelReplicasReadingCoordinator.h>
2023-02-03 13:34:18 +00:00
#include <algorithm>
#include <mutex>
#include <numeric>
#include <vector>
#include <map>
2023-02-03 13:34:18 +00:00
#include <set>
#include <consistent_hashing.h>
2022-04-27 15:05:45 +00:00
#include <Common/logger_useful.h>
2023-02-03 13:34:18 +00:00
#include <Common/SipHash.h>
#include <Common/thread_local_rng.h>
#include <base/types.h>
#include "IO/WriteBufferFromString.h"
#include "Storages/MergeTree/RangesInDataPart.h"
#include "Storages/MergeTree/RequestResponse.h"
#include <Storages/MergeTree/MarkRange.h>
#include <Storages/MergeTree/IntersectionsIndexes.h>
2023-02-03 13:34:18 +00:00
#include <fmt/format.h>
2022-11-14 05:54:58 +00:00
namespace DB
{
2023-02-03 13:34:18 +00:00
namespace ErrorCodes
{
extern const int LOGICAL_ERROR;
}
class ParallelReplicasReadingCoordinator::ImplInterface
{
public:
struct Stat
{
size_t number_of_requests{0};
size_t sum_marks{0};
};
using Stats = std::vector<Stat>;
static String toString(Stats stats)
{
String result = "Statistics: ";
for (size_t i = 0; i < stats.size(); ++i)
result += fmt::format("-- replica {}, requests: {} marks: {} ", i, stats[i].number_of_requests, stats[i].sum_marks);
return result;
}
Stats stats;
std::mutex mutex;
size_t replicas_count;
explicit ImplInterface(size_t replicas_count_)
: stats{replicas_count_}
, replicas_count(replicas_count_)
{}
virtual ~ImplInterface() = default;
virtual ParallelReadResponse handleRequest(ParallelReadRequest request) = 0;
virtual void handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement) = 0;
};
struct Part
{
mutable RangesInDataPartDescription description;
// FIXME: This is needed to put this struct in set
// and modify through iterator
mutable std::set<size_t> replicas;
bool operator<(const Part & rhs) const { return description.info < rhs.description.info; }
};
using Parts = std::set<Part>;
using PartRefs = std::deque<Parts::iterator>;
class DefaultCoordinator : public ParallelReplicasReadingCoordinator::ImplInterface
{
public:
2023-02-03 13:34:18 +00:00
using ParallelReadRequestPtr = std::unique_ptr<ParallelReadRequest>;
using PartToMarkRanges = std::map<PartToRead::PartAndProjectionNames, HalfIntervals>;
2023-02-03 13:34:18 +00:00
explicit DefaultCoordinator(size_t replicas_count_)
: ParallelReplicasReadingCoordinator::ImplInterface(replicas_count_)
, announcements(replicas_count_)
, reading_state(replicas_count_)
{
}
~DefaultCoordinator() override;
struct PartitionReading
{
PartSegments part_ranges;
PartToMarkRanges mark_ranges_in_part;
};
using PartitionToBlockRanges = std::map<String, PartitionReading>;
PartitionToBlockRanges partitions;
2023-02-03 13:34:18 +00:00
size_t sent_initial_requests{0};
std::vector<InitialAllRangesAnnouncement> announcements;
Parts all_parts_to_read;
/// Contains only parts which we haven't started to read from
PartRefs delayed_parts;
/// Per-replica preferred parts split by consistent hash
/// Once all task will be done by some replica, it can steal tasks
std::vector<PartRefs> reading_state;
Poco::Logger * log = &Poco::Logger::get("DefaultCoordinator");
std::atomic<bool> state_initialized{false};
ParallelReadResponse handleRequest(ParallelReadRequest request) override;
void handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement) override;
void updateReadingState(const InitialAllRangesAnnouncement & announcement);
void finalizeReadingState();
size_t computeConsistentHash(const MergeTreePartInfo & info) const
{
auto hash = SipHash();
hash.update(info.getPartNameV1());
return ConsistentHashing(hash.get64(), replicas_count);
}
2023-02-03 13:34:18 +00:00
void selectPartsAndRanges(const PartRefs & container, size_t replica_num, size_t min_number_of_marks, size_t & current_mark_size, ParallelReadResponse & response) const;
};
2023-02-03 13:34:18 +00:00
DefaultCoordinator::~DefaultCoordinator()
{
LOG_INFO(log, "Coordination done: {}", toString(stats));
}
2023-02-03 13:34:18 +00:00
void DefaultCoordinator::updateReadingState(const InitialAllRangesAnnouncement & announcement)
{
2023-02-03 13:34:18 +00:00
PartRefs parts_diff;
2022-11-14 07:01:48 +00:00
2023-02-03 13:34:18 +00:00
/// To get rid of duplicates
for (const auto & part: announcement.description)
{
auto the_same_it = std::find_if(all_parts_to_read.begin(), all_parts_to_read.end(),
[&part] (const Part & other) { return other.description.info.getPartNameV1() == part.info.getPartNameV1(); });
2023-02-03 13:34:18 +00:00
/// We have the same part - add the info about presence on current replica to it
if (the_same_it != all_parts_to_read.end())
{
the_same_it->replicas.insert(announcement.replica_num);
continue;
}
2023-02-03 13:34:18 +00:00
auto covering_or_the_same_it = std::find_if(all_parts_to_read.begin(), all_parts_to_read.end(),
[&part] (const Part & other) { return !other.description.info.isDisjoint(part.info); });
/// It is covering part or we have covering - skip it
if (covering_or_the_same_it != all_parts_to_read.end())
continue;
2023-02-03 13:34:18 +00:00
auto new_part = Part{
.description = part,
.replicas = {announcement.replica_num}
};
auto [insert_it, _] = all_parts_to_read.insert(new_part);
parts_diff.push_back(insert_it);
}
/// Split all parts by consistent hash
while (!parts_diff.empty())
{
2023-02-03 13:34:18 +00:00
auto current_part_it = parts_diff.front();
parts_diff.pop_front();
auto consistent_hash = computeConsistentHash(current_part_it->description.info);
2023-02-03 13:34:18 +00:00
/// Check whether the new part can easy go to replica queue
if (current_part_it->replicas.contains(consistent_hash))
{
reading_state[consistent_hash].emplace_back(current_part_it);
continue;
}
2023-02-03 13:34:18 +00:00
/// Add to delayed parts
delayed_parts.emplace_back(current_part_it);
}
}
void DefaultCoordinator::finalizeReadingState()
{
/// Clear all the delayed queue
while (!delayed_parts.empty())
{
auto current_part_it = delayed_parts.front();
auto consistent_hash = computeConsistentHash(current_part_it->description.info);
2023-02-03 13:34:18 +00:00
if (current_part_it->replicas.contains(consistent_hash))
{
reading_state[consistent_hash].emplace_back(current_part_it);
delayed_parts.pop_front();
continue;
}
2023-02-03 13:34:18 +00:00
/// In this situation just assign to a random replica which has this part
auto replica = *(std::next(current_part_it->replicas.begin(), thread_local_rng() % current_part_it->replicas.size()));
reading_state[replica].emplace_back(current_part_it);
delayed_parts.pop_front();
}
2023-02-03 13:34:18 +00:00
String description;
for (const auto & part : all_parts_to_read)
{
description += part.description.describe();
description += fmt::format("Replicas: ({}) --- ", fmt::join(part.replicas, ","));
}
2023-02-03 13:34:18 +00:00
LOG_INFO(log, "Reading state is fully initialized: {}", description);
}
void DefaultCoordinator::handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement)
{
std::lock_guard lock(mutex);
2023-02-03 13:34:18 +00:00
updateReadingState(announcement);
stats[announcement.replica_num].number_of_requests +=1;
2023-02-03 13:34:18 +00:00
++sent_initial_requests;
LOG_INFO(log, "{} {}", sent_initial_requests, replicas_count);
if (sent_initial_requests == replicas_count)
finalizeReadingState();
}
2023-02-03 13:34:18 +00:00
void DefaultCoordinator::selectPartsAndRanges(const PartRefs & container, size_t replica_num, size_t min_number_of_marks, size_t & current_mark_size, ParallelReadResponse & response) const
{
for (const auto & part : container)
{
2023-02-03 13:34:18 +00:00
if (current_mark_size >= min_number_of_marks)
{
2023-02-03 13:34:18 +00:00
LOG_TEST(log, "Current mark size {} is bigger than min_number_marks {}", current_mark_size, min_number_of_marks);
break;
}
2023-02-03 13:34:18 +00:00
if (part->description.ranges.empty())
{
2023-02-03 13:34:18 +00:00
LOG_TEST(log, "Part {} is already empty in reading state", part->description.info.getPartNameV1());
continue;
}
2023-02-03 13:34:18 +00:00
if (std::find(part->replicas.begin(), part->replicas.end(), replica_num) == part->replicas.end())
{
LOG_TEST(log, "Not found part {} on replica {}", part->description.info.getPartNameV1(), replica_num);
continue;
}
2023-02-03 13:34:18 +00:00
response.description.push_back({
.info = part->description.info,
.ranges = {},
});
2023-02-03 13:34:18 +00:00
while (!part->description.ranges.empty() && current_mark_size < min_number_of_marks)
{
auto & range = part->description.ranges.front();
if (range.getNumberOfMarks() > min_number_of_marks)
{
auto new_range = range;
range.begin += min_number_of_marks;
new_range.end = new_range.begin + min_number_of_marks;
response.description.back().ranges.emplace_back(new_range);
current_mark_size += new_range.getNumberOfMarks();
continue;
}
current_mark_size += part->description.ranges.front().getNumberOfMarks();
response.description.back().ranges.emplace_back(part->description.ranges.front());
part->description.ranges.pop_front();
}
}
}
ParallelReadResponse DefaultCoordinator::handleRequest(ParallelReadRequest request)
{
std::lock_guard lock(mutex);
LOG_TRACE(log, "Handling request from replica {}, minimal marks size is {}", request.replica_num, request.min_number_of_marks);
size_t current_mark_size = 0;
ParallelReadResponse response;
/// 1. Try to select from preferred set of parts for current replica
selectPartsAndRanges(reading_state[request.replica_num], request.replica_num, request.min_number_of_marks, current_mark_size, response);
/// 2. Try to use parts from delayed queue
while (!delayed_parts.empty() && current_mark_size < request.min_number_of_marks)
{
auto part = delayed_parts.front();
delayed_parts.pop_front();
reading_state[request.replica_num].emplace_back(part);
selectPartsAndRanges(reading_state[request.replica_num], request.replica_num, request.min_number_of_marks, current_mark_size, response);
}
2022-11-14 06:13:42 +00:00
2023-02-03 13:34:18 +00:00
/// 3. Try to steal tasks;
if (current_mark_size < request.min_number_of_marks)
{
for (size_t i = 0; i < replicas_count; ++i)
{
if (i != request.replica_num)
selectPartsAndRanges(reading_state[i], request.replica_num, request.min_number_of_marks, current_mark_size, response);
2022-11-14 06:13:42 +00:00
2023-02-03 13:34:18 +00:00
if (current_mark_size >= request.min_number_of_marks)
break;
}
2023-02-03 13:34:18 +00:00
}
stats[request.replica_num].number_of_requests += 1;
stats[request.replica_num].sum_marks += current_mark_size;
if (response.description.empty())
response.finish = true;
LOG_TRACE(log, "Going to respond to replica {} with {}", request.replica_num, response.describe());
return response;
}
template <CoordinationMode mode>
class InOrderCoordinator : public ParallelReplicasReadingCoordinator::ImplInterface
{
public:
explicit InOrderCoordinator([[ maybe_unused ]] size_t replicas_count_)
: ParallelReplicasReadingCoordinator::ImplInterface(replicas_count_)
{}
~InOrderCoordinator() override
{
LOG_INFO(log, "Coordination done: {}", toString(stats));
}
ParallelReadResponse handleRequest([[ maybe_unused ]] ParallelReadRequest request) override;
void handleInitialAllRangesAnnouncement([[ maybe_unused ]] InitialAllRangesAnnouncement announcement) override;
Parts all_parts_to_read;
Poco::Logger * log = &Poco::Logger::get(fmt::format("{}{}", magic_enum::enum_name(mode), "Coordinator"));
};
template <CoordinationMode mode>
void InOrderCoordinator<mode>::handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement)
{
std::lock_guard lock(mutex);
LOG_TRACE(log, "Received an announecement {}", announcement.describe());
/// To get rid of duplicates
for (const auto & part: announcement.description)
{
auto the_same_it = std::find_if(all_parts_to_read.begin(), all_parts_to_read.end(),
[&part] (const Part & other) { return other.description.info == part.info; });
/// We have the same part - add the info about presence on current replica to it
if (the_same_it != all_parts_to_read.end())
{
2023-02-03 13:34:18 +00:00
the_same_it->replicas.insert(announcement.replica_num);
continue;
}
2023-02-03 13:34:18 +00:00
auto covering_or_the_same_it = std::find_if(all_parts_to_read.begin(), all_parts_to_read.end(),
[&part] (const Part & other) { return other.description.info.contains(part.info) || part.info.contains(other.description.info); });
2023-02-03 13:34:18 +00:00
/// It is covering part or we have covering - skip it
if (covering_or_the_same_it != all_parts_to_read.end())
continue;
auto new_part = Part{
.description = part,
.replicas = {announcement.replica_num}
};
auto insert_it = all_parts_to_read.insert(new_part);
auto & ranges = insert_it.first->description.ranges;
std::sort(ranges.begin(), ranges.end());
}
}
template <CoordinationMode mode>
ParallelReadResponse InOrderCoordinator<mode>::handleRequest(ParallelReadRequest request)
{
std::lock_guard lock(mutex);
if (request.mode != mode)
throw Exception(ErrorCodes::LOGICAL_ERROR,
"Replica {} decided to read in {} mode, not in {}. This is a bug",
request.replica_num, magic_enum::enum_name(request.mode), magic_enum::enum_name(mode));
LOG_TRACE(log, "Got request from replica {}, data {}", request.replica_num, request.describe());
ParallelReadResponse response;
response.description = request.description;
size_t overall_number_of_marks = 0;
for (auto & part : response.description)
{
auto global_part_it = std::find_if(all_parts_to_read.begin(), all_parts_to_read.end(),
[&part] (const Part & other) { return other.description.info == part.info; });
if (global_part_it == all_parts_to_read.end())
continue;
if (!global_part_it->replicas.contains(request.replica_num))
throw Exception(ErrorCodes::LOGICAL_ERROR, "Part {} doesn't exist on replica {} according to the global state", part.info.getPartNameV1(), request.replica_num);
size_t current_mark_size = 0;
/// Now we can recommend to read more intervals
if constexpr (mode == CoordinationMode::ReverseOrder)
{
while (!global_part_it->description.ranges.empty() && current_mark_size < request.min_number_of_marks)
{
auto range = global_part_it->description.ranges.back();
if (range.getNumberOfMarks() > request.min_number_of_marks)
{
auto new_range = range;
range.end -= request.min_number_of_marks;
new_range.begin = new_range.end - request.min_number_of_marks;
global_part_it->description.ranges.back() = range;
part.ranges.emplace_front(new_range);
current_mark_size += new_range.getNumberOfMarks();
continue;
}
current_mark_size += global_part_it->description.ranges.back().getNumberOfMarks();
part.ranges.emplace_front(global_part_it->description.ranges.back());
global_part_it->description.ranges.pop_back();
}
}
2023-02-03 13:34:18 +00:00
else if constexpr (mode == CoordinationMode::WithOrder)
{
while (!global_part_it->description.ranges.empty() && current_mark_size < request.min_number_of_marks)
{
auto range = global_part_it->description.ranges.front();
if (range.getNumberOfMarks() > request.min_number_of_marks)
{
auto new_range = range;
range.begin += request.min_number_of_marks;
new_range.end = new_range.begin + request.min_number_of_marks;
global_part_it->description.ranges.front() = range;
part.ranges.emplace_back(new_range);
current_mark_size += new_range.getNumberOfMarks();
continue;
}
current_mark_size += global_part_it->description.ranges.front().getNumberOfMarks();
part.ranges.emplace_back(global_part_it->description.ranges.front());
global_part_it->description.ranges.pop_front();
}
}
overall_number_of_marks += current_mark_size;
}
2023-02-03 13:34:18 +00:00
if (!overall_number_of_marks)
response.finish = true;
stats[request.replica_num].number_of_requests += 1;
stats[request.replica_num].sum_marks += overall_number_of_marks;
LOG_TRACE(log, "Going to respond to replica {} with {}", request.replica_num, response.describe());
return response;
}
2023-02-03 13:34:18 +00:00
void ParallelReplicasReadingCoordinator::handleInitialAllRangesAnnouncement(InitialAllRangesAnnouncement announcement)
{
2023-02-03 13:34:18 +00:00
if (!pimpl)
initialize();
return pimpl->handleInitialAllRangesAnnouncement(announcement);
}
ParallelReadResponse ParallelReplicasReadingCoordinator::handleRequest(ParallelReadRequest request)
{
if (!pimpl)
initialize();
return pimpl->handleRequest(std::move(request));
}
2023-02-03 13:34:18 +00:00
void ParallelReplicasReadingCoordinator::setMode(CoordinationMode mode_)
{
mode = mode_;
}
void ParallelReplicasReadingCoordinator::initialize()
{
2023-02-03 13:34:18 +00:00
switch (mode)
{
case CoordinationMode::Default:
pimpl = std::make_unique<DefaultCoordinator>(replicas_count);
return;
case CoordinationMode::WithOrder:
pimpl = std::make_unique<InOrderCoordinator<CoordinationMode::WithOrder>>(replicas_count);
return;
case CoordinationMode::ReverseOrder:
pimpl = std::make_unique<InOrderCoordinator<CoordinationMode::ReverseOrder>>(replicas_count);
return;
}
}
2023-02-03 13:34:18 +00:00
ParallelReplicasReadingCoordinator::ParallelReplicasReadingCoordinator(size_t replicas_count_) : replicas_count(replicas_count_) {}
ParallelReplicasReadingCoordinator::~ParallelReplicasReadingCoordinator() = default;
}