mirror of
https://github.com/ClickHouse/ClickHouse.git
synced 2024-11-22 07:31:57 +00:00
Merge
This commit is contained in:
parent
95fc39c559
commit
847128152e
@ -12,10 +12,11 @@ class MergeTreeDataMerger
|
||||
public:
|
||||
MergeTreeDataMerger(MergeTreeData & data_) : data(data_), log(&Logger::get("MergeTreeDataMerger")), canceled(false) {}
|
||||
|
||||
typedef boost::function<bool (const MergeTreeData::DataPartPtr &, const MergeTreeData::DataPartPtr &)> AllowedMergingPredicate;
|
||||
typedef std::function<bool (const MergeTreeData::DataPartPtr &, const MergeTreeData::DataPartPtr &)> AllowedMergingPredicate;
|
||||
|
||||
/** Выбирает, какие куски слить. Использует кучу эвристик.
|
||||
* Если merge_anything_for_old_months, для кусков за прошедшие месяцы снимается ограничение на соотношение размеров.
|
||||
* Если available_disk_space > 0, выбирает куски так, чтобы места на диске хватило с запасом для их слияния.
|
||||
*
|
||||
* can_merge - функция, определяющая, можно ли объединить пару соседних кусков.
|
||||
* Эта функция должна координировать слияния со вставками и другими слияниями, обеспечивая, что:
|
||||
@ -24,6 +25,7 @@ public:
|
||||
*/
|
||||
bool selectPartsToMerge(
|
||||
MergeTreeData::DataPartsVector & what,
|
||||
String & merged_name,
|
||||
size_t available_disk_space,
|
||||
bool merge_anything_for_old_months,
|
||||
bool aggressive,
|
||||
@ -31,7 +33,7 @@ public:
|
||||
const AllowedMergingPredicate & can_merge);
|
||||
|
||||
/// Сливает куски. Возвращает название нового куска. Если слияние отменили, возвращает пустую строку.
|
||||
String mergeParts(const MergeTreeData::DataPartsVector & parts);
|
||||
String mergeParts(const MergeTreeData::DataPartsVector & parts, const String & merged_name);
|
||||
|
||||
/// Примерное количество места на диске, нужное для мерджа. С запасом.
|
||||
size_t estimateDiskSpaceForMerge(const MergeTreeData::DataPartsVector & parts);
|
||||
|
@ -5,8 +5,9 @@
|
||||
#include <DB/Storages/MergeTree/MergeTreeDataMerger.h>
|
||||
#include <DB/Storages/MergeTree/MergeTreeDataWriter.h>
|
||||
#include <DB/Storages/MergeTree/MergeTreeDataSelectExecutor.h>
|
||||
#include "MergeTree/ReplicatedMergeTreePartsExchange.h"
|
||||
#include <DB/Storages/MergeTree/ReplicatedMergeTreePartsExchange.h>
|
||||
#include <zkutil/ZooKeeper.h>
|
||||
#include <zkutil/LeaderElection.h>
|
||||
#include <statdaemons/threadpool.hpp>
|
||||
|
||||
namespace DB
|
||||
@ -66,6 +67,44 @@ public:
|
||||
private:
|
||||
friend class ReplicatedMergeTreeBlockOutputStream;
|
||||
|
||||
struct CurrentlyMergingPartsTagger
|
||||
{
|
||||
Strings parts;
|
||||
StorageReplicatedMergeTree & storage;
|
||||
|
||||
CurrentlyMergingPartsTagger(const Strings & parts_, StorageReplicatedMergeTree & storage_)
|
||||
: parts(parts_), storage(storage_)
|
||||
{
|
||||
Poco::ScopedLock<Poco::FastMutex> lock(storage.currently_merging_mutex);
|
||||
for (const auto & name : parts)
|
||||
{
|
||||
if (storage.currently_merging.count(name))
|
||||
throw Exception("Tagging alreagy tagged part " + name + ". This is a bug.", ErrorCodes::LOGICAL_ERROR);
|
||||
}
|
||||
storage.currently_merging.insert(parts.begin(), parts.end());
|
||||
}
|
||||
|
||||
~CurrentlyMergingPartsTagger()
|
||||
{
|
||||
try
|
||||
{
|
||||
Poco::ScopedLock<Poco::FastMutex> lock(storage.currently_merging_mutex);
|
||||
for (const auto & name : parts)
|
||||
{
|
||||
if (!storage.currently_merging.count(name))
|
||||
throw Exception("Untagging already untagged part " + name + ". This is a bug.", ErrorCodes::LOGICAL_ERROR);
|
||||
storage.currently_merging.erase(name);
|
||||
}
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef Poco::SharedPtr<CurrentlyMergingPartsTagger> CurrentlyMergingPartsTaggerPtr;
|
||||
|
||||
struct LogEntry
|
||||
{
|
||||
enum Type
|
||||
@ -80,6 +119,14 @@ private:
|
||||
String new_part_name;
|
||||
Strings parts_to_merge;
|
||||
|
||||
CurrentlyMergingPartsTaggerPtr currently_merging_tagger;
|
||||
|
||||
void tagPartsAsCurrentlyMerging(StorageReplicatedMergeTree & storage)
|
||||
{
|
||||
if (type == MERGE_PARTS)
|
||||
currently_merging_tagger = new CurrentlyMergingPartsTagger(parts_to_merge, storage);
|
||||
}
|
||||
|
||||
void writeText(WriteBuffer & out) const;
|
||||
void readText(ReadBuffer & in);
|
||||
|
||||
@ -135,7 +182,9 @@ private:
|
||||
MergeTreeData data;
|
||||
MergeTreeDataSelectExecutor reader;
|
||||
MergeTreeDataWriter writer;
|
||||
MergeTreeDataMerger merger;
|
||||
ReplicatedMergeTreePartsFetcher fetcher;
|
||||
zkutil::LeaderElectionPtr leader_election;
|
||||
|
||||
typedef std::vector<std::thread> Threads;
|
||||
|
||||
@ -145,6 +194,15 @@ private:
|
||||
/// Потоки, выполняющие действия из очереди.
|
||||
Threads queue_threads;
|
||||
|
||||
/// Поток, выбирающий куски для слияния.
|
||||
std::thread merge_selecting_thread;
|
||||
|
||||
typedef std::set<String> StringSet;
|
||||
|
||||
/// Куски, для которых в очереди есть задание на слияние.
|
||||
StringSet currently_merging;
|
||||
Poco::FastMutex currently_merging_mutex;
|
||||
|
||||
Logger * log;
|
||||
|
||||
volatile bool shutdown_called;
|
||||
@ -190,7 +248,7 @@ private:
|
||||
*/
|
||||
void checkParts();
|
||||
|
||||
/// Работа с очередью и логом.
|
||||
/// Выполнение заданий из очереди.
|
||||
|
||||
/** Кладет в queue записи из ZooKeeper (/replicas/me/queue/).
|
||||
*/
|
||||
@ -218,6 +276,17 @@ private:
|
||||
*/
|
||||
void queueThread();
|
||||
|
||||
void becomeLeader();
|
||||
|
||||
/// Выбор кусков для слияния.
|
||||
|
||||
/** В бесконечном цикле выбирается куски для слияния и записывает в лог.
|
||||
*/
|
||||
void mergeSelectingThread();
|
||||
|
||||
/// Вызывается во время выбора кусков для слияния.
|
||||
bool canMergeParts(const MergeTreeData::DataPartPtr & left, const MergeTreeData::DataPartPtr & right);
|
||||
|
||||
/// Обмен кусками.
|
||||
|
||||
/** Бросает исключение, если куска ни у кого нет.
|
||||
|
@ -35,7 +35,7 @@ static const double DISK_USAGE_COEFFICIENT_TO_RESERVE = 1.4;
|
||||
/// 4) Если в одном из потоков идет мердж крупных кусков, то во втором сливать только маленькие кусочки
|
||||
/// 5) С ростом логарифма суммарного размера кусочков в мердже увеличиваем требование сбалансированности
|
||||
|
||||
bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & parts, size_t available_disk_space,
|
||||
bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & parts, String & merged_name, size_t available_disk_space,
|
||||
bool merge_anything_for_old_months, bool aggressive, bool only_small, const AllowedMergingPredicate & can_merge)
|
||||
{
|
||||
LOG_DEBUG(log, "Selecting parts to merge");
|
||||
@ -44,6 +44,9 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa
|
||||
|
||||
DateLUTSingleton & date_lut = DateLUTSingleton::instance();
|
||||
|
||||
if (available_disk_space == 0)
|
||||
available_disk_space = std::numeric_limits<size_t>::max();
|
||||
|
||||
size_t min_max = -1U;
|
||||
size_t min_min = -1U;
|
||||
int max_len = 0;
|
||||
@ -209,13 +212,25 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa
|
||||
{
|
||||
parts.clear();
|
||||
|
||||
DayNum_t left_date = DayNum_t(std::numeric_limits<UInt16>::max());
|
||||
DayNum_t right_date = DayNum_t(std::numeric_limits<UInt16>::min());
|
||||
UInt32 level = 0;
|
||||
|
||||
MergeTreeData::DataParts::iterator it = best_begin;
|
||||
for (int i = 0; i < max_len; ++i)
|
||||
{
|
||||
parts.push_back(*it);
|
||||
|
||||
level = std::max(level, parts[i]->level);
|
||||
left_date = std::min(left_date, parts[i]->left_date);
|
||||
right_date = std::max(right_date, parts[i]->right_date);
|
||||
|
||||
++it;
|
||||
}
|
||||
|
||||
merged_name = MergeTreeData::getPartName(
|
||||
left_date, right_date, parts.front()->left, parts.back()->right, level + 1);
|
||||
|
||||
LOG_DEBUG(log, "Selected " << parts.size() << " parts from " << parts.front()->name << " to " << parts.back()->name);
|
||||
}
|
||||
else
|
||||
@ -228,7 +243,7 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa
|
||||
|
||||
|
||||
/// parts должны быть отсортированы.
|
||||
String MergeTreeDataMerger::mergeParts(const MergeTreeData::DataPartsVector & parts)
|
||||
String MergeTreeDataMerger::mergeParts(const MergeTreeData::DataPartsVector & parts, const String & merged_name)
|
||||
{
|
||||
LOG_DEBUG(log, "Merging " << parts.size() << " parts: from " << parts.front()->name << " to " << parts.back()->name);
|
||||
|
||||
@ -237,25 +252,9 @@ String MergeTreeDataMerger::mergeParts(const MergeTreeData::DataPartsVector & pa
|
||||
for (const auto & it : columns_list)
|
||||
all_column_names.push_back(it.first);
|
||||
|
||||
DateLUTSingleton & date_lut = DateLUTSingleton::instance();
|
||||
|
||||
MergeTreeData::MutableDataPartPtr new_data_part = std::make_shared<MergeTreeData::DataPart>(data);
|
||||
new_data_part->left_date = std::numeric_limits<UInt16>::max();
|
||||
new_data_part->right_date = std::numeric_limits<UInt16>::min();
|
||||
new_data_part->left = parts.front()->left;
|
||||
new_data_part->right = parts.back()->right;
|
||||
new_data_part->level = 0;
|
||||
for (size_t i = 0; i < parts.size(); ++i)
|
||||
{
|
||||
new_data_part->level = std::max(new_data_part->level, parts[i]->level);
|
||||
new_data_part->left_date = std::min(new_data_part->left_date, parts[i]->left_date);
|
||||
new_data_part->right_date = std::max(new_data_part->right_date, parts[i]->right_date);
|
||||
}
|
||||
++new_data_part->level;
|
||||
new_data_part->name = MergeTreeData::getPartName(
|
||||
new_data_part->left_date, new_data_part->right_date, new_data_part->left, new_data_part->right, new_data_part->level);
|
||||
new_data_part->left_month = date_lut.toFirstDayNumOfMonth(new_data_part->left_date);
|
||||
new_data_part->right_month = date_lut.toFirstDayNumOfMonth(new_data_part->right_date);
|
||||
data.parsePartName(merged_name, *new_data_part);
|
||||
new_data_part->name = merged_name;
|
||||
|
||||
/** Читаем из всех кусков, сливаем и пишем в новый.
|
||||
* Попутно вычисляем выражение для сортировки.
|
||||
|
@ -145,12 +145,13 @@ void StorageMergeTree::mergeThread(bool while_can, bool aggressive)
|
||||
/// К концу этого логического блока должен быть вызван деструктор, чтобы затем корректно определить удаленные куски
|
||||
/// Нужно вызывать деструктор под незалоченным currently_merging_mutex.
|
||||
CurrentlyMergingPartsTaggerPtr merging_tagger;
|
||||
String merged_name;
|
||||
|
||||
{
|
||||
Poco::ScopedLock<Poco::FastMutex> lock(currently_merging_mutex);
|
||||
|
||||
MergeTreeData::DataPartsVector parts;
|
||||
auto can_merge = boost::bind(&StorageMergeTree::canMergeParts, this, _1, _2);
|
||||
auto can_merge = std::bind(&StorageMergeTree::canMergeParts, this, std::placeholders::_1, std::placeholders::_2);
|
||||
bool only_small = false;
|
||||
|
||||
/// Если есть активный мердж крупных кусков, то ограничиваемся мерджем только маленьких частей.
|
||||
@ -163,14 +164,14 @@ void StorageMergeTree::mergeThread(bool while_can, bool aggressive)
|
||||
}
|
||||
}
|
||||
|
||||
if (!merger.selectPartsToMerge(parts, disk_space, false, aggressive, only_small, can_merge) &&
|
||||
!merger.selectPartsToMerge(parts, disk_space, true, aggressive, only_small, can_merge))
|
||||
if (!merger.selectPartsToMerge(parts, merged_name, disk_space, false, aggressive, only_small, can_merge) &&
|
||||
!merger.selectPartsToMerge(parts, merged_name, disk_space, true, aggressive, only_small, can_merge))
|
||||
break;
|
||||
|
||||
merging_tagger = new CurrentlyMergingPartsTagger(parts, merger.estimateDiskSpaceForMerge(parts), *this);
|
||||
}
|
||||
|
||||
merger.mergeParts(merging_tagger->parts);
|
||||
merger.mergeParts(merging_tagger->parts, merged_name);
|
||||
}
|
||||
|
||||
if (shutdown_called)
|
||||
|
@ -13,6 +13,7 @@ const auto QUEUE_UPDATE_SLEEP = std::chrono::seconds(5);
|
||||
const auto QUEUE_NO_WORK_SLEEP = std::chrono::seconds(5);
|
||||
const auto QUEUE_ERROR_SLEEP = std::chrono::seconds(1);
|
||||
const auto QUEUE_AFTER_WORK_SLEEP = std::chrono::seconds(0);
|
||||
const auto MERGE_SELECTING_SLEEP = std::chrono::seconds(5);
|
||||
|
||||
|
||||
StorageReplicatedMergeTree::StorageReplicatedMergeTree(
|
||||
@ -31,10 +32,10 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
|
||||
:
|
||||
context(context_), zookeeper(context.getZooKeeper()),
|
||||
path(path_), name(name_), full_path(path + escapeForFileName(name) + '/'), zookeeper_path(zookeeper_path_),
|
||||
replica_name(replica_name_),
|
||||
replica_name(replica_name_), is_leader_node(false),
|
||||
data( full_path, columns_, context_, primary_expr_ast_, date_column_name_, sampling_expression_,
|
||||
index_granularity_,mode_, sign_column_, settings_),
|
||||
reader(data), writer(data), fetcher(data),
|
||||
reader(data), writer(data), merger(data), fetcher(data),
|
||||
log(&Logger::get("StorageReplicatedMergeTree")),
|
||||
shutdown_called(false)
|
||||
{
|
||||
@ -62,6 +63,9 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
|
||||
loadQueue();
|
||||
activateReplica();
|
||||
|
||||
leader_election = new zkutil::LeaderElection(zookeeper_path + "/leader_election", zookeeper,
|
||||
std::bind(&StorageReplicatedMergeTree::becomeLeader, this), replica_name);
|
||||
|
||||
queue_updating_thread = std::thread(&StorageReplicatedMergeTree::queueUpdatingThread, this);
|
||||
for (size_t i = 0; i < settings_.replication_threads; ++i)
|
||||
queue_threads.push_back(std::thread(&StorageReplicatedMergeTree::queueThread, this));
|
||||
@ -126,10 +130,10 @@ void StorageReplicatedMergeTree::createTable()
|
||||
|
||||
zookeeper.create(zookeeper_path + "/metadata", metadata.str(), zkutil::CreateMode::Persistent);
|
||||
|
||||
/// Создадим нужные "директории".
|
||||
zookeeper.create(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent);
|
||||
zookeeper.create(zookeeper_path + "/blocks", "", zkutil::CreateMode::Persistent);
|
||||
zookeeper.create(zookeeper_path + "/block_numbers", "", zkutil::CreateMode::Persistent);
|
||||
zookeeper.create(zookeeper_path + "/leader_election", "", zkutil::CreateMode::Persistent);
|
||||
zookeeper.create(zookeeper_path + "/temp", "", zkutil::CreateMode::Persistent);
|
||||
}
|
||||
|
||||
@ -264,6 +268,7 @@ void StorageReplicatedMergeTree::loadQueue()
|
||||
String s = zookeeper.get(replica_path + "/queue/" + child);
|
||||
LogEntry entry = LogEntry::parse(s);
|
||||
entry.znode_name = child;
|
||||
entry.tagPartsAsCurrentlyMerging(*this);
|
||||
queue.push_back(entry);
|
||||
}
|
||||
}
|
||||
@ -349,6 +354,7 @@ void StorageReplicatedMergeTree::pullLogsToQueue()
|
||||
|
||||
String path_created = dynamic_cast<zkutil::OpResult::Create &>((*results)[0]).getPathCreated();
|
||||
entry.znode_name = path.substr(path.find_last_of('/') + 1);
|
||||
entry.tagPartsAsCurrentlyMerging(*this);
|
||||
queue.push_back(entry);
|
||||
|
||||
++iterator.index;
|
||||
@ -385,8 +391,16 @@ void StorageReplicatedMergeTree::queueUpdatingThread()
|
||||
{
|
||||
while (!shutdown_called)
|
||||
{
|
||||
pullLogsToQueue();
|
||||
optimizeQueue();
|
||||
try
|
||||
{
|
||||
pullLogsToQueue();
|
||||
optimizeQueue();
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
std::this_thread::sleep_for(QUEUE_UPDATE_SLEEP);
|
||||
}
|
||||
}
|
||||
@ -423,24 +437,9 @@ void StorageReplicatedMergeTree::queueThread()
|
||||
|
||||
success = true;
|
||||
}
|
||||
catch (const Exception & e)
|
||||
{
|
||||
LOG_ERROR(log, "Code: " << e.code() << ". " << e.displayText() << std::endl
|
||||
<< std::endl
|
||||
<< "Stack trace:" << std::endl
|
||||
<< e.getStackTrace().toString());
|
||||
}
|
||||
catch (const Poco::Exception & e)
|
||||
{
|
||||
LOG_ERROR(log, "Poco::Exception: " << e.code() << ". " << e.displayText());
|
||||
}
|
||||
catch (const std::exception & e)
|
||||
{
|
||||
LOG_ERROR(log, "std::exception: " << e.what());
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
LOG_ERROR(log, "Unknown exception");
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
if (shutdown_called)
|
||||
@ -462,6 +461,118 @@ void StorageReplicatedMergeTree::queueThread()
|
||||
}
|
||||
}
|
||||
|
||||
void StorageReplicatedMergeTree::mergeSelectingThread()
|
||||
{
|
||||
pullLogsToQueue();
|
||||
|
||||
while (!shutdown_called && is_leader_node)
|
||||
{
|
||||
size_t merges_queued = 0;
|
||||
|
||||
{
|
||||
Poco::ScopedLock<Poco::FastMutex> lock(queue_mutex);
|
||||
|
||||
for (const auto & entry : queue)
|
||||
if (entry.type == LogEntry::MERGE_PARTS)
|
||||
++merges_queued;
|
||||
}
|
||||
|
||||
if (merges_queued >= data.settings.merging_threads)
|
||||
{
|
||||
std::this_thread::sleep_for(MERGE_SELECTING_SLEEP);
|
||||
continue;
|
||||
}
|
||||
|
||||
/// Есть ли активный мердж крупных кусков.
|
||||
bool has_big_merge = false;
|
||||
|
||||
{
|
||||
Poco::ScopedLock<Poco::FastMutex> lock(currently_merging_mutex);
|
||||
|
||||
for (const auto & name : currently_merging)
|
||||
{
|
||||
MergeTreeData::DataPartPtr part = data.getContainingPart(name);
|
||||
if (!part)
|
||||
continue;
|
||||
if (part->name != name)
|
||||
throw Exception("Assertion failed in mergeSelectingThread().", ErrorCodes::LOGICAL_ERROR);
|
||||
if (part->size * data.index_granularity > 25 * 1024 * 1024)
|
||||
{
|
||||
has_big_merge = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool success = false;
|
||||
|
||||
try
|
||||
{
|
||||
Poco::ScopedLock<Poco::FastMutex> lock(currently_merging_mutex);
|
||||
|
||||
MergeTreeData::DataPartsVector parts;
|
||||
String merged_name;
|
||||
auto can_merge = std::bind(&StorageReplicatedMergeTree::canMergeParts, this, std::placeholders::_1, std::placeholders::_2);
|
||||
|
||||
if (merger.selectPartsToMerge(parts, merged_name, 0, false, false, has_big_merge, can_merge) ||
|
||||
merger.selectPartsToMerge(parts, merged_name, 0, true, false, has_big_merge, can_merge))
|
||||
{
|
||||
LogEntry entry;
|
||||
entry.type = LogEntry::MERGE_PARTS;
|
||||
entry.new_part_name = merged_name;
|
||||
|
||||
for (const auto & part : parts)
|
||||
{
|
||||
entry.parts_to_merge.push_back(part->name);
|
||||
}
|
||||
|
||||
zookeeper.create(replica_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential);
|
||||
|
||||
/// Нужно загрузить эту запись в очередь перед тем, как в следующий раз выбирать куски для слияния.
|
||||
pullLogsToQueue();
|
||||
|
||||
success = true;
|
||||
}
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
tryLogCurrentException(__PRETTY_FUNCTION__);
|
||||
}
|
||||
|
||||
if (shutdown_called)
|
||||
break;
|
||||
|
||||
if (!success)
|
||||
std::this_thread::sleep_for(QUEUE_AFTER_WORK_SLEEP);
|
||||
}
|
||||
}
|
||||
|
||||
bool StorageReplicatedMergeTree::canMergeParts(const MergeTreeData::DataPartPtr & left, const MergeTreeData::DataPartPtr & right)
|
||||
{
|
||||
if (currently_merging.count(left->name) || currently_merging.count(right->name))
|
||||
return false;
|
||||
|
||||
/// Можно слить куски, если все номера между ними заброшены - не соответствуют никаким блокам.
|
||||
for (UInt64 number = left->right + 1; number <= right->left - 1; ++number)
|
||||
{
|
||||
String number_str = toString(number);
|
||||
while (number_str.size() < 10)
|
||||
number_str = '0' + number_str;
|
||||
String path = zookeeper_path + "/block_numbers/block-" + number_str;
|
||||
|
||||
if (AbandonableLockInZooKeeper::check(path, zookeeper) != AbandonableLockInZooKeeper::ABANDONED)
|
||||
return false;
|
||||
}
|
||||
|
||||
return true;
|
||||
}
|
||||
|
||||
void StorageReplicatedMergeTree::becomeLeader()
|
||||
{
|
||||
is_leader_node = true;
|
||||
merge_selecting_thread = std::thread(&StorageReplicatedMergeTree::mergeSelectingThread, this);
|
||||
}
|
||||
|
||||
String StorageReplicatedMergeTree::findActiveReplicaHavingPart(const String & part_name)
|
||||
{
|
||||
Strings replicas = zookeeper.getChildren(zookeeper_path + "/replicas");
|
||||
@ -516,11 +627,14 @@ void StorageReplicatedMergeTree::shutdown()
|
||||
{
|
||||
if (shutdown_called)
|
||||
return;
|
||||
leader_election = nullptr;
|
||||
shutdown_called = true;
|
||||
replica_is_active_node = nullptr;
|
||||
endpoint_holder = nullptr;
|
||||
|
||||
LOG_TRACE(log, "Waiting for threads to finish");
|
||||
if (is_leader_node)
|
||||
merge_selecting_thread.join();
|
||||
queue_updating_thread.join();
|
||||
for (auto & thread : queue_threads)
|
||||
thread.join();
|
||||
|
103
libs/libzkutil/include/zkutil/LeaderElection.h
Normal file
103
libs/libzkutil/include/zkutil/LeaderElection.h
Normal file
@ -0,0 +1,103 @@
|
||||
#pragma once
|
||||
|
||||
#include <zkutil/ZooKeeper.h>
|
||||
#include <functional>
|
||||
#include <Yandex/logger_useful.h>
|
||||
|
||||
|
||||
namespace zkutil
|
||||
{
|
||||
|
||||
/** Реализует метод выбора лидера, описанный здесь: http://zookeeper.apache.org/doc/r3.4.5/recipes.html#sc_leaderElection
|
||||
*/
|
||||
class LeaderElection
|
||||
{
|
||||
public:
|
||||
typedef std::function<void()> LeadershipHandler;
|
||||
|
||||
/** handler вызывается, когда этот экземпляр становится лидером.
|
||||
*/
|
||||
LeaderElection(const std::string & path_, ZooKeeper & zookeeper_, LeadershipHandler handler_, const std::string & identifier_ = "")
|
||||
: path(path_), zookeeper(zookeeper_), handler(handler_), identifier(identifier_),
|
||||
shutdown(false), log(&Logger::get("LeaderElection"))
|
||||
{
|
||||
node = EphemeralNodeHolder::createSequential(path + "/leader_election-", zookeeper, identifier);
|
||||
|
||||
std::string node_path = node->getPath();
|
||||
node_name = node_path.substr(node_path.find_last_of('/') + 1);
|
||||
|
||||
thread = std::thread(&LeaderElection::threadFunction, this);
|
||||
}
|
||||
|
||||
~LeaderElection()
|
||||
{
|
||||
shutdown = true;
|
||||
thread.join();
|
||||
}
|
||||
|
||||
private:
|
||||
std::string path;
|
||||
ZooKeeper & zookeeper;
|
||||
LeadershipHandler handler;
|
||||
std::string identifier;
|
||||
|
||||
EphemeralNodeHolderPtr node;
|
||||
std::string node_name;
|
||||
|
||||
std::thread thread;
|
||||
volatile bool shutdown;
|
||||
|
||||
Logger * log;
|
||||
|
||||
void threadFunction()
|
||||
{
|
||||
try
|
||||
{
|
||||
while (!shutdown)
|
||||
{
|
||||
Strings children = zookeeper.getChildren(path);
|
||||
std::sort(children.begin(), children.end());
|
||||
auto it = std::lower_bound(children.begin(), children.end(), node_name);
|
||||
if (it == children.end() || *it != node_name)
|
||||
throw Poco::Exception("Assertion failed in LeaderElection");
|
||||
|
||||
if (it == children.begin())
|
||||
{
|
||||
handler();
|
||||
return;
|
||||
}
|
||||
|
||||
WatchFuture future;
|
||||
if (zookeeper.exists(*(it - 1), nullptr, &future))
|
||||
{
|
||||
while (!shutdown)
|
||||
if (future.wait_for(std::chrono::seconds(2)) != std::future_status::timeout)
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
catch (const DB::Exception & e)
|
||||
{
|
||||
LOG_ERROR(log, "Exception in LeaderElection: Code: " << e.code() << ". " << e.displayText() << std::endl
|
||||
<< std::endl
|
||||
<< "Stack trace:" << std::endl
|
||||
<< e.getStackTrace().toString());
|
||||
}
|
||||
catch (const Poco::Exception & e)
|
||||
{
|
||||
LOG_ERROR(log, "Poco::Exception in LeaderElection: " << e.code() << ". " << e.displayText());
|
||||
}
|
||||
catch (const std::exception & e)
|
||||
{
|
||||
LOG_ERROR(log, "std::exception in LeaderElection: " << e.what());
|
||||
}
|
||||
catch (...)
|
||||
{
|
||||
LOG_ERROR(log, "Unknown exception in LeaderElection");
|
||||
}
|
||||
}
|
||||
};
|
||||
|
||||
typedef Poco::SharedPtr<LeaderElection> LeaderElectionPtr;
|
||||
|
||||
}
|
@ -144,7 +144,12 @@ public:
|
||||
: path(path_), zookeeper(zookeeper_)
|
||||
{
|
||||
if (create)
|
||||
zookeeper.create(path, data, sequential ? CreateMode::EphemeralSequential : CreateMode::Ephemeral);
|
||||
path = zookeeper.create(path, data, sequential ? CreateMode::EphemeralSequential : CreateMode::Ephemeral);
|
||||
}
|
||||
|
||||
std::string getPath() const
|
||||
{
|
||||
return path;
|
||||
}
|
||||
|
||||
static Ptr create(const std::string & path, ZooKeeper & zookeeper, const std::string & data = "")
|
||||
|
Loading…
Reference in New Issue
Block a user