This commit is contained in:
Michael Kolupaev 2014-04-04 14:37:33 +04:00
parent 95fc39c559
commit 847128152e
7 changed files with 343 additions and 50 deletions

View File

@ -12,10 +12,11 @@ class MergeTreeDataMerger
public:
MergeTreeDataMerger(MergeTreeData & data_) : data(data_), log(&Logger::get("MergeTreeDataMerger")), canceled(false) {}
typedef boost::function<bool (const MergeTreeData::DataPartPtr &, const MergeTreeData::DataPartPtr &)> AllowedMergingPredicate;
typedef std::function<bool (const MergeTreeData::DataPartPtr &, const MergeTreeData::DataPartPtr &)> AllowedMergingPredicate;
/** Выбирает, какие куски слить. Использует кучу эвристик.
* Если merge_anything_for_old_months, для кусков за прошедшие месяцы снимается ограничение на соотношение размеров.
* Если available_disk_space > 0, выбирает куски так, чтобы места на диске хватило с запасом для их слияния.
*
* can_merge - функция, определяющая, можно ли объединить пару соседних кусков.
* Эта функция должна координировать слияния со вставками и другими слияниями, обеспечивая, что:
@ -24,6 +25,7 @@ public:
*/
bool selectPartsToMerge(
MergeTreeData::DataPartsVector & what,
String & merged_name,
size_t available_disk_space,
bool merge_anything_for_old_months,
bool aggressive,
@ -31,7 +33,7 @@ public:
const AllowedMergingPredicate & can_merge);
/// Сливает куски. Возвращает название нового куска. Если слияние отменили, возвращает пустую строку.
String mergeParts(const MergeTreeData::DataPartsVector & parts);
String mergeParts(const MergeTreeData::DataPartsVector & parts, const String & merged_name);
/// Примерное количество места на диске, нужное для мерджа. С запасом.
size_t estimateDiskSpaceForMerge(const MergeTreeData::DataPartsVector & parts);

View File

@ -5,8 +5,9 @@
#include <DB/Storages/MergeTree/MergeTreeDataMerger.h>
#include <DB/Storages/MergeTree/MergeTreeDataWriter.h>
#include <DB/Storages/MergeTree/MergeTreeDataSelectExecutor.h>
#include "MergeTree/ReplicatedMergeTreePartsExchange.h"
#include <DB/Storages/MergeTree/ReplicatedMergeTreePartsExchange.h>
#include <zkutil/ZooKeeper.h>
#include <zkutil/LeaderElection.h>
#include <statdaemons/threadpool.hpp>
namespace DB
@ -66,6 +67,44 @@ public:
private:
friend class ReplicatedMergeTreeBlockOutputStream;
struct CurrentlyMergingPartsTagger
{
Strings parts;
StorageReplicatedMergeTree & storage;
CurrentlyMergingPartsTagger(const Strings & parts_, StorageReplicatedMergeTree & storage_)
: parts(parts_), storage(storage_)
{
Poco::ScopedLock<Poco::FastMutex> lock(storage.currently_merging_mutex);
for (const auto & name : parts)
{
if (storage.currently_merging.count(name))
throw Exception("Tagging alreagy tagged part " + name + ". This is a bug.", ErrorCodes::LOGICAL_ERROR);
}
storage.currently_merging.insert(parts.begin(), parts.end());
}
~CurrentlyMergingPartsTagger()
{
try
{
Poco::ScopedLock<Poco::FastMutex> lock(storage.currently_merging_mutex);
for (const auto & name : parts)
{
if (!storage.currently_merging.count(name))
throw Exception("Untagging already untagged part " + name + ". This is a bug.", ErrorCodes::LOGICAL_ERROR);
storage.currently_merging.erase(name);
}
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
}
};
typedef Poco::SharedPtr<CurrentlyMergingPartsTagger> CurrentlyMergingPartsTaggerPtr;
struct LogEntry
{
enum Type
@ -80,6 +119,14 @@ private:
String new_part_name;
Strings parts_to_merge;
CurrentlyMergingPartsTaggerPtr currently_merging_tagger;
void tagPartsAsCurrentlyMerging(StorageReplicatedMergeTree & storage)
{
if (type == MERGE_PARTS)
currently_merging_tagger = new CurrentlyMergingPartsTagger(parts_to_merge, storage);
}
void writeText(WriteBuffer & out) const;
void readText(ReadBuffer & in);
@ -135,7 +182,9 @@ private:
MergeTreeData data;
MergeTreeDataSelectExecutor reader;
MergeTreeDataWriter writer;
MergeTreeDataMerger merger;
ReplicatedMergeTreePartsFetcher fetcher;
zkutil::LeaderElectionPtr leader_election;
typedef std::vector<std::thread> Threads;
@ -145,6 +194,15 @@ private:
/// Потоки, выполняющие действия из очереди.
Threads queue_threads;
/// Поток, выбирающий куски для слияния.
std::thread merge_selecting_thread;
typedef std::set<String> StringSet;
/// Куски, для которых в очереди есть задание на слияние.
StringSet currently_merging;
Poco::FastMutex currently_merging_mutex;
Logger * log;
volatile bool shutdown_called;
@ -190,7 +248,7 @@ private:
*/
void checkParts();
/// Работа с очередью и логом.
/// Выполнение заданий из очереди.
/** Кладет в queue записи из ZooKeeper (/replicas/me/queue/).
*/
@ -218,6 +276,17 @@ private:
*/
void queueThread();
void becomeLeader();
/// Выбор кусков для слияния.
/** В бесконечном цикле выбирается куски для слияния и записывает в лог.
*/
void mergeSelectingThread();
/// Вызывается во время выбора кусков для слияния.
bool canMergeParts(const MergeTreeData::DataPartPtr & left, const MergeTreeData::DataPartPtr & right);
/// Обмен кусками.
/** Бросает исключение, если куска ни у кого нет.

View File

@ -35,7 +35,7 @@ static const double DISK_USAGE_COEFFICIENT_TO_RESERVE = 1.4;
/// 4) Если в одном из потоков идет мердж крупных кусков, то во втором сливать только маленькие кусочки
/// 5) С ростом логарифма суммарного размера кусочков в мердже увеличиваем требование сбалансированности
bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & parts, size_t available_disk_space,
bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & parts, String & merged_name, size_t available_disk_space,
bool merge_anything_for_old_months, bool aggressive, bool only_small, const AllowedMergingPredicate & can_merge)
{
LOG_DEBUG(log, "Selecting parts to merge");
@ -44,6 +44,9 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa
DateLUTSingleton & date_lut = DateLUTSingleton::instance();
if (available_disk_space == 0)
available_disk_space = std::numeric_limits<size_t>::max();
size_t min_max = -1U;
size_t min_min = -1U;
int max_len = 0;
@ -209,13 +212,25 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa
{
parts.clear();
DayNum_t left_date = DayNum_t(std::numeric_limits<UInt16>::max());
DayNum_t right_date = DayNum_t(std::numeric_limits<UInt16>::min());
UInt32 level = 0;
MergeTreeData::DataParts::iterator it = best_begin;
for (int i = 0; i < max_len; ++i)
{
parts.push_back(*it);
level = std::max(level, parts[i]->level);
left_date = std::min(left_date, parts[i]->left_date);
right_date = std::max(right_date, parts[i]->right_date);
++it;
}
merged_name = MergeTreeData::getPartName(
left_date, right_date, parts.front()->left, parts.back()->right, level + 1);
LOG_DEBUG(log, "Selected " << parts.size() << " parts from " << parts.front()->name << " to " << parts.back()->name);
}
else
@ -228,7 +243,7 @@ bool MergeTreeDataMerger::selectPartsToMerge(MergeTreeData::DataPartsVector & pa
/// parts должны быть отсортированы.
String MergeTreeDataMerger::mergeParts(const MergeTreeData::DataPartsVector & parts)
String MergeTreeDataMerger::mergeParts(const MergeTreeData::DataPartsVector & parts, const String & merged_name)
{
LOG_DEBUG(log, "Merging " << parts.size() << " parts: from " << parts.front()->name << " to " << parts.back()->name);
@ -237,25 +252,9 @@ String MergeTreeDataMerger::mergeParts(const MergeTreeData::DataPartsVector & pa
for (const auto & it : columns_list)
all_column_names.push_back(it.first);
DateLUTSingleton & date_lut = DateLUTSingleton::instance();
MergeTreeData::MutableDataPartPtr new_data_part = std::make_shared<MergeTreeData::DataPart>(data);
new_data_part->left_date = std::numeric_limits<UInt16>::max();
new_data_part->right_date = std::numeric_limits<UInt16>::min();
new_data_part->left = parts.front()->left;
new_data_part->right = parts.back()->right;
new_data_part->level = 0;
for (size_t i = 0; i < parts.size(); ++i)
{
new_data_part->level = std::max(new_data_part->level, parts[i]->level);
new_data_part->left_date = std::min(new_data_part->left_date, parts[i]->left_date);
new_data_part->right_date = std::max(new_data_part->right_date, parts[i]->right_date);
}
++new_data_part->level;
new_data_part->name = MergeTreeData::getPartName(
new_data_part->left_date, new_data_part->right_date, new_data_part->left, new_data_part->right, new_data_part->level);
new_data_part->left_month = date_lut.toFirstDayNumOfMonth(new_data_part->left_date);
new_data_part->right_month = date_lut.toFirstDayNumOfMonth(new_data_part->right_date);
data.parsePartName(merged_name, *new_data_part);
new_data_part->name = merged_name;
/** Читаем из всех кусков, сливаем и пишем в новый.
* Попутно вычисляем выражение для сортировки.

View File

@ -145,12 +145,13 @@ void StorageMergeTree::mergeThread(bool while_can, bool aggressive)
/// К концу этого логического блока должен быть вызван деструктор, чтобы затем корректно определить удаленные куски
/// Нужно вызывать деструктор под незалоченным currently_merging_mutex.
CurrentlyMergingPartsTaggerPtr merging_tagger;
String merged_name;
{
Poco::ScopedLock<Poco::FastMutex> lock(currently_merging_mutex);
MergeTreeData::DataPartsVector parts;
auto can_merge = boost::bind(&StorageMergeTree::canMergeParts, this, _1, _2);
auto can_merge = std::bind(&StorageMergeTree::canMergeParts, this, std::placeholders::_1, std::placeholders::_2);
bool only_small = false;
/// Если есть активный мердж крупных кусков, то ограничиваемся мерджем только маленьких частей.
@ -163,14 +164,14 @@ void StorageMergeTree::mergeThread(bool while_can, bool aggressive)
}
}
if (!merger.selectPartsToMerge(parts, disk_space, false, aggressive, only_small, can_merge) &&
!merger.selectPartsToMerge(parts, disk_space, true, aggressive, only_small, can_merge))
if (!merger.selectPartsToMerge(parts, merged_name, disk_space, false, aggressive, only_small, can_merge) &&
!merger.selectPartsToMerge(parts, merged_name, disk_space, true, aggressive, only_small, can_merge))
break;
merging_tagger = new CurrentlyMergingPartsTagger(parts, merger.estimateDiskSpaceForMerge(parts), *this);
}
merger.mergeParts(merging_tagger->parts);
merger.mergeParts(merging_tagger->parts, merged_name);
}
if (shutdown_called)

View File

@ -13,6 +13,7 @@ const auto QUEUE_UPDATE_SLEEP = std::chrono::seconds(5);
const auto QUEUE_NO_WORK_SLEEP = std::chrono::seconds(5);
const auto QUEUE_ERROR_SLEEP = std::chrono::seconds(1);
const auto QUEUE_AFTER_WORK_SLEEP = std::chrono::seconds(0);
const auto MERGE_SELECTING_SLEEP = std::chrono::seconds(5);
StorageReplicatedMergeTree::StorageReplicatedMergeTree(
@ -31,10 +32,10 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
:
context(context_), zookeeper(context.getZooKeeper()),
path(path_), name(name_), full_path(path + escapeForFileName(name) + '/'), zookeeper_path(zookeeper_path_),
replica_name(replica_name_),
replica_name(replica_name_), is_leader_node(false),
data( full_path, columns_, context_, primary_expr_ast_, date_column_name_, sampling_expression_,
index_granularity_,mode_, sign_column_, settings_),
reader(data), writer(data), fetcher(data),
reader(data), writer(data), merger(data), fetcher(data),
log(&Logger::get("StorageReplicatedMergeTree")),
shutdown_called(false)
{
@ -62,6 +63,9 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
loadQueue();
activateReplica();
leader_election = new zkutil::LeaderElection(zookeeper_path + "/leader_election", zookeeper,
std::bind(&StorageReplicatedMergeTree::becomeLeader, this), replica_name);
queue_updating_thread = std::thread(&StorageReplicatedMergeTree::queueUpdatingThread, this);
for (size_t i = 0; i < settings_.replication_threads; ++i)
queue_threads.push_back(std::thread(&StorageReplicatedMergeTree::queueThread, this));
@ -126,10 +130,10 @@ void StorageReplicatedMergeTree::createTable()
zookeeper.create(zookeeper_path + "/metadata", metadata.str(), zkutil::CreateMode::Persistent);
/// Создадим нужные "директории".
zookeeper.create(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent);
zookeeper.create(zookeeper_path + "/blocks", "", zkutil::CreateMode::Persistent);
zookeeper.create(zookeeper_path + "/block_numbers", "", zkutil::CreateMode::Persistent);
zookeeper.create(zookeeper_path + "/leader_election", "", zkutil::CreateMode::Persistent);
zookeeper.create(zookeeper_path + "/temp", "", zkutil::CreateMode::Persistent);
}
@ -264,6 +268,7 @@ void StorageReplicatedMergeTree::loadQueue()
String s = zookeeper.get(replica_path + "/queue/" + child);
LogEntry entry = LogEntry::parse(s);
entry.znode_name = child;
entry.tagPartsAsCurrentlyMerging(*this);
queue.push_back(entry);
}
}
@ -349,6 +354,7 @@ void StorageReplicatedMergeTree::pullLogsToQueue()
String path_created = dynamic_cast<zkutil::OpResult::Create &>((*results)[0]).getPathCreated();
entry.znode_name = path.substr(path.find_last_of('/') + 1);
entry.tagPartsAsCurrentlyMerging(*this);
queue.push_back(entry);
++iterator.index;
@ -385,8 +391,16 @@ void StorageReplicatedMergeTree::queueUpdatingThread()
{
while (!shutdown_called)
{
pullLogsToQueue();
optimizeQueue();
try
{
pullLogsToQueue();
optimizeQueue();
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
std::this_thread::sleep_for(QUEUE_UPDATE_SLEEP);
}
}
@ -423,24 +437,9 @@ void StorageReplicatedMergeTree::queueThread()
success = true;
}
catch (const Exception & e)
{
LOG_ERROR(log, "Code: " << e.code() << ". " << e.displayText() << std::endl
<< std::endl
<< "Stack trace:" << std::endl
<< e.getStackTrace().toString());
}
catch (const Poco::Exception & e)
{
LOG_ERROR(log, "Poco::Exception: " << e.code() << ". " << e.displayText());
}
catch (const std::exception & e)
{
LOG_ERROR(log, "std::exception: " << e.what());
}
catch (...)
{
LOG_ERROR(log, "Unknown exception");
tryLogCurrentException(__PRETTY_FUNCTION__);
}
if (shutdown_called)
@ -462,6 +461,118 @@ void StorageReplicatedMergeTree::queueThread()
}
}
void StorageReplicatedMergeTree::mergeSelectingThread()
{
pullLogsToQueue();
while (!shutdown_called && is_leader_node)
{
size_t merges_queued = 0;
{
Poco::ScopedLock<Poco::FastMutex> lock(queue_mutex);
for (const auto & entry : queue)
if (entry.type == LogEntry::MERGE_PARTS)
++merges_queued;
}
if (merges_queued >= data.settings.merging_threads)
{
std::this_thread::sleep_for(MERGE_SELECTING_SLEEP);
continue;
}
/// Есть ли активный мердж крупных кусков.
bool has_big_merge = false;
{
Poco::ScopedLock<Poco::FastMutex> lock(currently_merging_mutex);
for (const auto & name : currently_merging)
{
MergeTreeData::DataPartPtr part = data.getContainingPart(name);
if (!part)
continue;
if (part->name != name)
throw Exception("Assertion failed in mergeSelectingThread().", ErrorCodes::LOGICAL_ERROR);
if (part->size * data.index_granularity > 25 * 1024 * 1024)
{
has_big_merge = true;
break;
}
}
}
bool success = false;
try
{
Poco::ScopedLock<Poco::FastMutex> lock(currently_merging_mutex);
MergeTreeData::DataPartsVector parts;
String merged_name;
auto can_merge = std::bind(&StorageReplicatedMergeTree::canMergeParts, this, std::placeholders::_1, std::placeholders::_2);
if (merger.selectPartsToMerge(parts, merged_name, 0, false, false, has_big_merge, can_merge) ||
merger.selectPartsToMerge(parts, merged_name, 0, true, false, has_big_merge, can_merge))
{
LogEntry entry;
entry.type = LogEntry::MERGE_PARTS;
entry.new_part_name = merged_name;
for (const auto & part : parts)
{
entry.parts_to_merge.push_back(part->name);
}
zookeeper.create(replica_path + "/log/log-", entry.toString(), zkutil::CreateMode::PersistentSequential);
/// Нужно загрузить эту запись в очередь перед тем, как в следующий раз выбирать куски для слияния.
pullLogsToQueue();
success = true;
}
}
catch (...)
{
tryLogCurrentException(__PRETTY_FUNCTION__);
}
if (shutdown_called)
break;
if (!success)
std::this_thread::sleep_for(QUEUE_AFTER_WORK_SLEEP);
}
}
bool StorageReplicatedMergeTree::canMergeParts(const MergeTreeData::DataPartPtr & left, const MergeTreeData::DataPartPtr & right)
{
if (currently_merging.count(left->name) || currently_merging.count(right->name))
return false;
/// Можно слить куски, если все номера между ними заброшены - не соответствуют никаким блокам.
for (UInt64 number = left->right + 1; number <= right->left - 1; ++number)
{
String number_str = toString(number);
while (number_str.size() < 10)
number_str = '0' + number_str;
String path = zookeeper_path + "/block_numbers/block-" + number_str;
if (AbandonableLockInZooKeeper::check(path, zookeeper) != AbandonableLockInZooKeeper::ABANDONED)
return false;
}
return true;
}
void StorageReplicatedMergeTree::becomeLeader()
{
is_leader_node = true;
merge_selecting_thread = std::thread(&StorageReplicatedMergeTree::mergeSelectingThread, this);
}
String StorageReplicatedMergeTree::findActiveReplicaHavingPart(const String & part_name)
{
Strings replicas = zookeeper.getChildren(zookeeper_path + "/replicas");
@ -516,11 +627,14 @@ void StorageReplicatedMergeTree::shutdown()
{
if (shutdown_called)
return;
leader_election = nullptr;
shutdown_called = true;
replica_is_active_node = nullptr;
endpoint_holder = nullptr;
LOG_TRACE(log, "Waiting for threads to finish");
if (is_leader_node)
merge_selecting_thread.join();
queue_updating_thread.join();
for (auto & thread : queue_threads)
thread.join();

View File

@ -0,0 +1,103 @@
#pragma once
#include <zkutil/ZooKeeper.h>
#include <functional>
#include <Yandex/logger_useful.h>
namespace zkutil
{
/** Реализует метод выбора лидера, описанный здесь: http://zookeeper.apache.org/doc/r3.4.5/recipes.html#sc_leaderElection
*/
class LeaderElection
{
public:
typedef std::function<void()> LeadershipHandler;
/** handler вызывается, когда этот экземпляр становится лидером.
*/
LeaderElection(const std::string & path_, ZooKeeper & zookeeper_, LeadershipHandler handler_, const std::string & identifier_ = "")
: path(path_), zookeeper(zookeeper_), handler(handler_), identifier(identifier_),
shutdown(false), log(&Logger::get("LeaderElection"))
{
node = EphemeralNodeHolder::createSequential(path + "/leader_election-", zookeeper, identifier);
std::string node_path = node->getPath();
node_name = node_path.substr(node_path.find_last_of('/') + 1);
thread = std::thread(&LeaderElection::threadFunction, this);
}
~LeaderElection()
{
shutdown = true;
thread.join();
}
private:
std::string path;
ZooKeeper & zookeeper;
LeadershipHandler handler;
std::string identifier;
EphemeralNodeHolderPtr node;
std::string node_name;
std::thread thread;
volatile bool shutdown;
Logger * log;
void threadFunction()
{
try
{
while (!shutdown)
{
Strings children = zookeeper.getChildren(path);
std::sort(children.begin(), children.end());
auto it = std::lower_bound(children.begin(), children.end(), node_name);
if (it == children.end() || *it != node_name)
throw Poco::Exception("Assertion failed in LeaderElection");
if (it == children.begin())
{
handler();
return;
}
WatchFuture future;
if (zookeeper.exists(*(it - 1), nullptr, &future))
{
while (!shutdown)
if (future.wait_for(std::chrono::seconds(2)) != std::future_status::timeout)
break;
}
}
}
catch (const DB::Exception & e)
{
LOG_ERROR(log, "Exception in LeaderElection: Code: " << e.code() << ". " << e.displayText() << std::endl
<< std::endl
<< "Stack trace:" << std::endl
<< e.getStackTrace().toString());
}
catch (const Poco::Exception & e)
{
LOG_ERROR(log, "Poco::Exception in LeaderElection: " << e.code() << ". " << e.displayText());
}
catch (const std::exception & e)
{
LOG_ERROR(log, "std::exception in LeaderElection: " << e.what());
}
catch (...)
{
LOG_ERROR(log, "Unknown exception in LeaderElection");
}
}
};
typedef Poco::SharedPtr<LeaderElection> LeaderElectionPtr;
}

View File

@ -144,7 +144,12 @@ public:
: path(path_), zookeeper(zookeeper_)
{
if (create)
zookeeper.create(path, data, sequential ? CreateMode::EphemeralSequential : CreateMode::Ephemeral);
path = zookeeper.create(path, data, sequential ? CreateMode::EphemeralSequential : CreateMode::Ephemeral);
}
std::string getPath() const
{
return path;
}
static Ptr create(const std::string & path, ZooKeeper & zookeeper, const std::string & data = "")