2017-04-01 09:19:00 +00:00
|
|
|
#include <Storages/MergeTree/ReplicatedMergeTreeCleanupThread.h>
|
|
|
|
#include <Storages/StorageReplicatedMergeTree.h>
|
|
|
|
#include <Common/setThreadName.h>
|
2017-07-24 20:12:59 +00:00
|
|
|
#include <Poco/Timestamp.h>
|
2014-10-15 01:22:06 +00:00
|
|
|
|
|
|
|
|
|
|
|
namespace DB
|
|
|
|
{
|
|
|
|
|
2016-01-11 21:46:36 +00:00
|
|
|
namespace ErrorCodes
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
extern const int NOT_FOUND_NODE;
|
2016-01-11 21:46:36 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-10-15 01:22:06 +00:00
|
|
|
ReplicatedMergeTreeCleanupThread::ReplicatedMergeTreeCleanupThread(StorageReplicatedMergeTree & storage_)
|
2017-04-01 07:20:54 +00:00
|
|
|
: storage(storage_),
|
|
|
|
log(&Logger::get(storage.database_name + "." + storage.table_name + " (StorageReplicatedMergeTree, CleanupThread)")),
|
2017-08-09 21:09:44 +00:00
|
|
|
thread([this] { run(); }),
|
|
|
|
cached_block_stats(std::make_unique<NodesStatCache>()) {}
|
2014-10-15 01:22:06 +00:00
|
|
|
|
|
|
|
|
|
|
|
void ReplicatedMergeTreeCleanupThread::run()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
setThreadName("ReplMTCleanup");
|
2015-09-24 18:54:21 +00:00
|
|
|
|
2017-07-24 20:12:59 +00:00
|
|
|
const auto CLEANUP_SLEEP_MS = storage.data.settings.cleanup_delay_period * 1000;
|
2014-10-15 01:22:06 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
while (!storage.shutdown_called)
|
|
|
|
{
|
|
|
|
try
|
|
|
|
{
|
|
|
|
iterate();
|
|
|
|
}
|
|
|
|
catch (...)
|
|
|
|
{
|
|
|
|
tryLogCurrentException(__PRETTY_FUNCTION__);
|
|
|
|
}
|
2014-10-15 01:22:06 +00:00
|
|
|
|
2017-10-03 19:04:56 +00:00
|
|
|
storage.cleanup_thread_event.tryWait(CLEANUP_SLEEP_MS);
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2014-10-15 01:22:06 +00:00
|
|
|
|
2017-04-01 07:20:54 +00:00
|
|
|
LOG_DEBUG(log, "Cleanup thread finished");
|
2014-10-15 01:22:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void ReplicatedMergeTreeCleanupThread::iterate()
|
|
|
|
{
|
2017-10-06 11:30:57 +00:00
|
|
|
storage.clearOldPartsAndRemoveFromZK();
|
2017-04-01 07:20:54 +00:00
|
|
|
storage.data.clearOldTemporaryDirectories();
|
|
|
|
|
|
|
|
if (storage.is_leader_node)
|
|
|
|
{
|
|
|
|
clearOldLogs();
|
|
|
|
clearOldBlocks();
|
|
|
|
}
|
2014-10-15 01:22:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void ReplicatedMergeTreeCleanupThread::clearOldLogs()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
auto zookeeper = storage.getZooKeeper();
|
|
|
|
|
|
|
|
zkutil::Stat stat;
|
|
|
|
if (!zookeeper->exists(storage.zookeeper_path + "/log", &stat))
|
|
|
|
throw Exception(storage.zookeeper_path + "/log doesn't exist", ErrorCodes::NOT_FOUND_NODE);
|
|
|
|
|
|
|
|
int children_count = stat.numChildren;
|
|
|
|
|
|
|
|
/// We will wait for 1.1 times more records to accumulate than necessary.
|
|
|
|
if (static_cast<double>(children_count) < storage.data.settings.replicated_logs_to_keep * 1.1)
|
|
|
|
return;
|
|
|
|
|
|
|
|
Strings replicas = zookeeper->getChildren(storage.zookeeper_path + "/replicas", &stat);
|
|
|
|
UInt64 min_pointer = std::numeric_limits<UInt64>::max();
|
|
|
|
for (const String & replica : replicas)
|
|
|
|
{
|
|
|
|
String pointer = zookeeper->get(storage.zookeeper_path + "/replicas/" + replica + "/log_pointer");
|
|
|
|
if (pointer.empty())
|
|
|
|
return;
|
|
|
|
min_pointer = std::min(min_pointer, parse<UInt64>(pointer));
|
|
|
|
}
|
|
|
|
|
|
|
|
Strings entries = zookeeper->getChildren(storage.zookeeper_path + "/log");
|
|
|
|
std::sort(entries.begin(), entries.end());
|
|
|
|
|
|
|
|
/// We will not touch the last `replicated_logs_to_keep` records.
|
2017-09-20 14:41:07 +00:00
|
|
|
entries.erase(entries.end() - std::min(entries.size(), storage.data.settings.replicated_logs_to_keep.value), entries.end());
|
2017-04-01 07:20:54 +00:00
|
|
|
/// We will not touch records that are no less than `min_pointer`.
|
|
|
|
entries.erase(std::lower_bound(entries.begin(), entries.end(), "log-" + padIndex(min_pointer)), entries.end());
|
|
|
|
|
|
|
|
if (entries.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
zkutil::Ops ops;
|
|
|
|
for (size_t i = 0; i < entries.size(); ++i)
|
|
|
|
{
|
|
|
|
ops.emplace_back(std::make_unique<zkutil::Op::Remove>(storage.zookeeper_path + "/log/" + entries[i], -1));
|
|
|
|
|
2017-08-10 15:19:36 +00:00
|
|
|
if (ops.size() > 4 * zkutil::MULTI_BATCH_SIZE || i + 1 == entries.size())
|
2017-04-01 07:20:54 +00:00
|
|
|
{
|
|
|
|
/// Simultaneously with clearing the log, we check to see if replica was added since we received replicas list.
|
|
|
|
ops.emplace_back(std::make_unique<zkutil::Op::Check>(storage.zookeeper_path + "/replicas", stat.version));
|
|
|
|
zookeeper->multi(ops);
|
|
|
|
ops.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
LOG_DEBUG(log, "Removed " << entries.size() << " old log entries: " << entries.front() << " - " << entries.back());
|
2014-10-15 01:22:06 +00:00
|
|
|
}
|
|
|
|
|
2017-08-10 15:19:36 +00:00
|
|
|
|
2017-08-09 21:09:44 +00:00
|
|
|
namespace
|
|
|
|
{
|
|
|
|
|
2017-08-10 15:19:36 +00:00
|
|
|
/// Just a subset of zkutil::Stat fields required for the cache
|
2017-08-09 21:09:44 +00:00
|
|
|
struct RequiredStat
|
|
|
|
{
|
2017-08-10 15:19:36 +00:00
|
|
|
int64_t ctime = 0;
|
|
|
|
int32_t numChildren = 0;
|
2017-08-09 21:09:44 +00:00
|
|
|
|
|
|
|
RequiredStat() = default;
|
|
|
|
RequiredStat(const RequiredStat &) = default;
|
|
|
|
explicit RequiredStat(const zkutil::Stat & s) : ctime(s.ctime), numChildren(s.numChildren) {};
|
|
|
|
explicit RequiredStat(Int64 ctime_) : ctime(ctime_) {}
|
|
|
|
};
|
|
|
|
|
|
|
|
}
|
|
|
|
|
2017-08-10 15:19:36 +00:00
|
|
|
/// Just a node name with its ZooKeeper's stat
|
2017-08-09 21:09:44 +00:00
|
|
|
struct ReplicatedMergeTreeCleanupThread::NodeWithStat
|
|
|
|
{
|
|
|
|
String node;
|
|
|
|
RequiredStat stat;
|
|
|
|
|
|
|
|
NodeWithStat() = default;
|
|
|
|
NodeWithStat(const String & node_, const RequiredStat & stat_) : node(node_), stat(stat_) {}
|
|
|
|
|
|
|
|
static bool greaterByTime (const NodeWithStat & lhs, const NodeWithStat & rhs)
|
|
|
|
{
|
2017-08-10 15:19:36 +00:00
|
|
|
return std::greater<void>()(std::forward_as_tuple(lhs.stat.ctime, lhs.node), std::forward_as_tuple(rhs.stat.ctime, rhs.node));
|
2017-08-09 21:09:44 +00:00
|
|
|
}
|
|
|
|
};
|
|
|
|
|
2017-08-10 15:19:36 +00:00
|
|
|
/// Use simple map node_name -> zkutil::Stat (only required fields) as the cache
|
|
|
|
/// It is not declared in the header explicitly to hide extra implementation dependent structs like RequiredStat
|
|
|
|
class ReplicatedMergeTreeCleanupThread::NodesStatCache : public std::map<String, RequiredStat> {};
|
|
|
|
|
2014-10-15 01:22:06 +00:00
|
|
|
|
|
|
|
void ReplicatedMergeTreeCleanupThread::clearOldBlocks()
|
|
|
|
{
|
2017-04-01 07:20:54 +00:00
|
|
|
auto zookeeper = storage.getZooKeeper();
|
2014-12-12 20:50:32 +00:00
|
|
|
|
2017-08-09 21:09:44 +00:00
|
|
|
std::vector<NodeWithStat> timed_blocks;
|
|
|
|
getBlocksSortedByTime(zookeeper, timed_blocks);
|
|
|
|
|
|
|
|
if (timed_blocks.empty())
|
|
|
|
return;
|
|
|
|
|
|
|
|
/// Use ZooKeeper's first node (last according to time) timestamp as "current" time.
|
|
|
|
Int64 current_time = timed_blocks.front().stat.ctime;
|
2017-09-01 17:21:03 +00:00
|
|
|
Int64 time_threshold = std::max(static_cast<Int64>(0), current_time - static_cast<Int64>(1000 * storage.data.settings.replicated_deduplication_window_seconds));
|
2017-08-10 15:19:36 +00:00
|
|
|
|
|
|
|
/// Virtual node, all nodes that are "greater" than this one will be deleted
|
2017-08-09 21:09:44 +00:00
|
|
|
NodeWithStat block_threshold("", RequiredStat(time_threshold));
|
|
|
|
|
2017-09-20 14:41:07 +00:00
|
|
|
size_t current_deduplication_window = std::min(timed_blocks.size(), storage.data.settings.replicated_deduplication_window.value);
|
2017-08-09 21:09:44 +00:00
|
|
|
auto first_outdated_block_fixed_threshold = timed_blocks.begin() + current_deduplication_window;
|
|
|
|
auto first_outdated_block_time_threshold = std::upper_bound(timed_blocks.begin(), timed_blocks.end(), block_threshold, NodeWithStat::greaterByTime);
|
|
|
|
auto first_outdated_block = std::min(first_outdated_block_fixed_threshold, first_outdated_block_time_threshold);
|
|
|
|
|
|
|
|
/// TODO After about half a year, we could remain only multi op, because there will be no obsolete children nodes.
|
|
|
|
zkutil::Ops ops;
|
|
|
|
for (auto it = first_outdated_block; it != timed_blocks.end(); ++it)
|
|
|
|
{
|
|
|
|
String path = storage.zookeeper_path + "/blocks/" + it->node;
|
|
|
|
|
|
|
|
if (it->stat.numChildren == 0)
|
|
|
|
{
|
|
|
|
ops.emplace_back(new zkutil::Op::Remove(path, -1));
|
|
|
|
if (ops.size() >= zkutil::MULTI_BATCH_SIZE)
|
|
|
|
{
|
2017-08-15 13:00:08 +00:00
|
|
|
zookeeper->multi(ops);
|
2017-08-09 21:09:44 +00:00
|
|
|
ops.clear();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
else
|
|
|
|
zookeeper->removeRecursive(path);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!ops.empty())
|
|
|
|
{
|
2017-08-15 13:00:08 +00:00
|
|
|
zookeeper->multi(ops);
|
2017-08-09 21:09:44 +00:00
|
|
|
ops.clear();
|
|
|
|
}
|
|
|
|
|
|
|
|
auto num_nodes_to_delete = timed_blocks.end() - first_outdated_block;
|
2017-08-15 13:00:08 +00:00
|
|
|
LOG_TRACE(log, "Cleared " << num_nodes_to_delete << " old blocks from ZooKeeper");
|
2017-08-09 21:09:44 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
void ReplicatedMergeTreeCleanupThread::getBlocksSortedByTime(zkutil::ZooKeeperPtr & zookeeper, std::vector<NodeWithStat> & timed_blocks)
|
|
|
|
{
|
|
|
|
timed_blocks.clear();
|
|
|
|
|
2017-08-06 21:40:38 +00:00
|
|
|
Strings blocks;
|
2017-04-01 07:20:54 +00:00
|
|
|
zkutil::Stat stat;
|
2017-08-06 21:40:38 +00:00
|
|
|
if (ZOK != zookeeper->tryGetChildren(storage.zookeeper_path + "/blocks", blocks, &stat))
|
2017-04-01 07:20:54 +00:00
|
|
|
throw Exception(storage.zookeeper_path + "/blocks doesn't exist", ErrorCodes::NOT_FOUND_NODE);
|
2014-10-15 01:22:06 +00:00
|
|
|
|
2017-08-06 21:40:38 +00:00
|
|
|
/// Clear already deleted blocks from the cache, cached_block_ctime should be subset of blocks
|
|
|
|
{
|
|
|
|
NameSet blocks_set(blocks.begin(), blocks.end());
|
2017-08-09 21:09:44 +00:00
|
|
|
for (auto it = cached_block_stats->begin(); it != cached_block_stats->end();)
|
2017-08-06 21:40:38 +00:00
|
|
|
{
|
|
|
|
if (!blocks_set.count(it->first))
|
2017-08-09 21:09:44 +00:00
|
|
|
it = cached_block_stats->erase(it);
|
2017-08-06 21:40:38 +00:00
|
|
|
else
|
|
|
|
++it;
|
|
|
|
}
|
|
|
|
}
|
2014-10-15 01:22:06 +00:00
|
|
|
|
2017-08-09 21:09:44 +00:00
|
|
|
auto not_cached_blocks = stat.numChildren - cached_block_stats->size();
|
2017-09-26 15:17:31 +00:00
|
|
|
if (not_cached_blocks)
|
|
|
|
{
|
|
|
|
LOG_TRACE(log, "Checking " << stat.numChildren << " blocks (" << not_cached_blocks << " are not cached)"
|
|
|
|
<< " to clear old ones from ZooKeeper. This might take several minutes.");
|
|
|
|
}
|
2014-10-15 01:22:06 +00:00
|
|
|
|
2017-08-09 21:09:44 +00:00
|
|
|
std::vector<std::pair<String, zkutil::ZooKeeper::ExistsFuture>> exists_futures;
|
2017-04-01 07:20:54 +00:00
|
|
|
for (const String & block : blocks)
|
|
|
|
{
|
2017-08-09 21:09:44 +00:00
|
|
|
auto it = cached_block_stats->find(block);
|
|
|
|
if (it == cached_block_stats->end())
|
2017-08-06 21:40:38 +00:00
|
|
|
{
|
2017-08-09 21:09:44 +00:00
|
|
|
/// New block. Fetch its stat stat asynchronously
|
|
|
|
exists_futures.emplace_back(block, zookeeper->asyncExists(storage.zookeeper_path + "/blocks/" + block));
|
2017-08-06 21:40:38 +00:00
|
|
|
}
|
|
|
|
else
|
|
|
|
{
|
|
|
|
/// Cached block
|
2017-08-09 21:09:44 +00:00
|
|
|
timed_blocks.emplace_back(block, it->second);
|
2017-08-06 21:40:38 +00:00
|
|
|
}
|
2017-04-01 07:20:54 +00:00
|
|
|
}
|
2014-10-15 01:22:06 +00:00
|
|
|
|
2017-08-09 21:09:44 +00:00
|
|
|
/// Put fetched stats into the cache
|
|
|
|
for (auto & elem : exists_futures)
|
|
|
|
{
|
|
|
|
zkutil::ZooKeeper::StatAndExists status = elem.second.get();
|
|
|
|
if (!status.exists)
|
|
|
|
throw zkutil::KeeperException("A block node was suddenly deleted", ZNONODE);
|
2017-07-24 20:12:59 +00:00
|
|
|
|
2017-08-16 21:01:43 +00:00
|
|
|
cached_block_stats->emplace(elem.first, RequiredStat(status.stat));
|
2017-08-09 21:09:44 +00:00
|
|
|
timed_blocks.emplace_back(elem.first, RequiredStat(status.stat));
|
|
|
|
}
|
2017-07-24 20:12:59 +00:00
|
|
|
|
2017-08-09 21:09:44 +00:00
|
|
|
std::sort(timed_blocks.begin(), timed_blocks.end(), NodeWithStat::greaterByTime);
|
|
|
|
}
|
2017-07-24 20:12:59 +00:00
|
|
|
|
2014-10-15 01:22:06 +00:00
|
|
|
|
2017-08-09 21:09:44 +00:00
|
|
|
ReplicatedMergeTreeCleanupThread::~ReplicatedMergeTreeCleanupThread()
|
|
|
|
{
|
|
|
|
if (thread.joinable())
|
|
|
|
thread.join();
|
2014-10-15 01:22:06 +00:00
|
|
|
}
|
|
|
|
|
|
|
|
}
|