ClickHouse/src/Storages/MergeTree/MergeTreeDeduplicationLog.cpp

233 lines
6.4 KiB
C++
Raw Normal View History

2021-03-31 15:20:30 +00:00
#include <Storages/MergeTree/MergeTreeDeduplicationLog.h>
#include <filesystem>
#include <boost/algorithm/string/split.hpp>
#include <boost/algorithm/string/join.hpp>
#include <boost/algorithm/string/trim.hpp>
2021-04-02 16:45:18 +00:00
#include <IO/ReadBufferFromFile.h>
2021-04-02 12:37:42 +00:00
#include <IO/WriteHelpers.h>
#include <IO/ReadHelpers.h>
2021-03-31 15:20:30 +00:00
namespace DB
{
namespace
{
2021-04-02 11:46:42 +00:00
enum class MergeTreeDeduplicationOp : uint8_t
{
ADD = 1,
DROP = 2,
};
struct MergeTreeDeduplicationLogRecord
{
MergeTreeDeduplicationOp operation;
std::string part_name;
std::string block_id;
};
void writeRecord(const MergeTreeDeduplicationLogRecord & record, WriteBuffer & out)
{
writeIntText(static_cast<uint8_t>(record.operation), out);
2021-04-02 16:45:18 +00:00
writeChar('\t', out);
writeString(record.part_name, out);
writeChar('\t', out);
writeString(record.block_id, out);
2021-04-02 11:46:42 +00:00
writeChar('\n', out);
2021-04-02 16:45:18 +00:00
out.next();
2021-04-02 11:46:42 +00:00
}
void readRecord(MergeTreeDeduplicationLogRecord & record, ReadBuffer & in)
{
uint8_t op;
readIntText(op, in);
record.operation = static_cast<MergeTreeDeduplicationOp>(op);
2021-04-02 16:45:18 +00:00
assertChar('\t', in);
readString(record.part_name, in);
assertChar('\t', in);
readString(record.block_id, in);
2021-04-02 11:46:42 +00:00
assertChar('\n', in);
}
2021-03-31 15:20:30 +00:00
std::string getLogPath(const std::string & prefix, size_t number)
{
std::filesystem::path path(prefix);
path /= std::filesystem::path(std::string{"deduplication_log_"} + std::to_string(number) + ".txt");
return path;
}
size_t getLogNumber(const std::string & path_str)
{
std::filesystem::path path(path_str);
std::string filename = path.stem();
Strings filename_parts;
boost::split(filename_parts, filename, boost::is_any_of("_"));
return parse<size_t>(filename_parts[2]);
}
}
MergeTreeDeduplicationLog::MergeTreeDeduplicationLog(
const std::string & logs_dir_,
size_t deduplication_window_,
2021-04-02 11:46:42 +00:00
const MergeTreeDataFormatVersion & format_version_)
2021-03-31 15:20:30 +00:00
: logs_dir(logs_dir_)
, deduplication_window(deduplication_window_)
2021-04-02 11:46:42 +00:00
, rotate_interval(deduplication_window_ * 2) /// actually it doesn't matter
, format_version(format_version_)
, deduplication_map(deduplication_window)
2021-03-31 15:20:30 +00:00
{}
void MergeTreeDeduplicationLog::load()
{
namespace fs = std::filesystem;
if (!fs::exists(logs_dir))
fs::create_directories(logs_dir);
for (const auto & p : fs::directory_iterator(logs_dir))
{
auto path = p.path();
auto log_number = getLogNumber(path);
2021-04-02 11:46:42 +00:00
existing_logs[log_number] = {path, 0};
2021-03-31 15:20:30 +00:00
}
2021-04-02 11:46:42 +00:00
/// Order important
for (auto & [log_number, desc] : existing_logs)
{
desc.entries_count = loadSingleLog(desc.path);
current_log_number = log_number;
}
rotateAndDropIfNeeded();
2021-04-02 16:45:18 +00:00
if (!current_writer)
current_writer = std::make_unique<WriteBufferFromFile>(existing_logs.rbegin()->second.path, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY);
2021-03-31 15:20:30 +00:00
}
2021-04-02 11:46:42 +00:00
size_t MergeTreeDeduplicationLog::loadSingleLog(const std::string & path)
2021-03-31 15:20:30 +00:00
{
ReadBufferFromFile read_buf(path);
2021-04-02 11:46:42 +00:00
size_t total_entries = 0;
2021-03-31 15:20:30 +00:00
while (!read_buf.eof())
{
2021-04-02 11:46:42 +00:00
MergeTreeDeduplicationLogRecord record;
readRecord(record, read_buf);
if (record.operation == MergeTreeDeduplicationOp::DROP)
deduplication_map.erase(record.block_id);
else
deduplication_map.insert(record.block_id, MergeTreePartInfo::fromPartName(record.part_name, format_version));
total_entries++;
2021-03-31 15:20:30 +00:00
}
2021-04-02 11:46:42 +00:00
return total_entries;
2021-03-31 15:20:30 +00:00
}
void MergeTreeDeduplicationLog::rotate()
{
2021-04-02 11:46:42 +00:00
current_log_number++;
auto new_path = getLogPath(logs_dir, current_log_number);
MergeTreeDeduplicationLogNameDescription log_description{new_path, 0};
existing_logs.emplace(current_log_number, log_description);
if (current_writer)
current_writer->sync();
current_writer = std::make_unique<WriteBufferFromFile>(log_description.path, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY);
}
void MergeTreeDeduplicationLog::dropOutdatedLogs()
{
size_t current_sum = 0;
2021-04-02 12:37:42 +00:00
size_t remove_from_value = 0;
for (auto itr = existing_logs.rbegin(); itr != existing_logs.rend(); ++itr)
2021-04-02 11:46:42 +00:00
{
auto & description = itr->second;
if (current_sum > deduplication_window)
{
2021-04-02 12:37:42 +00:00
remove_from_value = itr->first;
break;
2021-04-02 11:46:42 +00:00
}
2021-04-02 12:37:42 +00:00
current_sum += description.entries_count;
}
if (remove_from_value != 0)
{
for (auto itr = existing_logs.begin(); itr != existing_logs.end();)
2021-04-02 11:46:42 +00:00
{
2021-04-02 13:34:36 +00:00
size_t number = itr->first;
std::filesystem::remove(itr->second.path);
itr = existing_logs.erase(itr);
if (remove_from_value == number)
break;
2021-04-02 11:46:42 +00:00
}
}
2021-04-02 12:37:42 +00:00
2021-04-02 11:46:42 +00:00
}
void MergeTreeDeduplicationLog::rotateAndDropIfNeeded()
{
2021-04-02 16:45:18 +00:00
if (existing_logs.empty() || existing_logs[current_log_number].entries_count >= rotate_interval)
2021-04-02 11:46:42 +00:00
{
rotate();
dropOutdatedLogs();
}
2021-04-02 16:45:18 +00:00
2021-04-02 11:46:42 +00:00
}
2021-04-02 16:45:18 +00:00
std::pair<MergeTreePartInfo, bool> MergeTreeDeduplicationLog::addPart(const std::string & block_id, const MergeTreePartInfo & part_info)
2021-04-02 11:46:42 +00:00
{
std::lock_guard lock(state_mutex);
2021-03-31 15:20:30 +00:00
2021-04-02 11:46:42 +00:00
if (deduplication_map.contains(block_id))
2021-04-02 16:45:18 +00:00
{
auto info = deduplication_map.get(block_id);
return std::make_pair(info, false);
}
2021-03-31 15:20:30 +00:00
2021-04-02 11:46:42 +00:00
assert(current_writer != nullptr);
2021-04-02 16:45:18 +00:00
2021-04-02 11:46:42 +00:00
MergeTreeDeduplicationLogRecord record;
record.operation = MergeTreeDeduplicationOp::ADD;
2021-04-02 16:45:18 +00:00
record.part_name = part_info.getPartName();
2021-04-02 11:46:42 +00:00
record.block_id = block_id;
writeRecord(record, *current_writer);
existing_logs[current_log_number].entries_count++;
2021-04-02 16:45:18 +00:00
deduplication_map.insert(record.block_id, part_info);
rotateAndDropIfNeeded();
2021-04-02 11:46:42 +00:00
2021-04-02 16:45:18 +00:00
return std::make_pair(part_info, true);
2021-04-02 11:46:42 +00:00
}
2021-04-02 16:45:18 +00:00
void MergeTreeDeduplicationLog::dropPart(const MergeTreePartInfo & drop_part_info)
2021-04-02 11:46:42 +00:00
{
std::lock_guard lock(state_mutex);
assert(current_writer != nullptr);
2021-04-02 16:45:18 +00:00
for (auto itr = deduplication_map.begin(); itr != deduplication_map.end();)
2021-04-02 11:46:42 +00:00
{
2021-04-02 13:34:36 +00:00
const auto & part_info = itr->value;
2021-04-02 16:45:18 +00:00
if (drop_part_info.contains(part_info))
2021-04-02 11:46:42 +00:00
{
2021-04-02 13:34:36 +00:00
MergeTreeDeduplicationLogRecord record;
2021-04-02 11:46:42 +00:00
record.operation = MergeTreeDeduplicationOp::DROP;
record.part_name = part_info.getPartName();
2021-04-02 13:34:36 +00:00
record.block_id = itr->key;
2021-04-02 11:46:42 +00:00
writeRecord(record, *current_writer);
2021-04-02 16:45:18 +00:00
2021-04-02 11:46:42 +00:00
existing_logs[current_log_number].entries_count++;
2021-04-02 16:45:18 +00:00
++itr;
deduplication_map.erase(record.block_id);
2021-04-02 11:46:42 +00:00
rotateAndDropIfNeeded();
}
2021-04-02 16:45:18 +00:00
else
2021-04-02 11:46:42 +00:00
{
2021-04-02 16:45:18 +00:00
++itr;
2021-04-02 11:46:42 +00:00
}
}
2021-03-31 15:20:30 +00:00
}
}