#include #include #include #include #include #include #include #include namespace DB { namespace { /// Deduplication operation part was dropped or added enum class MergeTreeDeduplicationOp : uint8_t { ADD = 1, DROP = 2, }; /// Record for deduplication on disk struct MergeTreeDeduplicationLogRecord { MergeTreeDeduplicationOp operation; std::string part_name; std::string block_id; }; void writeRecord(const MergeTreeDeduplicationLogRecord & record, WriteBuffer & out) { writeIntText(static_cast(record.operation), out); writeChar('\t', out); writeString(record.part_name, out); writeChar('\t', out); writeString(record.block_id, out); writeChar('\n', out); out.next(); } void readRecord(MergeTreeDeduplicationLogRecord & record, ReadBuffer & in) { uint8_t op; readIntText(op, in); record.operation = static_cast(op); assertChar('\t', in); readString(record.part_name, in); assertChar('\t', in); readString(record.block_id, in); assertChar('\n', in); } std::string getLogPath(const std::string & prefix, size_t number) { std::filesystem::path path(prefix); path /= std::filesystem::path(std::string{"deduplication_log_"} + std::to_string(number) + ".txt"); return path; } size_t getLogNumber(const std::string & path_str) { std::filesystem::path path(path_str); std::string filename = path.stem(); Strings filename_parts; boost::split(filename_parts, filename, boost::is_any_of("_")); return parse(filename_parts[2]); } } MergeTreeDeduplicationLog::MergeTreeDeduplicationLog( const std::string & logs_dir_, size_t deduplication_window_, const MergeTreeDataFormatVersion & format_version_) : logs_dir(logs_dir_) , deduplication_window(deduplication_window_) , rotate_interval(deduplication_window_ * 2) /// actually it doesn't matter , format_version(format_version_) , deduplication_map(deduplication_window) { namespace fs = std::filesystem; if (deduplication_window != 0 && !fs::exists(logs_dir)) fs::create_directories(logs_dir); } void MergeTreeDeduplicationLog::load() { namespace fs = std::filesystem; if (!fs::exists(logs_dir)) return; for (const auto & p : fs::directory_iterator(logs_dir)) { const auto & path = p.path(); auto log_number = getLogNumber(path); existing_logs[log_number] = {path, 0}; } /// We should know which logs are exist even in case /// of deduplication_window = 0 if (!existing_logs.empty()) current_log_number = existing_logs.rbegin()->first; if (deduplication_window != 0) { /// Order important, we load history from the begging to the end for (auto & [log_number, desc] : existing_logs) { try { desc.entries_count = loadSingleLog(desc.path); } catch (...) { tryLogCurrentException(__PRETTY_FUNCTION__, "Error while loading MergeTree deduplication log on path " + desc.path); } } /// Start new log, drop previous rotateAndDropIfNeeded(); /// Can happen in case we have unfinished log if (!current_writer) current_writer = std::make_unique(existing_logs.rbegin()->second.path, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); } } size_t MergeTreeDeduplicationLog::loadSingleLog(const std::string & path) { ReadBufferFromFile read_buf(path); size_t total_entries = 0; while (!read_buf.eof()) { MergeTreeDeduplicationLogRecord record; readRecord(record, read_buf); if (record.operation == MergeTreeDeduplicationOp::DROP) deduplication_map.erase(record.block_id); else deduplication_map.insert(record.block_id, MergeTreePartInfo::fromPartName(record.part_name, format_version)); total_entries++; } return total_entries; } void MergeTreeDeduplicationLog::rotate() { /// We don't deduplicate anything so we don't need any writers if (deduplication_window == 0) return; current_log_number++; auto new_path = getLogPath(logs_dir, current_log_number); MergeTreeDeduplicationLogNameDescription log_description{new_path, 0}; existing_logs.emplace(current_log_number, log_description); if (current_writer) current_writer->sync(); current_writer = std::make_unique(log_description.path, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); } void MergeTreeDeduplicationLog::dropOutdatedLogs() { size_t current_sum = 0; size_t remove_from_value = 0; /// Go from end to the beginning for (auto itr = existing_logs.rbegin(); itr != existing_logs.rend(); ++itr) { if (current_sum > deduplication_window) { /// We have more logs than required, all older files (including current) can be dropped remove_from_value = itr->first; break; } auto & description = itr->second; current_sum += description.entries_count; } /// If we found some logs to drop if (remove_from_value != 0) { /// Go from the beginning to the end and drop all outdated logs for (auto itr = existing_logs.begin(); itr != existing_logs.end();) { size_t number = itr->first; std::filesystem::remove(itr->second.path); itr = existing_logs.erase(itr); if (remove_from_value == number) break; } } } void MergeTreeDeduplicationLog::rotateAndDropIfNeeded() { /// If we don't have logs at all or already have enough records in current if (existing_logs.empty() || existing_logs[current_log_number].entries_count >= rotate_interval) { rotate(); dropOutdatedLogs(); } } std::pair MergeTreeDeduplicationLog::addPart(const std::string & block_id, const MergeTreePartInfo & part_info) { std::lock_guard lock(state_mutex); /// We support zero case because user may want to disable deduplication with /// ALTER MODIFY SETTING query. It's much more simpler to handle zero case /// here then destroy whole object, check for null pointer from different /// threads and so on. if (deduplication_window == 0) return std::make_pair(part_info, true); /// If we already have this block let's deduplicate it if (deduplication_map.contains(block_id)) { auto info = deduplication_map.get(block_id); return std::make_pair(info, false); } assert(current_writer != nullptr); /// Create new record MergeTreeDeduplicationLogRecord record; record.operation = MergeTreeDeduplicationOp::ADD; record.part_name = part_info.getPartName(); record.block_id = block_id; /// Write it to disk writeRecord(record, *current_writer); /// We have one more record in current log existing_logs[current_log_number].entries_count++; /// Add to deduplication map deduplication_map.insert(record.block_id, part_info); /// Rotate and drop old logs if needed rotateAndDropIfNeeded(); return std::make_pair(part_info, true); } void MergeTreeDeduplicationLog::dropPart(const MergeTreePartInfo & drop_part_info) { std::lock_guard lock(state_mutex); /// We support zero case because user may want to disable deduplication with /// ALTER MODIFY SETTING query. It's much more simpler to handle zero case /// here then destroy whole object, check for null pointer from different /// threads and so on. if (deduplication_window == 0) return; assert(current_writer != nullptr); for (auto itr = deduplication_map.begin(); itr != deduplication_map.end(); /* no increment here, we erasing from map */) { const auto & part_info = itr->value; /// Part is covered by dropped part, let's remove it from /// deduplication history if (drop_part_info.contains(part_info)) { /// Create drop record MergeTreeDeduplicationLogRecord record; record.operation = MergeTreeDeduplicationOp::DROP; record.part_name = part_info.getPartName(); record.block_id = itr->key; /// Write it to disk writeRecord(record, *current_writer); /// We have one more record on disk existing_logs[current_log_number].entries_count++; /// Increment itr before erase, otherwise it will invalidated ++itr; /// Remove block_id from in-memory table deduplication_map.erase(record.block_id); /// Rotate and drop old logs if needed rotateAndDropIfNeeded(); } else { ++itr; } } } void MergeTreeDeduplicationLog::setDeduplicationWindowSize(size_t deduplication_window_) { std::lock_guard lock(state_mutex); deduplication_window = deduplication_window_; rotate_interval = deduplication_window * 2; /// If settings was set for the first time with ALTER MODIFY SETTING query if (deduplication_window != 0 && !std::filesystem::exists(logs_dir)) std::filesystem::create_directories(logs_dir); deduplication_map.setMaxSize(deduplication_window); rotateAndDropIfNeeded(); /// Can happen in case we have unfinished log if (!current_writer) current_writer = std::make_unique(existing_logs.rbegin()->second.path, DBMS_DEFAULT_BUFFER_SIZE, O_APPEND | O_CREAT | O_WRONLY); } }