#include #include #include #include namespace ProfileEvents { extern const Event ReplicatedPartChecks; extern const Event ReplicatedPartChecksFailed; extern const Event ReplicatedDataLoss; } namespace DB { namespace ErrorCodes { extern const int TABLE_DIFFERS_TOO_MUCH; } static const auto PART_CHECK_ERROR_SLEEP_MS = 5 * 1000; ReplicatedMergeTreePartCheckThread::ReplicatedMergeTreePartCheckThread(StorageReplicatedMergeTree & storage_) : storage(storage_) , log_name(storage.database_name + "." + storage.table_name + " (ReplicatedMergeTreePartCheckThread)") , log(&Logger::get(log_name)) { task = storage.global_context.getSchedulePool().createTask(log_name, [this] { run(); }); task->schedule(); } ReplicatedMergeTreePartCheckThread::~ReplicatedMergeTreePartCheckThread() { stop(); } void ReplicatedMergeTreePartCheckThread::start() { std::lock_guard lock(start_stop_mutex); need_stop = false; task->activateAndSchedule(); } void ReplicatedMergeTreePartCheckThread::stop() { //based on discussion on https://github.com/yandex/ClickHouse/pull/1489#issuecomment-344756259 //using the schedule pool there is no problem in case stop is called two time in row and the start multiple times std::lock_guard lock(start_stop_mutex); need_stop = true; task->deactivate(); } void ReplicatedMergeTreePartCheckThread::enqueuePart(const String & name, time_t delay_to_check_seconds) { std::lock_guard lock(parts_mutex); if (parts_set.count(name)) return; parts_queue.emplace_back(name, time(nullptr) + delay_to_check_seconds); parts_set.insert(name); task->schedule(); } size_t ReplicatedMergeTreePartCheckThread::size() const { std::lock_guard lock(parts_mutex); return parts_set.size(); } void ReplicatedMergeTreePartCheckThread::searchForMissingPart(const String & part_name) { auto zookeeper = storage.getZooKeeper(); String part_path = storage.replica_path + "/parts/" + part_name; /// If the part is in ZooKeeper, remove it from there and add the task to download it to the queue. if (zookeeper->exists(part_path)) { LOG_WARNING(log, "Part " << part_name << " exists in ZooKeeper but not locally. " "Removing from ZooKeeper and queueing a fetch."); ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed); storage.removePartAndEnqueueFetch(part_name); return; } /// If the part is not in ZooKeeper, we'll check if it's at least somewhere. auto part_info = MergeTreePartInfo::fromPartName(part_name, storage.data.format_version); /** The logic is as follows: * - if some live or inactive replica has such a part, or a part covering it * - it is Ok, nothing is needed, it is then downloaded when processing the queue, when the replica comes to life; * - or, if the replica never comes to life, then the administrator will delete or create a new replica with the same address and see everything from the beginning; * - if no one has such part or a part covering it, then * - if there are two smaller parts, one with the same min block and the other with the same * max block, we hope that all parts in between are present too and the needed part * will appear on other replicas as a result of a merge. * - otherwise, consider the part lost and delete the entry from the queue. * * Note that this logic is not perfect - some part in the interior may be missing and the * needed part will never appear. But precisely determining whether the part will appear as * a result of a merge is complicated - we can't just check if all block numbers covered * by the missing part are present somewhere (because gaps between blocks are possible) * and to determine the constituent parts of the merge we need to query the replication log * (both the common log and the queues of the individual replicas) and then, if the * constituent parts are in turn not found, solve the problem recursively for them. * * Considering the part lost when it is not in fact lost is very dangerous because it leads * to divergent replicas and intersecting parts. So we err on the side of caution * and don't delete the queue entry when in doubt. */ LOG_WARNING(log, "Checking if anyone has a part covering " << part_name << "."); bool found_part_with_the_same_min_block = false; bool found_part_with_the_same_max_block = false; Strings replicas = zookeeper->getChildren(storage.zookeeper_path + "/replicas"); for (const String & replica : replicas) { Strings parts = zookeeper->getChildren(storage.zookeeper_path + "/replicas/" + replica + "/parts"); for (const String & part_on_replica : parts) { auto part_on_replica_info = MergeTreePartInfo::fromPartName(part_on_replica, storage.data.format_version); if (part_on_replica_info.contains(part_info)) { LOG_WARNING(log, "Found part " << part_on_replica << " on " << replica << " that covers the missing part " << part_name); return; } if (part_info.contains(part_on_replica_info)) { if (part_on_replica_info.min_block == part_info.min_block) found_part_with_the_same_min_block = true; if (part_on_replica_info.max_block == part_info.max_block) found_part_with_the_same_max_block = true; if (found_part_with_the_same_min_block && found_part_with_the_same_max_block) { LOG_WARNING(log, "Found parts with the same min block and with the same max block as the missing part " << part_name << ". Hoping that it will eventually appear as a result of a merge."); return; } } } } /// No one has such a part and the merge is impossible. String not_found_msg; if (found_part_with_the_same_max_block) not_found_msg = "a smaller part with the same max block."; else if (found_part_with_the_same_min_block) not_found_msg = "a smaller part with the same min block."; else not_found_msg = "smaller parts with either the same min block or the same max block."; LOG_ERROR(log, "No replica has part covering " << part_name << " and a merge is impossible: we didn't find " << not_found_msg); ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed); /// Is it in the replication queue? If there is - delete, because the task can not be processed. if (!storage.queue.remove(zookeeper, part_name)) { /// The part was not in our queue. Why did it happen? LOG_ERROR(log, "Missing part " << part_name << " is not in our queue."); return; } /** This situation is possible if on all the replicas where the part was, it deteriorated. * For example, a replica that has just written it has power turned off and the data has not been written from cache to disk. */ LOG_ERROR(log, "Part " << part_name << " is lost forever."); ProfileEvents::increment(ProfileEvents::ReplicatedDataLoss); } void ReplicatedMergeTreePartCheckThread::checkPart(const String & part_name) { LOG_WARNING(log, "Checking part " << part_name); ProfileEvents::increment(ProfileEvents::ReplicatedPartChecks); /// If the part is still in the PreCommitted -> Committed transition, it is not lost /// and there is no need to go searching for it on other replicas. To definitely find the needed part /// if it exists (or a part containing it) we first search among the PreCommitted parts. auto part = storage.data.getPartIfExists(part_name, {MergeTreeDataPartState::PreCommitted}); if (!part) part = storage.data.getActiveContainingPart(part_name); /// We do not have this or a covering part. if (!part) { searchForMissingPart(part_name); } /// We have this part, and it's active. We will check whether we need this part and whether it has the right data. else if (part->name == part_name) { auto zookeeper = storage.getZooKeeper(); auto table_lock = storage.lockStructure(false); /// If the part is in ZooKeeper, check its data with its checksums, and them with ZooKeeper. if (zookeeper->exists(storage.replica_path + "/parts/" + part_name)) { LOG_WARNING(log, "Checking data of part " << part_name << "."); try { auto zk_checksums = MinimalisticDataPartChecksums::deserializeFrom( zookeeper->get(storage.replica_path + "/parts/" + part_name + "/checksums")); zk_checksums.checkEqual(part->checksums, true); auto zk_columns = NamesAndTypesList::parse( zookeeper->get(storage.replica_path + "/parts/" + part_name + "/columns")); if (part->columns != zk_columns) throw Exception("Columns of local part " + part_name + " are different from ZooKeeper", ErrorCodes::TABLE_DIFFERS_TOO_MUCH); checkDataPart( storage.data.getFullPath() + part_name, storage.data.index_granularity, true, storage.data.primary_key_data_types, [this] { return need_stop.load(); }); if (need_stop) { LOG_INFO(log, "Checking part was cancelled."); return; } LOG_INFO(log, "Part " << part_name << " looks good."); } catch (const Exception &) { /// TODO Better to check error code. tryLogCurrentException(log, __PRETTY_FUNCTION__); LOG_ERROR(log, "Part " << part_name << " looks broken. Removing it and queueing a fetch."); ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed); storage.removePartAndEnqueueFetch(part_name); /// Delete part locally. storage.data.forgetPartAndMoveToDetached(part, "broken_"); } } else if (part->modification_time + MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER < time(nullptr)) { /// If the part is not in ZooKeeper, delete it locally. /// Probably, someone just wrote down the part, and has not yet added to ZK. /// Therefore, delete only if the part is old (not very reliable). ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed); LOG_ERROR(log, "Unexpected part " << part_name << " in filesystem. Removing."); storage.data.forgetPartAndMoveToDetached(part, "unexpected_"); } else { /// TODO You need to make sure that the part is still checked after a while. /// Otherwise, it's possible that the part was not added to ZK, /// but remained in the filesystem and in a number of active parts. /// And then for a long time (before restarting), the data on the replicas will be different. LOG_TRACE(log, "Young part " << part_name << " with age " << (time(nullptr) - part->modification_time) << " seconds hasn't been added to ZooKeeper yet. It's ok."); } } else { /// If we have a covering part, ignore all the problems with this part. /// In the worst case, errors will still appear `old_parts_lifetime` seconds in error log until the part is removed as the old one. LOG_WARNING(log, "We have part " << part->name << " covering part " << part_name); } } void ReplicatedMergeTreePartCheckThread::run() { if (need_stop) return; try { time_t current_time = time(nullptr); /// Take part from the queue for verification. PartsToCheckQueue::iterator selected = parts_queue.end(); /// end from std::list is not get invalidated time_t min_check_time = std::numeric_limits::max(); { std::lock_guard lock(parts_mutex); if (parts_queue.empty()) { if (!parts_set.empty()) { LOG_ERROR(log, "Non-empty parts_set with empty parts_queue. This is a bug."); parts_set.clear(); } } else { for (auto it = parts_queue.begin(); it != parts_queue.end(); ++it) { if (it->second <= current_time) { selected = it; break; } if (it->second < min_check_time) min_check_time = it->second; } } } if (selected == parts_queue.end()) return; checkPart(selected->first); if (need_stop) return; /// Remove the part from check queue. { std::lock_guard lock(parts_mutex); if (parts_queue.empty()) { LOG_ERROR(log, "Someone erased cheking part from parts_queue. This is a bug."); } else { parts_set.erase(selected->first); parts_queue.erase(selected); } } task->schedule(); } catch (const Coordination::Exception & e) { tryLogCurrentException(log, __PRETTY_FUNCTION__); if (e.code == Coordination::ZSESSIONEXPIRED) return; task->scheduleAfter(PART_CHECK_ERROR_SLEEP_MS); } catch (...) { tryLogCurrentException(log, __PRETTY_FUNCTION__); task->scheduleAfter(PART_CHECK_ERROR_SLEEP_MS); } } }