ClickHouse/src/Storages/MergeTree/ReplicatedMergeTreePartCheckThread.cpp

#include <Storages/MergeTree/ReplicatedMergeTreePartCheckThread.h>
#include <Storages/MergeTree/checkDataPart.h>
#include <Storages/MergeTree/ReplicatedMergeTreePartHeader.h>
#include <Storages/StorageReplicatedMergeTree.h>
#include <Interpreters/Context.h>


namespace ProfileEvents
{
    extern const Event ReplicatedPartChecks;
    extern const Event ReplicatedPartChecksFailed;
    extern const Event ReplicatedDataLoss;
}

namespace DB
{

namespace ErrorCodes
{
    extern const int TABLE_DIFFERS_TOO_MUCH;
}

static const auto PART_CHECK_ERROR_SLEEP_MS = 5 * 1000;


ReplicatedMergeTreePartCheckThread::ReplicatedMergeTreePartCheckThread(StorageReplicatedMergeTree & storage_)
    : storage(storage_)
    , log_name(storage.getStorageID().getFullTableName() + " (ReplicatedMergeTreePartCheckThread)")
    , log(&Poco::Logger::get(log_name))
{
    task = storage.global_context.getSchedulePool().createTask(log_name, [this] { run(); });
    task->schedule();
}

ReplicatedMergeTreePartCheckThread::~ReplicatedMergeTreePartCheckThread()
{
    stop();
}

void ReplicatedMergeTreePartCheckThread::start()
{
    std::lock_guard lock(start_stop_mutex);
    need_stop = false;
    task->activateAndSchedule();
}

void ReplicatedMergeTreePartCheckThread::stop()
{
    //based on discussion on https://github.com/ClickHouse/ClickHouse/pull/1489#issuecomment-344756259
    //using the schedule pool there is no problem in case stop is called two time in row and the start multiple times

    std::lock_guard lock(start_stop_mutex);
    need_stop = true;
    task->deactivate();
}

void ReplicatedMergeTreePartCheckThread::enqueuePart(const String & name, time_t delay_to_check_seconds)
{
    std::lock_guard lock(parts_mutex);

    if (parts_set.count(name))
        return;

    parts_queue.emplace_back(name, time(nullptr) + delay_to_check_seconds);
    parts_set.insert(name);
    task->schedule();
}


size_t ReplicatedMergeTreePartCheckThread::size() const
{
    std::lock_guard lock(parts_mutex);
    return parts_set.size();
}


ReplicatedMergeTreePartCheckThread::MissingPartSearchResult ReplicatedMergeTreePartCheckThread::searchForMissingPartOnOtherReplicas(const String & part_name)
{
    auto zookeeper = storage.getZooKeeper();

    /// If the part is not in ZooKeeper, we'll check if it's at least somewhere.
    auto part_info = MergeTreePartInfo::fromPartName(part_name, storage.format_version);

    /** The logic is as follows:
        * - if some live or inactive replica has such a part, or a part covering it
        *   - it is Ok, nothing is needed, it is then downloaded when processing the queue, when the replica comes to life;
        *   - or, if the replica never comes to life, then the administrator will delete or create a new replica with the same address and see everything from the beginning;
        * - if no one has such part or a part covering it, then
        *   - if there are two smaller parts, one with the same min block and the other with the same
        *     max block, we hope that all parts in between are present too and the needed part
        *     will appear on other replicas as a result of a merge.
        *   - otherwise, consider the part lost and delete the entry from the queue.
        *
        *   Note that this logic is not perfect - some part in the interior may be missing and the
        *   needed part will never appear. But precisely determining whether the part will appear as
        *   a result of a merge is complicated - we can't just check if all block numbers covered
        *   by the missing part are present somewhere (because gaps between blocks are possible)
        *   and to determine the constituent parts of the merge we need to query the replication log
        *   (both the common log and the queues of the individual replicas) and then, if the
        *   constituent parts are in turn not found, solve the problem recursively for them.
        *
        *   Considering the part lost when it is not in fact lost is very dangerous because it leads
        *   to divergent replicas and intersecting parts. So we err on the side of caution
        *   and don't delete the queue entry when in doubt.
        */

    LOG_WARNING(log, "Checking if anyone has a part {} or covering part.", part_name);

    bool found_part_with_the_same_min_block = false;
    bool found_part_with_the_same_max_block = false;

    Strings replicas = zookeeper->getChildren(storage.zookeeper_path + "/replicas");
    for (const String & replica : replicas)
    {
        String replica_path = storage.zookeeper_path + "/replicas/" + replica;

        Strings parts = zookeeper->getChildren(replica_path + "/parts");
        for (const String & part_on_replica : parts)
        {
            auto part_on_replica_info = MergeTreePartInfo::fromPartName(part_on_replica, storage.format_version);

            if (part_info == part_on_replica_info)
            {
                /// Found missing part at ourself. If we are here then something wrong with this part, so skipping.
                if (replica_path == storage.replica_path)
                    continue;

                LOG_WARNING(log, "Found the missing part {} at {} on {}", part_name, part_on_replica, replica);
                return MissingPartSearchResult::FoundAndNeedFetch;
            }

            if (part_on_replica_info.contains(part_info))
            {
                LOG_WARNING(log, "Found part {} on {} that covers the missing part {}", part_on_replica, replica, part_name);
                return MissingPartSearchResult::FoundAndDontNeedFetch;
            }

            if (part_info.contains(part_on_replica_info))
            {
                if (part_on_replica_info.min_block == part_info.min_block)
                    found_part_with_the_same_min_block = true;
                if (part_on_replica_info.max_block == part_info.max_block)
                    found_part_with_the_same_max_block = true;

                if (found_part_with_the_same_min_block && found_part_with_the_same_max_block)
                {
                    LOG_WARNING(log, "Found parts with the same min block and with the same max block as the missing part {}. Hoping that it will eventually appear as a result of a merge.", part_name);
                    return MissingPartSearchResult::FoundAndDontNeedFetch;
                }
            }
        }
    }

    /// No one has such a part and the merge is impossible.
    String not_found_msg;
    if (found_part_with_the_same_max_block)
        not_found_msg = "a smaller part with the same max block.";
    else if (found_part_with_the_same_min_block)
        not_found_msg = "a smaller part with the same min block.";
    else
        not_found_msg = "smaller parts with either the same min block or the same max block.";
    LOG_ERROR(log, "No replica has part covering {} and a merge is impossible: we didn't find {}", part_name, not_found_msg);

    return MissingPartSearchResult::LostForever;
}

void ReplicatedMergeTreePartCheckThread::searchForMissingPartAndFetchIfPossible(const String & part_name, bool exists_in_zookeeper)
{
    auto zookeeper = storage.getZooKeeper();
    auto missing_part_search_result = searchForMissingPartOnOtherReplicas(part_name);

    /// If the part is in ZooKeeper, remove it from there and add the task to download it to the queue.
    if (exists_in_zookeeper)
    {
        /// If part found on some other replica
        if (missing_part_search_result == MissingPartSearchResult::FoundAndNeedFetch)
        {
            LOG_WARNING(log, "Part {} exists in ZooKeeper but not locally and found on other replica. Removing from ZooKeeper and queueing a fetch.", part_name);
            storage.removePartAndEnqueueFetch(part_name);
        }
        else /// If we have covering part on other replica or part is lost forever we don't need to fetch anything
        {
            LOG_WARNING(log, "Part {} exists in ZooKeeper but not locally and not found on other replica. Removing it from ZooKeeper.", part_name);
            storage.removePartFromZooKeeper(part_name);
        }
    }

    ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed);

    if (missing_part_search_result == MissingPartSearchResult::LostForever)
    {
        /// Is it in the replication queue? If there is - delete, because the task can not be processed.
        if (!storage.queue.remove(zookeeper, part_name))
        {
            /// The part was not in our queue.
            LOG_WARNING(log, "Missing part {} is not in our queue, this can happen rarely.", part_name);
        }

        /** This situation is possible if on all the replicas where the part was, it deteriorated.
            * For example, a replica that has just written it has power turned off and the data has not been written from cache to disk.
            */
        LOG_ERROR(log, "Part {} is lost forever.", part_name);
        ProfileEvents::increment(ProfileEvents::ReplicatedDataLoss);
    }
}

std::pair<bool, MergeTreeDataPartPtr> ReplicatedMergeTreePartCheckThread::findLocalPart(const String & part_name)
{
    auto zookeeper = storage.getZooKeeper();
    String part_path = storage.replica_path + "/parts/" + part_name;

    /// It's important to check zookeeper first and after that check local storage,
    /// because our checks of local storage and zookeeper are not consistent.
    /// If part exists in zookeeper and doesn't exists in local storage definitely require
    /// to fetch this part. But if we check local storage first and than check zookeeper
    /// some background process can successfully commit part between this checks (both to the local stoarge and zookeeper),
    /// but checker thread will remove part from zookeeper and queue fetch.
    bool exists_in_zookeeper = zookeeper->exists(part_path);

    /// If the part is still in the PreCommitted -> Committed transition, it is not lost
    /// and there is no need to go searching for it on other replicas. To definitely find the needed part
    /// if it exists (or a part containing it) we first search among the PreCommitted parts.
    auto part = storage.getPartIfExists(part_name, {MergeTreeDataPartState::PreCommitted});
    if (!part)
        part = storage.getActiveContainingPart(part_name);

    return std::make_pair(exists_in_zookeeper, part);
}

CheckResult ReplicatedMergeTreePartCheckThread::checkPart(const String & part_name)
{
    LOG_WARNING(log, "Checking part {}", part_name);
    ProfileEvents::increment(ProfileEvents::ReplicatedPartChecks);

    auto [exists_in_zookeeper, part] = findLocalPart(part_name);

    /// We do not have this or a covering part.
    if (!part)
    {
        searchForMissingPartAndFetchIfPossible(part_name, exists_in_zookeeper);
        return {part_name, false, "Part is missing, will search for it"};
    }

    /// We have this part, and it's active. We will check whether we need this part and whether it has the right data.
    if (part->name == part_name)
    {
        auto zookeeper = storage.getZooKeeper();
        auto table_lock = storage.lockForShare(RWLockImpl::NO_QUERY, storage.getSettings()->lock_acquire_timeout_for_background_operations);

        auto local_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksums(
            part->getColumns(), part->checksums);

        String part_path = storage.replica_path + "/parts/" + part_name;
        String part_znode;
        /// If the part is in ZooKeeper, check its data with its checksums, and them with ZooKeeper.
        if (zookeeper->tryGet(part_path, part_znode))
        {
            LOG_WARNING(log, "Checking data of part {}.", part_name);

            try
            {
                ReplicatedMergeTreePartHeader zk_part_header;
                if (!part_znode.empty())
                    zk_part_header = ReplicatedMergeTreePartHeader::fromString(part_znode);
                else
                {
                    String columns_znode = zookeeper->get(part_path + "/columns");
                    String checksums_znode = zookeeper->get(part_path + "/checksums");
                    zk_part_header = ReplicatedMergeTreePartHeader::fromColumnsAndChecksumsZNodes(
                        columns_znode, checksums_znode);
                }

                if (local_part_header.getColumnsHash() != zk_part_header.getColumnsHash())
                    throw Exception("Columns of local part " + part_name + " are different from ZooKeeper", ErrorCodes::TABLE_DIFFERS_TOO_MUCH);

                zk_part_header.getChecksums().checkEqual(local_part_header.getChecksums(), true);

                checkDataPart(
                    part,
                    true,
                    [this] { return need_stop.load(); });

                if (need_stop)
                {
                    LOG_INFO(log, "Checking part was cancelled.");
                    return {part_name, false, "Checking part was cancelled"};
                }

                LOG_INFO(log, "Part {} looks good.", part_name);
            }
            catch (const Exception & e)
            {
                /// Don't count the part as broken if there is not enough memory to load it.
                /// In fact, there can be many similar situations.
                /// But it is OK, because there is a safety guard against deleting too many parts.
                if (isNotEnoughMemoryErrorCode(e.code()))
                    throw;

                tryLogCurrentException(log, __PRETTY_FUNCTION__);

                String message = "Part " + part_name + " looks broken. Removing it and will try to fetch.";
                LOG_ERROR(log, message);

                /// Part is broken, let's try to find it and fetch.
                searchForMissingPartAndFetchIfPossible(part_name, exists_in_zookeeper);

                /// Delete part locally.
                storage.forgetPartAndMoveToDetached(part, "broken");
                return {part_name, false, message};
            }
        }
        else if (part->modification_time + MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER < time(nullptr))
        {
            /// If the part is not in ZooKeeper, delete it locally.
            /// Probably, someone just wrote down the part, and has not yet added to ZK.
            /// Therefore, delete only if the part is old (not very reliable).
            ProfileEvents::increment(ProfileEvents::ReplicatedPartChecksFailed);

            String message = "Unexpected part " + part_name + " in filesystem. Removing.";
            LOG_ERROR(log, message);
            storage.forgetPartAndMoveToDetached(part, "unexpected");
            return {part_name, false, message};
        }
        else
        {
            /// TODO You need to make sure that the part is still checked after a while.
            /// Otherwise, it's possible that the part was not added to ZK,
            ///  but remained in the filesystem and in a number of active parts.
            /// And then for a long time (before restarting), the data on the replicas will be different.

            LOG_TRACE(log, "Young part {} with age {} seconds hasn't been added to ZooKeeper yet. It's ok.", part_name, (time(nullptr) - part->modification_time));
        }
    }
    else
    {
        /// If we have a covering part, ignore all the problems with this part.
        /// In the worst case, errors will still appear `old_parts_lifetime` seconds in error log until the part is removed as the old one.
        LOG_WARNING(log, "We have part {} covering part {}", part->name, part_name);
    }

    return {part_name, true, ""};
}


void ReplicatedMergeTreePartCheckThread::run()
{
    if (need_stop)
        return;

    try
    {
        time_t current_time = time(nullptr);

        /// Take part from the queue for verification.
        PartsToCheckQueue::iterator selected = parts_queue.end();    /// end from std::list is not get invalidated
        time_t min_check_time = std::numeric_limits<time_t>::max();

        {
            std::lock_guard lock(parts_mutex);

            if (parts_queue.empty())
            {
                if (!parts_set.empty())
                {
                    LOG_ERROR(log, "Non-empty parts_set with empty parts_queue. This is a bug.");
                    parts_set.clear();
                }
            }
            else
            {
                for (auto it = parts_queue.begin(); it != parts_queue.end(); ++it)
                {
                    if (it->second <= current_time)
                    {
                        selected = it;
                        break;
                    }

                    if (it->second < min_check_time)
                        min_check_time = it->second;
                }
            }
        }

        if (selected == parts_queue.end())
            return;

        checkPart(selected->first);

        if (need_stop)
            return;

        /// Remove the part from check queue.
        {
            std::lock_guard lock(parts_mutex);

            if (parts_queue.empty())
            {
                LOG_ERROR(log, "Someone erased checking part from parts_queue. This is a bug.");
            }
            else
            {
                parts_set.erase(selected->first);
                parts_queue.erase(selected);
            }
        }

        task->schedule();
    }
    catch (const Coordination::Exception & e)
    {
        tryLogCurrentException(log, __PRETTY_FUNCTION__);

        if (e.code == Coordination::Error::ZSESSIONEXPIRED)
            return;

        task->scheduleAfter(PART_CHECK_ERROR_SLEEP_MS);
    }
    catch (...)
    {
        tryLogCurrentException(log, __PRETTY_FUNCTION__);
        task->scheduleAfter(PART_CHECK_ERROR_SLEEP_MS);
    }
}

}