Trying print less scary messages in case No such key for outdated parts

2024-11-23 08:02:02 +00:00 · 2022-09-14 17:25:58 +02:00 · 2022-09-14 17:25:58 +02:00 · ad279940d6
commit ad279940d6
parent c8dcd34abe
3 changed files with 40 additions and 5 deletions
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -131,18 +131,18 @@ bool ReadBufferFromS3::nextImpl()
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
            break;
        }
-        catch (const Exception & e)
+        catch (Exception & e)
        {
            watch.stop();
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1);
-            if (const auto * s3_exception = dynamic_cast<const S3Exception *>(&e))
+            if (auto * s3_exception = dynamic_cast<S3Exception *>(&e))
            {
                /// It doesn't make sense to retry Access Denied or No Such Key
                if (!s3_exception->isRetryableError())
                {
-                    tryLogCurrentException(log, fmt::format("while reading key: {}, from bucket: {}", key, bucket));
+                    s3_exception->addMessage("while reading key: {}, from bucket: {}", key, bucket);
                    throw;
                }
            }
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -55,7 +55,7 @@ public:
    bool isRetryableError() const;
 private:
-    const Aws::S3::S3Errors code;
+    Aws::S3::S3Errors code;
 };
 }
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -27,6 +27,7 @@
 #include <IO/Operators.h>
 #include <IO/ReadBufferFromMemory.h>
 #include <IO/WriteBufferFromString.h>
 #include <IO/S3Common.h>
 #include <Interpreters/Aggregator.h>
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/PartLog.h>
@ -956,6 +957,7 @@ void MergeTreeData::loadDataPartsFromDisk(
    /// Prepare data parts for parallel loading. Threads will focus on given disk first, then steal
    /// others' tasks when finish current disk part loading process.
    std::vector<std::vector<std::pair<String, DiskPtr>>> threads_parts(num_threads);
    std::unordered_map<std::string, std::string> parts_broken_because_of_no_such_key;
    std::set<size_t> remaining_thread_parts;
    std::queue<size_t> threads_queue;
    for (size_t i = 0; i < num_threads; ++i)
@ -1038,6 +1040,31 @@ void MergeTreeData::loadDataPartsFromDisk(
        {
            part->loadColumnsChecksumsIndexes(require_part_metadata, true);
        }
 #if USE_AWS_S3
        /// This code looks really strange. Why can it happen? When we remove something from S3 we can receive different kinds of errors during
        /// interaction with [Zoo]Keeper, S3 and even local disk. In such way we can get into situation where part is partially removed (at least we
        /// had an intention to remove it) but server was restarted and now we are trying to load this partially removed part. It can throw an exception
        /// but if this part is actually covered by some other part it's Ok and we should react to it with <Error> message.
        ///
        /// The only known case is related to zookeeper connection loss in zero-copy replication during part unlock from [Zoo]Keeper before removal.
        catch (const S3Exception & e)
        {
            broken = true;
            if (e.getS3ErrorCode() == Aws::S3::S3Errors::NO_SUCH_KEY)
            {
                {
                    std::lock_guard loading_lock(mutex);
                    parts_broken_because_of_no_such_key[part->name] = e.displayText();
                }
                LOG_WARNING(log, "Part {} on path {} is broken because of NO_SUCH_KEY error in S3. It's Ok if we had [Zoo]Keeper connection failures during part removal."
                            " Will check that part is covered", part->name, part_path);
            }
            else
            {
                tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part->name, part_path));
            }
        }
 #endif
        catch (const Exception & e)
        {
            /// Don't count the part as broken if there is not enough memory to load it.
@ -1066,7 +1093,7 @@ void MergeTreeData::loadDataPartsFromDisk(
            }
            catch (...)
            {
-                tryLogCurrentException(log, fmt::format("while calculating part size {} on path {}", part->name, part_path));
+                tryLogCurrentException(log, fmt::format("while calculating part {} on path {}", part->name, part_path));
            }
            std::string part_size_str = "failed to calculate size";
@ -1172,6 +1199,14 @@ void MergeTreeData::loadDataPartsFromDisk(
    if (has_lightweight_in_parts)
        has_lightweight_delete_parts.store(true);
    for (const auto & [part_name, exception] : parts_broken_because_of_no_such_key)
    {
        if (getActiveContainingPart(part_name) == nullptr)
            LOG_ERROR(log, "Part {} is broken because of NO_SUCH_KEY error and not covered by any part: {}", part_name, exception);
        else
            LOG_DEBUG(log, "Part {} was not completely removed (and NO_SUCH_KEY was thrown on part load), but covered by active part, it's Ok", part_name);
    }
    if (suspicious_broken_parts > settings->max_suspicious_broken_parts && !skip_sanity_checks)
        throw Exception(ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS,
            "Suspiciously many ({} parts, {} in total) broken parts to remove while maximum allowed broken parts count is {}. You can change the maximum value "