Trying print less scary messages in case No such key for outdated parts

2024-11-23 08:02:02 +00:00 · 2022-09-14 17:25:58 +02:00 · 2022-09-14 17:25:58 +02:00 · ad279940d6
commit ad279940d6
parent c8dcd34abe
3 changed files with 40 additions and 5 deletions
--- a/src/IO/ReadBufferFromS3.cpp
+++ b/src/IO/ReadBufferFromS3.cpp
@ -131,18 +131,18 @@ bool ReadBufferFromS3::nextImpl()
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
            break;
        }
-        catch (const Exception & e)
+        catch (Exception & e)
        {
            watch.stop();
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3Microseconds, watch.elapsedMicroseconds());
            ProfileEvents::increment(ProfileEvents::ReadBufferFromS3RequestsErrors, 1);

-            if (const auto * s3_exception = dynamic_cast<const S3Exception *>(&e))
+            if (auto * s3_exception = dynamic_cast<S3Exception *>(&e))
            {
                /// It doesn't make sense to retry Access Denied or No Such Key
                if (!s3_exception->isRetryableError())
                {
-                    tryLogCurrentException(log, fmt::format("while reading key: {}, from bucket: {}", key, bucket));
+                    s3_exception->addMessage("while reading key: {}, from bucket: {}", key, bucket);
                    throw;
                }
            }
--- a/src/IO/S3Common.h
+++ b/src/IO/S3Common.h
@ -55,7 +55,7 @@ public:
    bool isRetryableError() const;

 private:
-    const Aws::S3::S3Errors code;
+    Aws::S3::S3Errors code;
 };
 }

--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -27,6 +27,7 @@
 #include <IO/Operators.h>
 #include <IO/ReadBufferFromMemory.h>
 #include <IO/WriteBufferFromString.h>
+#include <IO/S3Common.h>
 #include <Interpreters/Aggregator.h>
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/PartLog.h>
@ -956,6 +957,7 @@ void MergeTreeData::loadDataPartsFromDisk(
    /// Prepare data parts for parallel loading. Threads will focus on given disk first, then steal
    /// others' tasks when finish current disk part loading process.
    std::vector<std::vector<std::pair<String, DiskPtr>>> threads_parts(num_threads);
+    std::unordered_map<std::string, std::string> parts_broken_because_of_no_such_key;
    std::set<size_t> remaining_thread_parts;
    std::queue<size_t> threads_queue;
    for (size_t i = 0; i < num_threads; ++i)
@ -1038,6 +1040,31 @@ void MergeTreeData::loadDataPartsFromDisk(
        {
            part->loadColumnsChecksumsIndexes(require_part_metadata, true);
        }
+#if USE_AWS_S3
+        /// This code looks really strange. Why can it happen? When we remove something from S3 we can receive different kinds of errors during
+        /// interaction with [Zoo]Keeper, S3 and even local disk. In such way we can get into situation where part is partially removed (at least we
+        /// had an intention to remove it) but server was restarted and now we are trying to load this partially removed part. It can throw an exception
+        /// but if this part is actually covered by some other part it's Ok and we should react to it with <Error> message.
+        ///
+        /// The only known case is related to zookeeper connection loss in zero-copy replication during part unlock from [Zoo]Keeper before removal.
+        catch (const S3Exception & e)
+        {
+            broken = true;
+            if (e.getS3ErrorCode() == Aws::S3::S3Errors::NO_SUCH_KEY)
+            {
+                {
+                    std::lock_guard loading_lock(mutex);
+                    parts_broken_because_of_no_such_key[part->name] = e.displayText();
+                }
+                LOG_WARNING(log, "Part {} on path {} is broken because of NO_SUCH_KEY error in S3. It's Ok if we had [Zoo]Keeper connection failures during part removal."
+                            " Will check that part is covered", part->name, part_path);
+            }
+            else
+            {
+                tryLogCurrentException(log, fmt::format("while loading part {} on path {}", part->name, part_path));
+            }
+        }
+#endif
        catch (const Exception & e)
        {
            /// Don't count the part as broken if there is not enough memory to load it.
@ -1066,7 +1093,7 @@ void MergeTreeData::loadDataPartsFromDisk(
            }
            catch (...)
            {
-                tryLogCurrentException(log, fmt::format("while calculating part size {} on path {}", part->name, part_path));
+                tryLogCurrentException(log, fmt::format("while calculating part {} on path {}", part->name, part_path));
            }

            std::string part_size_str = "failed to calculate size";
@ -1172,6 +1199,14 @@ void MergeTreeData::loadDataPartsFromDisk(
    if (has_lightweight_in_parts)
        has_lightweight_delete_parts.store(true);

+    for (const auto & [part_name, exception] : parts_broken_because_of_no_such_key)
+    {
+        if (getActiveContainingPart(part_name) == nullptr)
+            LOG_ERROR(log, "Part {} is broken because of NO_SUCH_KEY error and not covered by any part: {}", part_name, exception);
+        else
+            LOG_DEBUG(log, "Part {} was not completely removed (and NO_SUCH_KEY was thrown on part load), but covered by active part, it's Ok", part_name);
+    }
+
    if (suspicious_broken_parts > settings->max_suspicious_broken_parts && !skip_sanity_checks)
        throw Exception(ErrorCodes::TOO_MANY_UNEXPECTED_DATA_PARTS,
            "Suspiciously many ({} parts, {} in total) broken parts to remove while maximum allowed broken parts count is {}. You can change the maximum value "