Merge remote-tracking branch 'upstream/master' into git_analysis

2024-11-22 07:31:57 +00:00 · 2022-11-14 20:07:21 +00:00 · 2022-11-14 20:07:21 +00:00 · fcf8e5bfd0
commit fcf8e5bfd0
parent d557bece04 386b69acae
157 changed files with 3968 additions and 903 deletions
--- a/cmake/sanitize.cmake
+++ b/cmake/sanitize.cmake
@ -16,7 +16,9 @@ endmacro()

 if (SANITIZE)
    if (SANITIZE STREQUAL "address")
-        set (ASAN_FLAGS "-fsanitize=address -fsanitize-address-use-after-scope")
+        # LLVM-15 has a bug in Address Sanitizer, preventing the usage of 'sanitize-address-use-after-scope',
+        # see https://github.com/llvm/llvm-project/issues/58633
+        set (ASAN_FLAGS "-fsanitize=address -fno-sanitize-address-use-after-scope")
        set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")
        set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${SAN_FLAGS} ${ASAN_FLAGS}")

--- a/docker/server/Dockerfile.ubuntu
+++ b/docker/server/Dockerfile.ubuntu
@ -80,6 +80,16 @@ RUN arch=${TARGETARCH:-amd64} \
    && mkdir -p /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client \
    && chmod ugo+Xrw -R /var/lib/clickhouse /var/log/clickhouse-server /etc/clickhouse-server /etc/clickhouse-client

+# Remove as much of Ubuntu as possible.
+# ClickHouse does not need Ubuntu. It can run on top of Linux kernel without any OS distribution.
+# ClickHouse does not need Docker at all. ClickHouse is above all that.
+# It does not care about Ubuntu, Docker, or other cruft and you should neither.
+# The fact that this Docker image is based on Ubuntu is just a misconception.
+# Some vulnerability scanners are arguing about Ubuntu, which is not relevant to ClickHouse at all.
+# ClickHouse does not care when you report false vulnerabilities by running some Docker scanners.
+
+RUN apt-get remove --purge -y libksba8 && apt-get autoremove -y
+
 # we need to allow "others" access to clickhouse folder, because docker container
 # can be started with arbitrary uid (openshift usecase)

--- a/docs/en/operations/settings/permissions-for-queries.md
+++ b/docs/en/operations/settings/permissions-for-queries.md
@ -16,44 +16,54 @@ Queries in ClickHouse can be divided into several types:

 The following settings regulate user permissions by the type of query:

-   [readonly](#settings_readonly) — Restricts permissions for all types of queries except DDL queries.
-   [allow_ddl](#settings_allow_ddl) — Restricts permissions for DDL queries.
+## readonly
+Restricts permissions for read data, write data, and change settings queries.

-`KILL QUERY` can be performed with any settings.
+When set to 1, allows:

-## readonly {#settings_readonly}
+-   All types of read queries (like SELECT and equivalent queries).
+-   Queries that modify only session context (like USE).

-Restricts permissions for reading data, write data and change settings queries.
+When set to 2, allows the above plus:
+- SET and CREATE TEMPORARY TABLE

-See how the queries are divided into types [above](#permissions_for_queries).
+  :::tip
+  Queries like EXISTS, DESCRIBE, EXPLAIN, SHOW PROCESSLIST, etc are equivalent to SELECT, because they just do select from system tables.
+  :::

 Possible values:

-   0 — All queries are allowed.
-   1 — Only read data queries are allowed.
-   2 — Read data and change settings queries are allowed.
+-   0 — Read, Write, and Change settings queries are allowed.
+-   1 — Only Read data queries are allowed.
+-   2 — Read data and Change settings queries are allowed.

+Default value: 0
+
+:::note
 After setting `readonly = 1`, the user can’t change `readonly` and `allow_ddl` settings in the current session.

 When using the `GET` method in the [HTTP interface](../../interfaces/http.md), `readonly = 1` is set automatically. To modify data, use the `POST` method.

-Setting `readonly = 1` prohibit the user from changing all the settings. There is a way to prohibit the user from changing only specific settings. Also there is a way to allow changing only specific settings under `readonly = 1` restrictions. For details see [constraints on settings](../../operations/settings/constraints-on-settings.md).
+Setting `readonly = 1` prohibits the user from changing settings. There is a way to prohibit the user from changing only specific settings. Also there is a way to allow changing only specific settings under `readonly = 1` restrictions. For details see [constraints on settings](../../operations/settings/constraints-on-settings.md).
+:::

-Default value: 0

 ## allow_ddl {#settings_allow_ddl}

 Allows or denies [DDL](https://en.wikipedia.org/wiki/Data_definition_language) queries.

-See how the queries are divided into types [above](#permissions_for_queries).
-
 Possible values:

 -   0 — DDL queries are not allowed.
 -   1 — DDL queries are allowed.

-You can’t execute `SET allow_ddl = 1` if `allow_ddl = 0` for the current session.
-
 Default value: 1

-[Original article](https://clickhouse.com/docs/en/operations/settings/permissions_for_queries/) <!--hide-->
+:::note
+You cannot run `SET allow_ddl = 1` if `allow_ddl = 0` for the current session.
+:::
+
+
+:::note KILL QUERY
+`KILL QUERY` can be performed with any combination of readonly and allow_ddl settings.
+:::
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -3399,6 +3399,17 @@ Use schema from cache for URL with last modification time validation (for urls w

 Default value: `true`.

+## use_structure_from_insertion_table_in_table_functions {use_structure_from_insertion_table_in_table_functions}
+
+Use structure from insertion table instead of schema inference from data.
+
+Possible values:
+- 0 - disabled
+- 1 - enabled
+- 2 - auto
+
+Default value: 2.
+
 ## compatibility {#compatibility}

 This setting changes other settings according to provided ClickHouse version.
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -243,6 +243,7 @@ try
    registerAggregateFunctions();

    processConfig();
+    initTtyBuffer(toProgressOption(config().getString("progress", "default")));

    /// Includes delayed_interactive.
    if (is_interactive)
@ -1088,8 +1089,6 @@ void Client::processConfig()
    }
    else
    {
-        std::string progress = config().getString("progress", "tty");
-        need_render_progress = (Poco::icompare(progress, "off") && Poco::icompare(progress, "no") && Poco::icompare(progress, "false") && Poco::icompare(progress, "0"));
        echo_queries = config().getBool("echo", false);
        ignore_error = config().getBool("ignore-error", false);

--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -37,6 +37,7 @@
 #include <AggregateFunctions/registerAggregateFunctions.h>
 #include <TableFunctions/registerTableFunctions.h>
 #include <Storages/registerStorages.h>
+#include <Storages/NamedCollections.h>
 #include <Dictionaries/registerDictionaries.h>
 #include <Disks/registerDisks.h>
 #include <Formats/registerFormats.h>
@ -118,6 +119,8 @@ void LocalServer::initialize(Poco::Util::Application & self)
        config().getUInt("max_io_thread_pool_size", 100),
        config().getUInt("max_io_thread_pool_free_size", 0),
        config().getUInt("io_thread_pool_queue_size", 10000));
+
+    NamedCollectionFactory::instance().initialize(config());
 }


@ -414,6 +417,8 @@ try
    registerFormats();

    processConfig();
+    initTtyBuffer(toProgressOption(config().getString("progress", "default")));
+
    applyCmdSettings(global_context);

    if (is_interactive)
@ -489,8 +494,6 @@ void LocalServer::processConfig()
    }
    else
    {
-        std::string progress = config().getString("progress", "tty");
-        need_render_progress = (Poco::icompare(progress, "off") && Poco::icompare(progress, "no") && Poco::icompare(progress, "false") && Poco::icompare(progress, "0"));
        echo_queries = config().hasOption("echo") || config().hasOption("verbose");
        ignore_error = config().getBool("ignore-error", false);
        is_multiquery = true;
--- a/programs/server/MetricsTransmitter.cpp
+++ b/programs/server/MetricsTransmitter.cpp
@ -123,7 +123,7 @@ void MetricsTransmitter::transmit(std::vector<ProfileEvents::Count> & prev_count
    {
        for (const auto & name_value : async_metrics_values)
        {
-            key_vals.emplace_back(asynchronous_metrics_path_prefix + name_value.first, name_value.second);
+            key_vals.emplace_back(asynchronous_metrics_path_prefix + name_value.first, name_value.second.value);
        }
    }

--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -60,6 +60,7 @@
 #include <Storages/System/attachInformationSchemaTables.h>
 #include <Storages/Cache/ExternalDataSourceCache.h>
 #include <Storages/Cache/registerRemoteFileMetadatas.h>
+#include <Storages/NamedCollections.h>
 #include <AggregateFunctions/registerAggregateFunctions.h>
 #include <Functions/UserDefined/IUserDefinedSQLObjectsLoader.h>
 #include <Functions/registerFunctions.h>
@ -732,6 +733,8 @@ int Server::main(const std::vector<std::string> & /*args*/)
        config().getUInt("max_io_thread_pool_free_size", 0),
        config().getUInt("io_thread_pool_queue_size", 10000));

+    NamedCollectionFactory::instance().initialize(config());
+
    /// Initialize global local cache for remote filesystem.
    if (config().has("local_cache_for_remote_fs"))
    {
@ -1279,6 +1282,7 @@ int Server::main(const std::vector<std::string> & /*args*/)
 #if USE_SSL
            CertificateReloader::instance().tryLoad(*config);
 #endif
+            NamedCollectionFactory::instance().reload(*config);
            ProfileEvents::increment(ProfileEvents::MainConfigLoads);

            /// Must be the last.
@ -1486,11 +1490,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
 #endif

    SCOPE_EXIT({
-        /// Stop reloading of the main config. This must be done before `global_context->shutdown()` because
-        /// otherwise the reloading may pass a changed config to some destroyed parts of ContextSharedPart.
-        main_config_reloader.reset();
-        access_control.stopPeriodicReloading();
-
        async_metrics.stop();

        /** Ask to cancel background jobs all table engines,
@ -1789,10 +1788,17 @@ int Server::main(const std::vector<std::string> & /*args*/)

        SCOPE_EXIT_SAFE({
            LOG_DEBUG(log, "Received termination signal.");
-            LOG_DEBUG(log, "Waiting for current connections to close.");
+
+            /// Stop reloading of the main config. This must be done before everything else because it
+            /// can try to access/modify already deleted objects.
+            /// E.g. it can recreate new servers or it may pass a changed config to some destroyed parts of ContextSharedPart.
+            main_config_reloader.reset();
+            access_control.stopPeriodicReloading();

            is_cancelled = true;

+            LOG_DEBUG(log, "Waiting for current connections to close.");
+
            size_t current_connections = 0;
            {
                std::lock_guard lock(servers_lock);
--- a/src/Access/Common/AccessType.h
+++ b/src/Access/Common/AccessType.h
@ -130,6 +130,7 @@ enum class AccessType
    M(SHOW_ROW_POLICIES, "SHOW POLICIES, SHOW CREATE ROW POLICY, SHOW CREATE POLICY", TABLE, SHOW_ACCESS) \
    M(SHOW_QUOTAS, "SHOW CREATE QUOTA", GLOBAL, SHOW_ACCESS) \
    M(SHOW_SETTINGS_PROFILES, "SHOW PROFILES, SHOW CREATE SETTINGS PROFILE, SHOW CREATE PROFILE", GLOBAL, SHOW_ACCESS) \
+    M(SHOW_NAMED_COLLECTIONS, "SHOW NAMED COLLECTIONS", GLOBAL, SHOW_ACCESS) \
    M(SHOW_ACCESS, "", GROUP, ACCESS_MANAGEMENT) \
    M(ACCESS_MANAGEMENT, "", GROUP, ALL) \
    \
--- a/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp
+++ b/src/Analyzer/Passes/NormalizeCountVariantsPass.cpp
@ -33,18 +33,27 @@ public:

        if (function_node->getFunctionName() == "count" && !first_argument_constant_literal.isNull())
        {
+            resolveAsCountAggregateFunction(*function_node);
            function_node->getArguments().getNodes().clear();
        }
-        else if (function_node->getFunctionName() == "sum" && first_argument_constant_literal.getType() == Field::Types::UInt64 &&
+        else if (function_node->getFunctionName() == "sum" &&
+            first_argument_constant_literal.getType() == Field::Types::UInt64 &&
            first_argument_constant_literal.get<UInt64>() == 1)
        {
-            auto result_type = function_node->getResultType();
-            AggregateFunctionProperties properties;
-            auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties);
-            function_node->resolveAsAggregateFunction(std::move(aggregate_function), std::move(result_type));
+            resolveAsCountAggregateFunction(*function_node);
            function_node->getArguments().getNodes().clear();
        }
    }
+private:
+    static inline void resolveAsCountAggregateFunction(FunctionNode & function_node)
+    {
+        auto function_result_type = function_node.getResultType();
+
+        AggregateFunctionProperties properties;
+        auto aggregate_function = AggregateFunctionFactory::instance().get("count", {}, {}, properties);
+
+        function_node.resolveAsAggregateFunction(std::move(aggregate_function), std::move(function_result_type));
+    }
 };

 }
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -119,22 +119,27 @@ namespace ProfileEvents
 namespace DB
 {

+ProgressOption toProgressOption(std::string progress)
+{
+    boost::to_upper(progress);
+
+    if (progress == "OFF" || progress == "FALSE" || progress == "0" || progress == "NO")
+        return ProgressOption::OFF;
+    if (progress == "TTY" || progress == "ON" || progress == "TRUE" || progress == "1" || progress == "YES")
+        return ProgressOption::TTY;
+    if (progress == "ERR")
+        return ProgressOption::ERR;
+    if (progress == "DEFAULT")
+        return ProgressOption::DEFAULT;
+
+    throw boost::program_options::validation_error(boost::program_options::validation_error::invalid_option_value);
+}
+
 std::istream& operator>> (std::istream & in, ProgressOption & progress)
 {
    std::string token;
    in >> token;
-
-    boost::to_upper(token);
-
-    if (token == "OFF" || token == "FALSE" || token == "0" || token == "NO")
-        progress = ProgressOption::OFF;
-    else if (token == "TTY" || token == "ON" || token == "TRUE" || token == "1" || token == "YES")
-        progress = ProgressOption::TTY;
-    else if (token == "ERR")
-        progress = ProgressOption::ERR;
-    else
-        throw boost::program_options::validation_error(boost::program_options::validation_error::invalid_option_value);
-
+    progress = toProgressOption(token);
    return in;
 }

@ -662,56 +667,62 @@ void ClientBase::initLogsOutputStream()
    }
 }

-void ClientBase::initTtyBuffer(bool to_err)
+void ClientBase::initTtyBuffer(ProgressOption progress)
 {
-    if (!tty_buf)
+    if (tty_buf)
+        return;
+
+    if (progress == ProgressOption::OFF || (!is_interactive && progress == ProgressOption::DEFAULT))
    {
-        static constexpr auto tty_file_name = "/dev/tty";
+         need_render_progress = false;
+         return;
+    }

-        /// Output all progress bar commands to terminal at once to avoid flicker.
-        /// This size is usually greater than the window size.
-        static constexpr size_t buf_size = 1024;
+    static constexpr auto tty_file_name = "/dev/tty";

-        if (!to_err)
+    /// Output all progress bar commands to terminal at once to avoid flicker.
+    /// This size is usually greater than the window size.
+    static constexpr size_t buf_size = 1024;
+
+    if (is_interactive || progress == ProgressOption::TTY)
+    {
+        std::error_code ec;
+        std::filesystem::file_status tty = std::filesystem::status(tty_file_name, ec);
+
+        if (!ec && exists(tty) && is_character_file(tty)
+            && (tty.permissions() & std::filesystem::perms::others_write) != std::filesystem::perms::none)
        {
-            std::error_code ec;
-            std::filesystem::file_status tty = std::filesystem::status(tty_file_name, ec);
-
-            if (!ec && exists(tty) && is_character_file(tty)
-                && (tty.permissions() & std::filesystem::perms::others_write) != std::filesystem::perms::none)
+            try
            {
-                try
-                {
-                    tty_buf = std::make_unique<WriteBufferFromFile>(tty_file_name, buf_size);
+                tty_buf = std::make_unique<WriteBufferFromFile>(tty_file_name, buf_size);

-                    /// It is possible that the terminal file has writeable permissions
-                    /// but we cannot write anything there. Check it with invisible character.
-                    tty_buf->write('\0');
-                    tty_buf->next();
+                /// It is possible that the terminal file has writeable permissions
+                /// but we cannot write anything there. Check it with invisible character.
+                tty_buf->write('\0');
+                tty_buf->next();

-                    return;
-                }
-                catch (const Exception & e)
-                {
-                    if (tty_buf)
-                        tty_buf.reset();
+                return;
+            }
+            catch (const Exception & e)
+            {
+                if (tty_buf)
+                    tty_buf.reset();

-                    if (e.code() != ErrorCodes::CANNOT_OPEN_FILE)
-                        throw;
+                if (e.code() != ErrorCodes::CANNOT_OPEN_FILE)
+                    throw;

-                    /// It is normal if file exists, indicated as writeable but still cannot be opened.
-                    /// Fallback to other options.
-                }
+                /// It is normal if file exists, indicated as writeable but still cannot be opened.
+                /// Fallback to other options.
            }
        }
-
-        if (stderr_is_a_tty)
-        {
-            tty_buf = std::make_unique<WriteBufferFromFileDescriptor>(STDERR_FILENO, buf_size);
-        }
-        else
-            need_render_progress = false;
    }
+
+    if (stderr_is_a_tty || progress == ProgressOption::ERR)
+    {
+        tty_buf = std::make_unique<WriteBufferFromFileDescriptor>(STDERR_FILENO, buf_size);
+    }
+    else
+        need_render_progress = false;
 }

 void ClientBase::updateSuggest(const ASTPtr & ast)
@ -2324,7 +2335,7 @@ void ClientBase::init(int argc, char ** argv)
        ("stage", po::value<std::string>()->default_value("complete"), "Request query processing up to specified stage: complete,fetch_columns,with_mergeable_state,with_mergeable_state_after_aggregation,with_mergeable_state_after_aggregation_and_limit")
        ("query_kind", po::value<std::string>()->default_value("initial_query"), "One of initial_query/secondary_query/no_query")
        ("query_id", po::value<std::string>(), "query_id")
-        ("progress", po::value<ProgressOption>()->implicit_value(ProgressOption::TTY, "tty")->default_value(ProgressOption::TTY, "tty"), "Print progress of queries execution - to TTY (default): tty|on|1|true|yes; to STDERR: err; OFF: off|0|false|no")
+        ("progress", po::value<ProgressOption>()->implicit_value(ProgressOption::TTY, "tty")->default_value(ProgressOption::DEFAULT, "default"), "Print progress of queries execution - to TTY: tty|on|1|true|yes; to STDERR non-interactive mode: err; OFF: off|0|false|no; DEFAULT - interactive to TTY, non-interactive is off")

        ("disable_suggestion,A", "Disable loading suggestion data. Note that suggestion data is loaded asynchronously through a second connection to ClickHouse server. Also it is reasonable to disable suggestion if you want to paste a query with TAB characters. Shorthand option -A is for those who get used to mysql client.")
        ("time,t", "print query execution time to stderr in non-interactive mode (for benchmarks)")
@ -2379,11 +2390,6 @@ void ClientBase::init(int argc, char ** argv)
    parseAndCheckOptions(options_description, options, common_arguments);
    po::notify(options);

-    if (options["progress"].as<ProgressOption>() == ProgressOption::OFF)
-        need_render_progress = false;
-    else
-        initTtyBuffer(options["progress"].as<ProgressOption>() == ProgressOption::ERR);
-
    if (options.count("version") || options.count("V"))
    {
        showClientVersion();
@ -2437,6 +2443,9 @@ void ClientBase::init(int argc, char ** argv)
    {
        switch (options["progress"].as<ProgressOption>())
        {
+            case DEFAULT:
+                config().setString("progress", "default");
+                break;
            case OFF:
                config().setString("progress", "off");
                break;
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@ -38,10 +38,12 @@ enum MultiQueryProcessingStage

 enum ProgressOption
 {
+    DEFAULT,
    OFF,
    TTY,
    ERR,
 };
+ProgressOption toProgressOption(std::string progress);
 std::istream& operator>> (std::istream & in, ProgressOption & progress);

 void interruptSignalHandler(int signum);
@ -153,7 +155,6 @@ private:

    void initOutputFormat(const Block & block, ASTPtr parsed_query);
    void initLogsOutputStream();
-    void initTtyBuffer(bool to_err = false);

    String prompt() const;

@ -168,6 +169,8 @@ protected:
    static bool isSyncInsertWithData(const ASTInsertQuery & insert_query, const ContextPtr & context);
    bool processMultiQueryFromFile(const String & file_name);

+    void initTtyBuffer(ProgressOption progress);
+
    bool is_interactive = false; /// Use either interactive line editing interface or batch mode.
    bool is_multiquery = false;
    bool delayed_interactive = false;
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -637,6 +637,8 @@
    M(666, CANNOT_USE_CACHE) \
    M(667, NOT_INITIALIZED) \
    M(668, INVALID_STATE) \
+    M(669, UNKNOWN_NAMED_COLLECTION) \
+    M(670, NAMED_COLLECTION_ALREADY_EXISTS) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/Exception.h
+++ b/src/Common/Exception.h
@ -12,6 +12,7 @@

 #include <fmt/format.h>

+
 namespace Poco { class Logger; }


--- a/src/Common/ZooKeeper/TestKeeper.h
+++ b/src/Common/ZooKeeper/TestKeeper.h
@ -34,7 +34,7 @@ using TestKeeperRequestPtr = std::shared_ptr<TestKeeperRequest>;
 class TestKeeper final : public IKeeper
 {
 public:
-    TestKeeper(const zkutil::ZooKeeperArgs & args_);
+    explicit TestKeeper(const zkutil::ZooKeeperArgs & args_);
    ~TestKeeper() override;

    bool isExpired() const override { return expired; }
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -156,7 +156,7 @@ public:
    using Ptr = std::shared_ptr<ZooKeeper>;
    using ErrorsList = std::initializer_list<Coordination::Error>;

-    ZooKeeper(const ZooKeeperArgs & args_, std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr);
+    explicit ZooKeeper(const ZooKeeperArgs & args_, std::shared_ptr<DB::ZooKeeperLog> zk_log_ = nullptr);

    /** Config of the form:
        <zookeeper>
--- a/src/Core/MySQL/MySQLReplication.cpp
+++ b/src/Core/MySQL/MySQLReplication.cpp
@ -116,7 +116,8 @@ namespace MySQLReplication
            if (!query.starts_with("XA COMMIT"))
                transaction_complete = false;
        }
-        else if (query.starts_with("SAVEPOINT"))
+        else if (query.starts_with("SAVEPOINT") || query.starts_with("ROLLBACK")
+                 || query.starts_with("RELEASE SAVEPOINT"))
        {
            typ = QUERY_SAVEPOINT;
        }
@ -941,6 +942,7 @@ namespace MySQLReplication
                {
                    case QUERY_EVENT_MULTI_TXN_FLAG:
                    case QUERY_EVENT_XA:
+                    /// Ignore queries that have no impact on the data.
                    case QUERY_SAVEPOINT:
                    {
                        event = std::make_shared<DryRunEvent>(std::move(query->header));
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -614,7 +614,7 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    \
    M(Bool, load_marks_asynchronously, false, "Load MergeTree marks asynchronously", 0) \
    \
-    M(Bool, use_structure_from_insertion_table_in_table_functions, false, "Use structure from insertion table instead of schema inference from data", 0) \
+    M(UInt64, use_structure_from_insertion_table_in_table_functions, 2, "Use structure from insertion table instead of schema inference from data. Possible values: 0 - disabled, 1 - enabled, 2 - auto", 0) \
    \
    M(UInt64, http_max_tries, 10, "Max attempts to read via http.", 0) \
    M(UInt64, http_retry_initial_backoff_ms, 100, "Min milliseconds for backoff, when retrying read via http", 0) \
@ -659,6 +659,11 @@ static constexpr UInt64 operator""_GiB(unsigned long long value)
    M(Bool, multiple_joins_try_to_keep_original_names, false, "Do not add aliases to top level expression list on multiple joins rewrite", 0) \
    M(Bool, optimize_distinct_in_order, true, "Enable DISTINCT optimization if some columns in DISTINCT form a prefix of sorting. For example, prefix of sorting key in merge tree or ORDER BY statement", 0) \
    M(Bool, optimize_sorting_by_input_stream_properties, true, "Optimize sorting by sorting properties of input stream", 0) \
+    M(UInt64, insert_keeper_max_retries, 0, "Max retries for keeper operations during insert", 0) \
+    M(UInt64, insert_keeper_retry_initial_backoff_ms, 100, "Initial backoff timeout for keeper operations during insert", 0) \
+    M(UInt64, insert_keeper_retry_max_backoff_ms, 10000, "Max backoff timeout for keeper operations during insert", 0) \
+    M(Float, insert_keeper_fault_injection_probability, 0.0f, "Approximate probability of failure for a keeper request during insert. Valid value is in interval [0.0f, 1.0f]", 0) \
+    M(UInt64, insert_keeper_fault_injection_seed, 0, "0 - random seed, otherwise the setting value", 0) \
    // End of COMMON_SETTINGS
    // Please add settings related to formats into the FORMAT_FACTORY_SETTINGS and move obsolete settings to OBSOLETE_SETTINGS.

--- a/src/Core/SettingsChangesHistory.h
+++ b/src/Core/SettingsChangesHistory.h
@ -78,6 +78,7 @@ namespace SettingsChangesHistory
 /// It's used to implement `compatibility` setting (see https://github.com/ClickHouse/ClickHouse/issues/35972)
 static std::map<ClickHouseVersion, SettingsChangesHistory::SettingsChanges> settings_changes_history =
 {
+        {"22.11", {{"use_structure_from_insertion_table_in_table_functions", 0, 2, "Improve using structure from insertion table in table functions"}}},
        {"22.9", {{"force_grouping_standard_compatibility", false, true, "Make GROUPING function output the same as in SQL standard and other DBMS"}}},
        {"22.7", {{"cross_to_inner_join_rewrite", 1, 2, "Force rewrite comma join to inner"},
                  {"enable_positional_arguments", false, true, "Enable positional arguments feature by default"},
--- a/src/DataTypes/convertMySQLDataType.cpp
+++ b/src/DataTypes/convertMySQLDataType.cpp
@ -55,7 +55,7 @@ DataTypePtr convertMySQLDataType(MultiEnum<MySQLDataTypesSupport> type_support,
        else
            res = std::make_shared<DataTypeInt16>();
    }
-    else if (type_name == "int" || type_name == "mediumint")
+    else if (type_name == "int" || type_name == "mediumint" || type_name == "integer")
    {
        if (is_unsigned)
            res = std::make_shared<DataTypeUInt32>();
--- a/src/Formats/EscapingRuleUtils.cpp
+++ b/src/Formats/EscapingRuleUtils.cpp
@ -464,6 +464,9 @@ bool tryInferDate(const std::string_view & field)

 bool tryInferDateTime(const std::string_view & field, const FormatSettings & settings)
 {
+    if (field.empty())
+        return false;
+
    ReadBufferFromString buf(field);
    Float64 tmp_float;
    /// Check if it's just a number, and if so, don't try to infer DateTime from it,
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@ -2828,6 +2828,31 @@ private:
        };
    }

+#define GENERATE_INTERVAL_CASE(INTERVAL_KIND) \
+            case IntervalKind::INTERVAL_KIND: \
+                return createFunctionAdaptor(FunctionConvert<DataTypeInterval, NameToInterval##INTERVAL_KIND, PositiveMonotonicity>::create(), from_type);
+
+    static WrapperType createIntervalWrapper(const DataTypePtr & from_type, IntervalKind kind)
+    {
+        switch (kind)
+        {
+            GENERATE_INTERVAL_CASE(Nanosecond)
+            GENERATE_INTERVAL_CASE(Microsecond)
+            GENERATE_INTERVAL_CASE(Millisecond)
+            GENERATE_INTERVAL_CASE(Second)
+            GENERATE_INTERVAL_CASE(Minute)
+            GENERATE_INTERVAL_CASE(Hour)
+            GENERATE_INTERVAL_CASE(Day)
+            GENERATE_INTERVAL_CASE(Week)
+            GENERATE_INTERVAL_CASE(Month)
+            GENERATE_INTERVAL_CASE(Quarter)
+            GENERATE_INTERVAL_CASE(Year)
+        }
+        throw Exception{ErrorCodes::CANNOT_CONVERT_TYPE, "Conversion to unexpected IntervalKind: {}", kind.toString()};
+    }
+
+#undef GENERATE_INTERVAL_CASE
+
    template <typename ToDataType>
    requires IsDataTypeDecimal<ToDataType>
    WrapperType createDecimalWrapper(const DataTypePtr & from_type, const ToDataType * to_type, bool requested_result_is_nullable) const
@ -3853,6 +3878,8 @@ private:
                return createObjectWrapper(from_type, checkAndGetDataType<DataTypeObject>(to_type.get()));
            case TypeIndex::AggregateFunction:
                return createAggregateFunctionWrapper(from_type, checkAndGetDataType<DataTypeAggregateFunction>(to_type.get()));
+            case TypeIndex::Interval:
+                return createIntervalWrapper(from_type, checkAndGetDataType<DataTypeInterval>(to_type.get())->getKind());
            default:
                break;
        }
--- a/src/Functions/UTCTimestamp.cpp
+++ b/src/Functions/UTCTimestamp.cpp
@ -0,0 +1,125 @@
+#include <DataTypes/DataTypeDateTime.h>
+
+#include <Functions/IFunction.h>
+#include <Core/DecimalFunctions.h>
+#include <Functions/FunctionFactory.h>
+#include <Core/Field.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+}
+
+namespace
+{
+
+/// Get the UTC time. (It is a constant, it is evaluated once for the entire query.)
+class ExecutableFunctionUTCTimestamp : public IExecutableFunction
+{
+public:
+    explicit ExecutableFunctionUTCTimestamp(time_t time_) : time_value(time_) {}
+
+    String getName() const override { return "UTCTimestamp"; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName &, const DataTypePtr &, size_t input_rows_count) const override
+    {
+        return DataTypeDateTime().createColumnConst(
+                input_rows_count,
+                static_cast<UInt64>(time_value));
+    }
+
+private:
+    time_t time_value;
+};
+
+class FunctionBaseUTCTimestamp : public IFunctionBase
+{
+public:
+    explicit FunctionBaseUTCTimestamp(time_t time_, DataTypes argument_types_, DataTypePtr return_type_)
+        : time_value(time_), argument_types(std::move(argument_types_)), return_type(std::move(return_type_)) {}
+
+    String getName() const override { return "UTCTimestamp"; }
+
+    const DataTypes & getArgumentTypes() const override
+    {
+        return argument_types;
+    }
+
+    const DataTypePtr & getResultType() const override
+    {
+        return return_type;
+    }
+
+    ExecutableFunctionPtr prepare(const ColumnsWithTypeAndName &) const override
+    {
+        return std::make_unique<ExecutableFunctionUTCTimestamp>(time_value);
+    }
+
+    bool isDeterministic() const override { return false; }
+    bool isDeterministicInScopeOfQuery() const override { return true; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return false; }
+
+private:
+    time_t time_value;
+    DataTypes argument_types;
+    DataTypePtr return_type;
+};
+
+class UTCTimestampOverloadResolver : public IFunctionOverloadResolver
+{
+public:
+    static constexpr auto name = "UTCTimestamp";
+
+    String getName() const override { return name; }
+
+    bool isDeterministic() const override { return false; }
+
+    bool isVariadic() const override { return false; }
+
+    size_t getNumberOfArguments() const override { return 0; }
+    static FunctionOverloadResolverPtr create(ContextPtr) { return std::make_unique<UTCTimestampOverloadResolver>(); }
+
+    DataTypePtr getReturnTypeImpl(const ColumnsWithTypeAndName & arguments) const override
+    {
+        if (!arguments.empty())
+        {
+            throw Exception("Arguments size of function " + getName() + " should be 0", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+        }
+
+        return std::make_shared<DataTypeDateTime>();
+    }
+
+    FunctionBasePtr buildImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &) const override
+    {
+        if (!arguments.empty())
+        {
+            throw Exception("Arguments size of function " + getName() + " should be 0", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+        }
+
+        return std::make_unique<FunctionBaseUTCTimestamp>(time(nullptr), DataTypes(), std::make_shared<DataTypeDateTime>("UTC"));
+    }
+};
+
+}
+
+/// UTC_timestamp for MySQL interface support
+REGISTER_FUNCTION(UTCTimestamp)
+{
+    factory.registerFunction<UTCTimestampOverloadResolver>({
+        R"(
+Returns the current date and time at the moment of query analysis. The function is a constant expression.
+Same as `now('UTC')`. Was added only for MySQL support. `now` is preferred.
+
+Example:
+[example:typical]
+)",
+    Documentation::Examples{
+        {"typical", "SELECT UTCTimestamp();"}},
+    Documentation::Categories{"Dates and Times"}}, FunctionFactory::CaseInsensitive);
+    factory.registerAlias("UTC_timestamp", UTCTimestampOverloadResolver::name, FunctionFactory::CaseInsensitive);
+}
+
+}
--- a/src/Functions/dateDiff.cpp
+++ b/src/Functions/dateDiff.cpp
@ -34,6 +34,164 @@ namespace ErrorCodes
 namespace
 {

+class DateDiffImpl
+{
+public:
+    using ColumnDateTime64 = ColumnDecimal<DateTime64>;
+
+    explicit DateDiffImpl(const String & name_) : name(name_) {}
+
+    template <typename Transform>
+    void dispatchForColumns(
+        const IColumn & x, const IColumn & y,
+        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
+        ColumnInt64::Container & result) const
+    {
+        if (const auto * x_vec_16 = checkAndGetColumn<ColumnDate>(&x))
+            dispatchForSecondColumn<Transform>(*x_vec_16, y, timezone_x, timezone_y, result);
+        else if (const auto * x_vec_32 = checkAndGetColumn<ColumnDateTime>(&x))
+            dispatchForSecondColumn<Transform>(*x_vec_32, y, timezone_x, timezone_y, result);
+        else if (const auto * x_vec_32_s = checkAndGetColumn<ColumnDate32>(&x))
+            dispatchForSecondColumn<Transform>(*x_vec_32_s, y, timezone_x, timezone_y, result);
+        else if (const auto * x_vec_64 = checkAndGetColumn<ColumnDateTime64>(&x))
+            dispatchForSecondColumn<Transform>(*x_vec_64, y, timezone_x, timezone_y, result);
+        else if (const auto * x_const_16 = checkAndGetColumnConst<ColumnDate>(&x))
+            dispatchConstForSecondColumn<Transform>(x_const_16->getValue<UInt16>(), y, timezone_x, timezone_y, result);
+        else if (const auto * x_const_32 = checkAndGetColumnConst<ColumnDateTime>(&x))
+            dispatchConstForSecondColumn<Transform>(x_const_32->getValue<UInt32>(), y, timezone_x, timezone_y, result);
+        else if (const auto * x_const_32_s = checkAndGetColumnConst<ColumnDate32>(&x))
+            dispatchConstForSecondColumn<Transform>(x_const_32_s->getValue<Int32>(), y, timezone_x, timezone_y, result);
+        else if (const auto * x_const_64 = checkAndGetColumnConst<ColumnDateTime64>(&x))
+            dispatchConstForSecondColumn<Transform>(x_const_64->getValue<DecimalField<DateTime64>>(), y, timezone_x, timezone_y, result);
+        else
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+                "Illegal column for first argument of function {}, must be Date, Date32, DateTime or DateTime64",
+                name);
+    }
+
+    template <typename Transform, typename LeftColumnType>
+    void dispatchForSecondColumn(
+        const LeftColumnType & x, const IColumn & y,
+        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
+        ColumnInt64::Container & result) const
+    {
+        if (const auto * y_vec_16 = checkAndGetColumn<ColumnDate>(&y))
+            vectorVector<Transform>(x, *y_vec_16, timezone_x, timezone_y, result);
+        else if (const auto * y_vec_32 = checkAndGetColumn<ColumnDateTime>(&y))
+            vectorVector<Transform>(x, *y_vec_32, timezone_x, timezone_y, result);
+        else if (const auto * y_vec_32_s = checkAndGetColumn<ColumnDate32>(&y))
+            vectorVector<Transform>(x, *y_vec_32_s, timezone_x, timezone_y, result);
+        else if (const auto * y_vec_64 = checkAndGetColumn<ColumnDateTime64>(&y))
+            vectorVector<Transform>(x, *y_vec_64, timezone_x, timezone_y, result);
+        else if (const auto * y_const_16 = checkAndGetColumnConst<ColumnDate>(&y))
+            vectorConstant<Transform>(x, y_const_16->getValue<UInt16>(), timezone_x, timezone_y, result);
+        else if (const auto * y_const_32 = checkAndGetColumnConst<ColumnDateTime>(&y))
+            vectorConstant<Transform>(x, y_const_32->getValue<UInt32>(), timezone_x, timezone_y, result);
+        else if (const auto * y_const_32_s = checkAndGetColumnConst<ColumnDate32>(&y))
+            vectorConstant<Transform>(x, y_const_32_s->getValue<Int32>(), timezone_x, timezone_y, result);
+        else if (const auto * y_const_64 = checkAndGetColumnConst<ColumnDateTime64>(&y))
+            vectorConstant<Transform>(x, y_const_64->getValue<DecimalField<DateTime64>>(), timezone_x, timezone_y, result);
+        else
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+                "Illegal column for second argument of function {}, must be Date, Date32, DateTime or DateTime64",
+                name);
+    }
+
+    template <typename Transform, typename T1>
+    void dispatchConstForSecondColumn(
+        T1 x, const IColumn & y,
+        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
+        ColumnInt64::Container & result) const
+    {
+        if (const auto * y_vec_16 = checkAndGetColumn<ColumnDate>(&y))
+            constantVector<Transform>(x, *y_vec_16, timezone_x, timezone_y, result);
+        else if (const auto * y_vec_32 = checkAndGetColumn<ColumnDateTime>(&y))
+            constantVector<Transform>(x, *y_vec_32, timezone_x, timezone_y, result);
+        else if (const auto * y_vec_32_s = checkAndGetColumn<ColumnDate32>(&y))
+            constantVector<Transform>(x, *y_vec_32_s, timezone_x, timezone_y, result);
+        else if (const auto * y_vec_64 = checkAndGetColumn<ColumnDateTime64>(&y))
+            constantVector<Transform>(x, *y_vec_64, timezone_x, timezone_y, result);
+        else
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
+                "Illegal column for second argument of function {}, must be Date, Date32, DateTime or DateTime64",
+                name);
+    }
+
+    template <typename Transform, typename LeftColumnType, typename RightColumnType>
+    void vectorVector(
+        const LeftColumnType & x, const RightColumnType & y,
+        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
+        ColumnInt64::Container & result) const
+    {
+        const auto & x_data = x.getData();
+        const auto & y_data = y.getData();
+
+        const auto transform_x = TransformDateTime64<Transform>(getScale(x));
+        const auto transform_y = TransformDateTime64<Transform>(getScale(y));
+        for (size_t i = 0, size = x.size(); i < size; ++i)
+                result[i] = calculate(transform_x, transform_y, x_data[i], y_data[i], timezone_x, timezone_y);
+    }
+
+    template <typename Transform, typename LeftColumnType, typename T2>
+    void vectorConstant(
+        const LeftColumnType & x, T2 y,
+        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
+        ColumnInt64::Container & result) const
+    {
+        const auto & x_data = x.getData();
+        const auto transform_x = TransformDateTime64<Transform>(getScale(x));
+        const auto transform_y = TransformDateTime64<Transform>(getScale(y));
+        const auto y_value = stripDecimalFieldValue(y);
+
+        for (size_t i = 0, size = x.size(); i < size; ++i)
+            result[i] = calculate(transform_x, transform_y, x_data[i], y_value, timezone_x, timezone_y);
+    }
+
+    template <typename Transform, typename T1, typename RightColumnType>
+    void constantVector(
+        T1 x, const RightColumnType & y,
+        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
+        ColumnInt64::Container & result) const
+    {
+        const auto & y_data = y.getData();
+        const auto transform_x = TransformDateTime64<Transform>(getScale(x));
+        const auto transform_y = TransformDateTime64<Transform>(getScale(y));
+        const auto x_value = stripDecimalFieldValue(x);
+
+        for (size_t i = 0, size = y.size(); i < size; ++i)
+            result[i] = calculate(transform_x, transform_y, x_value, y_data[i], timezone_x, timezone_y);
+    }
+
+    template <typename TransformX, typename TransformY, typename T1, typename T2>
+    Int64 calculate(const TransformX & transform_x, const TransformY & transform_y, T1 x, T2 y, const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y) const
+    {
+        return static_cast<Int64>(transform_y.execute(y, timezone_y))
+                - static_cast<Int64>(transform_x.execute(x, timezone_x));
+    }
+
+    template <typename T>
+    static UInt32 getScale(const T & v)
+    {
+        if constexpr (std::is_same_v<T, ColumnDateTime64>)
+            return v.getScale();
+        else if constexpr (std::is_same_v<T, DecimalField<DateTime64>>)
+            return v.getScale();
+
+        return 0;
+    }
+    template <typename T>
+    static auto stripDecimalFieldValue(T && v)
+    {
+        if constexpr (std::is_same_v<std::decay_t<T>, DecimalField<DateTime64>>)
+            return v.getValue();
+        else
+            return v;
+    }
+private:
+    String name;
+};
+
+
 /** dateDiff('unit', t1, t2, [timezone])
  * t1 and t2 can be Date or DateTime
  *
@ -112,175 +270,89 @@ public:
        const auto & timezone_y = extractTimeZoneFromFunctionArguments(arguments, 3, 2);

        if (unit == "year" || unit == "yy" || unit == "yyyy")
-            dispatchForColumns<ToRelativeYearNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
+            impl.dispatchForColumns<ToRelativeYearNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
        else if (unit == "quarter" || unit == "qq" || unit == "q")
-            dispatchForColumns<ToRelativeQuarterNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
+            impl.dispatchForColumns<ToRelativeQuarterNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
        else if (unit == "month" || unit == "mm" || unit == "m")
-            dispatchForColumns<ToRelativeMonthNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
+            impl.dispatchForColumns<ToRelativeMonthNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
        else if (unit == "week" || unit == "wk" || unit == "ww")
-            dispatchForColumns<ToRelativeWeekNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
+            impl.dispatchForColumns<ToRelativeWeekNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
        else if (unit == "day" || unit == "dd" || unit == "d")
-            dispatchForColumns<ToRelativeDayNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
+            impl.dispatchForColumns<ToRelativeDayNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
        else if (unit == "hour" || unit == "hh" || unit == "h")
-            dispatchForColumns<ToRelativeHourNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
+            impl.dispatchForColumns<ToRelativeHourNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
        else if (unit == "minute" || unit == "mi" || unit == "n")
-            dispatchForColumns<ToRelativeMinuteNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
+            impl.dispatchForColumns<ToRelativeMinuteNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
        else if (unit == "second" || unit == "ss" || unit == "s")
-            dispatchForColumns<ToRelativeSecondNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
+            impl.dispatchForColumns<ToRelativeSecondNumImpl<ResultPrecision::Extended>>(x, y, timezone_x, timezone_y, res->getData());
        else
            throw Exception(ErrorCodes::BAD_ARGUMENTS,
                "Function {} does not support '{}' unit", getName(), unit);

        return res;
    }
-
 private:
-    template <typename Transform>
-    void dispatchForColumns(
-        const IColumn & x, const IColumn & y,
-        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
-        ColumnInt64::Container & result) const
+    DateDiffImpl impl{name};
+};
+
+
+/** TimeDiff(t1, t2)
+  * t1 and t2 can be Date or DateTime
+  */
+class FunctionTimeDiff : public IFunction
+{
+    using ColumnDateTime64 = ColumnDecimal<DateTime64>;
+public:
+    static constexpr auto name = "TimeDiff";
+    static FunctionPtr create(ContextPtr) { return std::make_shared<FunctionTimeDiff>(); }
+
+    String getName() const override
    {
-        if (const auto * x_vec_16 = checkAndGetColumn<ColumnDate>(&x))
-            dispatchForSecondColumn<Transform>(*x_vec_16, y, timezone_x, timezone_y, result);
-        else if (const auto * x_vec_32 = checkAndGetColumn<ColumnDateTime>(&x))
-            dispatchForSecondColumn<Transform>(*x_vec_32, y, timezone_x, timezone_y, result);
-        else if (const auto * x_vec_32_s = checkAndGetColumn<ColumnDate32>(&x))
-            dispatchForSecondColumn<Transform>(*x_vec_32_s, y, timezone_x, timezone_y, result);
-        else if (const auto * x_vec_64 = checkAndGetColumn<ColumnDateTime64>(&x))
-            dispatchForSecondColumn<Transform>(*x_vec_64, y, timezone_x, timezone_y, result);
-        else if (const auto * x_const_16 = checkAndGetColumnConst<ColumnDate>(&x))
-            dispatchConstForSecondColumn<Transform>(x_const_16->getValue<UInt16>(), y, timezone_x, timezone_y, result);
-        else if (const auto * x_const_32 = checkAndGetColumnConst<ColumnDateTime>(&x))
-            dispatchConstForSecondColumn<Transform>(x_const_32->getValue<UInt32>(), y, timezone_x, timezone_y, result);
-        else if (const auto * x_const_32_s = checkAndGetColumnConst<ColumnDate32>(&x))
-            dispatchConstForSecondColumn<Transform>(x_const_32_s->getValue<Int32>(), y, timezone_x, timezone_y, result);
-        else if (const auto * x_const_64 = checkAndGetColumnConst<ColumnDateTime64>(&x))
-            dispatchConstForSecondColumn<Transform>(x_const_64->getValue<DecimalField<DateTime64>>(), y, timezone_x, timezone_y, result);
-        else
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
-                "Illegal column for first argument of function {}, must be Date, Date32, DateTime or DateTime64",
+        return name;
+    }
+
+    bool isVariadic() const override { return false; }
+    bool isSuitableForShortCircuitArgumentsExecution(const DataTypesWithConstInfo & /*arguments*/) const override { return true; }
+    size_t getNumberOfArguments() const override { return 2; }
+
+    DataTypePtr getReturnTypeImpl(const DataTypes & arguments) const override
+    {
+        if (arguments.size() != 2)
+            throw Exception("Number of arguments for function " + getName() + " doesn't match: passed "
+                + toString(arguments.size()) + ", should be 2",
+                ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
+
+        if (!isDate(arguments[0]) && !isDate32(arguments[0]) && !isDateTime(arguments[0]) && !isDateTime64(arguments[0]))
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "First argument for function {} must be Date, Date32, DateTime or DateTime64",
                getName());
+
+        if (!isDate(arguments[1]) && !isDate32(arguments[1]) && !isDateTime(arguments[1]) && !isDateTime64(arguments[1]))
+            throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
+                "Second argument for function {} must be Date, Date32, DateTime or DateTime64",
+                getName()
+                );
+
+        return std::make_shared<DataTypeInt64>();
    }

-    template <typename Transform, typename LeftColumnType>
-    void dispatchForSecondColumn(
-        const LeftColumnType & x, const IColumn & y,
-        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
-        ColumnInt64::Container & result) const
+    bool useDefaultImplementationForConstants() const override { return true; }
+    ColumnNumbers getArgumentsThatAreAlwaysConstant() const override { return {}; }
+
+    ColumnPtr executeImpl(const ColumnsWithTypeAndName & arguments, const DataTypePtr &, size_t input_rows_count) const override
    {
-        if (const auto * y_vec_16 = checkAndGetColumn<ColumnDate>(&y))
-            vectorVector<Transform>(x, *y_vec_16, timezone_x, timezone_y, result);
-        else if (const auto * y_vec_32 = checkAndGetColumn<ColumnDateTime>(&y))
-            vectorVector<Transform>(x, *y_vec_32, timezone_x, timezone_y, result);
-        else if (const auto * y_vec_32_s = checkAndGetColumn<ColumnDate32>(&y))
-            vectorVector<Transform>(x, *y_vec_32_s, timezone_x, timezone_y, result);
-        else if (const auto * y_vec_64 = checkAndGetColumn<ColumnDateTime64>(&y))
-            vectorVector<Transform>(x, *y_vec_64, timezone_x, timezone_y, result);
-        else if (const auto * y_const_16 = checkAndGetColumnConst<ColumnDate>(&y))
-            vectorConstant<Transform>(x, y_const_16->getValue<UInt16>(), timezone_x, timezone_y, result);
-        else if (const auto * y_const_32 = checkAndGetColumnConst<ColumnDateTime>(&y))
-            vectorConstant<Transform>(x, y_const_32->getValue<UInt32>(), timezone_x, timezone_y, result);
-        else if (const auto * y_const_32_s = checkAndGetColumnConst<ColumnDate32>(&y))
-            vectorConstant<Transform>(x, y_const_32_s->getValue<Int32>(), timezone_x, timezone_y, result);
-        else if (const auto * y_const_64 = checkAndGetColumnConst<ColumnDateTime64>(&y))
-            vectorConstant<Transform>(x, y_const_64->getValue<DecimalField<DateTime64>>(), timezone_x, timezone_y, result);
-        else
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
-                "Illegal column for second argument of function {}, must be Date, Date32, DateTime or DateTime64",
-                getName());
-    }
-
-    template <typename Transform, typename T1>
-    void dispatchConstForSecondColumn(
-        T1 x, const IColumn & y,
-        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
-        ColumnInt64::Container & result) const
-    {
-        if (const auto * y_vec_16 = checkAndGetColumn<ColumnDate>(&y))
-            constantVector<Transform>(x, *y_vec_16, timezone_x, timezone_y, result);
-        else if (const auto * y_vec_32 = checkAndGetColumn<ColumnDateTime>(&y))
-            constantVector<Transform>(x, *y_vec_32, timezone_x, timezone_y, result);
-        else if (const auto * y_vec_32_s = checkAndGetColumn<ColumnDate32>(&y))
-            constantVector<Transform>(x, *y_vec_32_s, timezone_x, timezone_y, result);
-        else if (const auto * y_vec_64 = checkAndGetColumn<ColumnDateTime64>(&y))
-            constantVector<Transform>(x, *y_vec_64, timezone_x, timezone_y, result);
-        else
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN,
-                "Illegal column for second argument of function {}, must be Date, Date32, DateTime or DateTime64",
-                getName());
-    }
-
-    template <typename Transform, typename LeftColumnType, typename RightColumnType>
-    void vectorVector(
-        const LeftColumnType & x, const RightColumnType & y,
-        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
-        ColumnInt64::Container & result) const
-    {
-        const auto & x_data = x.getData();
-        const auto & y_data = y.getData();
-
-        const auto transform_x = TransformDateTime64<Transform>(getScale(x));
-        const auto transform_y = TransformDateTime64<Transform>(getScale(y));
-        for (size_t i = 0, size = x.size(); i < size; ++i)
-                result[i] = calculate(transform_x, transform_y, x_data[i], y_data[i], timezone_x, timezone_y);
-    }
-
-    template <typename Transform, typename LeftColumnType, typename T2>
-    void vectorConstant(
-        const LeftColumnType & x, T2 y,
-        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
-        ColumnInt64::Container & result) const
-    {
-        const auto & x_data = x.getData();
-        const auto transform_x = TransformDateTime64<Transform>(getScale(x));
-        const auto transform_y = TransformDateTime64<Transform>(getScale(y));
-        const auto y_value = stripDecimalFieldValue(y);
-
-        for (size_t i = 0, size = x.size(); i < size; ++i)
-            result[i] = calculate(transform_x, transform_y, x_data[i], y_value, timezone_x, timezone_y);
-    }
-
-    template <typename Transform, typename T1, typename RightColumnType>
-    void constantVector(
-        T1 x, const RightColumnType & y,
-        const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y,
-        ColumnInt64::Container & result) const
-    {
-        const auto & y_data = y.getData();
-        const auto transform_x = TransformDateTime64<Transform>(getScale(x));
-        const auto transform_y = TransformDateTime64<Transform>(getScale(y));
-        const auto x_value = stripDecimalFieldValue(x);
-
-        for (size_t i = 0, size = y.size(); i < size; ++i)
-            result[i] = calculate(transform_x, transform_y, x_value, y_data[i], timezone_x, timezone_y);
-    }
-
-    template <typename TransformX, typename TransformY, typename T1, typename T2>
-    Int64 calculate(const TransformX & transform_x, const TransformY & transform_y, T1 x, T2 y, const DateLUTImpl & timezone_x, const DateLUTImpl & timezone_y) const
-    {
-        return static_cast<Int64>(transform_y.execute(y, timezone_y))
-             - static_cast<Int64>(transform_x.execute(x, timezone_x));
-    }
-
-    template <typename T>
-    static UInt32 getScale(const T & v)
-    {
-        if constexpr (std::is_same_v<T, ColumnDateTime64>)
-            return v.getScale();
-        else if constexpr (std::is_same_v<T, DecimalField<DateTime64>>)
-            return v.getScale();
-
-        return 0;
-    }
-    template <typename T>
-    static auto stripDecimalFieldValue(T && v)
-    {
-        if constexpr (std::is_same_v<std::decay_t<T>, DecimalField<DateTime64>>)
-            return v.getValue();
-        else
-            return v;
+        const IColumn & x = *arguments[0].column;
+        const IColumn & y = *arguments[1].column;
+
+        size_t rows = input_rows_count;
+        auto res = ColumnInt64::create(rows);
+
+        impl.dispatchForColumns<ToRelativeSecondNumImpl<ResultPrecision::Extended>>(x, y, DateLUT::instance(), DateLUT::instance(), res->getData());
+
+        return res;
    }
+private:
+    DateDiffImpl impl{name};
 };

 }
@ -290,4 +362,18 @@ REGISTER_FUNCTION(DateDiff)
    factory.registerFunction<FunctionDateDiff>({}, FunctionFactory::CaseInsensitive);
 }

+REGISTER_FUNCTION(TimeDiff)
+{
+    factory.registerFunction<FunctionTimeDiff>({R"(
+Returns the difference between two dates or dates with time values. The difference is calculated in seconds units (see toRelativeSecondNum).
+It is same as `dateDiff` and was added only for MySQL support. `dateDiff` is preferred.
+
+Example:
+[example:typical]
+)",
+    Documentation::Examples{
+        {"typical", "SELECT timeDiff(UTCTimestamp(), now());"}},
+    Documentation::Categories{"Dates and Times"}}, FunctionFactory::CaseInsensitive);
+}
+
 }
--- a/src/Functions/if.cpp
+++ b/src/Functions/if.cpp
@ -904,6 +904,7 @@ private:

            if (cond_col)
            {
+                arg_else_column = arg_else_column->convertToFullColumnIfConst();
                auto result_column = IColumn::mutate(std::move(arg_else_column));
                if (else_is_short)
                    result_column->expand(cond_col->getData(), true);
@ -941,6 +942,7 @@ private:

            if (cond_col)
            {
+                arg_then_column = arg_then_column->convertToFullColumnIfConst();
                auto result_column = IColumn::mutate(std::move(arg_then_column));
                if (then_is_short)
                    result_column->expand(cond_col->getData(), false);
--- a/src/IO/ReadHelpers.h
+++ b/src/IO/ReadHelpers.h
@ -964,15 +964,16 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re
        components.whole = components.whole / common::exp10_i32(scale);
    }

+    bool is_ok = true;
    if constexpr (std::is_same_v<ReturnType, void>)
        datetime64 = DecimalUtils::decimalFromComponents<DateTime64>(components, scale);
    else
-        DecimalUtils::tryGetDecimalFromComponents<DateTime64>(components, scale, datetime64);
+        is_ok = DecimalUtils::tryGetDecimalFromComponents<DateTime64>(components, scale, datetime64);

    datetime64 *= negative_multiplier;


-    return ReturnType(true);
+    return ReturnType(is_ok);
 }

 inline void readDateTimeText(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & time_zone = DateLUT::instance())
--- a/src/IO/parseDateTimeBestEffort.cpp
+++ b/src/IO/parseDateTimeBestEffort.cpp
@ -659,6 +659,9 @@ ReturnType parseDateTime64BestEffortImpl(DateTime64 & res, UInt32 scale, ReadBuf
        fractional *= common::exp10_i64(scale - subsecond.digits);
    }

+    if constexpr (std::is_same_v<ReturnType, bool>)
+        return DecimalUtils::tryGetDecimalFromComponents<DateTime64>(whole, fractional, scale, res);
+
    res = DecimalUtils::decimalFromComponents<DateTime64>(whole, fractional, scale);
    return ReturnType(true);
 }
--- a/src/Interpreters/AsynchronousMetricLog.cpp
+++ b/src/Interpreters/AsynchronousMetricLog.cpp
@ -47,7 +47,7 @@ void AsynchronousMetricLog::addValues(const AsynchronousMetricValues & values)
    for (const auto & [key, value] : values)
    {
        element.metric_name = key;
-        element.value = round(value * precision) / precision;
+        element.value = round(value.value * precision) / precision;

        add(element);
    }
--- a/src/Interpreters/AsynchronousMetricLog.h
+++ b/src/Interpreters/AsynchronousMetricLog.h
@ -1,6 +1,7 @@
 #pragma once

 #include <Interpreters/SystemLog.h>
+#include <Interpreters/AsynchronousMetrics.h>
 #include <Common/ProfileEvents.h>
 #include <Common/CurrentMetrics.h>
 #include <Core/NamesAndTypes.h>
@ -14,12 +15,8 @@
 namespace DB
 {

-using AsynchronousMetricValue = double;
-using AsynchronousMetricValues = std::unordered_map<std::string, AsynchronousMetricValue>;
-
 /** AsynchronousMetricLog is a log of metric values measured at regular time interval.
  */
-
 struct AsynchronousMetricLogElement
 {
    UInt16 event_date;
--- a/src/Interpreters/AsynchronousMetrics.cpp
+++ b/src/Interpreters/AsynchronousMetrics.cpp
@ -15,7 +15,6 @@
 #include <Common/getCurrentProcessFDCount.h>
 #include <Common/getMaxFileDescriptorCount.h>
 #include <Interpreters/Cache/FileCache.h>
-#include <Server/ProtocolServerAdapter.h>
 #include <Storages/MarkCache.h>
 #include <Storages/StorageMergeTree.h>
 #include <Storages/StorageReplicatedMergeTree.h>
@ -24,15 +23,16 @@
 #include <IO/MMappedFileCache.h>
 #include <IO/ReadHelpers.h>
 #include <Databases/IDatabase.h>
+#include <base/errnoToString.h>
 #include <chrono>

-
 #include "config.h"

 #if USE_JEMALLOC
 #    include <jemalloc/jemalloc.h>
 #endif

+
 namespace DB
 {

@ -123,9 +123,9 @@ void AsynchronousMetrics::openSensors()
        {
            LOG_WARNING(
                &Poco::Logger::get("AsynchronousMetrics"),
-                "Thermal monitor '{}' exists but could not be read, error {}.",
+                "Thermal monitor '{}' exists but could not be read: {}.",
                thermal_device_index,
-                e.getErrno());
+                errnoToString(e.getErrno()));
            continue;
        }

@ -252,10 +252,10 @@ void AsynchronousMetrics::openSensorsChips()
            {
                LOG_WARNING(
                    &Poco::Logger::get("AsynchronousMetrics"),
-                    "Hardware monitor '{}', sensor '{}' exists but could not be read, error {}.",
+                    "Hardware monitor '{}', sensor '{}' exists but could not be read: {}.",
                    hwmon_name,
                    sensor_name,
-                    e.getErrno());
+                    errnoToString(e.getErrno()));
                continue;
            }

@ -386,14 +386,15 @@ uint64_t updateJemallocEpoch()
 }

 template <typename Value>
-static Value saveJemallocMetricImpl(AsynchronousMetricValues & values,
+static Value saveJemallocMetricImpl(
+    AsynchronousMetricValues & values,
    const std::string & jemalloc_full_name,
    const std::string & clickhouse_full_name)
 {
    Value value{};
    size_t size = sizeof(value);
    mallctl(jemalloc_full_name.c_str(), &value, &size, nullptr, 0);
-    values[clickhouse_full_name] = value;
+    values[clickhouse_full_name] = AsynchronousMetricValue(value, "An internal metric of the low-level memory allocator (jemalloc). See https://jemalloc.net/jemalloc.3.html");
    return value;
 }

@ -570,85 +571,93 @@ void AsynchronousMetrics::update(TimePoint update_time)
    previous_update_time = update_time;

    /// This is also a good indicator of system responsiveness.
-    new_values["Jitter"] = std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - update_time).count() / 1e9;
+    new_values["Jitter"] = { std::chrono::duration_cast<std::chrono::nanoseconds>(current_time - update_time).count() / 1e9,
+        "The difference in time the thread for calculation of the asynchronous metrics was scheduled to wake up and the time it was in fact, woken up."
+        " A proxy-indicator of overall system latency and responsiveness." };

+    if (auto mark_cache = getContext()->getMarkCache())
    {
-        if (auto mark_cache = getContext()->getMarkCache())
-        {
-            new_values["MarkCacheBytes"] = mark_cache->weight();
-            new_values["MarkCacheFiles"] = mark_cache->count();
-        }
+        new_values["MarkCacheBytes"] = { mark_cache->weight(), "Total size of mark cache in bytes" };
+        new_values["MarkCacheFiles"] = { mark_cache->count(), "Total number of mark files cached in the mark cache" };
    }

+    if (auto uncompressed_cache = getContext()->getUncompressedCache())
    {
-        if (auto uncompressed_cache = getContext()->getUncompressedCache())
-        {
-            new_values["UncompressedCacheBytes"] = uncompressed_cache->weight();
-            new_values["UncompressedCacheCells"] = uncompressed_cache->count();
-        }
+        new_values["UncompressedCacheBytes"] = { uncompressed_cache->weight(),
+            "Total size of uncompressed cache in bytes. Uncompressed cache does not usually improve the performance and should be mostly avoided." };
+        new_values["UncompressedCacheCells"] = { uncompressed_cache->count(),
+            "Total number of entries in the uncompressed cache. Each entry represents a decompressed block of data. Uncompressed cache does not usually improve performance and should be mostly avoided." };
    }

+    if (auto index_mark_cache = getContext()->getIndexMarkCache())
    {
-        if (auto index_mark_cache = getContext()->getIndexMarkCache())
-        {
-            new_values["IndexMarkCacheBytes"] = index_mark_cache->weight();
-            new_values["IndexMarkCacheFiles"] = index_mark_cache->count();
-        }
+        new_values["IndexMarkCacheBytes"] = { index_mark_cache->weight(), "Total size of mark cache for secondary indices in bytes." };
+        new_values["IndexMarkCacheFiles"] = { index_mark_cache->count(), "Total number of mark files cached in the mark cache for secondary indices." };
    }

+    if (auto index_uncompressed_cache = getContext()->getIndexUncompressedCache())
    {
-        if (auto index_uncompressed_cache = getContext()->getIndexUncompressedCache())
-        {
-            new_values["IndexUncompressedCacheBytes"] = index_uncompressed_cache->weight();
-            new_values["IndexUncompressedCacheCells"] = index_uncompressed_cache->count();
-        }
+        new_values["IndexUncompressedCacheBytes"] = { index_uncompressed_cache->weight(),
+            "Total size of uncompressed cache in bytes for secondary indices. Uncompressed cache does not usually improve the performance and should be mostly avoided." };
+        new_values["IndexUncompressedCacheCells"] = { index_uncompressed_cache->count(),
+            "Total number of entries in the uncompressed cache for secondary indices. Each entry represents a decompressed block of data. Uncompressed cache does not usually improve performance and should be mostly avoided." };
    }

+    if (auto mmap_cache = getContext()->getMMappedFileCache())
    {
-        if (auto mmap_cache = getContext()->getMMappedFileCache())
-        {
-            new_values["MMapCacheCells"] = mmap_cache->count();
-        }
+        new_values["MMapCacheCells"] = { mmap_cache->count(),
+            "The number of files opened with `mmap` (mapped in memory)."
+            " This is used for queries with the setting `local_filesystem_read_method` set to  `mmap`."
+            " The files opened with `mmap` are kept in the cache to avoid costly TLB flushes."};
    }

    {
        auto caches = FileCacheFactory::instance().getAll();
+        size_t total_bytes = 0;
+        size_t total_files = 0;
+
        for (const auto & [_, cache_data] : caches)
        {
-            new_values["FilesystemCacheBytes"] = cache_data->cache->getUsedCacheSize();
-            new_values["FilesystemCacheFiles"] = cache_data->cache->getFileSegmentsNum();
+            total_bytes += cache_data->cache->getUsedCacheSize();
+            total_files += cache_data->cache->getFileSegmentsNum();
        }
+
+        new_values["FilesystemCacheBytes"] = { total_bytes,
+            "Total bytes in the `cache` virtual filesystem. This cache is hold on disk." };
+        new_values["FilesystemCacheFiles"] = { total_files,
+            "Total number of cached file segments in the `cache` virtual filesystem. This cache is hold on disk." };
    }

 #if USE_ROCKSDB
+    if (auto metadata_cache = getContext()->tryGetMergeTreeMetadataCache())
    {
-        if (auto metadata_cache = getContext()->tryGetMergeTreeMetadataCache())
-        {
-            new_values["MergeTreeMetadataCacheSize"] = metadata_cache->getEstimateNumKeys();
-        }
+        new_values["MergeTreeMetadataCacheSize"] = { metadata_cache->getEstimateNumKeys(),
+            "The size of the metadata cache for tables. This cache is experimental and not used in production." };
    }
 #endif

 #if USE_EMBEDDED_COMPILER
+    if (auto * compiled_expression_cache = CompiledExpressionCacheFactory::instance().tryGetCache())
    {
-        if (auto * compiled_expression_cache = CompiledExpressionCacheFactory::instance().tryGetCache())
-        {
-            new_values["CompiledExpressionCacheBytes"] = compiled_expression_cache->weight();
-            new_values["CompiledExpressionCacheCount"]  = compiled_expression_cache->count();
-        }
+        new_values["CompiledExpressionCacheBytes"] = { compiled_expression_cache->weight(),
+            "Total bytes used for the cache of JIT-compiled code." };
+        new_values["CompiledExpressionCacheCount"] = { compiled_expression_cache->count(),
+            "Total entries in the cache of JIT-compiled code." };
    }
 #endif

+    new_values["Uptime"] = { getContext()->getUptimeSeconds(),
+        "The server uptime in seconds. It includes the time spent for server initialization before accepting connections." };

-    new_values["Uptime"] = getContext()->getUptimeSeconds();
-
+    if (const auto stats = getHashTablesCacheStatistics())
    {
-        if (const auto stats = getHashTablesCacheStatistics())
-        {
-            new_values["HashTableStatsCacheEntries"] = stats->entries;
-            new_values["HashTableStatsCacheHits"] = stats->hits;
-            new_values["HashTableStatsCacheMisses"] = stats->misses;
-        }
+        new_values["HashTableStatsCacheEntries"] = { stats->entries,
+            "The number of entries in the cache of hash table sizes."
+            " The cache for hash table sizes is used for predictive optimization of GROUP BY." };
+        new_values["HashTableStatsCacheHits"] = { stats->hits,
+            "The number of times the prediction of a hash table size was correct." };
+        new_values["HashTableStatsCacheMisses"] = { stats->misses,
+            "The number of times the prediction of a hash table size was incorrect." };
    }

 #if defined(OS_LINUX) || defined(OS_FREEBSD)
@ -660,7 +669,7 @@ void AsynchronousMetrics::update(TimePoint update_time)
    // the following calls will return stale values. It increments and returns
    // the current epoch number, which might be useful to log as a sanity check.
    auto epoch = updateJemallocEpoch();
-    new_values["jemalloc.epoch"] = epoch;
+    new_values["jemalloc.epoch"] = { epoch, "An internal incremental update number of the statistics of jemalloc (Jason Evans' memory allocator), used in all other `jemalloc` metrics." };

    // Collect the statistics themselves.
    saveJemallocMetric<size_t>(new_values, "allocated");
@ -685,13 +694,24 @@ void AsynchronousMetrics::update(TimePoint update_time)
    {
        MemoryStatisticsOS::Data & data = memory_statistics_data;

-        new_values["MemoryVirtual"] = data.virt;
-        new_values["MemoryResident"] = data.resident;
+        new_values["MemoryVirtual"] = { data.virt,
+            "The size of the virtual address space allocated by the server process, in bytes."
+            " The size of the virtual address space is usually much greater than the physical memory consumption, and should not be used as an estimate for the memory consumption."
+            " The large values of this metric are totally normal, and makes only technical sense."};
+        new_values["MemoryResident"] = { data.resident,
+            "The amount of physical memory used by the server process, in bytes." };
 #if !defined(OS_FREEBSD)
-        new_values["MemoryShared"] = data.shared;
+        new_values["MemoryShared"] = { data.shared,
+            "The amount of memory used by the server process, that is also shared by another processes, in bytes."
+            " ClickHouse does not use shared memory, but some memory can be labeled by OS as shared for its own reasons."
+            " This metric does not make a lot of sense to watch, and it exists only for completeness reasons."};
 #endif
-        new_values["MemoryCode"] = data.code;
-        new_values["MemoryDataAndStack"] = data.data_and_stack;
+        new_values["MemoryCode"] = { data.code,
+            "The amount of virtual memory mapped for the pages of machine code of the server process, in bytes." };
+        new_values["MemoryDataAndStack"] = { data.data_and_stack,
+            "The amount of virtual memory mapped for the use of stack and for the allocated memory, in bytes."
+            " It is unspecified whether it includes the per-thread stacks and most of the allocated memory, that is allocated with the 'mmap' system call."
+            " This metric exists only for completeness reasons. I recommend to use the `MemoryResident` metric for monitoring."};

        /// We must update the value of total_memory_tracker periodically.
        /// Otherwise it might be calculated incorrectly - it can include a "drift" of memory amount.
@ -754,11 +774,22 @@ void AsynchronousMetrics::update(TimePoint update_time)
            assertChar('/', *loadavg);
            readText(threads_total, *loadavg);

-            new_values["LoadAverage1"] = loadavg1;
-            new_values["LoadAverage5"] = loadavg5;
-            new_values["LoadAverage15"] = loadavg15;
-            new_values["OSThreadsRunnable"] = threads_runnable;
-            new_values["OSThreadsTotal"] = threads_total;
+#define LOAD_AVERAGE_DOCUMENTATION \
+    " The load represents the number of threads across all the processes (the scheduling entities of the OS kernel)," \
+    " that are currently running by CPU or waiting for IO, or ready to run but not being scheduled at this point of time." \
+    " This number includes all the processes, not only clickhouse-server. The number can be greater than the number of CPU cores," \
+    " if the system is overloaded, and many processes are ready to run but waiting for CPU or IO."
+
+            new_values["LoadAverage1"] = { loadavg1,
+                "The whole system load, averaged with exponential smoothing over 1 minute." LOAD_AVERAGE_DOCUMENTATION };
+            new_values["LoadAverage5"] = { loadavg5,
+                "The whole system load, averaged with exponential smoothing over 5 minutes." LOAD_AVERAGE_DOCUMENTATION };
+            new_values["LoadAverage15"] = { loadavg15,
+                "The whole system load, averaged with exponential smoothing over 15 minutes." LOAD_AVERAGE_DOCUMENTATION };
+            new_values["OSThreadsRunnable"] = { threads_runnable,
+                "The total number of 'runnable' threads, as the OS kernel scheduler seeing it." };
+            new_values["OSThreadsTotal"] = { threads_total,
+                "The total number of threads, as the OS kernel scheduler seeing it." };
        }
        catch (...)
        {
@ -775,7 +806,7 @@ void AsynchronousMetrics::update(TimePoint update_time)
            Float64 uptime_seconds = 0;
            readText(uptime_seconds, *uptime);

-            new_values["OSUptime"] = uptime_seconds;
+            new_values["OSUptime"] = { uptime_seconds, "The uptime of the host server (the machine where ClickHouse is running), in seconds." };
        }
        catch (...)
        {
@ -838,16 +869,43 @@ void AsynchronousMetrics::update(TimePoint update_time)
                        else
                            delta_values_all_cpus = delta_values;

-                        new_values["OSUserTime" + cpu_suffix] = delta_values.user * multiplier;
-                        new_values["OSNiceTime" + cpu_suffix] = delta_values.nice * multiplier;
-                        new_values["OSSystemTime" + cpu_suffix] = delta_values.system * multiplier;
-                        new_values["OSIdleTime" + cpu_suffix] = delta_values.idle * multiplier;
-                        new_values["OSIOWaitTime" + cpu_suffix] = delta_values.iowait * multiplier;
-                        new_values["OSIrqTime" + cpu_suffix] = delta_values.irq * multiplier;
-                        new_values["OSSoftIrqTime" + cpu_suffix] = delta_values.softirq * multiplier;
-                        new_values["OSStealTime" + cpu_suffix] = delta_values.steal * multiplier;
-                        new_values["OSGuestTime" + cpu_suffix] = delta_values.guest * multiplier;
-                        new_values["OSGuestNiceTime" + cpu_suffix] = delta_values.guest_nice * multiplier;
+                        new_values["OSUserTime" + cpu_suffix] = { delta_values.user * multiplier,
+                            "The ratio of time the CPU core was running userspace code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " This includes also the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSNiceTime" + cpu_suffix] = { delta_values.nice * multiplier,
+                            "The ratio of time the CPU core was running userspace code with higher priority. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSSystemTime" + cpu_suffix] = { delta_values.system * multiplier,
+                            "The ratio of time the CPU core was running OS kernel (system) code. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSIdleTime" + cpu_suffix] = { delta_values.idle * multiplier,
+                            "The ratio of time the CPU core was idle (not even ready to run a process waiting for IO) from the OS kernel standpoint. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " This does not include the time when the CPU was under-utilized due to the reasons internal to the CPU (memory loads, pipeline stalls, branch mispredictions, running another SMT core)."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSIOWaitTime" + cpu_suffix] = { delta_values.iowait * multiplier,
+                            "The ratio of time the CPU core was not running the code but when the OS kernel did not run any other process on this CPU as the processes were waiting for IO. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSIrqTime" + cpu_suffix] = { delta_values.irq * multiplier,
+                            "The ratio of time spent for running hardware interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " A high number of this metric may indicate hardware misconfiguration or a very high network load."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSSoftIrqTime" + cpu_suffix] = { delta_values.softirq * multiplier,
+                            "The ratio of time spent for running software interrupt requests on the CPU. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " A high number of this metric may indicate inefficient software running on the system."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSStealTime" + cpu_suffix] = { delta_values.steal * multiplier,
+                            "The ratio of time spent in other operating systems by the CPU when running in a virtualized environment. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " Not every virtualized environments present this metric, and most of them don't."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSGuestTime" + cpu_suffix] = { delta_values.guest * multiplier,
+                            "The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " This metric is irrelevant for ClickHouse, but still exists for completeness."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
+                        new_values["OSGuestNiceTime" + cpu_suffix] = { delta_values.guest_nice * multiplier,
+                            "The ratio of time spent running a virtual CPU for guest operating systems under the control of the Linux kernel, when a guest was set to a higher priority (See `man procfs`). This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server."
+                            " This metric is irrelevant for ClickHouse, but still exists for completeness."
+                            " The value for a single CPU core will be in the interval [0..1]. The value for all CPU cores is calculated as a sum across them [0..num cores]."};
                    }

                    prev_values = current_values;
@ -872,14 +930,18 @@ void AsynchronousMetrics::update(TimePoint update_time)
                    UInt64 processes_running = 0;
                    readText(processes_running, *proc_stat);
                    skipToNextLineOrEOF(*proc_stat);
-                    new_values["OSProcessesRunning"] = processes_running;
+                    new_values["OSProcessesRunning"] = { processes_running,
+                        "The number of runnable (running or ready to run) threads by the operating system."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
                }
                else if (name == "procs_blocked")
                {
                    UInt64 processes_blocked = 0;
                    readText(processes_blocked, *proc_stat);
                    skipToNextLineOrEOF(*proc_stat);
-                    new_values["OSProcessesBlocked"] = processes_blocked;
+                    new_values["OSProcessesBlocked"] = { processes_blocked,
+                        "Number of threads blocked waiting for I/O to complete (`man procfs`)."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
                }
                else
                    skipToNextLineOrEOF(*proc_stat);
@ -889,25 +951,45 @@ void AsynchronousMetrics::update(TimePoint update_time)
            {
                ProcStatValuesOther delta_values = current_other_values - proc_stat_values_other;

-                new_values["OSInterrupts"] = delta_values.interrupts;
-                new_values["OSContextSwitches"] = delta_values.context_switches;
-                new_values["OSProcessesCreated"] = delta_values.processes_created;
+                new_values["OSInterrupts"] = { delta_values.interrupts, "The number of interrupts on the host machine. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
+                new_values["OSContextSwitches"] = { delta_values.context_switches, "The number of context switches that the system underwent on the host machine. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
+                new_values["OSProcessesCreated"] = { delta_values.processes_created, "The number of processes created. This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };

                /// Also write values normalized to 0..1 by diving to the number of CPUs.
                /// These values are good to be averaged across the cluster of non-uniform servers.

                if (num_cpus)
                {
-                    new_values["OSUserTimeNormalized"] = delta_values_all_cpus.user * multiplier / num_cpus;
-                    new_values["OSNiceTimeNormalized"] = delta_values_all_cpus.nice * multiplier / num_cpus;
-                    new_values["OSSystemTimeNormalized"] = delta_values_all_cpus.system * multiplier / num_cpus;
-                    new_values["OSIdleTimeNormalized"] = delta_values_all_cpus.idle * multiplier / num_cpus;
-                    new_values["OSIOWaitTimeNormalized"] = delta_values_all_cpus.iowait * multiplier / num_cpus;
-                    new_values["OSIrqTimeNormalized"] = delta_values_all_cpus.irq * multiplier / num_cpus;
-                    new_values["OSSoftIrqTimeNormalized"] = delta_values_all_cpus.softirq * multiplier / num_cpus;
-                    new_values["OSStealTimeNormalized"] = delta_values_all_cpus.steal * multiplier / num_cpus;
-                    new_values["OSGuestTimeNormalized"] = delta_values_all_cpus.guest * multiplier / num_cpus;
-                    new_values["OSGuestNiceTimeNormalized"] = delta_values_all_cpus.guest_nice * multiplier / num_cpus;
+                    new_values["OSUserTimeNormalized"] = { delta_values_all_cpus.user * multiplier / num_cpus,
+                        "The value is similar to `OSUserTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSNiceTimeNormalized"] = { delta_values_all_cpus.nice * multiplier / num_cpus,
+                        "The value is similar to `OSNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSSystemTimeNormalized"] = { delta_values_all_cpus.system * multiplier / num_cpus,
+                        "The value is similar to `OSSystemTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSIdleTimeNormalized"] = { delta_values_all_cpus.idle * multiplier / num_cpus,
+                        "The value is similar to `OSIdleTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSIOWaitTimeNormalized"] = { delta_values_all_cpus.iowait * multiplier / num_cpus,
+                        "The value is similar to `OSIOWaitTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSIrqTimeNormalized"] = { delta_values_all_cpus.irq * multiplier / num_cpus,
+                        "The value is similar to `OSIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSSoftIrqTimeNormalized"] = { delta_values_all_cpus.softirq * multiplier / num_cpus,
+                        "The value is similar to `OSSoftIrqTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSStealTimeNormalized"] = { delta_values_all_cpus.steal * multiplier / num_cpus,
+                        "The value is similar to `OSStealTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSGuestTimeNormalized"] = { delta_values_all_cpus.guest * multiplier / num_cpus,
+                        "The value is similar to `OSGuestTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
+                    new_values["OSGuestNiceTimeNormalized"] = { delta_values_all_cpus.guest_nice * multiplier / num_cpus,
+                        "The value is similar to `OSGuestNiceTime` but divided to the number of CPU cores to be measured in the [0..1] interval regardless of the number of cores."
+                        " This allows you to average the values of this metric across multiple servers in a cluster even if the number of cores is non-uniform, and still get the average resource utilization metric."};
                }
            }

@ -962,39 +1044,47 @@ void AsynchronousMetrics::update(TimePoint update_time)

                if (name == "MemTotal:")
                {
-                    new_values["OSMemoryTotal"] = bytes;
+                    new_values["OSMemoryTotal"] = { bytes, "The total amount of memory on the host system, in bytes." };
                }
                else if (name == "MemFree:")
                {
-                    /// We cannot simply name this metric "Free", because it confuses users.
-                    /// See https://www.linuxatemyram.com/
-                    /// For convenience we also provide OSMemoryFreePlusCached, that should be somewhat similar to OSMemoryAvailable.
-
                    free_plus_cached_bytes += bytes;
-                    new_values["OSMemoryFreeWithoutCached"] = bytes;
+                    new_values["OSMemoryFreeWithoutCached"] = { bytes,
+                        "The amount of free memory on the host system, in bytes."
+                        " This does not include the memory used by the OS page cache memory, in bytes."
+                        " The page cache memory is also available for usage by programs, so the value of this metric can be confusing."
+                        " See the `OSMemoryAvailable` metric instead."
+                        " For convenience we also provide the `OSMemoryFreePlusCached` metric, that should be somewhat similar to OSMemoryAvailable."
+                        " See also https://www.linuxatemyram.com/."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
                }
                else if (name == "MemAvailable:")
                {
-                    new_values["OSMemoryAvailable"] = bytes;
+                    new_values["OSMemoryAvailable"] = { bytes, "The amount of memory available to be used by programs, in bytes. This is very similar to the `OSMemoryFreePlusCached` metric."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
                }
                else if (name == "Buffers:")
                {
-                    new_values["OSMemoryBuffers"] = bytes;
+                    new_values["OSMemoryBuffers"] = { bytes, "The amount of memory used by OS kernel buffers, in bytes. This should be typically small, and large values may indicate a misconfiguration of the OS."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
                }
                else if (name == "Cached:")
                {
                    free_plus_cached_bytes += bytes;
-                    new_values["OSMemoryCached"] = bytes;
+                    new_values["OSMemoryCached"] = { bytes, "The amount of memory used by the OS page cache, in bytes. Typically, almost all available memory is used by the OS page cache - high values of this metric are normal and expected."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
                }
                else if (name == "SwapCached:")
                {
-                    new_values["OSMemorySwapCached"] = bytes;
+                    new_values["OSMemorySwapCached"] = { bytes, "The amount of memory in swap that was also loaded in RAM. Swap should be disabled on production systems. If the value of this metric is large, it indicates a misconfiguration."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
                }

                skipToNextLineOrEOF(*meminfo);
            }

-            new_values["OSMemoryFreePlusCached"] = free_plus_cached_bytes;
+            new_values["OSMemoryFreePlusCached"] = { free_plus_cached_bytes, "The amount of free memory plus OS page cache memory on the host system, in bytes. This memory is available to be used by programs. The value should be very similar to `OSMemoryAvailable`."
+                " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
        }
        catch (...)
        {
@ -1043,7 +1133,7 @@ void AsynchronousMetrics::update(TimePoint update_time)
                    if (auto colon = s.find_first_of(':'))
                    {
                        auto mhz = std::stod(s.substr(colon + 2));
-                        new_values[fmt::format("CPUFrequencyMHz_{}", core_id)] = mhz;
+                        new_values[fmt::format("CPUFrequencyMHz_{}", core_id)] = { mhz, "The current frequency of the CPU, in MHz. Most of the modern CPUs adjust the frequency dynamically for power saving and Turbo Boosting." };
                    }
                }
            }
@ -1062,7 +1152,8 @@ void AsynchronousMetrics::update(TimePoint update_time)

            uint64_t open_files = 0;
            readText(open_files, *file_nr);
-            new_values["OSOpenFiles"] = open_files;
+            new_values["OSOpenFiles"] = { open_files, "The total number of opened files on the host machine."
+                " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
        }
        catch (...)
        {
@ -1083,7 +1174,17 @@ void AsynchronousMetrics::update(TimePoint update_time)

            BlockDeviceStatValues current_values{};
            BlockDeviceStatValues & prev_values = block_device_stats[name];
-            current_values.read(*device);
+
+            try
+            {
+                current_values.read(*device);
+            }
+            catch (const ErrnoException & e)
+            {
+                LOG_DEBUG(log, "Cannot read statistics about the block device '{}': {}.",
+                    name, errnoToString(e.getErrno()));
+                continue;
+            }

            BlockDeviceStatValues delta_values = current_values - prev_values;
            prev_values = current_values;
@ -1097,42 +1198,89 @@ void AsynchronousMetrics::update(TimePoint update_time)
            /// Always in milliseconds according to the docs.
            static constexpr double time_multiplier = 1e-6;

-            new_values["BlockReadOps_" + name] = delta_values.read_ios;
-            new_values["BlockWriteOps_" + name] = delta_values.write_ios;
-            new_values["BlockDiscardOps_" + name] = delta_values.discard_ops;
+#define BLOCK_DEVICE_EXPLANATION \
+    " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." \
+    " Source: `/sys/block`. See https://www.kernel.org/doc/Documentation/block/stat.txt"

-            new_values["BlockReadMerges_" + name] = delta_values.read_merges;
-            new_values["BlockWriteMerges_" + name] = delta_values.write_merges;
-            new_values["BlockDiscardMerges_" + name] = delta_values.discard_merges;
+            new_values["BlockReadOps_" + name] = { delta_values.read_ios,
+                "Number of read operations requested from the block device."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockWriteOps_" + name] = { delta_values.write_ios,
+                "Number of write operations requested from the block device."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockDiscardOps_" + name] = { delta_values.discard_ops,
+                "Number of discard operations requested from the block device. These operations are relevant for SSD."
+                " Discard operations are not used by ClickHouse, but can be used by other processes on the system."
+                BLOCK_DEVICE_EXPLANATION };

-            new_values["BlockReadBytes_" + name] = delta_values.read_sectors * sector_size;
-            new_values["BlockWriteBytes_" + name] = delta_values.write_sectors * sector_size;
-            new_values["BlockDiscardBytes_" + name] = delta_values.discard_sectors * sector_size;
+            new_values["BlockReadMerges_" + name] = { delta_values.read_merges,
+                "Number of read operations requested from the block device and merged together by the OS IO scheduler."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockWriteMerges_" + name] = { delta_values.write_merges,
+                "Number of write operations requested from the block device and merged together by the OS IO scheduler."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockDiscardMerges_" + name] = { delta_values.discard_merges,
+                "Number of discard operations requested from the block device and merged together by the OS IO scheduler."
+                " These operations are relevant for SSD. Discard operations are not used by ClickHouse, but can be used by other processes on the system."
+                BLOCK_DEVICE_EXPLANATION };

-            new_values["BlockReadTime_" + name] = delta_values.read_ticks * time_multiplier;
-            new_values["BlockWriteTime_" + name] = delta_values.write_ticks * time_multiplier;
-            new_values["BlockDiscardTime_" + name] = delta_values.discard_ticks * time_multiplier;
+            new_values["BlockReadBytes_" + name] = { delta_values.read_sectors * sector_size,
+                "Number of bytes read from the block device."
+                " It can be lower than the number of bytes read from the filesystem due to the usage of the OS page cache, that saves IO."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockWriteBytes_" + name] = { delta_values.write_sectors * sector_size,
+                "Number of bytes written to the block device."
+                " It can be lower than the number of bytes written to the filesystem due to the usage of the OS page cache, that saves IO."
+                " A write to the block device may happen later than the corresponding write to the filesystem due to write-through caching."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockDiscardBytes_" + name] = { delta_values.discard_sectors * sector_size,
+                "Number of discarded bytes on the block device."
+                " These operations are relevant for SSD. Discard operations are not used by ClickHouse, but can be used by other processes on the system."
+                BLOCK_DEVICE_EXPLANATION };

-            new_values["BlockInFlightOps_" + name] = delta_values.in_flight_ios;
+            new_values["BlockReadTime_" + name] = { delta_values.read_ticks * time_multiplier,
+                "Time in seconds spend in read operations requested from the block device, summed across all the operations."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockWriteTime_" + name] = { delta_values.write_ticks * time_multiplier,
+                "Time in seconds spend in write operations requested from the block device, summed across all the operations."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockDiscardTime_" + name] = { delta_values.discard_ticks * time_multiplier,
+                "Time in seconds spend in discard operations requested from the block device, summed across all the operations."
+                " These operations are relevant for SSD. Discard operations are not used by ClickHouse, but can be used by other processes on the system."
+                BLOCK_DEVICE_EXPLANATION };

-            new_values["BlockActiveTime_" + name] = delta_values.io_ticks * time_multiplier;
-            new_values["BlockQueueTime_" + name] = delta_values.time_in_queue * time_multiplier;
+            new_values["BlockInFlightOps_" + name] = { delta_values.in_flight_ios,
+                "This value counts the number of I/O requests that have been issued to"
+                " the device driver but have not yet completed. It does not include IO"
+                " requests that are in the queue but not yet issued to the device driver."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockActiveTime_" + name] = { delta_values.io_ticks * time_multiplier,
+                "Time in seconds the block device had the IO requests queued."
+                BLOCK_DEVICE_EXPLANATION };
+            new_values["BlockQueueTime_" + name] = { delta_values.time_in_queue * time_multiplier,
+                "This value counts the number of milliseconds that IO requests have waited"
+                " on this block device. If there are multiple IO requests waiting, this"
+                " value will increase as the product of the number of milliseconds times the"
+                " number of requests waiting."
+                BLOCK_DEVICE_EXPLANATION };

            if (delta_values.in_flight_ios)
            {
                /// TODO Check if these values are meaningful.

-                new_values["BlockActiveTimePerOp_" + name] = delta_values.io_ticks * time_multiplier / delta_values.in_flight_ios;
-                new_values["BlockQueueTimePerOp_" + name] = delta_values.time_in_queue * time_multiplier / delta_values.in_flight_ios;
+                new_values["BlockActiveTimePerOp_" + name] = { delta_values.io_ticks * time_multiplier / delta_values.in_flight_ios,
+                    "Similar to the `BlockActiveTime` metrics, but the value is divided to the number of IO operations to count the per-operation time." };
+                new_values["BlockQueueTimePerOp_" + name] = { delta_values.time_in_queue * time_multiplier / delta_values.in_flight_ios,
+                    "Similar to the `BlockQueueTime` metrics, but the value is divided to the number of IO operations to count the per-operation time." };
            }
        }
    }
    catch (...)
    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
+        LOG_DEBUG(log, "Cannot read statistics from block devices: {}", getCurrentExceptionMessage(false));

        /// Try to reopen block devices in case of error
-        /// (i.e. ENOENT means that some disk had been replaced, and it may apperas with a new name)
+        /// (i.e. ENOENT or ENODEV means that some disk had been replaced, and it may appear with a new name)
        try
        {
            openBlockDevices();
@ -1211,15 +1359,31 @@ void AsynchronousMetrics::update(TimePoint update_time)

                if (!first_run)
                {
-                    new_values["NetworkReceiveBytes_" + interface_name] = delta_values.recv_bytes;
-                    new_values["NetworkReceivePackets_" + interface_name] = delta_values.recv_packets;
-                    new_values["NetworkReceiveErrors_" + interface_name] = delta_values.recv_errors;
-                    new_values["NetworkReceiveDrop_" + interface_name] = delta_values.recv_drop;
+                    new_values["NetworkReceiveBytes_" + interface_name] = { delta_values.recv_bytes,
+                        " Number of bytes received via the network interface."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
+                    new_values["NetworkReceivePackets_" + interface_name] = { delta_values.recv_packets,
+                        " Number of network packets received via the network interface."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
+                    new_values["NetworkReceiveErrors_" + interface_name] = { delta_values.recv_errors,
+                        " Number of times error happened receiving via the network interface."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
+                    new_values["NetworkReceiveDrop_" + interface_name] = { delta_values.recv_drop,
+                        " Number of bytes a packet was dropped while received via the network interface."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };

-                    new_values["NetworkSendBytes_" + interface_name] = delta_values.send_bytes;
-                    new_values["NetworkSendPackets_" + interface_name] = delta_values.send_packets;
-                    new_values["NetworkSendErrors_" + interface_name] = delta_values.send_errors;
-                    new_values["NetworkSendDrop_" + interface_name] = delta_values.send_drop;
+                    new_values["NetworkSendBytes_" + interface_name] = { delta_values.send_bytes,
+                        " Number of bytes sent via the network interface."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
+                    new_values["NetworkSendPackets_" + interface_name] = { delta_values.send_packets,
+                        " Number of network packets sent via the network interface."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
+                    new_values["NetworkSendErrors_" + interface_name] = { delta_values.send_errors,
+                        " Number of times error (e.g. TCP retransmit) happened while sending via the network interface."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
+                    new_values["NetworkSendDrop_" + interface_name] = { delta_values.send_drop,
+                        " Number of times a packed was dropped while sending via the network interface."
+                        " This is a system-wide metric, it includes all the processes on the host machine, not just clickhouse-server." };
                }
            }
        }
@ -1238,7 +1402,8 @@ void AsynchronousMetrics::update(TimePoint update_time)
            in.rewind();
            Int64 temperature = 0;
            readText(temperature, in);
-            new_values[fmt::format("Temperature{}", i)] = temperature * 0.001;
+            new_values[fmt::format("Temperature{}", i)] = { temperature * 0.001,
+                "The temperature of the corresponding device in ℃. A sensor can return an unrealistic value. Source: `/sys/class/thermal`" };
        }
    }
    catch (...)
@ -1271,13 +1436,17 @@ void AsynchronousMetrics::update(TimePoint update_time)
                }
                catch (const ErrnoException & e)
                {
-                    LOG_DEBUG(&Poco::Logger::get("AsynchronousMetrics"), "Hardware monitor '{}', sensor '{}' exists but could not be read, error {}.", hwmon_name, sensor_name, e.getErrno());
+                    LOG_DEBUG(log, "Hardware monitor '{}', sensor '{}' exists but could not be read: {}.",
+                        hwmon_name, sensor_name, errnoToString(e.getErrno()));
+                    continue;
                }

                if (sensor_name.empty())
-                    new_values[fmt::format("Temperature_{}", hwmon_name)] = temperature * 0.001;
+                    new_values[fmt::format("Temperature_{}", hwmon_name)] = { temperature * 0.001,
+                        "The temperature reported by the corresponding hardware monitor in ℃. A sensor can return an unrealistic value. Source: `/sys/class/hwmon`" };
                else
-                    new_values[fmt::format("Temperature_{}_{}", hwmon_name, sensor_name)] = temperature * 0.001;
+                    new_values[fmt::format("Temperature_{}_{}", hwmon_name, sensor_name)] = { temperature * 0.001,
+                        "The temperature reported by the corresponding hardware monitor and the corresponding sensor in ℃. A sensor can return an unrealistic value. Source: `/sys/class/hwmon`" };
            }
        }
    }
@ -1313,7 +1482,11 @@ void AsynchronousMetrics::update(TimePoint update_time)
                in.rewind();
                uint64_t errors = 0;
                readText(errors, in);
-                new_values[fmt::format("EDAC{}_Correctable", i)] = errors;
+                new_values[fmt::format("EDAC{}_Correctable", i)] = { errors,
+                    "The number of correctable ECC memory errors."
+                    " A high number of this value indicates bad RAM which has to be immediately replaced,"
+                    " because in presence of a high number of corrected errors, a number of silent errors may happen as well, leading to data corruption."
+                    " Source: `/sys/devices/system/edac/mc/`" };
            }

            if (edac[i].second)
@ -1322,7 +1495,11 @@ void AsynchronousMetrics::update(TimePoint update_time)
                in.rewind();
                uint64_t errors = 0;
                readText(errors, in);
-                new_values[fmt::format("EDAC{}_Uncorrectable", i)] = errors;
+                new_values[fmt::format("EDAC{}_Uncorrectable", i)] = { errors,
+                    "The number of uncorrectable ECC memory errors."
+                    " A non-zero number of this value indicates bad RAM which has to be immediately replaced,"
+                    " because it indicates potential data corruption."
+                    " Source: `/sys/devices/system/edac/mc/`" };
            }
        }
    }
@ -1346,24 +1523,36 @@ void AsynchronousMetrics::update(TimePoint update_time)
    {
        auto stat = getStatVFS(getContext()->getPath());

-        new_values["FilesystemMainPathTotalBytes"] = stat.f_blocks * stat.f_frsize;
-        new_values["FilesystemMainPathAvailableBytes"] = stat.f_bavail * stat.f_frsize;
-        new_values["FilesystemMainPathUsedBytes"] = (stat.f_blocks - stat.f_bavail) * stat.f_frsize;
-        new_values["FilesystemMainPathTotalINodes"] = stat.f_files;
-        new_values["FilesystemMainPathAvailableINodes"] = stat.f_favail;
-        new_values["FilesystemMainPathUsedINodes"] = stat.f_files - stat.f_favail;
+        new_values["FilesystemMainPathTotalBytes"] = { stat.f_blocks * stat.f_frsize,
+            "The size of the volume where the main ClickHouse path is mounted, in bytes." };
+        new_values["FilesystemMainPathAvailableBytes"] = { stat.f_bavail * stat.f_frsize,
+            "Available bytes on the volume where the main ClickHouse path is mounted." };
+        new_values["FilesystemMainPathUsedBytes"] = { (stat.f_blocks - stat.f_bavail) * stat.f_frsize,
+            "Used bytes on the volume where the main ClickHouse path is mounted." };
+        new_values["FilesystemMainPathTotalINodes"] = { stat.f_files,
+            "The total number of inodes on the volume where the main ClickHouse path is mounted. If it is less than 25 million, it indicates a misconfiguration." };
+        new_values["FilesystemMainPathAvailableINodes"] = { stat.f_favail,
+            "The number of available inodes on the volume where the main ClickHouse path is mounted. If it is close to zero, it indicates a misconfiguration, and you will get 'no space left on device' even when the disk is not full." };
+        new_values["FilesystemMainPathUsedINodes"] = { stat.f_files - stat.f_favail,
+            "The number of used inodes on the volume where the main ClickHouse path is mounted. This value mostly corresponds to the number of files." };
    }

    {
        /// Current working directory of the server is the directory with logs.
        auto stat = getStatVFS(".");

-        new_values["FilesystemLogsPathTotalBytes"] = stat.f_blocks * stat.f_frsize;
-        new_values["FilesystemLogsPathAvailableBytes"] = stat.f_bavail * stat.f_frsize;
-        new_values["FilesystemLogsPathUsedBytes"] = (stat.f_blocks - stat.f_bavail) * stat.f_frsize;
-        new_values["FilesystemLogsPathTotalINodes"] = stat.f_files;
-        new_values["FilesystemLogsPathAvailableINodes"] = stat.f_favail;
-        new_values["FilesystemLogsPathUsedINodes"] = stat.f_files - stat.f_favail;
+        new_values["FilesystemLogsPathTotalBytes"] = { stat.f_blocks * stat.f_frsize,
+            "The size of the volume where ClickHouse logs path is mounted, in bytes. It's recommended to have at least 10 GB for logs." };
+        new_values["FilesystemLogsPathAvailableBytes"] = { stat.f_bavail * stat.f_frsize,
+            "Available bytes on the volume where ClickHouse logs path is mounted. If this value approaches zero, you should tune the log rotation in the configuration file." };
+        new_values["FilesystemLogsPathUsedBytes"] = { (stat.f_blocks - stat.f_bavail) * stat.f_frsize,
+            "Used bytes on the volume where ClickHouse logs path is mounted." };
+        new_values["FilesystemLogsPathTotalINodes"] = { stat.f_files,
+            "The total number of inodes on the volume where ClickHouse logs path is mounted." };
+        new_values["FilesystemLogsPathAvailableINodes"] = { stat.f_favail,
+            "The number of available inodes on the volume where ClickHouse logs path is mounted." };
+        new_values["FilesystemLogsPathUsedINodes"] = { stat.f_files - stat.f_favail,
+            "The number of used inodes on the volume where ClickHouse logs path is mounted." };
    }

    /// Free and total space on every configured disk.
@ -1380,10 +1569,14 @@ void AsynchronousMetrics::update(TimePoint update_time)
            auto available = disk->getAvailableSpace();
            auto unreserved = disk->getUnreservedSpace();

-            new_values[fmt::format("DiskTotal_{}", name)] = total;
-            new_values[fmt::format("DiskUsed_{}", name)] = total - available;
-            new_values[fmt::format("DiskAvailable_{}", name)] = available;
-            new_values[fmt::format("DiskUnreserved_{}", name)] = unreserved;
+            new_values[fmt::format("DiskTotal_{}", name)] = { total,
+                "The total size in bytes of the disk (virtual filesystem). Remote filesystems can show a large value like 16 EiB." };
+            new_values[fmt::format("DiskUsed_{}", name)] = { total - available,
+                "Used bytes on the disk (virtual filesystem). Remote filesystems not always provide this information." };
+            new_values[fmt::format("DiskAvailable_{}", name)] = { available,
+                "Available bytes on the disk (virtual filesystem). Remote filesystems can show a large value like 16 EiB." };
+            new_values[fmt::format("DiskUnreserved_{}", name)] = { unreserved,
+                "Available bytes on the disk (virtual filesystem) without the reservations for merges, fetches, and moves. Remote filesystems can show a large value like 16 EiB." };
        }
    }

@ -1463,44 +1656,46 @@ void AsynchronousMetrics::update(TimePoint update_time)
            }
        }

-        new_values["ReplicasMaxQueueSize"] = max_queue_size;
-        new_values["ReplicasMaxInsertsInQueue"] = max_inserts_in_queue;
-        new_values["ReplicasMaxMergesInQueue"] = max_merges_in_queue;
+        new_values["ReplicasMaxQueueSize"] = { max_queue_size, "Maximum queue size (in the number of operations like get, merge) across Replicated tables." };
+        new_values["ReplicasMaxInsertsInQueue"] = { max_inserts_in_queue, "Maximum number of INSERT operations in the queue (still to be replicated) across Replicated tables." };
+        new_values["ReplicasMaxMergesInQueue"] = { max_merges_in_queue, "Maximum number of merge operations in the queue (still to be applied) across Replicated tables." };

-        new_values["ReplicasSumQueueSize"] = sum_queue_size;
-        new_values["ReplicasSumInsertsInQueue"] = sum_inserts_in_queue;
-        new_values["ReplicasSumMergesInQueue"] = sum_merges_in_queue;
+        new_values["ReplicasSumQueueSize"] = { sum_queue_size, "Sum queue size (in the number of operations like get, merge) across Replicated tables." };
+        new_values["ReplicasSumInsertsInQueue"] = { sum_inserts_in_queue, "Sum of INSERT operations in the queue (still to be replicated) across Replicated tables." };
+        new_values["ReplicasSumMergesInQueue"] = { sum_merges_in_queue, "Sum of merge operations in the queue (still to be applied) across Replicated tables." };

-        new_values["ReplicasMaxAbsoluteDelay"] = max_absolute_delay;
-        new_values["ReplicasMaxRelativeDelay"] = max_relative_delay;
+        new_values["ReplicasMaxAbsoluteDelay"] = { max_absolute_delay, "Maximum difference in seconds between the most fresh replicated part and the most fresh data part still to be replicated, across Replicated tables. A very high value indicates a replica with no data." };
+        new_values["ReplicasMaxRelativeDelay"] = { max_relative_delay, "Maximum difference between the replica delay and the delay of the most up-to-date replica of the same table, across Replicated tables." };

-        new_values["MaxPartCountForPartition"] = max_part_count_for_partition;
+        new_values["MaxPartCountForPartition"] = { max_part_count_for_partition, "Maximum number of parts per partition across all partitions of all tables of MergeTree family. Values larger than 300 indicates misconfiguration, overload, or massive data loading." };

-        new_values["NumberOfDatabases"] = number_of_databases;
-        new_values["NumberOfTables"] = total_number_of_tables;
+        new_values["NumberOfDatabases"] = { number_of_databases, "Total number of databases on the server." };
+        new_values["NumberOfTables"] = { total_number_of_tables, "Total number of tables summed across the databases on the server, excluding the databases that cannot contain MergeTree tables."
+            " The excluded database engines are those who generate the set of tables on the fly, like `Lazy`, `MySQL`, `PostgreSQL`, `SQlite`."};

-        new_values["TotalBytesOfMergeTreeTables"] = total_number_of_bytes;
-        new_values["TotalRowsOfMergeTreeTables"] = total_number_of_rows;
-        new_values["TotalPartsOfMergeTreeTables"] = total_number_of_parts;
+        new_values["TotalBytesOfMergeTreeTables"] = { total_number_of_bytes, "Total amount of bytes (compressed, including data and indices) stored in all tables of MergeTree family." };
+        new_values["TotalRowsOfMergeTreeTables"] = { total_number_of_rows, "Total amount of rows (records) stored in all tables of MergeTree family." };
+        new_values["TotalPartsOfMergeTreeTables"] = { total_number_of_parts, "Total amount of data parts in all tables of MergeTree family."
+            " Numbers larger than 10 000 will negatively affect the server startup time and it may indicate unreasonable choice of the partition key." };

-        auto get_metric_name = [](const String & name) -> const char *
+        auto get_metric_name_doc = [](const String & name) -> std::pair<const char *, const char *>
        {
-            static std::map<String, const char *> metric_map =
+            static std::map<String, std::pair<const char *, const char *>> metric_map =
            {
-                {"tcp_port", "TCPThreads"},
-                {"tcp_port_secure", "TCPSecureThreads"},
-                {"http_port", "HTTPThreads"},
-                {"https_port", "HTTPSecureThreads"},
-                {"interserver_http_port", "InterserverThreads"},
-                {"interserver_https_port", "InterserverSecureThreads"},
-                {"mysql_port", "MySQLThreads"},
-                {"postgresql_port", "PostgreSQLThreads"},
-                {"grpc_port", "GRPCThreads"},
-                {"prometheus.port", "PrometheusThreads"}
+                {"tcp_port", {"TCPThreads", "Number of threads in the server of the TCP protocol (without TLS)."}},
+                {"tcp_port_secure", {"TCPSecureThreads", "Number of threads in the server of the TCP protocol (with TLS)."}},
+                {"http_port", {"HTTPThreads", "Number of threads in the server of the HTTP interface (without TLS)."}},
+                {"https_port", {"HTTPSecureThreads", "Number of threads in the server of the HTTPS interface."}},
+                {"interserver_http_port", {"InterserverThreads", "Number of threads in the server of the replicas communication protocol (without TLS)."}},
+                {"interserver_https_port", {"InterserverSecureThreads", "Number of threads in the server of the replicas communication protocol (with TLS)."}},
+                {"mysql_port", {"MySQLThreads", "Number of threads in the server of the MySQL compatibility protocol."}},
+                {"postgresql_port", {"PostgreSQLThreads", "Number of threads in the server of the PostgreSQL compatibility protocol."}},
+                {"grpc_port", {"GRPCThreads", "Number of threads in the server of the GRPC protocol."}},
+                {"prometheus.port", {"PrometheusThreads", "Number of threads in the server of the Prometheus endpoint. Note: prometheus endpoints can be also used via the usual HTTP/HTTPs ports."}}
            };
            auto it = metric_map.find(name);
            if (it == metric_map.end())
-                return nullptr;
+                return { nullptr, nullptr };
            else
                return it->second;
        };
@ -1508,8 +1703,8 @@ void AsynchronousMetrics::update(TimePoint update_time)
        const auto server_metrics = protocol_server_metrics_func();
        for (const auto & server_metric : server_metrics)
        {
-            if (const auto * name = get_metric_name(server_metric.port_name))
-                new_values[name] = server_metric.current_threads;
+            if (auto name_doc = get_metric_name_doc(server_metric.port_name); name_doc.first != nullptr)
+                new_values[name_doc.first] = { server_metric.current_threads, name_doc.second };
        }
    }
 #if USE_NURAFT
@ -1522,14 +1717,14 @@ void AsynchronousMetrics::update(TimePoint update_time)
            size_t is_observer = 0;
            size_t is_standalone = 0;
            size_t znode_count = 0;
-            size_t watch_count =0;
+            size_t watch_count = 0;
            size_t ephemerals_count = 0;
-            size_t approximate_data_size =0;
+            size_t approximate_data_size = 0;
            size_t key_arena_size = 0;
-            size_t latest_snapshot_size =0;
-            size_t open_file_descriptor_count =0;
-            size_t max_file_descriptor_count =0;
-            size_t followers =0;
+            size_t latest_snapshot_size = 0;
+            size_t open_file_descriptor_count = 0;
+            size_t max_file_descriptor_count = 0;
+            size_t followers = 0;
            size_t synced_followers = 0;
            size_t zxid = 0;
            size_t session_with_watches = 0;
@ -1570,29 +1765,29 @@ void AsynchronousMetrics::update(TimePoint update_time)
                }
            }

-            new_values["KeeperIsLeader"] = is_leader;
-            new_values["KeeperIsFollower"] = is_follower;
-            new_values["KeeperIsObserver"] = is_observer;
-            new_values["KeeperIsStandalone"] = is_standalone;
+            new_values["KeeperIsLeader"] = { is_leader, "1 if ClickHouse Keeper is a leader, 0 otherwise." };
+            new_values["KeeperIsFollower"] = { is_follower, "1 if ClickHouse Keeper is a follower, 0 otherwise." };
+            new_values["KeeperIsObserver"] = { is_observer, "1 if ClickHouse Keeper is an observer, 0 otherwise." };
+            new_values["KeeperIsStandalone"] = { is_standalone, "1 if ClickHouse Keeper is in a standalone mode, 0 otherwise." };

-            new_values["KeeperZnodeCount"] = znode_count;
-            new_values["KeeperWatchCount"] = watch_count;
-            new_values["KeeperEphemeralsCount"] = ephemerals_count;
+            new_values["KeeperZnodeCount"] = { znode_count, "The number of nodes (data entries) in ClickHouse Keeper." };
+            new_values["KeeperWatchCount"] = { watch_count, "The number of watches in ClickHouse Keeper." };
+            new_values["KeeperEphemeralsCount"] = { ephemerals_count, "The number of ephemeral nodes in ClickHouse Keeper." };

-            new_values["KeeperApproximateDataSize"] = approximate_data_size;
-            new_values["KeeperKeyArenaSize"] = key_arena_size;
-            new_values["KeeperLatestSnapshotSize"] = latest_snapshot_size;
+            new_values["KeeperApproximateDataSize"] = { approximate_data_size, "The approximate data size of ClickHouse Keeper, in bytes." };
+            new_values["KeeperKeyArenaSize"] = { key_arena_size, "The size in bytes of the memory arena for keys in ClickHouse Keeper." };
+            new_values["KeeperLatestSnapshotSize"] = { latest_snapshot_size, "The uncompressed size in bytes of the latest snapshot created by ClickHouse Keeper." };

-            new_values["KeeperOpenFileDescriptorCount"] = open_file_descriptor_count;
-            new_values["KeeperMaxFileDescriptorCount"] = max_file_descriptor_count;
+            new_values["KeeperOpenFileDescriptorCount"] = { open_file_descriptor_count, "The number of open file descriptors in ClickHouse Keeper." };
+            new_values["KeeperMaxFileDescriptorCount"] = { max_file_descriptor_count, "The maximum number of open file descriptors in ClickHouse Keeper." };

-            new_values["KeeperFollowers"] = followers;
-            new_values["KeeperSyncedFollowers"] = synced_followers;
-            new_values["KeeperZxid"] = zxid;
-            new_values["KeeperSessionWithWatches"] = session_with_watches;
-            new_values["KeeperPathsWatched"] = paths_watched;
-            new_values["KeeperSnapshotDirSize"] = snapshot_dir_size;
-            new_values["KeeperLogDirSize"] = log_dir_size;
+            new_values["KeeperFollowers"] = { followers, "The number of followers of ClickHouse Keeper." };
+            new_values["KeeperSyncedFollowers"] = { synced_followers, "The number of followers of ClickHouse Keeper who are also in-sync." };
+            new_values["KeeperZxid"] = { zxid, "The current transaction id number (zxid) in ClickHouse Keeper." };
+            new_values["KeeperSessionWithWatches"] = { session_with_watches, "The number of client sessions of ClickHouse Keeper having watches." };
+            new_values["KeeperPathsWatched"] = { paths_watched, "The number of different paths watched by the clients of ClickHouse Keeper." };
+            new_values["KeeperSnapshotDirSize"] = { snapshot_dir_size, "The size of the snapshots directory of ClickHouse Keeper, in bytes." };
+            new_values["KeeperLogDirSize"] = { log_dir_size, "The size of the logs directory of ClickHouse Keeper, in bytes." };
        }
    }
 #endif
@ -1601,7 +1796,7 @@ void AsynchronousMetrics::update(TimePoint update_time)

    /// Add more metrics as you wish.

-    new_values["AsynchronousMetricsCalculationTimeSpent"] = watch.elapsedSeconds();
+    new_values["AsynchronousMetricsCalculationTimeSpent"] = { watch.elapsedSeconds(), "Time in seconds spent for calculation of asynchronous metrics (this is the overhead of asynchronous metrics)." };

    /// Log the new metrics.
    if (auto asynchronous_metric_log = getContext()->getAsynchronousMetricLog())
@ -1681,11 +1876,10 @@ void AsynchronousMetrics::updateHeavyMetricsIfNeeded(TimePoint current_time, Tim
                 update_period.count(),
                 heavy_metric_update_period.count(),
                 watch.elapsedSeconds());
-
    }

-    new_values["NumberOfDetachedParts"] = detached_parts_stats.count;
-    new_values["NumberOfDetachedByUserParts"] = detached_parts_stats.detached_by_user;
+    new_values["NumberOfDetachedParts"] = { detached_parts_stats.count, "The total number of parts detached from MergeTree tables. A part can be detached by a user with the `ALTER TABLE DETACH` query or by the server itself it the part is broken, unexpected or unneeded. The server does not care about detached parts and they can be removed." };
+    new_values["NumberOfDetachedByUserParts"] = { detached_parts_stats.detached_by_user, "The total number of parts detached from MergeTree tables by users with the `ALTER TABLE DETACH` query (as opposed to unexpected, broken or ignored parts). The server does not care about detached parts and they can be removed." };
 }

 }
--- a/src/Interpreters/AsynchronousMetrics.h
+++ b/src/Interpreters/AsynchronousMetrics.h
@ -18,16 +18,25 @@

 namespace Poco
 {
-class Logger;
+    class Logger;
 }

 namespace DB
 {

-class ProtocolServerAdapter;
 class ReadBuffer;

-using AsynchronousMetricValue = double;
+struct AsynchronousMetricValue
+{
+    double value;
+    const char * documentation;
+
+    template <typename T>
+    AsynchronousMetricValue(T value_, const char * documentation_)
+        : value(static_cast<double>(value_)), documentation(documentation_) {}
+    AsynchronousMetricValue() = default; /// For std::unordered_map::operator[].
+};
+
 using AsynchronousMetricValues = std::unordered_map<std::string, AsynchronousMetricValue>;

 struct ProtocolServerMetrics
@ -42,6 +51,9 @@ struct ProtocolServerMetrics
  *
  * This includes both ClickHouse-related metrics (like memory usage of ClickHouse process)
  *  and common OS-related metrics (like total memory usage on the server).
+  *
+  * All the values are either gauge type (like the total number of tables, the current memory usage).
+  * Or delta-counters representing some accumulation during the interval of time.
  */
 class AsynchronousMetrics : WithContext
 {
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -79,6 +79,8 @@
 #include <Parsers/ASTCreateQuery.h>
 #include <Parsers/ParserCreateQuery.h>
 #include <Parsers/parseQuery.h>
+#include <Parsers/ASTAsterisk.h>
+#include <Parsers/ASTIdentifier.h>
 #include <Common/StackTrace.h>
 #include <Common/Config/ConfigHelper.h>
 #include <Common/Config/ConfigProcessor.h>
@ -1229,7 +1231,7 @@ void Context::addQueryFactoriesInfo(QueryLogFactories factory_type, const String
 }


-StoragePtr Context::executeTableFunction(const ASTPtr & table_expression)
+StoragePtr Context::executeTableFunction(const ASTPtr & table_expression, const ASTSelectQuery * select_query_hint)
 {
    auto hash = table_expression->getTreeHash();
    String key = toString(hash.first) + '_' + toString(hash.second);
@ -1239,15 +1241,61 @@ StoragePtr Context::executeTableFunction(const ASTPtr & table_expression)
    if (!res)
    {
        TableFunctionPtr table_function_ptr = TableFunctionFactory::instance().get(table_expression, shared_from_this());
-        if (getSettingsRef().use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint())
+        if (getSettingsRef().use_structure_from_insertion_table_in_table_functions && table_function_ptr->needStructureHint() && hasInsertionTable())
        {
-            const auto & insertion_table = getInsertionTable();
-            if (!insertion_table.empty())
+            const auto & structure_hint = DatabaseCatalog::instance().getTable(getInsertionTable(), shared_from_this())->getInMemoryMetadataPtr()->getColumns();
+            bool use_columns_from_insert_query = true;
+
+            /// use_structure_from_insertion_table_in_table_functions=2 means `auto`
+            if (select_query_hint && getSettingsRef().use_structure_from_insertion_table_in_table_functions == 2)
            {
-                const auto & structure_hint
-                    = DatabaseCatalog::instance().getTable(insertion_table, shared_from_this())->getInMemoryMetadataPtr()->columns;
-                table_function_ptr->setStructureHint(structure_hint);
+                const auto * expression_list = select_query_hint->select()->as<ASTExpressionList>();
+                Names columns_names;
+                bool have_asterisk = false;
+                /// First, check if we have only identifiers, asterisk and literals in select expression,
+                /// and if no, we cannot use the structure from insertion table.
+                for (const auto & expression : expression_list->children)
+                {
+                    if (auto * identifier = expression->as<ASTIdentifier>())
+                    {
+                        columns_names.push_back(identifier->name());
+                    }
+                    else if (expression->as<ASTAsterisk>())
+                    {
+                        have_asterisk = true;
+                    }
+                    else if (!expression->as<ASTLiteral>())
+                    {
+                        use_columns_from_insert_query = false;
+                        break;
+                    }
+                }
+
+                /// Check that all identifiers are column names from insertion table.
+                for (const auto & column_name : columns_names)
+                {
+                    if (!structure_hint.has(column_name))
+                    {
+                        use_columns_from_insert_query = false;
+                        break;
+                    }
+                }
+
+                /// If we don't have asterisk but only subset of columns, we should use
+                /// structure from insertion table only in case when table function
+                /// supports reading subset of columns from data.
+                if (use_columns_from_insert_query && !have_asterisk && !columns_names.empty())
+                {
+                    /// For input function we should check if input format supports reading subset of columns.
+                    if (table_function_ptr->getName() == "input")
+                        use_columns_from_insert_query = FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(getInsertFormat());
+                    else
+                        use_columns_from_insert_query = table_function_ptr->supportsReadingSubsetOfColumns();
+                }
            }
+
+            if (use_columns_from_insert_query)
+                table_function_ptr->setStructureHint(structure_hint);
        }

        res = table_function_ptr->execute(table_expression, shared_from_this(), table_function_ptr->getName());
@ -1484,12 +1532,21 @@ String Context::getDefaultFormat() const
    return default_format.empty() ? "TabSeparated" : default_format;
 }

-
 void Context::setDefaultFormat(const String & name)
 {
    default_format = name;
 }

+String Context::getInsertFormat() const
+{
+    return insert_format;
+}
+
+void Context::setInsertFormat(const String & name)
+{
+    insert_format = name;
+}
+
 MultiVersion<Macros>::Version Context::getMacros() const
 {
    return shared->macros.get();
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -9,6 +9,7 @@
 #include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/MergeTreeTransactionHolder.h>
 #include <Parsers/IAST_fwd.h>
+#include <Parsers/ASTSelectQuery.h>
 #include <Storages/IStorage_fwd.h>
 #include <Common/MultiVersion.h>
 #include <Common/OpenTelemetryTraceContext.h>
@ -239,6 +240,9 @@ private:

    String default_format;  /// Format, used when server formats data by itself and if query does not have FORMAT specification.
                            /// Thus, used in HTTP interface. If not specified - then some globally default format is used.
+
+    String insert_format; /// Format, used in insert query.
+
    TemporaryTablesMapping external_tables_mapping;
    Scalars scalars;
    /// Used to store constant values which are different on each instance during distributed plan, such as _shard_num.
@ -602,7 +606,9 @@ public:
    const QueryFactoriesInfo & getQueryFactoriesInfo() const { return query_factories_info; }
    void addQueryFactoriesInfo(QueryLogFactories factory_type, const String & created_object) const;

-    StoragePtr executeTableFunction(const ASTPtr & table_expression);
+    /// For table functions s3/file/url/hdfs/input we can use structure from
+    /// insertion table depending on select expression.
+    StoragePtr executeTableFunction(const ASTPtr & table_expression, const ASTSelectQuery * select_query_hint = nullptr);

    void addViewSource(const StoragePtr & storage);
    StoragePtr getViewSource() const;
@ -631,6 +637,9 @@ public:
    String getDefaultFormat() const;    /// If default_format is not specified, some global default format is returned.
    void setDefaultFormat(const String & name);

+    String getInsertFormat() const;
+    void setInsertFormat(const String & name);
+
    MultiVersion<Macros>::Version getMacros() const;
    void setMacros(std::unique_ptr<Macros> && macros);

--- a/src/Interpreters/JoinedTables.cpp
+++ b/src/Interpreters/JoinedTables.cpp
@ -173,12 +173,13 @@ using RenameQualifiedIdentifiersVisitor = InDepthNodeVisitor<RenameQualifiedIden

 }

-JoinedTables::JoinedTables(ContextPtr context_, const ASTSelectQuery & select_query, bool include_all_columns_)
+JoinedTables::JoinedTables(ContextPtr context_, const ASTSelectQuery & select_query_, bool include_all_columns_)
    : context(context_)
-    , table_expressions(getTableExpressions(select_query))
+    , table_expressions(getTableExpressions(select_query_))
    , include_all_columns(include_all_columns_)
-    , left_table_expression(extractTableExpression(select_query, 0))
-    , left_db_and_table(getDatabaseAndTable(select_query, 0))
+    , left_table_expression(extractTableExpression(select_query_, 0))
+    , left_db_and_table(getDatabaseAndTable(select_query_, 0))
+    , select_query(select_query_)
 {}

 bool JoinedTables::isLeftTableSubquery() const
@ -206,7 +207,7 @@ StoragePtr JoinedTables::getLeftTableStorage()
        return {};

    if (isLeftTableFunction())
-        return context->getQueryContext()->executeTableFunction(left_table_expression);
+        return context->getQueryContext()->executeTableFunction(left_table_expression, &select_query);

    StorageID table_id = StorageID::createEmpty();
    if (left_db_and_table)
--- a/src/Interpreters/JoinedTables.h
+++ b/src/Interpreters/JoinedTables.h
@ -22,7 +22,7 @@ using StorageMetadataPtr = std::shared_ptr<const StorageInMemoryMetadata>;
 class JoinedTables
 {
 public:
-    JoinedTables(ContextPtr context, const ASTSelectQuery & select_query, bool include_all_columns_ = false);
+    JoinedTables(ContextPtr context, const ASTSelectQuery & select_query_, bool include_all_columns_ = false);

    void reset(const ASTSelectQuery & select_query);

@ -52,6 +52,7 @@ private:
    /// Legacy (duplicated left table values)
    ASTPtr left_table_expression;
    std::optional<DatabaseAndTableWithAlias> left_db_and_table;
+    const ASTSelectQuery & select_query;
 };

 }
--- a/src/Interpreters/executeQuery.cpp
+++ b/src/Interpreters/executeQuery.cpp
@ -451,6 +451,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
        }
        else if (auto * insert_query = ast->as<ASTInsertQuery>())
        {
+            context->setInsertFormat(insert_query->format);
            if (insert_query->settings_ast)
                InterpreterSetQuery(insert_query->settings_ast, context).executeForCurrentContext();
            insert_query->tail = istr;
@ -530,7 +531,7 @@ static std::tuple<ASTPtr, BlockIO> executeQueryImpl(
                insert_query->tryFindInputFunction(input_function);
                if (input_function)
                {
-                    StoragePtr storage = context->executeTableFunction(input_function);
+                    StoragePtr storage = context->executeTableFunction(input_function, insert_query->select->as<ASTSelectQuery>());
                    auto & input_storage = dynamic_cast<StorageInput &>(*storage);
                    auto input_metadata_snapshot = input_storage.getInMemoryMetadataPtr();
                    auto pipe = getSourceFromASTInsertQuery(
--- a/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp
+++ b/src/Interpreters/maskSensitiveInfoInQueryForLogging.cpp
@ -164,7 +164,7 @@ namespace
                /// MongoDB('host:port', 'database', 'collection', 'user', 'password', ...)
                wipePasswordFromArgument(*storage.engine, data, 4);
            }
-            else if (engine_name == "S3" || engine_name == "COSN")
+            else if (engine_name == "S3" || engine_name == "COSN" || engine_name == "OSS")
            {
                /// S3('url', ['aws_access_key_id', 'aws_secret_access_key',] ...)
                wipePasswordFromS3TableEngineArguments(*storage.engine, data);
@ -222,7 +222,7 @@ namespace
                /// mongodb('host:port', 'database', 'collection', 'user', 'password', ...)
                wipePasswordFromArgument(function, data, 4);
            }
-            else if (function.name == "s3" || function.name == "cosn")
+            else if (function.name == "s3" || function.name == "cosn" || function.name == "oss")
            {
                /// s3('url', 'aws_access_key_id', 'aws_secret_access_key', ...)
                wipePasswordFromS3FunctionArguments(function, data, /* is_cluster_function= */ false);
--- a/src/Parsers/MySQL/ASTAlterCommand.cpp
+++ b/src/Parsers/MySQL/ASTAlterCommand.cpp
@ -267,7 +267,12 @@ static inline bool parseRenameCommand(IParser::Pos & pos, ASTPtr & node, Expecte
    }
    else
    {
-        return false;
+        if (!ParserCompoundIdentifier(true).parse(pos, new_name, expected))
+            return false;
+        auto new_table_id = new_name->as<ASTTableIdentifier>()->getTableId();
+        alter_command->type = ASTAlterCommand::RENAME_TABLE;
+        alter_command->new_table_name = new_table_id.table_name;
+        alter_command->new_database_name = new_table_id.database_name;
    }

    node = alter_command;
@ -306,6 +311,7 @@ static inline bool parseOtherCommand(IParser::Pos & pos, ASTPtr & node, Expected
                OptionDescribe("CONVERT TO CHARACTER SET", "charset", std::make_shared<ParserCharsetOrCollateName>()),
                OptionDescribe("CHARACTER SET", "charset", std::make_shared<ParserCharsetOrCollateName>()),
                OptionDescribe("DEFAULT CHARACTER SET", "charset", std::make_shared<ParserCharsetOrCollateName>()),
+                OptionDescribe("COMMENT", "", std::make_shared<ParserIdentifier>()),
                OptionDescribe("LOCK", "lock", std::make_shared<ParserIdentifier>())
            }
        };
--- a/src/Parsers/MySQL/ASTDeclareColumn.cpp
+++ b/src/Parsers/MySQL/ASTDeclareColumn.cpp
@ -52,6 +52,7 @@ static inline bool parseColumnDeclareOptions(IParser::Pos & pos, ASTPtr & node,
            OptionDescribe("KEY", "primary_key", std::make_unique<ParserAlwaysTrue>()),
            OptionDescribe("COMMENT", "comment", std::make_unique<ParserStringLiteral>()),
            OptionDescribe("CHARACTER SET", "charset_name", std::make_unique<ParserCharsetOrCollateName>()),
+             OptionDescribe("CHARSET", "charset", std::make_unique<ParserCharsetOrCollateName>()),
            OptionDescribe("COLLATE", "collate", std::make_unique<ParserCharsetOrCollateName>()),
            OptionDescribe("COLUMN_FORMAT", "column_format", std::make_unique<ParserIdentifier>()),
            OptionDescribe("STORAGE", "storage", std::make_unique<ParserIdentifier>()),
@ -59,6 +60,7 @@ static inline bool parseColumnDeclareOptions(IParser::Pos & pos, ASTPtr & node,
            OptionDescribe("GENERATED ALWAYS AS", "generated", std::make_unique<ParserExpression>()),
            OptionDescribe("STORED", "is_stored", std::make_unique<ParserAlwaysTrue>()),
            OptionDescribe("VIRTUAL", "is_stored", std::make_unique<ParserAlwaysFalse>()),
+            OptionDescribe("INVISIBLE", "", std::make_unique<ParserAlwaysTrue>()),
            OptionDescribe("", "reference", std::make_unique<ParserDeclareReference>()),
            OptionDescribe("", "constraint", std::make_unique<ParserDeclareConstraint>()),
        }
--- a/src/Planner/PlannerJoinTree.cpp
+++ b/src/Planner/PlannerJoinTree.cpp
@ -227,7 +227,11 @@ QueryPlan buildQueryPlanForJoinNode(QueryTreeNodePtr join_tree_node,
    JoinClausesAndActions join_clauses_and_actions;
    JoinKind join_kind = join_node.getKind();

-    auto join_constant = tryExtractConstantFromJoinNode(join_tree_node);
+    std::optional<bool> join_constant;
+
+    if (join_node.getStrictness() == JoinStrictness::All)
+        join_constant = tryExtractConstantFromJoinNode(join_tree_node);
+
    if (join_constant)
    {
        /** If there is JOIN with always true constant, we transform it to cross.
--- a/src/Planner/PlannerJoins.cpp
+++ b/src/Planner/PlannerJoins.cpp
@ -20,6 +20,7 @@
 #include <Functions/FunctionsConversion.h>
 #include <Functions/CastOverloadResolver.h>

+#include <Analyzer/FunctionNode.h>
 #include <Analyzer/TableNode.h>
 #include <Analyzer/TableFunctionNode.h>
 #include <Analyzer/JoinNode.h>
@ -76,6 +77,23 @@ void JoinClause::dump(WriteBuffer & buffer) const

    if (!right_filter_condition_nodes.empty())
        buffer << " right_condition_nodes: " + dump_dag_nodes(right_filter_condition_nodes);
+
+    if (!asof_conditions.empty())
+    {
+        buffer << " asof_conditions: ";
+        size_t asof_conditions_size = asof_conditions.size();
+
+        for (size_t i = 0; i < asof_conditions_size; ++i)
+        {
+            const auto & asof_condition = asof_conditions[i];
+
+            buffer << "key_index: " << asof_condition.key_index;
+            buffer << "inequality: " << toString(asof_condition.asof_inequality);
+
+            if (i + 1 != asof_conditions_size)
+                buffer << ',';
+        }
+    }
 }

 String JoinClause::dump() const
@ -249,9 +267,7 @@ void buildJoinClause(ActionsDAGPtr join_expression_dag,
        join_node);

    if (!expression_side_optional)
-        throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
-                "JOIN {} with constants is not supported",
-                join_node.formatASTForErrorMessage());
+        expression_side_optional = JoinTableSide::Right;

    auto expression_side = *expression_side_optional;
    join_clause.addCondition(expression_side, join_expressions_actions_node);
@ -277,6 +293,22 @@ JoinClausesAndActions buildJoinClausesAndActions(const ColumnsWithTypeAndName &
    for (const auto & node : join_expression_actions_nodes)
        join_expression_dag_input_nodes.insert(&node);

+    auto * function_node = join_node.getJoinExpression()->as<FunctionNode>();
+    if (!function_node)
+        throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
+            "JOIN {} join expression expected function",
+            join_node.formatASTForErrorMessage());
+
+    /** It is possible to have constant value in JOIN ON section, that we need to ignore during DAG construction.
+      * If we do not ignore it, this function will be replaced by underlying constant.
+      * For example ASOF JOIN does not support JOIN with constants, and we should process it like ordinary JOIN.
+      *
+      * Example: SELECT * FROM (SELECT 1 AS id, 1 AS value) AS t1 ASOF LEFT JOIN (SELECT 1 AS id, 1 AS value) AS t2
+      * ON (t1.id = t2.id) AND 1 != 1 AND (t1.value >= t1.value);
+      */
+    auto constant_value = function_node->getConstantValueOrNull();
+    function_node->performConstantFolding({});
+
    PlannerActionsVisitor join_expression_visitor(planner_context);
    auto join_expression_dag_node_raw_pointers = join_expression_visitor.visit(join_expression_actions, join_node.getJoinExpression());
    if (join_expression_dag_node_raw_pointers.size() != 1)
@ -284,6 +316,8 @@ JoinClausesAndActions buildJoinClausesAndActions(const ColumnsWithTypeAndName &
            "JOIN {} ON clause contains multiple expressions",
            join_node.formatASTForErrorMessage());

+    function_node->performConstantFolding(std::move(constant_value));
+
    const auto * join_expressions_actions_root_node = join_expression_dag_node_raw_pointers[0];
    if (!join_expressions_actions_root_node->function)
        throw Exception(ErrorCodes::INVALID_JOIN_ON_EXPRESSION,
--- a/src/Server/PrometheusMetricsWriter.cpp
+++ b/src/Server/PrometheusMetricsWriter.cpp
@ -108,11 +108,16 @@ void PrometheusMetricsWriter::write(WriteBuffer & wb) const

            if (!replaceInvalidChars(key))
                continue;
+
            auto value = name_value.second;

+            std::string metric_doc{value.documentation};
+            convertHelpToSingleLine(metric_doc);
+
            // TODO: add HELP section? asynchronous_metrics contains only key and value
+            writeOutLine(wb, "# HELP", key, metric_doc);
            writeOutLine(wb, "# TYPE", key, "gauge");
-            writeOutLine(wb, key, value);
+            writeOutLine(wb, key, value.value);
        }
    }

--- a/src/Server/PrometheusMetricsWriter.h
+++ b/src/Server/PrometheusMetricsWriter.h
@ -3,11 +3,11 @@
 #include <string>

 #include <Interpreters/AsynchronousMetrics.h>
-
 #include <IO/WriteBuffer.h>

 #include <Poco/Util/AbstractConfiguration.h>

+
 namespace DB
 {

--- a/src/Server/ProtocolServerAdapter.h
+++ b/src/Server/ProtocolServerAdapter.h
@ -6,8 +6,10 @@
 #include <memory>
 #include <string>

+
 namespace DB
 {
+
 class GRPCServer;
 class TCPServer;

--- a/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp
+++ b/src/Storages/MergeTree/EphemeralLockInZooKeeper.cpp
@ -2,6 +2,7 @@
 #include <Common/ZooKeeper/KeeperException.h>
 #include <Common/logger_useful.h>
 #include <base/types.h>
+#include <Storages/MergeTree/ZooKeeperWithFaultInjection.h>


 namespace DB
@ -12,22 +13,22 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-EphemeralLockInZooKeeper::EphemeralLockInZooKeeper(const String & path_prefix_, zkutil::ZooKeeper & zookeeper_, const String & path_)
-    : zookeeper(&zookeeper_), path_prefix(path_prefix_), path(path_)
+EphemeralLockInZooKeeper::EphemeralLockInZooKeeper(const String & path_prefix_, const ZooKeeperWithFaultInjectionPtr & zookeeper_, const String & path_)
+    : zookeeper(zookeeper_), path_prefix(path_prefix_), path(path_)
 {
    if (path.size() <= path_prefix.size())
        throw Exception("Logical error: name of the main node is shorter than prefix.", ErrorCodes::LOGICAL_ERROR);
 }

 std::optional<EphemeralLockInZooKeeper> createEphemeralLockInZooKeeper(
-    const String & path_prefix_, const String & temp_path, zkutil::ZooKeeper & zookeeper_, const String & deduplication_path)
+    const String & path_prefix_, const String & temp_path, const ZooKeeperWithFaultInjectionPtr & zookeeper_, const String & deduplication_path)
 {
    String path;

    if (deduplication_path.empty())
    {
        String holder_path = temp_path + "/" + EphemeralLockInZooKeeper::LEGACY_LOCK_OTHER;
-        path = zookeeper_.create(path_prefix_, holder_path, zkutil::CreateMode::EphemeralSequential);
+        path = zookeeper_->create(path_prefix_, holder_path, zkutil::CreateMode::EphemeralSequential);
    }
    else
    {
@ -39,11 +40,15 @@ std::optional<EphemeralLockInZooKeeper> createEphemeralLockInZooKeeper(
        ops.emplace_back(zkutil::makeRemoveRequest(deduplication_path, -1));
        ops.emplace_back(zkutil::makeCreateRequest(path_prefix_, holder_path, zkutil::CreateMode::EphemeralSequential));
        Coordination::Responses responses;
-        Coordination::Error e = zookeeper_.tryMulti(ops, responses);
+        Coordination::Error e = zookeeper_->tryMulti(ops, responses);
        if (e != Coordination::Error::ZOK)
        {
            if (responses[0]->error == Coordination::Error::ZNODEEXISTS)
            {
+                LOG_DEBUG(
+                    &Poco::Logger::get("createEphemeralLockInZooKeeper"),
+                    "Deduplication path already exists: deduplication_path={}",
+                    deduplication_path);
                return {};
            }
            else
@ -82,9 +87,31 @@ EphemeralLockInZooKeeper::~EphemeralLockInZooKeeper()
    {
        unlock();
    }
+    catch (const zkutil::KeeperException & e)
+    {
+        if (Coordination::isHardwareError(e.code))
+            LOG_DEBUG(
+                &Poco::Logger::get("EphemeralLockInZooKeeper"),
+                "ZooKeeper communication error during unlock: code={} message='{}'",
+                e.code,
+                e.message());
+        else if (e.code == Coordination::Error::ZNONODE)
+            /// To avoid additional round-trip for unlocking,
+            /// ephemeral node can be deleted explicitly as part of another multi op request to ZK
+            /// and marked as such via assumeUnlocked() if we got successful response.
+            /// But it's possible that the multi op request can be executed on server side, and client will not get response due to network issue.
+            /// In such case, assumeUnlocked() will not be called, so we'll get ZNONODE error here since the noded is already deleted
+            LOG_DEBUG(
+                &Poco::Logger::get("EphemeralLockInZooKeeper"),
+                "ZooKeeper node was already deleted: code={} message={}",
+                e.code,
+                e.message());
+        else
+            tryLogCurrentException("EphemeralLockInZooKeeper");
+    }
    catch (...)
    {
-        tryLogCurrentException("~EphemeralLockInZooKeeper");
+        tryLogCurrentException("EphemeralLockInZooKeeper");
    }
 }

--- a/src/Storages/MergeTree/EphemeralLockInZooKeeper.h
+++ b/src/Storages/MergeTree/EphemeralLockInZooKeeper.h
@ -12,6 +12,8 @@

 namespace DB
 {
+class ZooKeeperWithFaultInjection;
+using ZooKeeperWithFaultInjectionPtr = std::shared_ptr<ZooKeeperWithFaultInjection>;

 namespace ErrorCodes
 {
@ -25,13 +27,14 @@ namespace ErrorCodes
 class EphemeralLockInZooKeeper : public boost::noncopyable
 {
    friend std::optional<EphemeralLockInZooKeeper> createEphemeralLockInZooKeeper(
-        const String & path_prefix_, const String & temp_path, zkutil::ZooKeeper & zookeeper_, const String & deduplication_path);
+        const String & path_prefix_, const String & temp_path, const ZooKeeperWithFaultInjectionPtr & zookeeper_, const String & deduplication_path);

 protected:
-    EphemeralLockInZooKeeper() = delete;
-    EphemeralLockInZooKeeper(const String & path_prefix_, zkutil::ZooKeeper & zookeeper_, const String & path_);
+    EphemeralLockInZooKeeper(const String & path_prefix_, const ZooKeeperWithFaultInjectionPtr & zookeeper_, const String & path_);

 public:
+    EphemeralLockInZooKeeper() = delete;
+
    /// Fake "secondary node" names for blocks with and without "deduplication_path"
    static constexpr const char * LEGACY_LOCK_INSERT = "abandonable_lock-insert";
    static constexpr const char * LEGACY_LOCK_OTHER = "abandonable_lock-other";
@ -53,7 +56,7 @@ public:

    bool isLocked() const
    {
-        return zookeeper;
+        return zookeeper.get();
    }

    String getPath() const
@ -91,13 +94,13 @@ public:
    ~EphemeralLockInZooKeeper();

 private:
-    zkutil::ZooKeeper * zookeeper = nullptr;
+    ZooKeeperWithFaultInjectionPtr zookeeper;
    String path_prefix;
    String path;
 };

 std::optional<EphemeralLockInZooKeeper> createEphemeralLockInZooKeeper(
-    const String & path_prefix_, const String & temp_path, zkutil::ZooKeeper & zookeeper_, const String & deduplication_path);
+    const String & path_prefix_, const String & temp_path, const ZooKeeperWithFaultInjectionPtr & zookeeper_, const String & deduplication_path);


 /// Acquires block number locks in all partitions.
--- a/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeMergeStrategyPicker.cpp
@ -112,6 +112,8 @@ void ReplicatedMergeTreeMergeStrategyPicker::refreshState()
        && now - last_refresh_time < REFRESH_STATE_MINIMUM_INTERVAL_SECONDS)
        return;

+    LOG_DEBUG(storage.log, "Updating strategy picker state");
+
    auto zookeeper = storage.getZooKeeper();
    auto all_replicas = zookeeper->getChildren(storage.zookeeper_path + "/replicas");

@ -154,6 +156,8 @@ void ReplicatedMergeTreeMergeStrategyPicker::refreshState()
    last_refresh_time = now;
    current_replica_index = current_replica_index_tmp;
    active_replicas = active_replicas_tmp;
+
+    LOG_DEBUG(storage.log, "Strategy picker state updated, current replica: {}, active replicas: [{}]", current_replica_index, fmt::join(active_replicas, ", "));
 }


--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@ -1015,7 +1015,8 @@ bool ReplicatedMergeTreeQueue::checkReplaceRangeCanBeRemoved(const MergeTreePart
 void ReplicatedMergeTreeQueue::removePartProducingOpsInRange(
    zkutil::ZooKeeperPtr zookeeper,
    const MergeTreePartInfo & part_info,
-    const std::optional<ReplicatedMergeTreeLogEntryData> & covering_entry)
+    const std::optional<ReplicatedMergeTreeLogEntryData> & covering_entry,
+    const String & fetch_entry_znode)
 {
    /// TODO is it possible to simplify it?
    Queue to_wait;
@ -1029,22 +1030,40 @@ void ReplicatedMergeTreeQueue::removePartProducingOpsInRange(
    [[maybe_unused]] bool called_from_alter_query_directly = covering_entry && covering_entry->replace_range_entry
        && covering_entry->replace_range_entry->columns_version < 0;
    [[maybe_unused]] bool called_for_broken_part = !covering_entry;
-    assert(currently_executing_drop_replace_ranges.contains(part_info) || called_from_alter_query_directly || called_for_broken_part);
+    assert(currently_executing_drop_replace_ranges.contains(part_info) || called_from_alter_query_directly || called_for_broken_part || !fetch_entry_znode.empty());
+
+    auto is_simple_part_producing_op = [](const ReplicatedMergeTreeLogEntryData & data)
+    {
+        return data.type == LogEntry::GET_PART ||
+               data.type == LogEntry::ATTACH_PART ||
+               data.type == LogEntry::MERGE_PARTS ||
+               data.type == LogEntry::MUTATE_PART;
+    };

    for (Queue::iterator it = queue.begin(); it != queue.end();)
    {
-        auto type = (*it)->type;
-        bool is_simple_producing_op = type == LogEntry::GET_PART ||
-                                      type == LogEntry::ATTACH_PART ||
-                                      type == LogEntry::MERGE_PARTS ||
-                                      type == LogEntry::MUTATE_PART;
+        /// Skipping currently processing entry
+        if (!fetch_entry_znode.empty() && (*it)->znode_name == fetch_entry_znode)
+        {
+            ++it;
+            continue;
+        }
+
+        bool is_simple_producing_op = is_simple_part_producing_op(**it);

        bool simple_op_covered = is_simple_producing_op && part_info.contains(MergeTreePartInfo::fromPartName((*it)->new_part_name, format_version));
        bool replace_range_covered = covering_entry && checkReplaceRangeCanBeRemoved(part_info, *it, *covering_entry);
        if (simple_op_covered || replace_range_covered)
        {
            if ((*it)->currently_executing)
+            {
+                bool is_covered_by_simple_op = covering_entry && is_simple_part_producing_op(*covering_entry);
+                bool is_fetching_covering_part = !fetch_entry_znode.empty();
+                if (is_covered_by_simple_op || is_fetching_covering_part)
+                    throw Exception(ErrorCodes::LOGICAL_ERROR, "Cannot remove covered entry {} producing parts {}, it's a bug",
+                                    (*it)->znode_name, fmt::join((*it)->getVirtualPartNames(format_version), ", "));
                to_wait.push_back(*it);
+            }
            auto code = zookeeper->tryRemove(fs::path(replica_path) / "queue" / (*it)->znode_name);
            if (code != Coordination::Error::ZOK)
                LOG_INFO(log, "Couldn't remove {}: {}", (fs::path(replica_path) / "queue" / (*it)->znode_name).string(), Coordination::errorMessage(code));
@ -1110,7 +1129,12 @@ bool ReplicatedMergeTreeQueue::isCoveredByFuturePartsImpl(const LogEntry & entry
        /// Parts are not disjoint. They can be even intersecting and it's not a problem,
        /// because we may have two queue entries producing intersecting parts if there's DROP_RANGE between them (so virtual_parts are ok).

-        /// We cannot execute `entry` (or upgrade its actual_part_name to `new_part_name`)
+        /// Give priority to DROP_RANGEs and allow processing them even if covered entries are currently executing.
+        /// DROP_RANGE will cancel covered operations and will wait for them in removePartProducingOpsInRange.
+        if (result_part.isFakeDropRangePart() && result_part.contains(future_part))
+            continue;
+
+        /// In other cases we cannot execute `entry` (or upgrade its actual_part_name to `new_part_name`)
        /// while any covered or covering parts are processed.
        /// But we also cannot simply return true and postpone entry processing, because it may lead to kind of livelock.
        /// Since queue is processed in multiple threads, it's likely that there will be at least one thread
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
@ -336,8 +336,10 @@ public:
      * And also wait for the completion of their execution, if they are now being executed.
      * covering_entry is as an entry that caused removal of entries in range (usually, DROP_RANGE)
      */
-    void removePartProducingOpsInRange(zkutil::ZooKeeperPtr zookeeper, const MergeTreePartInfo & part_info,
-                                       const std::optional<ReplicatedMergeTreeLogEntryData> & covering_entry);
+    void removePartProducingOpsInRange(zkutil::ZooKeeperPtr zookeeper,
+                                       const MergeTreePartInfo & part_info,
+                                       const std::optional<ReplicatedMergeTreeLogEntryData> & covering_entry,
+                                       const String & fetch_entry_znode);

    /** In the case where there are not enough parts to perform the merge in part_name
      * - move actions with merged parts to the end of the queue
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.cpp
@ -9,7 +9,6 @@
 #include <Core/Block.h>
 #include <IO/Operators.h>

-
 namespace ProfileEvents
 {
    extern const Event DuplicatedInsertedBlocks;
@ -32,6 +31,7 @@ namespace ErrorCodes
    extern const int DUPLICATE_DATA_PART;
    extern const int PART_IS_TEMPORARILY_LOCKED;
    extern const int LOGICAL_ERROR;
+    extern const int TABLE_IS_READ_ONLY;
    extern const int QUERY_WAS_CANCELLED;
 }

@ -84,7 +84,7 @@ ReplicatedMergeTreeSink::ReplicatedMergeTreeSink(
 ReplicatedMergeTreeSink::~ReplicatedMergeTreeSink() = default;

 /// Allow to verify that the session in ZooKeeper is still alive.
-static void assertSessionIsNotExpired(zkutil::ZooKeeperPtr & zookeeper)
+static void assertSessionIsNotExpired(const zkutil::ZooKeeperPtr & zookeeper)
 {
    if (!zookeeper)
        throw Exception("No ZooKeeper session.", ErrorCodes::NO_ZOOKEEPER);
@ -93,7 +93,7 @@ static void assertSessionIsNotExpired(zkutil::ZooKeeperPtr & zookeeper)
        throw Exception("ZooKeeper session has been expired.", ErrorCodes::NO_ZOOKEEPER);
 }

-size_t ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper)
+size_t ReplicatedMergeTreeSink::checkQuorumPrecondition(const ZooKeeperWithFaultInjectionPtr & zookeeper)
 {
    if (!isQuorumEnabled())
        return 0;
@ -103,6 +103,7 @@ size_t ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & z
    Strings replicas = zookeeper->getChildren(fs::path(storage.zookeeper_path) / "replicas");

    Strings exists_paths;
+    exists_paths.reserve(replicas.size());
    for (const auto & replica : replicas)
        if (replica != storage.replica_name)
            exists_paths.emplace_back(fs::path(storage.zookeeper_path) / "replicas" / replica / "is_active");
@ -110,20 +111,28 @@ size_t ReplicatedMergeTreeSink::checkQuorumPrecondition(zkutil::ZooKeeperPtr & z
    auto exists_result = zookeeper->exists(exists_paths);
    auto get_results = zookeeper->get(Strings{storage.replica_path + "/is_active", storage.replica_path + "/host"});

+    Coordination::Error keeper_error = Coordination::Error::ZOK;
    size_t active_replicas = 1;     /// Assume current replica is active (will check below)
    for (size_t i = 0; i < exists_paths.size(); ++i)
    {
-        auto status = exists_result[i];
-        if (status.error == Coordination::Error::ZOK)
+        auto error = exists_result[i].error;
+        if (error == Coordination::Error::ZOK)
            ++active_replicas;
+        else if (Coordination::isHardwareError(error))
+            keeper_error = error;
    }

    size_t replicas_number = replicas.size();
    size_t quorum_size = getQuorumSize(replicas_number);

    if (active_replicas < quorum_size)
+    {
+        if (Coordination::isHardwareError(keeper_error))
+            throw Coordination::Exception("Failed to check number of alive replicas", keeper_error);
+
        throw Exception(ErrorCodes::TOO_FEW_LIVE_REPLICAS, "Number of alive replicas ({}) is less than requested quorum ({}/{}).",
                        active_replicas, quorum_size, replicas_number);
+    }

    /** Is there a quorum for the last part for which a quorum is needed?
        * Write of all the parts with the included quorum is linearly ordered.
@ -156,15 +165,34 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
 {
    auto block = getHeader().cloneWithColumns(chunk.detachColumns());

-    auto zookeeper = storage.getZooKeeper();
-    assertSessionIsNotExpired(zookeeper);
+    const auto & settings = context->getSettingsRef();
+    zookeeper_retries_info = ZooKeeperRetriesInfo(
+        "ReplicatedMergeTreeSink::consume",
+        settings.insert_keeper_max_retries ? log : nullptr,
+        settings.insert_keeper_max_retries,
+        settings.insert_keeper_retry_initial_backoff_ms,
+        settings.insert_keeper_retry_max_backoff_ms);
+
+    ZooKeeperWithFaultInjectionPtr zookeeper = ZooKeeperWithFaultInjection::createInstance(
+        settings.insert_keeper_fault_injection_probability,
+        settings.insert_keeper_fault_injection_seed,
+        storage.getZooKeeper(),
+        "ReplicatedMergeTreeSink::consume",
+        log);

    /** If write is with quorum, then we check that the required number of replicas is now live,
      *  and also that for all previous parts for which quorum is required, this quorum is reached.
      * And also check that during the insertion, the replica was not reinitialized or disabled (by the value of `is_active` node).
      * TODO Too complex logic, you can do better.
      */
-    size_t replicas_num = checkQuorumPrecondition(zookeeper);
+    size_t replicas_num = 0;
+    ZooKeeperRetriesControl quorum_retries_ctl("checkQuorumPrecondition", zookeeper_retries_info);
+    quorum_retries_ctl.retryLoop(
+        [&]()
+        {
+            zookeeper->setKeeper(storage.getZooKeeper());
+            replicas_num = checkQuorumPrecondition(zookeeper);
+        });

    if (!storage_snapshot->object_columns.empty())
        convertDynamicColumnsToTuples(block, storage_snapshot);
@ -176,7 +204,6 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)

    size_t streams = 0;
    bool support_parallel_write = false;
-    const Settings & settings = context->getSettingsRef();

    for (auto & current_block : part_blocks)
    {
@ -256,7 +283,7 @@ void ReplicatedMergeTreeSink::consume(Chunk chunk)
        finishDelayedChunk(zookeeper);
 }

-void ReplicatedMergeTreeSink::finishDelayedChunk(zkutil::ZooKeeperPtr & zookeeper)
+void ReplicatedMergeTreeSink::finishDelayedChunk(const ZooKeeperWithFaultInjectionPtr & zookeeper)
 {
    if (!delayed_chunk)
        return;
@ -271,7 +298,7 @@ void ReplicatedMergeTreeSink::finishDelayedChunk(zkutil::ZooKeeperPtr & zookeepe

        try
        {
-            commitPart(zookeeper, part, partition.block_id, delayed_chunk->replicas_num);
+            commitPart(zookeeper, part, partition.block_id, delayed_chunk->replicas_num, false);

            last_block_is_duplicate = last_block_is_duplicate || part->is_duplicate;

@ -294,8 +321,9 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
 {
    /// NOTE: No delay in this case. That's Ok.

-    auto zookeeper = storage.getZooKeeper();
-    assertSessionIsNotExpired(zookeeper);
+    auto origin_zookeeper = storage.getZooKeeper();
+    assertSessionIsNotExpired(origin_zookeeper);
+    auto zookeeper = std::make_shared<ZooKeeperWithFaultInjection>(origin_zookeeper);

    size_t replicas_num = checkQuorumPrecondition(zookeeper);

@ -304,7 +332,7 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
    try
    {
        part->version.setCreationTID(Tx::PrehistoricTID, nullptr);
-        commitPart(zookeeper, part, "", replicas_num);
+        commitPart(zookeeper, part, "", replicas_num, true);
        PartLog::addNewPart(storage.getContext(), part, watch.elapsed());
    }
    catch (...)
@ -315,10 +343,11 @@ void ReplicatedMergeTreeSink::writeExistingPart(MergeTreeData::MutableDataPartPt
 }

 void ReplicatedMergeTreeSink::commitPart(
-    zkutil::ZooKeeperPtr & zookeeper,
+    const ZooKeeperWithFaultInjectionPtr & zookeeper,
    MergeTreeData::MutableDataPartPtr & part,
    const String & block_id,
-    size_t replicas_num)
+    size_t replicas_num,
+    bool writing_existing_part)
 {
    /// It is possible that we alter a part with different types of source columns.
    /// In this case, if column was not altered, the result type will be different with what we have in metadata.
@ -326,8 +355,6 @@ void ReplicatedMergeTreeSink::commitPart(
    ///
    /// metadata_snapshot->check(part->getColumns());

-    assertSessionIsNotExpired(zookeeper);
-
    String temporary_part_relative_path = part->getDataPartStorage().getPartDirectory();

    /// There is one case when we need to retry transaction in a loop.
@ -337,14 +364,65 @@ void ReplicatedMergeTreeSink::commitPart(

    bool is_already_existing_part = false;

-    while (true)
+    /// for retries due to keeper error
+    bool part_committed_locally_but_zookeeper = false;
+    Coordination::Error write_part_info_keeper_error = Coordination::Error::ZOK;
+
+    ZooKeeperRetriesControl retries_ctl("commitPart", zookeeper_retries_info);
+    retries_ctl.retryLoop([&]()
    {
+        zookeeper->setKeeper(storage.getZooKeeper());
+        if (storage.is_readonly)
+        {
+            /// stop retries if in shutdown
+            if (storage.shutdown_called)
+                throw Exception(
+                    ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode due to shutdown: replica_path={}", storage.replica_path);
+
+            /// When we attach existing parts it's okay to be in read-only mode
+            /// For example during RESTORE REPLICA.
+            if (!writing_existing_part)
+            {
+                retries_ctl.setUserError(ErrorCodes::TABLE_IS_READ_ONLY, "Table is in readonly mode: replica_path={}", storage.replica_path);
+                return;
+            }
+        }
+
+        if (retries_ctl.isRetry())
+        {
+            /// If we are retrying, check if last iteration was actually successful,
+            /// we could get network error on committing part to zk
+            /// but the operation could be completed by zk server
+
+            /// If this flag is true, then part is in Active state, and we'll not retry anymore
+            /// we only check if part was committed to zk and return success or failure correspondingly
+            /// Note: if commit to zk failed then cleanup thread will mark the part as Outdated later
+            if (part_committed_locally_but_zookeeper)
+            {
+                /// check that info about the part was actually written in zk
+                if (zookeeper->exists(fs::path(storage.replica_path) / "parts" / part->name))
+                {
+                    LOG_DEBUG(log, "Part was successfully committed on previous iteration: part_id={}", part->name);
+                }
+                else
+                {
+                    retries_ctl.setUserError(
+                        ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
+                        "Insert failed due to zookeeper error. Please retry. Reason: {}",
+                        Coordination::errorMessage(write_part_info_keeper_error));
+                }
+
+                retries_ctl.stopRetries();
+                return;
+            }
+        }
+
        /// Obtain incremental block number and lock it. The lock holds our intention to add the block to the filesystem.
        /// We remove the lock just after renaming the part. In case of exception, block number will be marked as abandoned.
        /// Also, make deduplication check. If a duplicate is detected, no nodes are created.

        /// Allocate new block number and check for duplicates
-        bool deduplicate_block = !block_id.empty();
+        const bool deduplicate_block = !block_id.empty();
        String block_id_path = deduplicate_block ? storage.zookeeper_path + "/blocks/" + block_id : "";
        auto block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zookeeper, block_id_path);
        ThreadFuzzer::maybeInjectSleep();
@ -468,7 +546,13 @@ void ReplicatedMergeTreeSink::commitPart(
                    else
                        quorum_path = storage.zookeeper_path + "/quorum/status";

-                    waitForQuorum(zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_version, replicas_num);
+                    if (!retries_ctl.callAndCatchAll(
+                            [&]()
+                            {
+                                waitForQuorum(
+                                    zookeeper, existing_part_name, quorum_path, quorum_info.is_active_node_version, replicas_num);
+                            }))
+                        return;
                }
                else
                {
@ -477,6 +561,7 @@ void ReplicatedMergeTreeSink::commitPart(

                return;
            }
+
            LOG_INFO(log, "Block with ID {} already exists on other replicas as part {}; will write it locally with that name.",
                block_id, existing_part_name);

@ -508,8 +593,7 @@ void ReplicatedMergeTreeSink::commitPart(
        }
        catch (const Exception & e)
        {
-            if (e.code() != ErrorCodes::DUPLICATE_DATA_PART
-                && e.code() != ErrorCodes::PART_IS_TEMPORARILY_LOCKED)
+            if (e.code() != ErrorCodes::DUPLICATE_DATA_PART && e.code() != ErrorCodes::PART_IS_TEMPORARILY_LOCKED)
                throw;
        }

@ -526,15 +610,26 @@ void ReplicatedMergeTreeSink::commitPart(
                    part->name);
        }

-        ThreadFuzzer::maybeInjectSleep();
+        try
+        {
+            ThreadFuzzer::maybeInjectSleep();
+            storage.lockSharedData(*part, zookeeper, false, {});
+            ThreadFuzzer::maybeInjectSleep();
+        }
+        catch (const Exception &)
+        {
+            transaction.rollbackPartsToTemporaryState();

-        storage.lockSharedData(*part, false, {});
+            part->is_temp = true;
+            part->renameTo(temporary_part_relative_path, false);
+
+            throw;
+        }

        ThreadFuzzer::maybeInjectSleep();

        Coordination::Responses responses;
        Coordination::Error multi_code = zookeeper->tryMultiNoThrow(ops, responses); /// 1 RTT
-
        if (multi_code == Coordination::Error::ZOK)
        {
            transaction.commit();
@ -549,18 +644,32 @@ void ReplicatedMergeTreeSink::commitPart(
            throw Exception(ErrorCodes::QUERY_WAS_CANCELLED,
                            "Insert query (for block {}) was cancelled by concurrent ALTER PARTITION", block_number_lock->getPath());
        }
-        else if (multi_code == Coordination::Error::ZCONNECTIONLOSS
-            || multi_code == Coordination::Error::ZOPERATIONTIMEOUT)
+        else if (Coordination::isHardwareError(multi_code))
        {
+            write_part_info_keeper_error = multi_code;
            /** If the connection is lost, and we do not know if the changes were applied, we can not delete the local part
-              *  if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again.
-              */
+             *  if the changes were applied, the inserted block appeared in `/blocks/`, and it can not be inserted again.
+             */
            transaction.commit();
-            storage.enqueuePartForCheck(part->name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER);
+
+            /// Setting this flag is point of no return
+            /// On next retry, we'll just check if actually operation succeed or failed
+            /// and return ok or error correspondingly
+            part_committed_locally_but_zookeeper = true;
+
+            /// if all retries will be exhausted by accessing zookeeper on fresh retry -> we'll add committed part to queue in the action
+            /// here lambda capture part name, it's ok since we'll not generate new one for this insert,
+            /// see comments around 'part_committed_locally_but_zookeeper' flag
+            retries_ctl.actionAfterLastFailedRetry(
+                [&storage = storage, part_name = part->name]()
+                { storage.enqueuePartForCheck(part_name, MAX_AGE_OF_LOCAL_PART_THAT_WASNT_ADDED_TO_ZOOKEEPER); });

            /// We do not know whether or not data has been inserted.
-            throw Exception("Unknown status, client must retry. Reason: " + String(Coordination::errorMessage(multi_code)),
-                ErrorCodes::UNKNOWN_STATUS_OF_INSERT);
+            retries_ctl.setUserError(
+                ErrorCodes::UNKNOWN_STATUS_OF_INSERT,
+                "Unknown status, client must retry. Reason: {}",
+                Coordination::errorMessage(multi_code));
+            return;
        }
        else if (Coordination::isUserError(multi_code))
        {
@ -580,62 +689,72 @@ void ReplicatedMergeTreeSink::commitPart(
                part->renameTo(temporary_part_relative_path, false);

                /// If this part appeared on other replica than it's better to try to write it locally one more time. If it's our part
-                /// than it will be ignored on the next itration.
+                /// than it will be ignored on the next iteration.
                ++loop_counter;
                if (loop_counter == max_iterations)
                {
                    part->is_duplicate = true; /// Part is duplicate, just remove it from local FS
                    throw Exception("Too many transaction retries - it may indicate an error", ErrorCodes::DUPLICATE_DATA_PART);
                }
-                continue;
+                retries_ctl.requestUnconditionalRetry(); /// we want one more iteration w/o counting it as a try and timeout
+                return;
            }
            else if (multi_code == Coordination::Error::ZNODEEXISTS && failed_op_path == quorum_info.status_path)
            {
-                storage.unlockSharedData(*part);
+                storage.unlockSharedData(*part, zookeeper);
                transaction.rollback();
                throw Exception("Another quorum insert has been already started", ErrorCodes::UNSATISFIED_QUORUM_FOR_PREVIOUS_WRITE);
            }
            else
            {
-                storage.unlockSharedData(*part);
+                storage.unlockSharedData(*part, zookeeper);
                /// NOTE: We could be here if the node with the quorum existed, but was quickly removed.
                transaction.rollback();
-                throw Exception("Unexpected logical error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
-                                + Coordination::errorMessage(multi_code) + ", path " + failed_op_path,
-                                ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
+                throw Exception(
+                    ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
+                    "Unexpected logical error while adding block {} with ID '{}': {}, path {}",
+                    block_number,
+                    block_id,
+                    Coordination::errorMessage(multi_code),
+                    failed_op_path);
            }
        }
-        else if (Coordination::isHardwareError(multi_code))
-        {
-            storage.unlockSharedData(*part);
-            transaction.rollback();
-            throw Exception("Unrecoverable network error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
-                            + Coordination::errorMessage(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
-        }
        else
        {
-            storage.unlockSharedData(*part);
+            storage.unlockSharedData(*part, zookeeper);
            transaction.rollback();
-            throw Exception("Unexpected ZooKeeper error while adding block " + toString(block_number) + " with ID '" + block_id + "': "
-                            + Coordination::errorMessage(multi_code), ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR);
+            throw Exception(
+                ErrorCodes::UNEXPECTED_ZOOKEEPER_ERROR,
+                "Unexpected ZooKeeper error while adding block {} with ID '{}': {}",
+                block_number,
+                block_id,
+                Coordination::errorMessage(multi_code));
        }
-
-        break;
-    }
+    },
+    [&zookeeper]() { zookeeper->cleanupEphemeralNodes(); });

    if (isQuorumEnabled())
    {
-        if (is_already_existing_part)
+        ZooKeeperRetriesControl quorum_retries_ctl("waitForQuorum", zookeeper_retries_info);
+        quorum_retries_ctl.retryLoop([&]()
        {
-            /// We get duplicate part without fetch
-            /// Check if this quorum insert is parallel or not
-            if (zookeeper->exists(storage.zookeeper_path + "/quorum/parallel/" + part->name))
-                storage.updateQuorum(part->name, true);
-            else if (zookeeper->exists(storage.zookeeper_path + "/quorum/status"))
-                storage.updateQuorum(part->name, false);
-        }
+            zookeeper->setKeeper(storage.getZooKeeper());

-        waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_version, replicas_num);
+            if (is_already_existing_part)
+            {
+                /// We get duplicate part without fetch
+                /// Check if this quorum insert is parallel or not
+                if (zookeeper->exists(storage.zookeeper_path + "/quorum/parallel/" + part->name))
+                    storage.updateQuorum(part->name, true);
+                else if (zookeeper->exists(storage.zookeeper_path + "/quorum/status"))
+                    storage.updateQuorum(part->name, false);
+            }
+
+            if (!quorum_retries_ctl.callAndCatchAll(
+                    [&]()
+                    { waitForQuorum(zookeeper, part->name, quorum_info.status_path, quorum_info.is_active_node_version, replicas_num); }))
+                return;
+        });
    }
 }

@ -650,11 +769,11 @@ void ReplicatedMergeTreeSink::onFinish()
 {
    auto zookeeper = storage.getZooKeeper();
    assertSessionIsNotExpired(zookeeper);
-    finishDelayedChunk(zookeeper);
+    finishDelayedChunk(std::make_shared<ZooKeeperWithFaultInjection>(zookeeper));
 }

 void ReplicatedMergeTreeSink::waitForQuorum(
-    zkutil::ZooKeeperPtr & zookeeper,
+    const ZooKeeperWithFaultInjectionPtr & zookeeper,
    const std::string & part_name,
    const std::string & quorum_path,
    Int32 is_active_node_version,
--- a/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeSink.h
@ -3,6 +3,8 @@
 #include <Processors/Sinks/SinkToStorage.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <base/types.h>
+#include <Storages/MergeTree/ZooKeeperRetries.h>
+#include <Storages/MergeTree/ZooKeeperWithFaultInjection.h>


 namespace Poco { class Logger; }
@ -60,6 +62,7 @@ public:
    }

 private:
+    ZooKeeperRetriesInfo zookeeper_retries_info;
    struct QuorumInfo
    {
        String status_path;
@ -71,20 +74,24 @@ private:

    /// Checks active replicas.
    /// Returns total number of replicas.
-    size_t checkQuorumPrecondition(zkutil::ZooKeeperPtr & zookeeper);
+    size_t checkQuorumPrecondition(const ZooKeeperWithFaultInjectionPtr & zookeeper);

    /// Rename temporary part and commit to ZooKeeper.
    void commitPart(
-        zkutil::ZooKeeperPtr & zookeeper,
+        const ZooKeeperWithFaultInjectionPtr & zookeeper,
        MergeTreeData::MutableDataPartPtr & part,
        const String & block_id,
-        size_t replicas_num);
+        size_t replicas_num,
+        bool writing_existing_part);

    /// Wait for quorum to be satisfied on path (quorum_path) form part (part_name)
    /// Also checks that replica still alive.
    void waitForQuorum(
-        zkutil::ZooKeeperPtr & zookeeper, const std::string & part_name,
-        const std::string & quorum_path, int is_active_node_version, size_t replicas_num) const;
+        const ZooKeeperWithFaultInjectionPtr & zookeeper,
+        const std::string & part_name,
+        const std::string & quorum_path,
+        int is_active_node_version,
+        size_t replicas_num) const;

    StorageReplicatedMergeTree & storage;
    StorageMetadataPtr metadata_snapshot;
@ -116,7 +123,7 @@ private:
    struct DelayedChunk;
    std::unique_ptr<DelayedChunk> delayed_chunk;

-    void finishDelayedChunk(zkutil::ZooKeeperPtr & zookeeper);
+    void finishDelayedChunk(const ZooKeeperWithFaultInjectionPtr & zookeeper);
 };

 }
--- a/src/Storages/MergeTree/ZooKeeperRetries.h
+++ b/src/Storages/MergeTree/ZooKeeperRetries.h
@ -0,0 +1,265 @@
+#pragma once
+#include <base/sleep.h>
+#include <Common/Exception.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/logger_useful.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int OK;
+}
+
+struct ZooKeeperRetriesInfo
+{
+    ZooKeeperRetriesInfo() = default;
+    ZooKeeperRetriesInfo(std::string name_, Poco::Logger * logger_, UInt64 max_retries_, UInt64 initial_backoff_ms_, UInt64 max_backoff_ms_)
+        : name(std::move(name_))
+        , logger(logger_)
+        , max_retries(max_retries_)
+        , curr_backoff_ms(std::min(initial_backoff_ms_, max_backoff_ms_))
+        , max_backoff_ms(max_backoff_ms_)
+    {
+    }
+
+    std::string name;
+    Poco::Logger * logger = nullptr;
+    UInt64 max_retries = 0;
+    UInt64 curr_backoff_ms = 0;
+    UInt64 max_backoff_ms = 0;
+    UInt64 retry_count = 0;
+};
+
+class ZooKeeperRetriesControl
+{
+public:
+    ZooKeeperRetriesControl(std::string name_, ZooKeeperRetriesInfo & retries_info_) : name(std::move(name_)), retries_info(retries_info_)
+    {
+    }
+
+    void retryLoop(auto && f)
+    {
+        retryLoop(f, []() {});
+    }
+
+    void retryLoop(auto && f, auto && iteration_cleanup)
+    {
+        while (canTry())
+        {
+            try
+            {
+                f();
+                iteration_cleanup();
+            }
+            catch (const zkutil::KeeperException & e)
+            {
+                iteration_cleanup();
+
+                if (!Coordination::isHardwareError(e.code))
+                    throw;
+
+                setKeeperError(e.code, e.message());
+            }
+            catch (...)
+            {
+                iteration_cleanup();
+                throw;
+            }
+        }
+    }
+
+    bool callAndCatchAll(auto && f)
+    {
+        try
+        {
+            f();
+            return true;
+        }
+        catch (const zkutil::KeeperException & e)
+        {
+            setKeeperError(e.code, e.message());
+        }
+        catch (const Exception & e)
+        {
+            setUserError(e.code(), e.what());
+        }
+        return false;
+    }
+
+    void setUserError(int code, std::string message)
+    {
+        if (retries_info.logger)
+            LOG_TRACE(
+                retries_info.logger, "ZooKeeperRetriesControl: {}/{}: setUserError: error={} message={}", retries_info.name, name, code, message);
+
+        /// if current iteration is already failed, keep initial error
+        if (!iteration_succeeded)
+            return;
+
+        iteration_succeeded = false;
+        user_error.code = code;
+        user_error.message = std::move(message);
+        keeper_error = KeeperError{};
+    }
+
+    template <typename... Args>
+    void setUserError(int code, fmt::format_string<Args...> fmt, Args &&... args)
+    {
+        setUserError(code, fmt::format(fmt, std::forward<Args>(args)...));
+    }
+
+    void setKeeperError(Coordination::Error code, std::string message)
+    {
+        if (retries_info.logger)
+            LOG_TRACE(
+                retries_info.logger, "ZooKeeperRetriesControl: {}/{}: setKeeperError: error={} message={}", retries_info.name, name, code, message);
+
+        /// if current iteration is already failed, keep initial error
+        if (!iteration_succeeded)
+            return;
+
+        iteration_succeeded = false;
+        keeper_error.code = code;
+        keeper_error.message = std::move(message);
+        user_error = UserError{};
+    }
+
+    void stopRetries() { stop_retries = true; }
+
+    void requestUnconditionalRetry() { unconditional_retry = true; }
+
+    bool isLastRetry() const { return retries_info.retry_count >= retries_info.max_retries; }
+
+    bool isRetry() const { return retries_info.retry_count > 0; }
+
+    Coordination::Error getLastKeeperErrorCode() const { return keeper_error.code; }
+
+    /// action will be called only once and only after latest failed retry
+    void actionAfterLastFailedRetry(std::function<void()> f) { action_after_last_failed_retry = std::move(f); }
+
+private:
+    struct KeeperError
+    {
+        using Code = Coordination::Error;
+        Code code = Code::ZOK;
+        std::string message;
+    };
+
+    struct UserError
+    {
+        int code = ErrorCodes::OK;
+        std::string message;
+    };
+
+    bool canTry()
+    {
+        ++iteration_count;
+        /// first iteration is ordinary execution, no further checks needed
+        if (0 == iteration_count)
+            return true;
+
+        if (unconditional_retry)
+        {
+            unconditional_retry = false;
+            return true;
+        }
+
+        /// iteration succeeded -> no need to retry
+        if (iteration_succeeded)
+        {
+            /// avoid unnecessary logs, - print something only in case of retries
+            if (retries_info.logger && iteration_count > 1)
+                LOG_DEBUG(
+                    retries_info.logger,
+                    "ZooKeeperRetriesControl: {}/{}: succeeded after: iterations={} total_retries={}",
+                    retries_info.name,
+                    name,
+                    iteration_count,
+                    retries_info.retry_count);
+            return false;
+        }
+
+        if (stop_retries)
+        {
+            logLastError("stop retries on request");
+            action_after_last_failed_retry();
+            throwIfError();
+            return false;
+        }
+
+        if (retries_info.retry_count >= retries_info.max_retries)
+        {
+            logLastError("retry limit is reached");
+            action_after_last_failed_retry();
+            throwIfError();
+            return false;
+        }
+
+        /// retries
+        ++retries_info.retry_count;
+        logLastError("will retry due to error");
+        sleepForMilliseconds(retries_info.curr_backoff_ms);
+        retries_info.curr_backoff_ms = std::min(retries_info.curr_backoff_ms * 2, retries_info.max_backoff_ms);
+
+        /// reset the flag, it will be set to false in case of error
+        iteration_succeeded = true;
+
+        return true;
+    }
+
+    void throwIfError() const
+    {
+        if (user_error.code != ErrorCodes::OK)
+            throw Exception(user_error.code, user_error.message);
+
+        if (keeper_error.code != KeeperError::Code::ZOK)
+            throw zkutil::KeeperException(keeper_error.code, keeper_error.message);
+    }
+
+    void logLastError(std::string_view header)
+    {
+        if (user_error.code == ErrorCodes::OK)
+        {
+            if (retries_info.logger)
+                LOG_DEBUG(
+                    retries_info.logger,
+                    "ZooKeeperRetriesControl: {}/{}: {}: retry_count={} timeout={}ms error={} message={}",
+                    retries_info.name,
+                    name,
+                    header,
+                    retries_info.retry_count,
+                    retries_info.curr_backoff_ms,
+                    keeper_error.code,
+                    keeper_error.message);
+        }
+        else
+        {
+            if (retries_info.logger)
+                LOG_DEBUG(
+                    retries_info.logger,
+                    "ZooKeeperRetriesControl: {}/{}: {}: retry_count={} timeout={}ms error={} message={}",
+                    retries_info.name,
+                    name,
+                    header,
+                    retries_info.retry_count,
+                    retries_info.curr_backoff_ms,
+                    user_error.code,
+                    user_error.message);
+        }
+    }
+
+
+    std::string name;
+    ZooKeeperRetriesInfo & retries_info;
+    Int64 iteration_count = -1;
+    UserError user_error;
+    KeeperError keeper_error;
+    std::function<void()> action_after_last_failed_retry = []() {};
+    bool unconditional_retry = false;
+    bool iteration_succeeded = true;
+    bool stop_retries = false;
+};
+
+}
--- a/src/Storages/MergeTree/ZooKeeperWithFaultInjection.h
+++ b/src/Storages/MergeTree/ZooKeeperWithFaultInjection.h
@ -0,0 +1,527 @@
+#pragma once
+#include <random>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/ZooKeeper/Types.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/ZooKeeper/ZooKeeperCommon.h>
+#include <Common/randomSeed.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+class RandomFaultInjection
+{
+public:
+    RandomFaultInjection(double probability, UInt64 seed_) : rndgen(seed_), distribution(probability) { }
+
+    void beforeOperation()
+    {
+        if (distribution(rndgen))
+            throw zkutil::KeeperException("Fault injection before operation", Coordination::Error::ZSESSIONEXPIRED);
+    }
+    void afterOperation()
+    {
+        if (distribution(rndgen))
+            throw zkutil::KeeperException("Fault injection after operation", Coordination::Error::ZOPERATIONTIMEOUT);
+    }
+
+private:
+    std::mt19937_64 rndgen;
+    std::bernoulli_distribution distribution;
+};
+
+///
+/// ZooKeeperWithFaultInjection mimics ZooKeeper interface and inject failures according to failure policy if set
+///
+class ZooKeeperWithFaultInjection
+{
+    using zk = zkutil::ZooKeeper;
+
+    zk::Ptr keeper;
+    zk::Ptr keeper_prev;
+    std::unique_ptr<RandomFaultInjection> fault_policy;
+    std::string name;
+    Poco::Logger * logger = nullptr;
+    UInt64 calls_total = 0;
+    UInt64 calls_without_fault_injection = 0;
+    const UInt64 seed = 0;
+
+    std::vector<std::string> ephemeral_nodes;
+
+    ZooKeeperWithFaultInjection(
+        zk::Ptr const & keeper_,
+        double fault_injection_probability,
+        UInt64 fault_injection_seed,
+        std::string name_,
+        Poco::Logger * logger_)
+        : keeper(keeper_), name(std::move(name_)), logger(logger_), seed(fault_injection_seed)
+    {
+        fault_policy = std::make_unique<RandomFaultInjection>(fault_injection_probability, fault_injection_seed);
+
+        if (unlikely(logger))
+            LOG_TRACE(
+                logger,
+                "ZooKeeperWithFaultInjection created: name={} seed={} fault_probability={}",
+                name,
+                seed,
+                fault_injection_probability);
+    }
+
+public:
+    using Ptr = std::shared_ptr<ZooKeeperWithFaultInjection>;
+
+    static ZooKeeperWithFaultInjection::Ptr createInstance(
+        double fault_injection_probability, UInt64 fault_injection_seed, const zk::Ptr & zookeeper, std::string name, Poco::Logger * logger)
+    {
+        /// validate all parameters here, constructor just accept everything
+
+        if (fault_injection_probability < 0.0)
+            fault_injection_probability = .0;
+        else if (fault_injection_probability > 1.0)
+            fault_injection_probability = 1.0;
+
+        if (0 == fault_injection_seed)
+            fault_injection_seed = randomSeed();
+
+        if (fault_injection_probability > 0.0)
+            return std::shared_ptr<ZooKeeperWithFaultInjection>(
+                new ZooKeeperWithFaultInjection(zookeeper, fault_injection_probability, fault_injection_seed, std::move(name), logger));
+
+        /// if no fault injection provided, create instance which will not log anything
+        return std::make_shared<ZooKeeperWithFaultInjection>(zookeeper);
+    }
+
+    explicit ZooKeeperWithFaultInjection(zk::Ptr const & keeper_) : keeper(keeper_) { }
+
+    ~ZooKeeperWithFaultInjection()
+    {
+        if (unlikely(logger))
+            LOG_TRACE(
+                logger,
+                "ZooKeeperWithFaultInjection report: name={} seed={} calls_total={} calls_succeeded={} calls_failed={} failure_rate={}",
+                name,
+                seed,
+                calls_total,
+                calls_without_fault_injection,
+                calls_total - calls_without_fault_injection,
+                float(calls_total - calls_without_fault_injection) / calls_total);
+    }
+
+    void setKeeper(zk::Ptr const & keeper_) { keeper = keeper_; }
+    bool isNull() const { return keeper.get() == nullptr; }
+
+    ///
+    /// mirror ZooKeeper interface
+    ///
+
+    Strings getChildren(
+        const std::string & path,
+        Coordination::Stat * stat = nullptr,
+        const zkutil::EventPtr & watch = nullptr,
+        Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL)
+    {
+        return access("getChildren", path, [&]() { return keeper->getChildren(path, stat, watch, list_request_type); });
+    }
+
+    Coordination::Error tryGetChildren(
+        const std::string & path,
+        Strings & res,
+        Coordination::Stat * stat = nullptr,
+        const zkutil::EventPtr & watch = nullptr,
+        Coordination::ListRequestType list_request_type = Coordination::ListRequestType::ALL)
+    {
+        return access("tryGetChildren", path, [&]() { return keeper->tryGetChildren(path, res, stat, watch, list_request_type); });
+    }
+
+    zk::FutureExists asyncExists(const std::string & path, Coordination::WatchCallback watch_callback = {})
+    {
+        return access("asyncExists", path, [&]() { return keeper->asyncExists(path, watch_callback); });
+    }
+
+    zk::FutureGet asyncTryGet(const std::string & path)
+    {
+        return access("asyncTryGet", path, [&]() { return keeper->asyncTryGet(path); });
+    }
+
+    bool tryGet(
+        const std::string & path,
+        std::string & res,
+        Coordination::Stat * stat = nullptr,
+        const zkutil::EventPtr & watch = nullptr,
+        Coordination::Error * code = nullptr)
+    {
+        return access("tryGet", path, [&]() { return keeper->tryGet(path, res, stat, watch, code); });
+    }
+
+    Coordination::Error tryMulti(const Coordination::Requests & requests, Coordination::Responses & responses)
+    {
+        constexpr auto method = "tryMulti";
+        auto error = access(
+            method,
+            !requests.empty() ? requests.front()->getPath() : "",
+            [&]() { return keeper->tryMulti(requests, responses); },
+            [&](const Coordination::Error & original_error)
+            {
+                if (original_error == Coordination::Error::ZOK)
+                    faultInjectionPostAction(method, requests, responses);
+            },
+            [&]()
+            {
+                responses.clear();
+                for (size_t i = 0; i < requests.size(); ++i)
+                    responses.emplace_back(std::make_shared<Coordination::ZooKeeperErrorResponse>());
+            });
+
+
+        /// collect ephemeral nodes when no fault was injected (to clean up on demand)
+        if (unlikely(fault_policy) && Coordination::Error::ZOK == error)
+        {
+            doForEachCreatedEphemeralNode(
+                method, requests, responses, [&](const String & path_created) { ephemeral_nodes.push_back(path_created); });
+        }
+        return error;
+    }
+
+    Coordination::Error tryMultiNoThrow(const Coordination::Requests & requests, Coordination::Responses & responses)
+    {
+        constexpr auto method = "tryMultiNoThrow";
+        constexpr auto no_throw = true;
+        constexpr auto inject_failure_before_op = false;
+        auto error = access<no_throw, inject_failure_before_op>(
+            method,
+            !requests.empty() ? requests.front()->getPath() : "",
+            [&]() { return keeper->tryMultiNoThrow(requests, responses); },
+            [&](const Coordination::Error & original_error)
+            {
+                if (original_error == Coordination::Error::ZOK)
+                    faultInjectionPostAction(method, requests, responses);
+            },
+            [&]()
+            {
+                responses.clear();
+                for (size_t i = 0; i < requests.size(); ++i)
+                    responses.emplace_back(std::make_shared<Coordination::ZooKeeperErrorResponse>());
+            });
+
+        /// collect ephemeral nodes when no fault was injected (to clean up later)
+        if (unlikely(fault_policy) && Coordination::Error::ZOK == error)
+        {
+            doForEachCreatedEphemeralNode(
+                method, requests, responses, [&](const String & path_created) { ephemeral_nodes.push_back(path_created); });
+        }
+        return error;
+    }
+
+    std::string get(const std::string & path, Coordination::Stat * stat = nullptr, const zkutil::EventPtr & watch = nullptr)
+    {
+        return access("get", path, [&]() { return keeper->get(path, stat, watch); });
+    }
+
+    zkutil::ZooKeeper::MultiGetResponse get(const std::vector<std::string> & paths)
+    {
+        return access("get", !paths.empty() ? paths.front() : "", [&]() { return keeper->get(paths); });
+    }
+
+    bool exists(const std::string & path, Coordination::Stat * stat = nullptr, const zkutil::EventPtr & watch = nullptr)
+    {
+        return access("exists", path, [&]() { return keeper->exists(path, stat, watch); });
+    }
+
+    zkutil::ZooKeeper::MultiExistsResponse exists(const std::vector<std::string> & paths)
+    {
+        return access("exists", !paths.empty() ? paths.front() : "", [&]() { return keeper->exists(paths); });
+    }
+
+    std::string create(const std::string & path, const std::string & data, int32_t mode)
+    {
+        auto path_created = access(
+            "create",
+            path,
+            [&]() { return keeper->create(path, data, mode); },
+            [&](std::string const & result_path)
+            {
+                try
+                {
+                    if (mode == zkutil::CreateMode::EphemeralSequential || mode == zkutil::CreateMode::Ephemeral)
+                    {
+                        keeper->remove(result_path);
+                        if (unlikely(logger))
+                            LOG_TRACE(logger, "ZooKeeperWithFaultInjection cleanup: seed={} func={} path={}", seed, "create", result_path);
+                    }
+                }
+                catch (const zkutil::KeeperException & e)
+                {
+                    if (unlikely(logger))
+                        LOG_TRACE(
+                            logger,
+                            "ZooKeeperWithFaultInjection cleanup FAILED: seed={} func={} path={} code={} message={} ",
+                            seed,
+                            "create",
+                            result_path,
+                            e.code,
+                            e.message());
+                }
+            });
+
+        /// collect ephemeral nodes when no fault was injected (to clean up later)
+        if (unlikely(fault_policy))
+        {
+            if (mode == zkutil::CreateMode::EphemeralSequential || mode == zkutil::CreateMode::Ephemeral)
+                ephemeral_nodes.push_back(path_created);
+        }
+
+        return path_created;
+    }
+
+    Coordination::Responses multi(const Coordination::Requests & requests)
+    {
+        constexpr auto method = "multi";
+        auto result = access(
+            method,
+            !requests.empty() ? requests.front()->getPath() : "",
+            [&]() { return keeper->multi(requests); },
+            [&](Coordination::Responses & responses) { faultInjectionPostAction(method, requests, responses); });
+
+        /// collect ephemeral nodes to clean up
+        if (unlikely(fault_policy))
+        {
+            doForEachCreatedEphemeralNode(
+                method, requests, result, [&](const String & path_created) { ephemeral_nodes.push_back(path_created); });
+        }
+        return result;
+    }
+
+    void createAncestors(const std::string & path)
+    {
+        access("createAncestors", path, [&]() { return keeper->createAncestors(path); });
+    }
+
+    Coordination::Error tryRemove(const std::string & path, int32_t version = -1)
+    {
+        return access("tryRemove", path, [&]() { return keeper->tryRemove(path, version); });
+    }
+
+    void cleanupEphemeralNodes()
+    {
+        for (const auto & path : ephemeral_nodes)
+        {
+            try
+            {
+                if (keeper_prev)
+                    keeper_prev->tryRemove(path);
+            }
+            catch (...)
+            {
+                if (unlikely(logger))
+                    tryLogCurrentException(logger, "Exception during ephemeral nodes clean up");
+            }
+        }
+
+        ephemeral_nodes.clear();
+    }
+
+private:
+    void faultInjectionBefore(std::function<void()> fault_cleanup)
+    {
+        try
+        {
+            if (unlikely(fault_policy))
+                fault_policy->beforeOperation();
+        }
+        catch (const zkutil::KeeperException &)
+        {
+            fault_cleanup();
+            throw;
+        }
+    }
+    void faultInjectionAfter(std::function<void()> fault_cleanup)
+    {
+        try
+        {
+            if (unlikely(fault_policy))
+                fault_policy->afterOperation();
+        }
+        catch (const zkutil::KeeperException &)
+        {
+            fault_cleanup();
+            throw;
+        }
+    }
+
+    void doForEachCreatedEphemeralNode(
+        const char * method, const Coordination::Requests & requests, const Coordination::Responses & responses, auto && action)
+    {
+        if (responses.empty())
+            return;
+
+        if (responses.size() != requests.size())
+            throw Exception(
+                ErrorCodes::LOGICAL_ERROR,
+                "Number of responses doesn't match number of requests: method={} requests={} responses={}",
+                method,
+                requests.size(),
+                responses.size());
+
+        /// find create request with ephemeral flag
+        std::vector<std::pair<size_t, const Coordination::CreateRequest *>> create_requests;
+        for (size_t i = 0; i < requests.size(); ++i)
+        {
+            const auto * create_req = dynamic_cast<const Coordination::CreateRequest *>(requests[i].get());
+            if (create_req && create_req->is_ephemeral)
+                create_requests.emplace_back(i, create_req);
+        }
+
+        for (auto && [i, req] : create_requests)
+        {
+            const auto * create_resp = dynamic_cast<const Coordination::CreateResponse *>(responses.at(i).get());
+            if (!create_resp)
+                throw Exception(
+                    ErrorCodes::LOGICAL_ERROR, "Response should be CreateResponse: method={} index={} path={}", method, i, req->path);
+
+            action(create_resp->path_created);
+        }
+    }
+
+    void faultInjectionPostAction(const char * method, const Coordination::Requests & requests, Coordination::Responses & responses)
+    {
+        doForEachCreatedEphemeralNode(method, requests, responses, [&](const String & path_created) { keeper->remove(path_created); });
+    }
+
+    template <typename T>
+    struct FaultCleanupTypeImpl
+    {
+        using Type = std::function<void(T &)>;
+    };
+
+    template <>
+    struct FaultCleanupTypeImpl<void>
+    {
+        using Type = std::function<void()>;
+    };
+
+    template <typename T>
+    using FaultCleanupType = typename FaultCleanupTypeImpl<T>::Type;
+
+    template <
+        bool no_throw_access = false,
+        bool inject_failure_before_op = true,
+        int inject_failure_after_op = true,
+        typename Operation,
+        typename Result = std::invoke_result_t<Operation>>
+    Result access(
+        const char * func_name,
+        const std::string & path,
+        Operation operation,
+        FaultCleanupType<Result> fault_after_op_cleanup = {},
+        FaultCleanupType<void> fault_before_op_cleanup = {})
+    {
+        try
+        {
+            ++calls_total;
+
+            if (!keeper)
+                throw zkutil::KeeperException(
+                    "Session is considered to be expired due to fault injection", Coordination::Error::ZSESSIONEXPIRED);
+
+            if constexpr (inject_failure_before_op)
+            {
+                faultInjectionBefore(
+                    [&]
+                    {
+                        if (fault_before_op_cleanup)
+                            fault_before_op_cleanup();
+                    });
+            }
+
+            if constexpr (!std::is_same_v<Result, void>)
+            {
+                Result res = operation();
+
+                /// if connectivity error occurred w/o fault injection -> just return it
+                if constexpr (std::is_same_v<Coordination::Error, Result>)
+                {
+                    if (Coordination::isHardwareError(res))
+                        return res;
+                }
+
+                if constexpr (inject_failure_after_op)
+                {
+                    faultInjectionAfter(
+                        [&]
+                        {
+                            if (fault_after_op_cleanup)
+                                fault_after_op_cleanup(res);
+                        });
+                }
+
+                ++calls_without_fault_injection;
+
+                if (unlikely(logger))
+                    LOG_TRACE(logger, "ZooKeeperWithFaultInjection call SUCCEEDED: seed={} func={} path={}", seed, func_name, path);
+
+                return res;
+            }
+            else
+            {
+                operation();
+
+                if constexpr (inject_failure_after_op)
+                {
+                    faultInjectionAfter(
+                        [&fault_after_op_cleanup]
+                        {
+                            if (fault_after_op_cleanup)
+                                fault_after_op_cleanup();
+                        });
+                }
+
+                ++calls_without_fault_injection;
+
+                if (unlikely(logger))
+                    LOG_TRACE(logger, "ZooKeeperWithFaultInjection call SUCCEEDED: seed={} func={} path={}", seed, func_name, path);
+            }
+        }
+        catch (const zkutil::KeeperException & e)
+        {
+            if (unlikely(logger))
+                LOG_TRACE(
+                    logger,
+                    "ZooKeeperWithFaultInjection call FAILED: seed={} func={} path={} code={} message={} ",
+                    seed,
+                    func_name,
+                    path,
+                    e.code,
+                    e.message());
+
+            /// save valid pointer to clean up ephemeral nodes later if necessary
+            if (keeper)
+                keeper_prev = keeper;
+            keeper.reset();
+
+            /// for try*NoThrow() methods
+            if constexpr (no_throw_access)
+                return e.code;
+
+            if constexpr (std::is_same_v<Coordination::Error, Result>)
+            {
+                /// try*() methods throws at least on hardware error and return only on user errors
+                /// todo: the methods return only on subset of user errors, and throw on another errors
+                ///       to mimic the methods exactly - we need to specify errors on which to return for each such method
+                if (Coordination::isHardwareError(e.code))
+                    throw;
+
+                return e.code;
+            }
+
+            throw;
+        }
+    }
+};
+
+using ZooKeeperWithFaultInjectionPtr = ZooKeeperWithFaultInjection::Ptr;
+}
--- a/src/Storages/NamedCollections.cpp
+++ b/src/Storages/NamedCollections.cpp
@ -0,0 +1,545 @@
+#include "NamedCollections.h"
+
+#include <base/find_symbols.h>
+#include <Common/assert_cast.h>
+#include <Common/FieldVisitorToString.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/evaluateConstantExpression.h>
+#include <Parsers/ASTIdentifier.h>
+#include <Parsers/ASTFunction.h>
+#include <Parsers/ASTLiteral.h>
+#include <Poco/Util/AbstractConfiguration.h>
+#include <Poco/Util/XMLConfiguration.h>
+#include <IO/WriteBufferFromString.h>
+#include <IO/Operators.h>
+#include <ranges>
+
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNKNOWN_NAMED_COLLECTION;
+    extern const int NAMED_COLLECTION_ALREADY_EXISTS;
+    extern const int BAD_ARGUMENTS;
+    extern const int NOT_IMPLEMENTED;
+    extern const int LOGICAL_ERROR;
+}
+
+namespace
+{
+    constexpr auto NAMED_COLLECTIONS_CONFIG_PREFIX = "named_collections";
+
+    std::string getCollectionPrefix(const std::string & collection_name)
+    {
+        return fmt::format("{}.{}", NAMED_COLLECTIONS_CONFIG_PREFIX, collection_name);
+    }
+
+    /// Enumerate keys paths of the config recursively.
+    /// E.g. if `enumerate_paths` = {"root.key1"} and config like
+    /// <root>
+    ///     <key0></key0>
+    ///     <key1>
+    ///         <key2></key2>
+    ///         <key3>
+    ///            <key4></key4>
+    ///         </key3>
+    ///     </key1>
+    /// </root>
+    /// the `result` will contain two strings: "root.key1.key2" and "root.key1.key3.key4"
+    void collectKeys(
+        const Poco::Util::AbstractConfiguration & config,
+        std::queue<std::string> enumerate_paths,
+        std::set<std::string> & result)
+    {
+        if (enumerate_paths.empty())
+            return;
+
+        auto initial_paths = std::move(enumerate_paths);
+        enumerate_paths = {};
+        while (!initial_paths.empty())
+        {
+            auto path = initial_paths.front();
+            initial_paths.pop();
+
+            Poco::Util::AbstractConfiguration::Keys keys;
+            config.keys(path, keys);
+
+            if (keys.empty())
+            {
+                result.insert(path);
+            }
+            else
+            {
+                for (const auto & key : keys)
+                    enumerate_paths.emplace(path + '.' + key);
+            }
+        }
+
+        collectKeys(config, enumerate_paths, result);
+    }
+}
+
+NamedCollectionFactory & NamedCollectionFactory::instance()
+{
+    static NamedCollectionFactory instance;
+    return instance;
+}
+
+void NamedCollectionFactory::initialize(const Poco::Util::AbstractConfiguration & config_)
+{
+    std::lock_guard lock(mutex);
+    if (is_initialized)
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Named collection factory already initialized");
+    }
+
+    config = &config_;
+    is_initialized = true;
+}
+
+void NamedCollectionFactory::reload(const Poco::Util::AbstractConfiguration & config_)
+{
+    std::lock_guard lock(mutex);
+    config = &config_;
+    loaded_named_collections.clear();
+}
+
+void NamedCollectionFactory::assertInitialized(
+    std::lock_guard<std::mutex> & /* lock */) const
+{
+    if (!is_initialized)
+    {
+        throw Exception(
+            ErrorCodes::LOGICAL_ERROR,
+            "Named collection factory must be initialized before being used");
+    }
+}
+
+bool NamedCollectionFactory::exists(const std::string & collection_name) const
+{
+    std::lock_guard lock(mutex);
+    return existsUnlocked(collection_name, lock);
+}
+
+bool NamedCollectionFactory::existsUnlocked(
+    const std::string & collection_name,
+    std::lock_guard<std::mutex> & lock) const
+{
+    assertInitialized(lock);
+    /// Named collections can be added via SQL command or via config.
+    /// Named collections from config are loaded on first access,
+    /// therefore it might not be in `named_collections` map yet.
+    return loaded_named_collections.contains(collection_name)
+        || config->has(getCollectionPrefix(collection_name));
+}
+
+NamedCollectionPtr NamedCollectionFactory::get(const std::string & collection_name) const
+{
+    std::lock_guard lock(mutex);
+    assertInitialized(lock);
+
+    if (!existsUnlocked(collection_name, lock))
+    {
+        throw Exception(
+            ErrorCodes::UNKNOWN_NAMED_COLLECTION,
+            "There is no named collection `{}`",
+            collection_name);
+    }
+
+    return getImpl(collection_name, lock);
+}
+
+NamedCollectionPtr NamedCollectionFactory::tryGet(const std::string & collection_name) const
+{
+    std::lock_guard lock(mutex);
+    assertInitialized(lock);
+
+    if (!existsUnlocked(collection_name, lock))
+        return nullptr;
+
+    return getImpl(collection_name, lock);
+}
+
+NamedCollectionPtr NamedCollectionFactory::getImpl(
+    const std::string & collection_name,
+    std::lock_guard<std::mutex> & /* lock */) const
+{
+    auto it = loaded_named_collections.find(collection_name);
+    if (it == loaded_named_collections.end())
+    {
+        it = loaded_named_collections.emplace(
+            collection_name,
+            NamedCollection::create(*config, collection_name)).first;
+    }
+    return it->second;
+}
+
+void NamedCollectionFactory::add(
+    const std::string & collection_name,
+    NamedCollectionPtr collection)
+{
+    std::lock_guard lock(mutex);
+    auto [it, inserted] = loaded_named_collections.emplace(collection_name, collection);
+    if (!inserted)
+    {
+        throw Exception(
+            ErrorCodes::NAMED_COLLECTION_ALREADY_EXISTS,
+            "A named collection `{}` already exists",
+            collection_name);
+    }
+}
+
+void NamedCollectionFactory::remove(const std::string & collection_name)
+{
+    std::lock_guard lock(mutex);
+    assertInitialized(lock);
+
+    if (!existsUnlocked(collection_name, lock))
+    {
+        throw Exception(
+            ErrorCodes::UNKNOWN_NAMED_COLLECTION,
+            "There is no named collection `{}`",
+            collection_name);
+    }
+
+    if (config->has(collection_name))
+    {
+        throw Exception(
+            ErrorCodes::NOT_IMPLEMENTED,
+            "Collection {} is defined in config and cannot be removed",
+            collection_name);
+    }
+
+    [[maybe_unused]] auto removed = loaded_named_collections.erase(collection_name);
+    assert(removed);
+}
+
+NamedCollectionFactory::NamedCollections NamedCollectionFactory::getAll() const
+{
+    std::lock_guard lock(mutex);
+    assertInitialized(lock);
+
+    NamedCollections result(loaded_named_collections);
+
+    Poco::Util::AbstractConfiguration::Keys config_collections_names;
+    config->keys(NAMED_COLLECTIONS_CONFIG_PREFIX, config_collections_names);
+
+    for (const auto & collection_name : config_collections_names)
+    {
+        if (result.contains(collection_name))
+            continue;
+
+        result.emplace(collection_name, NamedCollection::create(*config, collection_name));
+    }
+
+    return result;
+}
+
+class NamedCollection::Impl
+{
+private:
+    using ConfigurationPtr = Poco::AutoPtr<Poco::Util::AbstractConfiguration>;
+
+    ///  Named collection configuration
+    ///  <collection1>
+    ///      ...
+    ///  </collection1>
+    ConfigurationPtr config;
+    Keys keys;
+
+public:
+    Impl(const Poco::Util::AbstractConfiguration & config_,
+         const std::string & collection_name_,
+         const Keys & keys_)
+        : config(createEmptyConfiguration(collection_name_))
+        , keys(keys_)
+    {
+        auto collection_path = getCollectionPrefix(collection_name_);
+        for (const auto & key : keys)
+            copyConfigValue<String>(config_, collection_path + '.' + key, *config, key);
+    }
+
+    template <typename T> T get(const Key & key) const
+    {
+        return getConfigValue<T>(*config, key);
+    }
+
+    template <typename T> T getOrDefault(const Key & key, const T & default_value) const
+    {
+        return getConfigValueOrDefault<T>(*config, key, &default_value);
+    }
+
+    template <typename T> void set(const Key & key, const T & value, bool update_if_exists)
+    {
+        setConfigValue<T>(*config, key, value, update_if_exists);
+        if (!keys.contains(key))
+            keys.insert(key);
+    }
+
+    void remove(const Key & key)
+    {
+        removeConfigValue(*config, key);
+        [[maybe_unused]] auto removed = keys.erase(key);
+        assert(removed);
+    }
+
+    Keys getKeys() const
+    {
+        return keys;
+    }
+
+    ImplPtr copy() const
+    {
+        return std::make_unique<Impl>(*this);
+    }
+
+    std::string dumpStructure() const
+    {
+        /// Convert a collection config like
+        /// <collection>
+        ///     <key0>value0</key0>
+        ///     <key1>
+        ///         <key2>value2</key2>
+        ///         <key3>
+        ///            <key4>value3</key4>
+        ///         </key3>
+        ///     </key1>
+        /// </collection>
+        /// to a string:
+        /// "key0: value0
+        ///  key1:
+        ///     key2: value2
+        ///     key3:
+        ///        key4: value3"
+        WriteBufferFromOwnString wb;
+        Strings prev_key_parts;
+        for (const auto & key : keys)
+        {
+            Strings key_parts;
+            splitInto<'.'>(key_parts, key);
+            size_t tab_cnt = 0;
+
+            auto it = key_parts.begin();
+            auto prev_key_parts_it = prev_key_parts.begin();
+            while (it != key_parts.end()
+                   && prev_key_parts_it != prev_key_parts.end()
+                   && *it == *prev_key_parts_it)
+            {
+                ++it;
+                ++prev_key_parts_it;
+                ++tab_cnt;
+            }
+
+            auto start_it = it;
+            for (; it != key_parts.end(); ++it)
+            {
+                if (it != start_it)
+                    wb << '\n';
+                wb << std::string(tab_cnt++, '\t');
+                wb << *it << ':';
+            }
+            wb << '\t' << get<String>(key) << '\n';
+            prev_key_parts = key_parts;
+        }
+        return wb.str();
+    }
+
+private:
+    template <typename T> static T getConfigValue(
+        const Poco::Util::AbstractConfiguration & config,
+        const std::string & path)
+    {
+        return getConfigValueOrDefault<T>(config, path);
+    }
+
+    template <typename T> static T getConfigValueOrDefault(
+        const Poco::Util::AbstractConfiguration & config,
+        const std::string & path,
+        const T * default_value = nullptr)
+    {
+        if (!config.has(path))
+        {
+            if (!default_value)
+                throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key `{}`", path);
+            return *default_value;
+        }
+
+        if constexpr (std::is_same_v<T, String>)
+            return config.getString(path);
+        else if constexpr (std::is_same_v<T, UInt64>)
+            return config.getUInt64(path);
+        else if constexpr (std::is_same_v<T, Int64>)
+            return config.getInt64(path);
+        else if constexpr (std::is_same_v<T, Float64>)
+            return config.getDouble(path);
+        else
+            throw Exception(
+                ErrorCodes::NOT_IMPLEMENTED,
+                "Unsupported type in getConfigValueOrDefault(). "
+                "Supported types are String, UInt64, Int64, Float64");
+    }
+
+    template<typename T> static void setConfigValue(
+        Poco::Util::AbstractConfiguration & config,
+        const std::string & path,
+        const T & value,
+        bool update = false)
+    {
+        if (!update && config.has(path))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key `{}` already exists", path);
+
+        if constexpr (std::is_same_v<T, String>)
+            config.setString(path, value);
+        else if constexpr (std::is_same_v<T, UInt64>)
+            config.setUInt64(path, value);
+        else if constexpr (std::is_same_v<T, Int64>)
+            config.setInt64(path, value);
+        else if constexpr (std::is_same_v<T, Float64>)
+            config.setDouble(path, value);
+        else
+            throw Exception(
+                ErrorCodes::NOT_IMPLEMENTED,
+                "Unsupported type in setConfigValue(). "
+                "Supported types are String, UInt64, Int64, Float64");
+    }
+
+    template <typename T> static void copyConfigValue(
+        const Poco::Util::AbstractConfiguration & from_config,
+        const std::string & from_path,
+        Poco::Util::AbstractConfiguration & to_config,
+        const std::string & to_path)
+    {
+        if (!from_config.has(from_path))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key `{}`", from_path);
+
+        if (to_config.has(to_path))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Key `{}` already exists", to_path);
+
+        if constexpr (std::is_same_v<T, String>)
+            to_config.setString(to_path, from_config.getString(from_path));
+        else if constexpr (std::is_same_v<T, std::string>)
+            to_config.setString(to_path, from_config.getString(from_path));
+        else if constexpr (std::is_same_v<T, UInt64>)
+            to_config.setUInt64(to_path, from_config.getUInt64(from_path));
+        else if constexpr (std::is_same_v<T, Int64>)
+            to_config.setInt64(to_path, from_config.getInt64(from_path));
+        else if constexpr (std::is_same_v<T, Float64>)
+            to_config.setDouble(to_path, from_config.getDouble(from_path));
+        else
+            throw Exception(
+                ErrorCodes::NOT_IMPLEMENTED,
+                "Unsupported type in copyConfigValue(). "
+                "Supported types are String, UInt64, Int64, Float64");
+    }
+
+    static void removeConfigValue(
+        Poco::Util::AbstractConfiguration & config,
+        const std::string & path)
+    {
+        if (!config.has(path))
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "No such key `{}`", path);
+        config.remove(path);
+    }
+
+    static ConfigurationPtr createEmptyConfiguration(const std::string & root_name)
+    {
+        using DocumentPtr = Poco::AutoPtr<Poco::XML::Document>;
+        DocumentPtr xml_document(new Poco::XML::Document());
+        xml_document->appendChild(xml_document->createElement(root_name));
+        ConfigurationPtr config(new Poco::Util::XMLConfiguration(xml_document));
+        return config;
+    }
+};
+
+NamedCollection::NamedCollection(
+    const Poco::Util::AbstractConfiguration & config,
+    const std::string & collection_path,
+    const Keys & keys)
+    : NamedCollection(std::make_unique<Impl>(config, collection_path, keys))
+{
+}
+
+NamedCollection::NamedCollection(ImplPtr pimpl_)
+    : pimpl(std::move(pimpl_))
+{
+}
+
+NamedCollectionPtr NamedCollection::create(
+    const Poco::Util::AbstractConfiguration & config,
+    const std::string & collection_name)
+{
+    const auto collection_prefix = getCollectionPrefix(collection_name);
+    std::queue<std::string> enumerate_input;
+    std::set<std::string> enumerate_result;
+
+    enumerate_input.push(collection_prefix);
+    collectKeys(config, std::move(enumerate_input), enumerate_result);
+
+    /// Collection does not have any keys.
+    /// (`enumerate_result` == <collection_path>).
+    const bool collection_is_empty = enumerate_result.size() == 1;
+    std::set<std::string> keys;
+    if (!collection_is_empty)
+    {
+        /// Skip collection prefix and add +1 to avoid '.' in the beginning.
+        for (const auto & path : enumerate_result)
+            keys.emplace(path.substr(collection_prefix.size() + 1));
+    }
+    return std::make_unique<NamedCollection>(config, collection_name, keys);
+}
+
+template <typename T> T NamedCollection::get(const Key & key) const
+{
+    return pimpl->get<T>(key);
+}
+
+template <typename T> T NamedCollection::getOrDefault(const Key & key, const T & default_value) const
+{
+    return pimpl->getOrDefault<T>(key, default_value);
+}
+
+template <typename T> void NamedCollection::set(const Key & key, const T & value, bool update_if_exists)
+{
+    pimpl->set<T>(key, value, update_if_exists);
+}
+
+void NamedCollection::remove(const Key & key)
+{
+    pimpl->remove(key);
+}
+
+std::shared_ptr<NamedCollection> NamedCollection::duplicate() const
+{
+    return std::make_shared<NamedCollection>(pimpl->copy());
+}
+
+NamedCollection::Keys NamedCollection::getKeys() const
+{
+    return pimpl->getKeys();
+}
+
+std::string NamedCollection::dumpStructure() const
+{
+    return pimpl->dumpStructure();
+}
+
+template String NamedCollection::get<String>(const NamedCollection::Key & key) const;
+template UInt64 NamedCollection::get<UInt64>(const NamedCollection::Key & key) const;
+template Int64 NamedCollection::get<Int64>(const NamedCollection::Key & key) const;
+template Float64 NamedCollection::get<Float64>(const NamedCollection::Key & key) const;
+
+template String NamedCollection::getOrDefault<String>(const NamedCollection::Key & key, const String & default_value) const;
+template UInt64 NamedCollection::getOrDefault<UInt64>(const NamedCollection::Key & key, const UInt64 & default_value) const;
+template Int64 NamedCollection::getOrDefault<Int64>(const NamedCollection::Key & key, const Int64 & default_value) const;
+template Float64 NamedCollection::getOrDefault<Float64>(const NamedCollection::Key & key, const Float64 & default_value) const;
+
+template void NamedCollection::set<String>(const NamedCollection::Key & key, const String & value, bool update_if_exists);
+template void NamedCollection::set<UInt64>(const NamedCollection::Key & key, const UInt64 & value, bool update_if_exists);
+template void NamedCollection::set<Int64>(const NamedCollection::Key & key, const Int64 & value, bool update_if_exists);
+template void NamedCollection::set<Float64>(const NamedCollection::Key & key, const Float64 & value, bool update_if_exists);
+
+}
--- a/src/Storages/NamedCollections.h
+++ b/src/Storages/NamedCollections.h
@ -0,0 +1,107 @@
+#pragma once
+
+#include <Interpreters/Context.h>
+#include <Poco/Util/AbstractConfiguration.h>
+
+
+namespace DB
+{
+
+class NamedCollection;
+using NamedCollectionPtr = std::shared_ptr<const NamedCollection>;
+
+/**
+ * Class to represent arbitrary-structured named collection object.
+ * It can be defined via config or via SQL command.
+ * <named_collections>
+ *     <collection1>
+ *         ...
+ *     </collection1>
+ *     ...
+ * </named_collections>
+ */
+class NamedCollection
+{
+private:
+    class Impl;
+    using ImplPtr = std::unique_ptr<Impl>;
+
+    ImplPtr pimpl;
+
+public:
+    using Key = std::string;
+    using Keys = std::set<Key>;
+
+    static NamedCollectionPtr create(
+        const Poco::Util::AbstractConfiguration & config,
+        const std::string & collection_name);
+
+    NamedCollection(
+        const Poco::Util::AbstractConfiguration & config,
+        const std::string & collection_path,
+        const Keys & keys);
+
+    explicit NamedCollection(ImplPtr pimpl_);
+
+    template <typename T> T get(const Key & key) const;
+
+    template <typename T> T getOrDefault(const Key & key, const T & default_value) const;
+
+    template <typename T> void set(const Key & key, const T & value, bool update_if_exists = false);
+
+    void remove(const Key & key);
+
+    std::shared_ptr<NamedCollection> duplicate() const;
+
+    Keys getKeys() const;
+
+    std::string dumpStructure() const;
+};
+
+/**
+ * A factory of immutable named collections.
+ */
+class NamedCollectionFactory : boost::noncopyable
+{
+public:
+    static NamedCollectionFactory & instance();
+
+    void initialize(const Poco::Util::AbstractConfiguration & config_);
+
+    void reload(const Poco::Util::AbstractConfiguration & config_);
+
+    bool exists(const std::string & collection_name) const;
+
+    NamedCollectionPtr get(const std::string & collection_name) const;
+
+    NamedCollectionPtr tryGet(const std::string & collection_name) const;
+
+    void add(
+        const std::string & collection_name,
+        NamedCollectionPtr collection);
+
+    void remove(const std::string & collection_name);
+
+    using NamedCollections = std::unordered_map<std::string, NamedCollectionPtr>;
+    NamedCollections getAll() const;
+
+private:
+    void assertInitialized(std::lock_guard<std::mutex> & lock) const;
+
+    NamedCollectionPtr getImpl(
+        const std::string & collection_name,
+        std::lock_guard<std::mutex> & lock) const;
+
+    bool existsUnlocked(
+        const std::string & collection_name,
+        std::lock_guard<std::mutex> & lock) const;
+
+    mutable NamedCollections loaded_named_collections;
+
+    const Poco::Util::AbstractConfiguration * config;
+
+    bool is_initialized = false;
+    mutable std::mutex mutex;
+};
+
+}
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -1750,8 +1750,18 @@ bool StorageReplicatedMergeTree::executeFetch(LogEntry & entry, bool need_to_che
            if (!entry.actual_new_part_name.empty())
                LOG_DEBUG(log, "Will fetch part {} instead of {}", entry.actual_new_part_name, entry.new_part_name);

-            if (!fetchPart(part_name, metadata_snapshot, fs::path(zookeeper_path) / "replicas" / replica, false, entry.quorum))
+            String source_replica_path = fs::path(zookeeper_path) / "replicas" / replica;
+            if (!fetchPart(part_name,
+                metadata_snapshot,
+                source_replica_path,
+                /* to_detached= */ false,
+                entry.quorum,
+                /* zookeeper_ */ nullptr,
+                /* try_fetch_shared= */ true,
+                entry.znode_name))
+            {
                return false;
+            }
        }
        catch (Exception & e)
        {
@ -1834,7 +1844,7 @@ void StorageReplicatedMergeTree::executeDropRange(const LogEntry & entry)
    LOG_TRACE(log, "Executing DROP_RANGE {}", entry.new_part_name);
    auto drop_range_info = MergeTreePartInfo::fromPartName(entry.new_part_name, format_version);
    getContext()->getMergeList().cancelInPartition(getStorageID(), drop_range_info.partition_id, drop_range_info.max_block);
-    queue.removePartProducingOpsInRange(getZooKeeper(), drop_range_info, entry);
+    queue.removePartProducingOpsInRange(getZooKeeper(), drop_range_info, entry, /* fetch_entry_znode= */ {});
    part_check_thread.cancelRemovedPartsCheck(drop_range_info);

    /// Delete the parts contained in the range to be deleted.
@ -1906,7 +1916,7 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
    if (replace)
    {
        getContext()->getMergeList().cancelInPartition(getStorageID(), drop_range.partition_id, drop_range.max_block);
-        queue.removePartProducingOpsInRange(getZooKeeper(), drop_range, entry);
+        queue.removePartProducingOpsInRange(getZooKeeper(), drop_range, entry, /* fetch_entry_znode= */ {});
        part_check_thread.cancelRemovedPartsCheck(drop_range);
    }
    else
@ -3450,7 +3460,7 @@ void StorageReplicatedMergeTree::removePartAndEnqueueFetch(const String & part_n
    ///       so GET_PART all_1_42_5 (and all source parts) is useless. The only thing we can do is to fetch all_1_42_5_63.
    ///    2. If all_1_42_5_63 is lost, then replication may stuck waiting for all_1_42_5_63 to appear,
    ///       because we may have some covered parts (more precisely, parts with the same min and max blocks)
-    queue.removePartProducingOpsInRange(zookeeper, broken_part_info, {});
+    queue.removePartProducingOpsInRange(zookeeper, broken_part_info, /* covering_entry= */ {}, /* fetch_entry_znode= */ {});

    String part_path = fs::path(replica_path) / "parts" / part_name;

@ -3850,8 +3860,15 @@ bool StorageReplicatedMergeTree::partIsLastQuorumPart(const MergeTreePartInfo &
 }


-bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const StorageMetadataPtr & metadata_snapshot,
-    const String & source_replica_path, bool to_detached, size_t quorum, zkutil::ZooKeeper::Ptr zookeeper_, bool try_fetch_shared)
+bool StorageReplicatedMergeTree::fetchPart(
+    const String & part_name,
+    const StorageMetadataPtr & metadata_snapshot,
+    const String & source_replica_path,
+    bool to_detached,
+    size_t quorum,
+    zkutil::ZooKeeper::Ptr zookeeper_,
+    bool try_fetch_shared,
+    String entry_znode)
 {
    auto zookeeper = zookeeper_ ? zookeeper_ : getZooKeeper();
    const auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version);
@ -4049,6 +4066,17 @@ bool StorageReplicatedMergeTree::fetchPart(const String & part_name, const Stora
                ProfileEvents::increment(ProfileEvents::ObsoleteReplicatedParts);
            }

+            /// It is possible that fetched parts may cover other parts (see
+            /// findReplicaHavingCoveringPart()), and if those covered parts
+            /// cannot be executed right now (due to MERGE_PARTS that covers
+            /// them is in progress), replica delay will be increased until
+            /// those entries will be executed (if covered operations
+            /// finishes) in other words until MERGE_PARTS is in progress,
+            /// while this can take awhile.
+            ///
+            /// So let's just remove them from the queue.
+            queue.removePartProducingOpsInRange(zookeeper, part->info, /* covering_entry= */ {}, entry_znode);
+
            write_part_log({});
        }
        else
@ -4479,9 +4507,16 @@ void StorageReplicatedMergeTree::assertNotReadonly() const

 SinkToStoragePtr StorageReplicatedMergeTree::write(const ASTPtr & /*query*/, const StorageMetadataPtr & metadata_snapshot, ContextPtr local_context)
 {
-    const auto storage_settings_ptr = getSettings();
-    assertNotReadonly();
+    /// If table is read-only because it doesn't have metadata in zk yet, then it's not possible to insert into it
+    /// Without this check, we'll write data parts on disk, and afterwards will remove them since we'll fail to commit them into zk
+    /// In case of remote storage like s3, it'll generate unnecessary PUT requests
+    if (is_readonly && (!has_metadata_in_zookeeper.has_value() || false == has_metadata_in_zookeeper.value()))
+        throw Exception(
+            ErrorCodes::TABLE_IS_READ_ONLY,
+            "Table is in readonly mode since table metadata was not found in zookeeper: replica_path={}",
+            replica_path);

+    const auto storage_settings_ptr = getSettings();
    const Settings & query_settings = local_context->getSettingsRef();
    bool deduplicate = storage_settings_ptr->replicated_deduplication_window != 0 && query_settings.insert_deduplicate;

@ -4996,8 +5031,7 @@ bool StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(const St
    Int64 mutation_version;

    {
-        auto zookeeper = getZooKeeper();
-        delimiting_block_lock = allocateBlockNumber(partition_id, zookeeper);
+        delimiting_block_lock = allocateBlockNumber(partition_id, getZooKeeper());
        right = delimiting_block_lock->getNumber();
        /// Make sure we cover all parts in drop range.
        /// There might be parts with mutation version greater than current block number
@ -5278,7 +5312,7 @@ void StorageReplicatedMergeTree::rename(const String & new_path_to_table_data, c
 }


-bool StorageReplicatedMergeTree::existsNodeCached(const std::string & path) const
+bool StorageReplicatedMergeTree::existsNodeCached(const ZooKeeperWithFaultInjectionPtr & zookeeper, const std::string & path) const
 {
    {
        std::lock_guard lock(existing_nodes_cache_mutex);
@ -5286,7 +5320,7 @@ bool StorageReplicatedMergeTree::existsNodeCached(const std::string & path) cons
            return true;
    }

-    bool res = getZooKeeper()->exists(path);
+    bool res = zookeeper->exists(path);

    if (res)
    {
@ -5298,9 +5332,22 @@ bool StorageReplicatedMergeTree::existsNodeCached(const std::string & path) cons
 }


-std::optional<EphemeralLockInZooKeeper>
-StorageReplicatedMergeTree::allocateBlockNumber(
-    const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_block_id_path, const String & zookeeper_path_prefix) const
+std::optional<EphemeralLockInZooKeeper> StorageReplicatedMergeTree::allocateBlockNumber(
+    const String & partition_id,
+    const zkutil::ZooKeeperPtr & zookeeper,
+    const String & zookeeper_block_id_path,
+    const String & zookeeper_path_prefix) const
+{
+    return allocateBlockNumber(
+        partition_id, std::make_shared<ZooKeeperWithFaultInjection>(zookeeper), zookeeper_block_id_path, zookeeper_path_prefix);
+}
+
+
+std::optional<EphemeralLockInZooKeeper> StorageReplicatedMergeTree::allocateBlockNumber(
+    const String & partition_id,
+    const ZooKeeperWithFaultInjectionPtr & zookeeper,
+    const String & zookeeper_block_id_path,
+    const String & zookeeper_path_prefix) const
 {
    String zookeeper_table_path;
    if (zookeeper_path_prefix.empty())
@ -5311,7 +5358,7 @@ StorageReplicatedMergeTree::allocateBlockNumber(
    String block_numbers_path = fs::path(zookeeper_table_path) / "block_numbers";
    String partition_path = fs::path(block_numbers_path) / partition_id;

-    if (!existsNodeCached(partition_path))
+    if (!existsNodeCached(zookeeper, partition_path))
    {
        Coordination::Requests ops;
        /// Check that table is not being dropped ("host" is the first node that is removed on replica drop)
@ -5329,10 +5376,9 @@ StorageReplicatedMergeTree::allocateBlockNumber(
    }

    return createEphemeralLockInZooKeeper(
-        fs::path(partition_path) / "block-", fs::path(zookeeper_table_path) / "temp", *zookeeper, zookeeper_block_id_path);
+        fs::path(partition_path) / "block-", fs::path(zookeeper_table_path) / "temp", zookeeper, zookeeper_block_id_path);
 }

-
 Strings StorageReplicatedMergeTree::tryWaitForAllReplicasToProcessLogEntry(
    const String & table_zookeeper_path, const ReplicatedMergeTreeLogEntryData & entry, Int64 wait_for_inactive_timeout)
 {
@ -7085,7 +7131,7 @@ CancellationCode StorageReplicatedMergeTree::killPartMoveToShard(const UUID & ta

 void StorageReplicatedMergeTree::getCommitPartOps(
    Coordination::Requests & ops,
-    MutableDataPartPtr & part,
+    const DataPartPtr & part,
    const String & block_id_path) const
 {
    const String & part_name = part->name;
@ -7667,11 +7713,28 @@ void StorageReplicatedMergeTree::lockSharedDataTemporary(const String & part_nam
        String zookeeper_node = fs::path(zc_zookeeper_path) / id / replica_name;

        LOG_TRACE(log, "Set zookeeper temporary ephemeral lock {}", zookeeper_node);
-        createZeroCopyLockNode(zookeeper, zookeeper_node, zkutil::CreateMode::Ephemeral, false);
+        createZeroCopyLockNode(
+            std::make_shared<ZooKeeperWithFaultInjection>(zookeeper), zookeeper_node, zkutil::CreateMode::Ephemeral, false);
    }
 }

-void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional<HardlinkedFiles> hardlinked_files) const
+void StorageReplicatedMergeTree::lockSharedData(
+    const IMergeTreeDataPart & part,
+    bool replace_existing_lock,
+    std::optional<HardlinkedFiles> hardlinked_files) const
+{
+    auto zookeeper = tryGetZooKeeper();
+    if (zookeeper)
+        return lockSharedData(part, std::make_shared<ZooKeeperWithFaultInjection>(zookeeper), replace_existing_lock, hardlinked_files);
+    else
+        return lockSharedData(part, std::make_shared<ZooKeeperWithFaultInjection>(nullptr), replace_existing_lock, hardlinked_files);
+}
+
+void StorageReplicatedMergeTree::lockSharedData(
+    const IMergeTreeDataPart & part,
+    const ZooKeeperWithFaultInjectionPtr & zookeeper,
+    bool replace_existing_lock,
+    std::optional<HardlinkedFiles> hardlinked_files) const
 {
    auto settings = getSettings();

@ -7681,8 +7744,7 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
    if (!part.getDataPartStorage().supportZeroCopyReplication())
        return;

-    zkutil::ZooKeeperPtr zookeeper = tryGetZooKeeper();
-    if (!zookeeper)
+    if (zookeeper->isNull())
        return;

    String id = part.getUniqueId();
@ -7716,7 +7778,14 @@ void StorageReplicatedMergeTree::lockSharedData(const IMergeTreeDataPart & part,
    }
 }

-std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
+std::pair<bool, NameSet>
+StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part) const
+{
+    return unlockSharedData(part, std::make_shared<ZooKeeperWithFaultInjection>(nullptr));
+}
+
+std::pair<bool, NameSet>
+StorageReplicatedMergeTree::unlockSharedData(const IMergeTreeDataPart & part, const ZooKeeperWithFaultInjectionPtr & zookeeper) const
 {
    auto settings = getSettings();
    if (!settings->allow_remote_fs_zero_copy_replication)
@ -7762,11 +7831,10 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedData(const IMer
    /// We remove parts during table shutdown. If exception happen, restarting thread will be already turned
    /// off and nobody will reconnect our zookeeper connection. In this case we use zookeeper connection from
    /// context.
-    zkutil::ZooKeeperPtr zookeeper;
    if (shutdown_called.load())
-        zookeeper = getZooKeeperIfTableShutDown();
+        zookeeper->setKeeper(getZooKeeperIfTableShutDown());
    else
-        zookeeper = getZooKeeper();
+        zookeeper->setKeeper(getZooKeeper());

    /// It can happen that we didn't had the connection to zookeeper during table creation, but actually
    /// table is completely dropped, so we can drop it without any additional checks.
@ -7791,7 +7859,7 @@ namespace
 /// But sometimes we need an opposite. When we deleting all_0_0_0_1 it can be non replicated to other replicas, so we are the only owner of this part.
 /// In this case when we will drop all_0_0_0_1 we will drop blobs for all_0_0_0. But it will lead to dataloss. For such case we need to check that other replicas
 /// still need parent part.
-std::pair<bool, NameSet> getParentLockedBlobs(zkutil::ZooKeeperPtr zookeeper_ptr, const std::string & zero_copy_part_path_prefix, const std::string & part_info_str, MergeTreeDataFormatVersion format_version, Poco::Logger * log)
+std::pair<bool, NameSet> getParentLockedBlobs(const ZooKeeperWithFaultInjectionPtr & zookeeper_ptr, const std::string & zero_copy_part_path_prefix, const std::string & part_info_str, MergeTreeDataFormatVersion format_version, Poco::Logger * log)
 {
    NameSet files_not_to_remove;

@ -7853,7 +7921,7 @@ std::pair<bool, NameSet> getParentLockedBlobs(zkutil::ZooKeeperPtr zookeeper_ptr

 std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
        String part_id, const String & table_uuid, const String & part_name,
-        const String & replica_name_, const std::string & disk_type, zkutil::ZooKeeperPtr zookeeper_ptr, const MergeTreeSettings & settings,
+        const String & replica_name_, const std::string & disk_type, const ZooKeeperWithFaultInjectionPtr & zookeeper_ptr, const MergeTreeSettings & settings,
        Poco::Logger * logger, const String & zookeeper_path_old, MergeTreeDataFormatVersion data_format_version)
 {
    boost::replace_all(part_id, "/", "_");
@ -7872,7 +7940,8 @@ std::pair<bool, NameSet> StorageReplicatedMergeTree::unlockSharedDataByID(
        if (!files_not_to_remove_str.empty())
            boost::split(files_not_to_remove, files_not_to_remove_str, boost::is_any_of("\n "));

-        auto [has_parent, parent_not_to_remove] = getParentLockedBlobs(zookeeper_ptr, fs::path(zc_zookeeper_path).parent_path(), part_name, data_format_version, logger);
+        auto [has_parent, parent_not_to_remove] = getParentLockedBlobs(
+            zookeeper_ptr, fs::path(zc_zookeeper_path).parent_path(), part_name, data_format_version, logger);
        files_not_to_remove.insert(parent_not_to_remove.begin(), parent_not_to_remove.end());

        String zookeeper_part_uniq_node = fs::path(zc_zookeeper_path) / part_id;
@ -8387,7 +8456,7 @@ bool StorageReplicatedMergeTree::createEmptyPartInsteadOfLost(zkutil::ZooKeeperP


 void StorageReplicatedMergeTree::createZeroCopyLockNode(
-    const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node, int32_t mode,
+    const ZooKeeperWithFaultInjectionPtr & zookeeper, const String & zookeeper_node, int32_t mode,
    bool replace_existing_lock, const String & path_to_set_hardlinked_files, const NameSet & hardlinked_files)
 {
    /// In rare case other replica can remove path between createAncestors and createIfNotExists
@ -8504,7 +8573,7 @@ bool StorageReplicatedMergeTree::removeSharedDetachedPart(DiskPtr disk, const St
                id, table_uuid, part_name,
                detached_replica_name,
                toString(disk->getDataSourceDescription().type),
-                zookeeper, local_context->getReplicatedMergeTreeSettings(),
+                std::make_shared<ZooKeeperWithFaultInjection>(zookeeper), local_context->getReplicatedMergeTreeSettings(),
                &Poco::Logger::get("StorageReplicatedMergeTree"),
                detached_zookeeper_path,
                MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING);
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@ -83,6 +83,9 @@ namespace DB
  * as the time will take the time of creation the appropriate part on any of the replicas.
  */

+class ZooKeeperWithFaultInjection;
+using ZooKeeperWithFaultInjectionPtr = std::shared_ptr<ZooKeeperWithFaultInjection>;
+
 class StorageReplicatedMergeTree final : public MergeTreeData
 {
 public:
@ -267,6 +270,11 @@ public:

    /// Lock part in zookeeper for use shared data in several nodes
    void lockSharedData(const IMergeTreeDataPart & part, bool replace_existing_lock, std::optional<HardlinkedFiles> hardlinked_files) const override;
+    void lockSharedData(
+        const IMergeTreeDataPart & part,
+        const ZooKeeperWithFaultInjectionPtr & zookeeper,
+        bool replace_existing_lock,
+        std::optional<HardlinkedFiles> hardlinked_files) const;

    void lockSharedDataTemporary(const String & part_name, const String & part_id, const DiskPtr & disk) const;

@ -274,13 +282,23 @@ public:
    /// Return true if data unlocked
    /// Return false if data is still used by another node
    std::pair<bool, NameSet> unlockSharedData(const IMergeTreeDataPart & part) const override;
+    std::pair<bool, NameSet>
+    unlockSharedData(const IMergeTreeDataPart & part, const ZooKeeperWithFaultInjectionPtr & zookeeper) const;

    /// Unlock shared data part in zookeeper by part id
    /// Return true if data unlocked
    /// Return false if data is still used by another node
-    static std::pair<bool, NameSet> unlockSharedDataByID(String part_id, const String & table_uuid, const String & part_name, const String & replica_name_,
-        const std::string & disk_type, zkutil::ZooKeeperPtr zookeeper_, const MergeTreeSettings & settings, Poco::Logger * logger,
-        const String & zookeeper_path_old, MergeTreeDataFormatVersion data_format_version);
+    static std::pair<bool, NameSet> unlockSharedDataByID(
+        String part_id,
+        const String & table_uuid,
+        const String & part_name,
+        const String & replica_name_,
+        const std::string & disk_type,
+        const ZooKeeperWithFaultInjectionPtr & zookeeper_,
+        const MergeTreeSettings & settings,
+        Poco::Logger * logger,
+        const String & zookeeper_path_old,
+        MergeTreeDataFormatVersion data_format_version);

    /// Fetch part only if some replica has it on shared storage like S3
    MutableDataPartStoragePtr tryToFetchIfShared(const IMergeTreeDataPart & part, const DiskPtr & disk, const String & path) override;
@ -534,7 +552,7 @@ private:

    bool partIsAssignedToBackgroundOperation(const DataPartPtr & part) const override;

-    void getCommitPartOps(Coordination::Requests & ops, MutableDataPartPtr & part, const String & block_id_path = "") const;
+    void getCommitPartOps(Coordination::Requests & ops, const DataPartPtr & part, const String & block_id_path = "") const;

    /// Adds actions to `ops` that remove a part from ZooKeeper.
    /// Set has_children to true for "old-style" parts (those with /columns and /checksums child znodes).
@ -672,11 +690,12 @@ private:
    bool fetchPart(
        const String & part_name,
        const StorageMetadataPtr & metadata_snapshot,
-        const String & replica_path,
+        const String & source_replica_path,
        bool to_detached,
        size_t quorum,
        zkutil::ZooKeeper::Ptr zookeeper_ = nullptr,
-        bool try_fetch_shared = true);
+        bool try_fetch_shared = true,
+        String entry_znode = "");

    /** Download the specified part from the specified replica.
      * Used for replace local part on the same s3-shared part in hybrid storage.
@ -711,6 +730,11 @@ private:
    std::optional<EphemeralLockInZooKeeper> allocateBlockNumber(
        const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper,
        const String & zookeeper_block_id_path = "", const String & zookeeper_path_prefix = "") const;
+    std::optional<EphemeralLockInZooKeeper> allocateBlockNumber(
+        const String & partition_id,
+        const ZooKeeperWithFaultInjectionPtr & zookeeper,
+        const String & zookeeper_block_id_path = "",
+        const String & zookeeper_path_prefix = "") const;

    /** Wait until all replicas, including this, execute the specified action from the log.
      * If replicas are added at the same time, it can not wait the added replica.
@ -748,7 +772,7 @@ private:
    /// Check for a node in ZK. If it is, remember this information, and then immediately answer true.
    mutable std::unordered_set<std::string> existing_nodes_cache;
    mutable std::mutex existing_nodes_cache_mutex;
-    bool existsNodeCached(const std::string & path) const;
+    bool existsNodeCached(const ZooKeeperWithFaultInjectionPtr & zookeeper, const std::string & path) const;

    /// Cancels INSERTs in the block range by removing ephemeral block numbers
    void clearLockedBlockNumbersInPartition(zkutil::ZooKeeper & zookeeper, const String & partition_id, Int64 min_block_num, Int64 max_block_num);
@ -836,7 +860,7 @@ private:
        const String & part_name, const String & zookeeper_path_old);

    static void createZeroCopyLockNode(
-        const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_node,
+        const ZooKeeperWithFaultInjectionPtr & zookeeper, const String & zookeeper_node,
        int32_t mode = zkutil::CreateMode::Persistent, bool replace_existing_lock = false,
        const String & path_to_set_hardlinked_files = "", const NameSet & hardlinked_files = {});

--- a/src/Storages/StorageS3.cpp
+++ b/src/Storages/StorageS3.cpp
@ -1314,6 +1314,11 @@ void registerStorageCOS(StorageFactory & factory)
    return registerStorageS3Impl("COSN", factory);
 }

+void registerStorageOSS(StorageFactory & factory)
+{
+    return registerStorageS3Impl("OSS", factory);
+}
+
 NamesAndTypesList StorageS3::getVirtuals() const
 {
    return virtual_columns;
--- a/src/Storages/StorageS3Cluster.cpp
+++ b/src/Storages/StorageS3Cluster.cpp
@ -117,32 +117,24 @@ Pipe StorageS3Cluster::read(
        addColumnsStructureToQueryWithClusterEngine(
            query_to_send, StorageDictionary::generateNamesAndTypesDescription(storage_snapshot->metadata->getColumns().getAll()), 5, getName());

-    for (const auto & replicas : cluster->getShardsAddresses())
+    const auto & current_settings = context->getSettingsRef();
+    auto timeouts = ConnectionTimeouts::getTCPTimeoutsWithFailover(current_settings);
+    for (const auto & shard_info : cluster->getShardsInfo())
    {
-        /// There will be only one replica, because we consider each replica as a shard
-        for (const auto & node : replicas)
+        auto try_results = shard_info.pool->getMany(timeouts, &current_settings, PoolMode::GET_MANY);
+        for (auto & try_result : try_results)
        {
-            auto connection = std::make_shared<Connection>(
-                node.host_name, node.port, context->getGlobalContext()->getCurrentDatabase(),
-                node.user, node.password, node.quota_key, node.cluster, node.cluster_secret,
-                "S3ClusterInititiator",
-                node.compression,
-                node.secure
-            );
-
-
-            /// For unknown reason global context is passed to IStorage::read() method
-            /// So, task_identifier is passed as constructor argument. It is more obvious.
            auto remote_query_executor = std::make_shared<RemoteQueryExecutor>(
-                connection,
-                queryToString(query_to_send),
-                header,
-                context,
-                /*throttler=*/nullptr,
-                scalars,
-                Tables(),
-                processed_stage,
-                RemoteQueryExecutor::Extension{.task_iterator = callback});
+                    shard_info.pool,
+                    std::vector<IConnectionPool::Entry>{try_result},
+                    queryToString(query_to_send),
+                    header,
+                    context,
+                    /*throttler=*/nullptr,
+                    scalars,
+                    Tables(),
+                    processed_stage,
+                    RemoteQueryExecutor::Extension{.task_iterator = callback});

            pipes.emplace_back(std::make_shared<RemoteSource>(remote_query_executor, add_agg_info, false));
        }
--- a/src/Storages/System/StorageSystemAsynchronousMetrics.cpp
+++ b/src/Storages/System/StorageSystemAsynchronousMetrics.cpp
@ -12,6 +12,7 @@ NamesAndTypesList StorageSystemAsynchronousMetrics::getNamesAndTypes()
    return {
        {"metric", std::make_shared<DataTypeString>()},
        {"value", std::make_shared<DataTypeFloat64>()},
+        {"description", std::make_shared<DataTypeString>()},
    };
 }

@ -27,7 +28,8 @@ void StorageSystemAsynchronousMetrics::fillData(MutableColumns & res_columns, Co
    for (const auto & name_value : async_metrics_values)
    {
        res_columns[0]->insert(name_value.first);
-        res_columns[1]->insert(name_value.second);
+        res_columns[1]->insert(name_value.second.value);
+        res_columns[2]->insert(name_value.second.documentation);
    }
 }

--- a/src/Storages/System/StorageSystemNamedCollections.cpp
+++ b/src/Storages/System/StorageSystemNamedCollections.cpp
@ -0,0 +1,58 @@
+#include "StorageSystemNamedCollections.h"
+
+#include <Common/FieldVisitorToString.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeMap.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/ProfileEventsExt.h>
+#include <Access/Common/AccessType.h>
+#include <Access/Common/AccessFlags.h>
+#include <Columns/ColumnMap.h>
+#include <Storages/NamedCollections.h>
+
+
+namespace DB
+{
+
+NamesAndTypesList StorageSystemNamedCollections::getNamesAndTypes()
+{
+    return {
+        {"name", std::make_shared<DataTypeString>()},
+        {"collection", std::make_shared<DataTypeMap>(std::make_shared<DataTypeString>(), std::make_shared<DataTypeString>())},
+    };
+}
+
+StorageSystemNamedCollections::StorageSystemNamedCollections(const StorageID & table_id_)
+    : IStorageSystemOneBlock(table_id_)
+{
+}
+
+void StorageSystemNamedCollections::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const
+{
+    context->checkAccess(AccessType::SHOW_NAMED_COLLECTIONS);
+
+    auto collections = NamedCollectionFactory::instance().getAll();
+    for (const auto & [name, collection] : collections)
+    {
+        res_columns[0]->insert(name);
+
+        auto * column_map = typeid_cast<ColumnMap *>(res_columns[1].get());
+
+        auto & offsets = column_map->getNestedColumn().getOffsets();
+        auto & tuple_column = column_map->getNestedData();
+        auto & key_column = tuple_column.getColumn(0);
+        auto & value_column = tuple_column.getColumn(1);
+
+        size_t size = 0;
+        for (const auto & key : collection->getKeys())
+        {
+            key_column.insertData(key.data(), key.size());
+            value_column.insert(collection->get<String>(key));
+            size++;
+        }
+
+        offsets.push_back(offsets.back() + size);
+    }
+}
+
+}
--- a/src/Storages/System/StorageSystemNamedCollections.h
+++ b/src/Storages/System/StorageSystemNamedCollections.h
@ -0,0 +1,21 @@
+#pragma once
+
+#include <Storages/System/IStorageSystemOneBlock.h>
+
+namespace DB
+{
+
+class StorageSystemNamedCollections final : public IStorageSystemOneBlock<StorageSystemNamedCollections>
+{
+public:
+    explicit StorageSystemNamedCollections(const StorageID & table_id_);
+
+    std::string getName() const override { return "SystemNamedCollections"; }
+
+    static NamesAndTypesList getNamesAndTypes();
+
+protected:
+    void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override;
+};
+
+}
--- a/src/Storages/System/attachSystemTables.cpp
+++ b/src/Storages/System/attachSystemTables.cpp
@ -72,6 +72,7 @@
 #include <Storages/System/StorageSystemAsynchronousInserts.h>
 #include <Storages/System/StorageSystemTransactions.h>
 #include <Storages/System/StorageSystemFilesystemCache.h>
+#include <Storages/System/StorageSystemNamedCollections.h>
 #include <Storages/System/StorageSystemRemoteDataPaths.h>
 #include <Storages/System/StorageSystemCertificates.h>
 #include <Storages/System/StorageSystemSchemaInferenceCache.h>
@ -174,6 +175,7 @@ void attachSystemTablesServer(ContextPtr context, IDatabase & system_database, b
    attach<StorageSystemFilesystemCache>(context, system_database, "filesystem_cache");
    attach<StorageSystemRemoteDataPaths>(context, system_database, "remote_data_paths");
    attach<StorageSystemCertificates>(context, system_database, "certificates");
+    attach<StorageSystemNamedCollections>(context, system_database, "named_collections");

    if (has_zookeeper)
        attach<StorageSystemZooKeeper>(context, system_database, "zookeeper");
--- a/src/Storages/registerStorages.cpp
+++ b/src/Storages/registerStorages.cpp
@ -32,6 +32,7 @@ void registerStorageMeiliSearch(StorageFactory& factory);
 #if USE_AWS_S3
 void registerStorageS3(StorageFactory & factory);
 void registerStorageCOS(StorageFactory & factory);
+void registerStorageOSS(StorageFactory & factory);
 void registerStorageHudi(StorageFactory & factory);
 void registerStorageDelta(StorageFactory & factory);
 #endif
@ -120,6 +121,7 @@ void registerStorages()
    #if USE_AWS_S3
    registerStorageS3(factory);
    registerStorageCOS(factory);
+    registerStorageOSS(factory);
    registerStorageHudi(factory);
    registerStorageDelta(factory);
    #endif
--- a/src/Storages/tests/gtest_named_collections.cpp
+++ b/src/Storages/tests/gtest_named_collections.cpp
@ -0,0 +1,143 @@
+#include <Common/tests/gtest_global_context.h>
+#include <Storages/NamedCollections.h>
+#include <Poco/Util/XMLConfiguration.h>
+#include <Poco/DOM/DOMParser.h>
+#include <gtest/gtest.h>
+
+using namespace DB;
+
+TEST(NamedCollections, SimpleConfig)
+{
+    std::string xml(R"CONFIG(<clickhouse>
+    <named_collections>
+        <collection1>
+            <key1>value1</key1>
+            <key2>2</key2>
+            <key3>3.3</key3>
+            <key4>-4</key4>
+        </collection1>
+        <collection2>
+            <key4>value4</key4>
+            <key5>5</key5>
+            <key6>6.6</key6>
+        </collection2>
+    </named_collections>
+</clickhouse>)CONFIG");
+
+    Poco::XML::DOMParser dom_parser;
+    Poco::AutoPtr<Poco::XML::Document> document = dom_parser.parseString(xml);
+    Poco::AutoPtr<Poco::Util::XMLConfiguration> config = new Poco::Util::XMLConfiguration(document);
+
+    NamedCollectionFactory::instance().initialize(*config);
+
+    ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection1"));
+    ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection2"));
+    ASSERT_TRUE(NamedCollectionFactory::instance().tryGet("collection3") == nullptr);
+
+    auto collections = NamedCollectionFactory::instance().getAll();
+    ASSERT_EQ(collections.size(), 2);
+    ASSERT_TRUE(collections.contains("collection1"));
+    ASSERT_TRUE(collections.contains("collection2"));
+
+    ASSERT_EQ(collections["collection1"]->dumpStructure(),
+              R"CONFIG(key1:	value1
+key2:	2
+key3:	3.3
+key4:	-4
+)CONFIG");
+
+    auto collection1 = NamedCollectionFactory::instance().get("collection1");
+    ASSERT_TRUE(collection1 != nullptr);
+
+    ASSERT_TRUE(collection1->get<String>("key1") == "value1");
+    ASSERT_TRUE(collection1->get<UInt64>("key2") == 2);
+    ASSERT_TRUE(collection1->get<Float64>("key3") == 3.3);
+    ASSERT_TRUE(collection1->get<Int64>("key4") == -4);
+
+    ASSERT_EQ(collections["collection2"]->dumpStructure(),
+              R"CONFIG(key4:	value4
+key5:	5
+key6:	6.6
+)CONFIG");
+
+    auto collection2 = NamedCollectionFactory::instance().get("collection2");
+    ASSERT_TRUE(collection2 != nullptr);
+
+    ASSERT_TRUE(collection2->get<String>("key4") == "value4");
+    ASSERT_TRUE(collection2->get<UInt64>("key5") == 5);
+    ASSERT_TRUE(collection2->get<Float64>("key6") == 6.6);
+
+    auto collection2_copy = collections["collection2"]->duplicate();
+    NamedCollectionFactory::instance().add("collection2_copy", collection2_copy);
+    ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection2_copy"));
+    ASSERT_EQ(NamedCollectionFactory::instance().get("collection2_copy")->dumpStructure(),
+              R"CONFIG(key4:	value4
+key5:	5
+key6:	6.6
+)CONFIG");
+
+    collection2_copy->set<String>("key4", "value44", true);
+    ASSERT_TRUE(collection2_copy->get<String>("key4") == "value44");
+    ASSERT_TRUE(collection2->get<String>("key4") == "value4");
+
+    collection2_copy->remove("key4");
+    ASSERT_TRUE(collection2_copy->getOrDefault<String>("key4", "N") == "N");
+    ASSERT_TRUE(collection2->getOrDefault<String>("key4", "N") == "value4");
+
+    collection2_copy->set<String>("key4", "value45");
+    ASSERT_TRUE(collection2_copy->getOrDefault<String>("key4", "N") == "value45");
+
+    NamedCollectionFactory::instance().remove("collection2_copy");
+    ASSERT_FALSE(NamedCollectionFactory::instance().exists("collection2_copy"));
+
+    config.reset();
+}
+
+TEST(NamedCollections, NestedConfig)
+{
+    std::string xml(R"CONFIG(<clickhouse>
+    <named_collections>
+        <collection1>
+            <key1>
+                <key1_1>value1</key1_1>
+            </key1>
+            <key2>
+                <key2_1>value2_1</key2_1>
+                <key2_2>
+                    <key2_3>
+                        <key2_4>4</key2_4>
+                        <key2_5>5</key2_5>
+                    </key2_3>
+                </key2_2>
+            </key2>
+        </collection1>
+    </named_collections>
+</clickhouse>)CONFIG");
+
+    Poco::XML::DOMParser dom_parser;
+    Poco::AutoPtr<Poco::XML::Document> document = dom_parser.parseString(xml);
+    Poco::AutoPtr<Poco::Util::XMLConfiguration> config = new Poco::Util::XMLConfiguration(document);
+    NamedCollectionFactory::instance().reload(*config);
+
+    ASSERT_TRUE(NamedCollectionFactory::instance().exists("collection1"));
+
+    auto collection1 = NamedCollectionFactory::instance().get("collection1");
+    ASSERT_TRUE(collection1 != nullptr);
+
+    ASSERT_EQ(collection1->dumpStructure(),
+              R"CONFIG(key1:
+	key1_1:	value1
+key2:
+	key2_1:	value2_1
+	key2_2:
+		key2_3:
+			key2_4:	4
+			key2_5:	5
+)CONFIG");
+
+    ASSERT_EQ(collection1->get<String>("key1.key1_1"), "value1");
+    ASSERT_EQ(collection1->get<String>("key2.key2_1"), "value2_1");
+    ASSERT_EQ(collection1->get<Int64>("key2.key2_2.key2_3.key2_4"), 4);
+    ASSERT_EQ(collection1->get<Int64>("key2.key2_2.key2_3.key2_5"), 5);
+
+}
--- a/src/TableFunctions/ITableFunction.h
+++ b/src/TableFunctions/ITableFunction.h
@ -55,15 +55,17 @@ public:
    virtual ColumnsDescription getActualTableStructure(ContextPtr /*context*/) const = 0;

    /// Check if table function needs a structure hint from SELECT query in case of
-    /// INSERT INTO FUNCTION ... SELECT ...
+    /// INSERT INTO FUNCTION ... SELECT ... and INSERT INTO ... SELECT ... FROM table_function(...)
    /// It's used for schema inference.
    virtual bool needStructureHint() const { return false; }

    /// Set a structure hint from SELECT query in case of
-    /// INSERT INTO FUNCTION ... SELECT ...
+    /// INSERT INTO FUNCTION ... SELECT ... and INSERT INTO ... SELECT ... FROM table_function(...)
    /// This hint could be used not to repeat schema in function arguments.
    virtual void setStructureHint(const ColumnsDescription &) {}

+    virtual bool supportsReadingSubsetOfColumns() { return true; }
+
    /// Create storage according to the query.
    StoragePtr
    execute(const ASTPtr & ast_function, ContextPtr context, const std::string & table_name, ColumnsDescription cached_columns_ = {}, bool use_global_context = false) const;
--- a/src/TableFunctions/ITableFunctionFileLike.cpp
+++ b/src/TableFunctions/ITableFunctionFileLike.cpp
@ -34,6 +34,11 @@ String ITableFunctionFileLike::getFormatFromFirstArgument()
    return FormatFactory::instance().getFormatFromFileName(filename, true);
 }

+bool ITableFunctionFileLike::supportsReadingSubsetOfColumns()
+{
+    return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(format);
+}
+
 void ITableFunctionFileLike::parseArguments(const ASTPtr & ast_function, ContextPtr context)
 {
    /// Parse args
--- a/src/TableFunctions/ITableFunctionFileLike.h
+++ b/src/TableFunctions/ITableFunctionFileLike.h
@ -18,6 +18,8 @@ public:

    void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }

+    bool supportsReadingSubsetOfColumns() override;
+
 protected:
    void parseArguments(const ASTPtr & ast_function, ContextPtr context) override;
    virtual void parseFirstArguments(const ASTPtr & arg, const ContextPtr & context);
--- a/src/TableFunctions/TableFunctionS3.cpp
+++ b/src/TableFunctions/TableFunctionS3.cpp
@ -146,6 +146,11 @@ ColumnsDescription TableFunctionS3::getActualTableStructure(ContextPtr context)
    return parseColumnsListFromString(configuration.structure, context);
 }

+bool TableFunctionS3::supportsReadingSubsetOfColumns()
+{
+    return FormatFactory::instance().checkIfFormatSupportsSubsetOfColumns(configuration.format);
+}
+
 StoragePtr TableFunctionS3::executeImpl(const ASTPtr & /*ast_function*/, ContextPtr context, const std::string & table_name, ColumnsDescription /*cached_columns*/) const
 {
    Poco::URI uri (configuration.url);
@ -183,6 +188,11 @@ void registerTableFunctionCOS(TableFunctionFactory & factory)
    factory.registerFunction<TableFunctionCOS>();
 }

+void registerTableFunctionOSS(TableFunctionFactory & factory)
+{
+    factory.registerFunction<TableFunctionOSS>();
+}
+
 }

 #endif
--- a/src/TableFunctions/TableFunctionS3.h
+++ b/src/TableFunctions/TableFunctionS3.h
@ -30,6 +30,8 @@ public:

    void setStructureHint(const ColumnsDescription & structure_hint_) override { structure_hint = structure_hint_; }

+    bool supportsReadingSubsetOfColumns() override;
+
 protected:
    friend class TableFunctionS3Cluster;

@ -62,6 +64,18 @@ private:
    const char * getStorageTypeName() const override { return "COSN"; }
 };

+class TableFunctionOSS : public TableFunctionS3
+{
+public:
+    static constexpr auto name = "oss";
+    std::string getName() const override
+    {
+        return name;
+    }
+private:
+    const char * getStorageTypeName() const override { return "OSS"; }
+};
+
 }

 #endif
--- a/src/TableFunctions/registerTableFunctions.cpp
+++ b/src/TableFunctions/registerTableFunctions.cpp
@ -27,6 +27,7 @@ void registerTableFunctions()
    registerTableFunctionS3(factory);
    registerTableFunctionS3Cluster(factory);
    registerTableFunctionCOS(factory);
+    registerTableFunctionOSS(factory);
 #endif

 #if USE_HDFS
--- a/src/TableFunctions/registerTableFunctions.h
+++ b/src/TableFunctions/registerTableFunctions.h
@ -24,6 +24,7 @@ void registerTableFunctionMeiliSearch(TableFunctionFactory & factory);
 void registerTableFunctionS3(TableFunctionFactory & factory);
 void registerTableFunctionS3Cluster(TableFunctionFactory & factory);
 void registerTableFunctionCOS(TableFunctionFactory & factory);
+void registerTableFunctionOSS(TableFunctionFactory & factory);
 #endif

 #if USE_HDFS
--- a/tests/ci/commit_status_helper.py
+++ b/tests/ci/commit_status_helper.py
@ -3,19 +3,21 @@
 import csv
 import os
 import time
-from typing import Optional
+from typing import List
 import logging

 from ci_config import CI_CONFIG, REQUIRED_CHECKS
 from env_helper import GITHUB_REPOSITORY, GITHUB_RUN_URL
 from github import Github
 from github.Commit import Commit
-from pr_info import SKIP_MERGEABLE_CHECK_LABEL
+from github.CommitStatus import CommitStatus
+from pr_info import PRInfo, SKIP_MERGEABLE_CHECK_LABEL

 RETRY = 5
+CommitStatuses = List[CommitStatus]


-def override_status(status, check_name, invert=False):
+def override_status(status: str, check_name: str, invert=False) -> str:
    if CI_CONFIG["tests_config"].get(check_name, {}).get("force_tests", False):
        return "success"

@ -27,24 +29,23 @@ def override_status(status, check_name, invert=False):
    return status


-def get_commit(
-    gh: Github, commit_sha: str, retry_count: int = RETRY
-) -> Optional[Commit]:
+def get_commit(gh: Github, commit_sha: str, retry_count: int = RETRY) -> Commit:
    for i in range(retry_count):
        try:
            repo = gh.get_repo(GITHUB_REPOSITORY)
            commit = repo.get_commit(commit_sha)
-            return commit
+            break
        except Exception as ex:
            if i == retry_count - 1:
                raise ex
            time.sleep(i)

-    # just suppress warning
-    return None
+    return commit


-def post_commit_status(gh, sha, check_name, description, state, report_url):
+def post_commit_status(
+    gh: Github, sha: str, check_name: str, description: str, state: str, report_url: str
+):
    for i in range(RETRY):
        try:
            commit = get_commit(gh, sha, 1)
@ -61,7 +62,9 @@ def post_commit_status(gh, sha, check_name, description, state, report_url):
            time.sleep(i)


-def post_commit_status_to_file(file_path, description, state, report_url):
+def post_commit_status_to_file(
+    file_path: str, description: str, state: str, report_url: str
+):
    if os.path.exists(file_path):
        raise Exception(f'File "{file_path}" already exists!')
    with open(file_path, "w", encoding="utf-8") as f:
@ -69,21 +72,37 @@ def post_commit_status_to_file(file_path, description, state, report_url):
        out.writerow([state, report_url, description])


-def remove_labels(gh, pr_info, labels_names):
+def get_commit_filtered_statuses(commit: Commit) -> CommitStatuses:
+    """
+    Squash statuses to latest state
+    1. context="first", state="success", update_time=1
+    2. context="second", state="success", update_time=2
+    3. context="first", stat="failure", update_time=3
+    =========>
+    1. context="second", state="success"
+    2. context="first", stat="failure"
+    """
+    filtered = {}
+    for status in sorted(commit.get_statuses(), key=lambda x: x.updated_at):
+        filtered[status.context] = status
+    return list(filtered.values())
+
+
+def remove_labels(gh: Github, pr_info: PRInfo, labels_names: List[str]):
    repo = gh.get_repo(GITHUB_REPOSITORY)
    pull_request = repo.get_pull(pr_info.number)
    for label in labels_names:
        pull_request.remove_from_labels(label)


-def post_labels(gh, pr_info, labels_names):
+def post_labels(gh: Github, pr_info: PRInfo, labels_names: List[str]):
    repo = gh.get_repo(GITHUB_REPOSITORY)
    pull_request = repo.get_pull(pr_info.number)
    for label in labels_names:
        pull_request.add_to_labels(label)


-def fail_mergeable_check(commit, description):
+def fail_mergeable_check(commit: Commit, description: str):
    commit.create_status(
        context="Mergeable Check",
        description=description,
@ -92,7 +111,7 @@ def fail_mergeable_check(commit, description):
    )


-def reset_mergeable_check(commit, description=""):
+def reset_mergeable_check(commit: Commit, description: str = ""):
    commit.create_status(
        context="Mergeable Check",
        description=description,
@ -101,7 +120,7 @@ def reset_mergeable_check(commit, description=""):
    )


-def update_mergeable_check(gh, pr_info, check_name):
+def update_mergeable_check(gh: Github, pr_info: PRInfo, check_name: str):
    if SKIP_MERGEABLE_CHECK_LABEL in pr_info.labels:
        return

--- a/tests/ci/rerun_helper.py
+++ b/tests/ci/rerun_helper.py
@ -1,14 +1,13 @@
 #!/usr/bin/env python3
-from typing import List, Optional
+from typing import Optional

-from commit_status_helper import get_commit
+from commit_status_helper import get_commit, get_commit_filtered_statuses
 from github import Github
 from github.CommitStatus import CommitStatus
 from pr_info import PRInfo

-CommitStatuses = List[CommitStatus]
-

+# TODO: move it to commit_status_helper
 class RerunHelper:
    def __init__(self, gh: Github, pr_info: PRInfo, check_name: str):
        self.gh = gh
@ -18,7 +17,7 @@ class RerunHelper:
        if commit is None:
            raise ValueError(f"unable to receive commit for {pr_info.sha}")
        self.pygh_commit = commit
-        self.statuses = self.ger_filtered_statuses()
+        self.statuses = get_commit_filtered_statuses(commit)

    def is_already_finished_by_status(self) -> bool:
        # currently we agree even for failed statuses
@ -35,20 +34,3 @@ class RerunHelper:
            if self.check_name in status.context:
                return status
        return None
-
-    def ger_filtered_statuses(self) -> CommitStatuses:
-        """
-        Squash statuses to latest state
-        1. context="first", state="success", update_time=1
-        2. context="second", state="success", update_time=2
-        3. context="first", stat="failure", update_time=3
-        =========>
-        1. context="second", state="success"
-        2. context="first", stat="failure"
-        """
-        filt = {}
-        for status in sorted(
-            self.pygh_commit.get_statuses(), key=lambda x: x.updated_at
-        ):
-            filt[status.context] = status
-        return list(filt.values())
--- a/tests/ci/sqlancer_check.py
+++ b/tests/ci/sqlancer_check.py
@ -150,7 +150,7 @@ if __name__ == "__main__":
            os.path.join(workspace_path, "summary.tsv"), "r", encoding="utf-8"
        ) as summary_f:
            for line in summary_f:
-                l = line.split("\t")
+                l = line.rstrip("\n").split("\t")
                test_results.append((l[0], l[1]))

        with open(
--- a/tests/config/install.sh
+++ b/tests/config/install.sh
@ -71,6 +71,7 @@ ln -sf $SRC_PATH/users.d/no_fsync_metadata.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/filelog.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/enable_blobs_check.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/marks.xml $DEST_SERVER_PATH/users.d/
+ln -sf $SRC_PATH/users.d/insert_keeper_retries.xml $DEST_SERVER_PATH/users.d/

 # FIXME DataPartsExchange may hang for http_send_timeout seconds
 # when nobody is going to read from the other side of socket (due to "Fetching of part was cancelled"),
--- a/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml
+++ b/tests/integration/test_global_overcommit_tracker/configs/global_overcommit_tracker.xml
@ -1,3 +1,3 @@
 <clickhouse>
-    <max_server_memory_usage>50000000</max_server_memory_usage>
+    <max_server_memory_usage>2000000000</max_server_memory_usage>
 </clickhouse>
--- a/tests/integration/test_global_overcommit_tracker/test.py
+++ b/tests/integration/test_global_overcommit_tracker/test.py
@ -18,21 +18,31 @@ def start_cluster():
        cluster.shutdown()


-TEST_QUERY_A = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator_for_user=1, memory_usage_overcommit_max_wait_microseconds=500"
-TEST_QUERY_B = "SELECT number FROM numbers(1000) GROUP BY number SETTINGS memory_overcommit_ratio_denominator_for_user=2, memory_usage_overcommit_max_wait_microseconds=500"
+GLOBAL_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=1"
+GLOBAL_TEST_QUERY_B = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS memory_overcommit_ratio_denominator_for_user=80000000"


-def test_overcommited_is_killed():
-    node.query("CREATE USER A")
+def test_global_overcommit():
+    # NOTE: another option is to increase waiting time.
+    if (
+        node.is_built_with_thread_sanitizer()
+        or node.is_built_with_address_sanitizer()
+        or node.is_built_with_memory_sanitizer()
+    ):
+        pytest.skip("doesn't fit in memory limits")
+
+    node.query("CREATE USER IF NOT EXISTS A")
    node.query("GRANT ALL ON *.* TO A")
-    node.query("CREATE USER B")
+    node.query("CREATE USER IF NOT EXISTS B")
    node.query("GRANT ALL ON *.* TO B")

    responses_A = list()
    responses_B = list()
-    for _ in range(500):
-        responses_A.append(node.get_query_request(TEST_QUERY_A, user="A"))
-        responses_B.append(node.get_query_request(TEST_QUERY_B, user="B"))
+    for i in range(100):
+        if i % 2 == 0:
+            responses_A.append(node.get_query_request(GLOBAL_TEST_QUERY_A, user="A"))
+        else:
+            responses_B.append(node.get_query_request(GLOBAL_TEST_QUERY_B, user="B"))

    overcommited_killed = False
    for response in responses_A:
--- a/tests/integration/test_inserts_with_keeper_retries/init.py
+++ b/tests/integration/test_inserts_with_keeper_retries/init.py
--- a/tests/integration/test_inserts_with_keeper_retries/test.py
+++ b/tests/integration/test_inserts_with_keeper_retries/test.py
@ -0,0 +1,100 @@
+#!/usr/bin/env python3
+
+import pytest
+import time
+import threading
+from helpers.cluster import ClickHouseCluster
+from multiprocessing.dummy import Pool
+from helpers.network import PartitionManager
+from helpers.client import QueryRuntimeException
+from helpers.test_tools import assert_eq_with_retry
+
+cluster = ClickHouseCluster(__file__)
+
+node1 = cluster.add_instance("node1", with_zookeeper=True)
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    global cluster
+    try:
+        cluster.start()
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+
+def test_replica_inserts_with_keeper_restart(started_cluster):
+    try:
+        node1.query(
+            "CREATE TABLE r (a UInt64, b String) ENGINE=ReplicatedMergeTree('/test/r', '0') ORDER BY tuple()"
+        )
+
+        p = Pool(1)
+        zk_stopped_event = threading.Event()
+
+        def zoo_restart(zk_stopped_event):
+            cluster.stop_zookeeper_nodes(["zoo1", "zoo2", "zoo3"])
+            zk_stopped_event.set()
+            cluster.start_zookeeper_nodes(["zoo1", "zoo2", "zoo3"])
+
+        job = p.apply_async(zoo_restart, (zk_stopped_event,))
+
+        zk_stopped_event.wait(90)
+
+        node1.query(
+            "INSERT INTO r SELECT number, toString(number) FROM numbers(10) SETTINGS insert_keeper_max_retries=20"
+        )
+        node1.query(
+            "INSERT INTO r SELECT number, toString(number) FROM numbers(10, 10) SETTINGS insert_keeper_max_retries=20"
+        )
+
+        job.wait()
+        p.close()
+        p.join()
+
+        assert node1.query("SELECT COUNT() FROM r") == "20\n"
+
+    finally:
+        node1.query("DROP TABLE IF EXISTS r SYNC")
+
+
+def test_replica_inserts_with_keeper_disconnect(started_cluster):
+    try:
+        node1.query(
+            "CREATE TABLE r (a UInt64, b String) ENGINE=ReplicatedMergeTree('/test/r', '0') ORDER BY tuple()"
+        )
+
+        p = Pool(1)
+        disconnect_event = threading.Event()
+
+        def keeper_disconnect(node, event):
+            with PartitionManager() as pm:
+                pm.drop_instance_zk_connections(node)
+                event.set()
+
+        job = p.apply_async(
+            keeper_disconnect,
+            (
+                node1,
+                disconnect_event,
+            ),
+        )
+        disconnect_event.wait(90)
+
+        node1.query(
+            "INSERT INTO r SELECT number, toString(number) FROM numbers(10) SETTINGS insert_keeper_max_retries=20"
+        )
+        node1.query(
+            "INSERT INTO r SELECT number, toString(number) FROM numbers(10, 10) SETTINGS insert_keeper_max_retries=20"
+        )
+
+        job.wait()
+        p.close()
+        p.join()
+
+        assert node1.query("SELECT COUNT() FROM r") == "20\n"
+
+    finally:
+        node1.query("DROP TABLE IF EXISTS r SYNC")
--- a/tests/integration/test_keeper_four_word_command/test.py
+++ b/tests/integration/test_keeper_four_word_command/test.py
@ -367,7 +367,7 @@ def test_cmd_stat(started_cluster):
        assert result["Received"] == "10"
        assert result["Sent"] == "10"
        assert int(result["Connections"]) == 1
-        assert int(result["Zxid"]) > 14
+        assert int(result["Zxid"]) >= 10
        assert result["Mode"] == "leader"
        assert result["Node count"] == "13"

--- a/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py
+++ b/tests/integration/test_materialized_mysql_database/materialize_with_ddl.py
@ -875,6 +875,22 @@ def alter_rename_table_with_materialized_mysql_database(
        "1\n2\n3\n4\n5\n",
    )

+    mysql_node.query(
+        "ALTER TABLE test_database_rename_table.test_table_4 RENAME test_database_rename_table.test_table_5"
+    )
+    mysql_node.query(
+        "ALTER TABLE test_database_rename_table.test_table_5 RENAME TO test_database_rename_table.test_table_6"
+    )
+    mysql_node.query(
+        "ALTER TABLE test_database_rename_table.test_table_6 RENAME AS test_database_rename_table.test_table_7"
+    )
+
+    check_query(
+        clickhouse_node,
+        "SELECT * FROM test_database_rename_table.test_table_7 ORDER BY id FORMAT TSV",
+        "1\n2\n3\n4\n5\n",
+    )
+
    clickhouse_node.query("DROP DATABASE test_database_rename_table")
    mysql_node.query("DROP DATABASE test_database_rename_table")

--- a/tests/integration/test_overcommit_tracker/init.py
+++ b/tests/integration/test_overcommit_tracker/init.py
--- a/tests/integration/test_overcommit_tracker/test.py
+++ b/tests/integration/test_overcommit_tracker/test.py
@ -0,0 +1,49 @@
+import pytest
+
+from helpers.cluster import ClickHouseCluster
+
+cluster = ClickHouseCluster(__file__)
+
+node = cluster.add_instance("node")
+
+
+@pytest.fixture(scope="module", autouse=True)
+def start_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+USER_TEST_QUERY_A = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS max_memory_usage_for_user=2000000000,memory_overcommit_ratio_denominator=1"
+USER_TEST_QUERY_B = "SELECT groupArray(number) FROM numbers(2500000) SETTINGS max_memory_usage_for_user=2000000000,memory_overcommit_ratio_denominator=80000000"
+
+
+def test_user_overcommit():
+    node.query("CREATE USER IF NOT EXISTS A")
+    node.query("GRANT ALL ON *.* TO A")
+
+    responses_A = list()
+    responses_B = list()
+    for i in range(100):
+        if i % 2 == 0:
+            responses_A.append(node.get_query_request(USER_TEST_QUERY_A, user="A"))
+        else:
+            responses_B.append(node.get_query_request(USER_TEST_QUERY_B, user="A"))
+
+    overcommited_killed = False
+    for response in responses_A:
+        _, err = response.get_answer_and_error()
+        if "MEMORY_LIMIT_EXCEEDED" in err:
+            overcommited_killed = True
+    finished = False
+    for response in responses_B:
+        _, err = response.get_answer_and_error()
+        if err == "":
+            finished = True
+
+    assert overcommited_killed, "no overcommited task was killed"
+    assert finished, "all tasks are killed"
+
+    node.query("DROP USER IF EXISTS A")
--- a/tests/integration/test_restore_replica/test.py
+++ b/tests/integration/test_restore_replica/test.py
@ -7,6 +7,9 @@ from helpers.test_tools import assert_eq_with_retry


 def fill_nodes(nodes):
+    for node in nodes:
+        node.query("DROP TABLE IF EXISTS test SYNC")
+
    for node in nodes:
        node.query(
            """
@ -29,11 +32,7 @@ nodes = [node_1, node_2, node_3]


 def fill_table():
-    node_1.query("TRUNCATE TABLE test")
-
-    for node in nodes:
-        node.query("SYSTEM SYNC REPLICA test")
-
+    fill_nodes(nodes)
    check_data(0, 0)

    # it will create multiple parts in each partition and probably cause merges
--- a/tests/integration/test_s3_cluster/configs/cluster.xml
+++ b/tests/integration/test_s3_cluster/configs/cluster.xml
@ -20,6 +20,21 @@
        </shard>
    </cluster_simple>

+    <cluster_non_existent_port>
+        <shard>
+            <replica>
+                <host>s0_0_0</host>
+                <port>9000</port>
+            </replica>
+        </shard>
+        <shard>
+            <replica>
+                <host>s0_0_0</host>
+                <port>19000</port>
+            </replica>
+        </shard>
+    </cluster_non_existent_port>
+
    </remote_servers>
    <macros>
        <default_cluster_macro>cluster_simple</default_cluster_macro>
--- a/tests/integration/test_s3_cluster/test.py
+++ b/tests/integration/test_s3_cluster/test.py
@ -195,3 +195,32 @@ def test_ambiguous_join(started_cluster):
    """
    )
    assert "AMBIGUOUS_COLUMN_NAME" not in result
+
+
+def test_skip_unavailable_shards(started_cluster):
+    node = started_cluster.instances["s0_0_0"]
+    result = node.query(
+        """
+    SELECT count(*) from s3Cluster(
+        'cluster_non_existent_port',
+        'http://minio1:9001/root/data/clickhouse/part1.csv', 
+        'minio', 'minio123', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))')
+    SETTINGS skip_unavailable_shards = 1
+    """
+    )
+
+    assert result == "10\n"
+
+
+def test_unskip_unavailable_shards(started_cluster):
+    node = started_cluster.instances["s0_0_0"]
+    error = node.query_and_get_error(
+        """
+    SELECT count(*) from s3Cluster(
+        'cluster_non_existent_port',
+        'http://minio1:9001/root/data/clickhouse/part1.csv', 
+        'minio', 'minio123', 'CSV', 'name String, value UInt32, polygon Array(Array(Tuple(Float64, Float64)))')
+    """
+    )
+
+    assert "NETWORK_ERROR" in error
--- a/tests/queries/0_stateless/00121_drop_column_zookeeper.sql
+++ b/tests/queries/0_stateless/00121_drop_column_zookeeper.sql
@ -1,14 +1,14 @@
 -- Tags: zookeeper, no-replicated-database
 -- Tag no-replicated-database: Old syntax is not allowed

-DROP TABLE IF EXISTS alter_00121;
+DROP TABLE IF EXISTS alter_00121 SYNC;
 set allow_deprecated_syntax_for_merge_tree=1;
 CREATE TABLE alter_00121 (d Date, x UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/alter_00121/t1', 'r1', d, (d), 8192);

 INSERT INTO alter_00121 VALUES ('2014-01-01', 1);
 ALTER TABLE alter_00121 DROP COLUMN x;

-DROP TABLE alter_00121;
+DROP TABLE alter_00121 SYNC;

 CREATE TABLE alter_00121 (d Date) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/alter_00121/t2', 'r1', d, (d), 8192);

@ -23,4 +23,4 @@ SELECT * FROM alter_00121 ORDER BY d;
 ALTER TABLE alter_00121 DROP COLUMN x;
 SELECT * FROM alter_00121 ORDER BY d;

-DROP TABLE alter_00121;
+DROP TABLE alter_00121 SYNC;
--- a/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper_long.sql
+++ b/tests/queries/0_stateless/00502_custom_partitioning_replicated_zookeeper_long.sql
@ -1,11 +1,12 @@
 -- Tags: long, replica

 SET replication_alter_partitions_sync = 2;
+SET insert_keeper_fault_injection_probability=0;

 SELECT '*** Not partitioned ***';

-DROP TABLE IF EXISTS not_partitioned_replica1_00502;
-DROP TABLE IF EXISTS not_partitioned_replica2_00502;
+DROP TABLE IF EXISTS not_partitioned_replica1_00502 SYNC;
+DROP TABLE IF EXISTS not_partitioned_replica2_00502 SYNC;
 CREATE TABLE not_partitioned_replica1_00502(x UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/not_partitioned_00502', '1') ORDER BY x;
 CREATE TABLE not_partitioned_replica2_00502(x UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/not_partitioned_00502', '2') ORDER BY x;

@ -14,7 +15,7 @@ INSERT INTO not_partitioned_replica1_00502 VALUES (4), (5);

 SELECT 'Parts before OPTIMIZE:';
 SELECT partition, name FROM system.parts WHERE database = currentDatabase() AND table = 'not_partitioned_replica1_00502' AND active ORDER BY name;
-SYSTEM SYNC REPLICA not_partitioned_replica1_00502;
+SYSTEM SYNC REPLICA not_partitioned_replica2_00502;
 OPTIMIZE TABLE not_partitioned_replica1_00502 PARTITION tuple() FINAL;
 SELECT 'Parts after OPTIMIZE:';
 SELECT partition, name FROM system.parts WHERE database = currentDatabase() AND table = 'not_partitioned_replica2_00502' AND active ORDER BY name;
@ -25,13 +26,13 @@ ALTER TABLE not_partitioned_replica1_00502 DETACH PARTITION ID 'all';
 SELECT 'Sum after DETACH PARTITION:';
 SELECT sum(x) FROM not_partitioned_replica2_00502;

-DROP TABLE not_partitioned_replica1_00502;
-DROP TABLE not_partitioned_replica2_00502;
+DROP TABLE not_partitioned_replica1_00502 SYNC;
+DROP TABLE not_partitioned_replica2_00502 SYNC;

 SELECT '*** Partitioned by week ***';

-DROP TABLE IF EXISTS partitioned_by_week_replica1;
-DROP TABLE IF EXISTS partitioned_by_week_replica2;
+DROP TABLE IF EXISTS partitioned_by_week_replica1 SYNC;
+DROP TABLE IF EXISTS partitioned_by_week_replica2 SYNC;
 CREATE TABLE partitioned_by_week_replica1(d Date, x UInt8) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test/partitioned_by_week_00502', '1') PARTITION BY toMonday(d) ORDER BY x;
 CREATE TABLE partitioned_by_week_replica2(d Date, x UInt8) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test/partitioned_by_week_00502', '2') PARTITION BY toMonday(d) ORDER BY x;

@ -41,7 +42,7 @@ INSERT INTO partitioned_by_week_replica1 VALUES ('2000-01-03', 4), ('2000-01-03'

 SELECT 'Parts before OPTIMIZE:'; -- Select parts on the first replica to avoid waiting for replication.
 SELECT partition, name FROM system.parts WHERE database = currentDatabase() AND table = 'partitioned_by_week_replica1' AND active ORDER BY name;
-SYSTEM SYNC REPLICA partitioned_by_week_replica1;
+SYSTEM SYNC REPLICA partitioned_by_week_replica2;
 OPTIMIZE TABLE partitioned_by_week_replica1 PARTITION '2000-01-03' FINAL;
 SELECT 'Parts after OPTIMIZE:'; -- After OPTIMIZE with replication_alter_partitions_sync=2 replicas must be in sync.
 SELECT partition, name FROM system.parts WHERE database = currentDatabase() AND table = 'partitioned_by_week_replica2' AND active ORDER BY name;
@ -52,13 +53,13 @@ ALTER TABLE partitioned_by_week_replica1 DROP PARTITION '1999-12-27';
 SELECT 'Sum after DROP PARTITION:';
 SELECT sum(x) FROM partitioned_by_week_replica2;

-DROP TABLE partitioned_by_week_replica1;
-DROP TABLE partitioned_by_week_replica2;
+DROP TABLE partitioned_by_week_replica1 SYNC;
+DROP TABLE partitioned_by_week_replica2 SYNC;

 SELECT '*** Partitioned by a (Date, UInt8) tuple ***';

-DROP TABLE IF EXISTS partitioned_by_tuple_replica1_00502;
-DROP TABLE IF EXISTS partitioned_by_tuple_replica2_00502;
+DROP TABLE IF EXISTS partitioned_by_tuple_replica1_00502 SYNC;
+DROP TABLE IF EXISTS partitioned_by_tuple_replica2_00502 SYNC;
 CREATE TABLE partitioned_by_tuple_replica1_00502(d Date, x UInt8, y UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/partitioned_by_tuple_00502', '1') ORDER BY x PARTITION BY (d, x);
 CREATE TABLE partitioned_by_tuple_replica2_00502(d Date, x UInt8, y UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test/partitioned_by_tuple_00502', '2') ORDER BY x PARTITION BY (d, x);

@ -67,7 +68,7 @@ INSERT INTO partitioned_by_tuple_replica1_00502 VALUES ('2000-01-02', 1, 4), ('2

 SELECT 'Parts before OPTIMIZE:';
 SELECT partition, name FROM system.parts WHERE database = currentDatabase() AND table = 'partitioned_by_tuple_replica1_00502' AND active ORDER BY name;
-SYSTEM SYNC REPLICA partitioned_by_tuple_replica1_00502;
+SYSTEM SYNC REPLICA partitioned_by_tuple_replica2_00502;
 OPTIMIZE TABLE partitioned_by_tuple_replica1_00502 PARTITION ('2000-01-01', 1) FINAL;
 OPTIMIZE TABLE partitioned_by_tuple_replica1_00502 PARTITION ('2000-01-02', 1) FINAL;
 SELECT 'Parts after OPTIMIZE:';
@ -79,13 +80,13 @@ ALTER TABLE partitioned_by_tuple_replica1_00502 DETACH PARTITION ID '20000101-1'
 SELECT 'Sum after DETACH PARTITION:';
 SELECT sum(y) FROM partitioned_by_tuple_replica2_00502;

-DROP TABLE partitioned_by_tuple_replica1_00502;
-DROP TABLE partitioned_by_tuple_replica2_00502;
+DROP TABLE partitioned_by_tuple_replica1_00502 SYNC;
+DROP TABLE partitioned_by_tuple_replica2_00502 SYNC;

 SELECT '*** Partitioned by String ***';

-DROP TABLE IF EXISTS partitioned_by_string_replica1;
-DROP TABLE IF EXISTS partitioned_by_string_replica2;
+DROP TABLE IF EXISTS partitioned_by_string_replica1 SYNC;
+DROP TABLE IF EXISTS partitioned_by_string_replica2 SYNC;
 CREATE TABLE partitioned_by_string_replica1(s String, x UInt8) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test/partitioned_by_string_00502', '1') PARTITION BY s ORDER BY x;
 CREATE TABLE partitioned_by_string_replica2(s String, x UInt8) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test/partitioned_by_string_00502', '2') PARTITION BY s ORDER BY x;

@ -105,13 +106,13 @@ ALTER TABLE partitioned_by_string_replica1 DROP PARTITION 'bbb';
 SELECT 'Sum after DROP PARTITION:';
 SELECT sum(x) FROM partitioned_by_string_replica2;

-DROP TABLE partitioned_by_string_replica1;
-DROP TABLE partitioned_by_string_replica2;
+DROP TABLE partitioned_by_string_replica1 SYNC;
+DROP TABLE partitioned_by_string_replica2 SYNC;

 SELECT '*** Table without columns with fixed size ***';

-DROP TABLE IF EXISTS without_fixed_size_columns_replica1;
-DROP TABLE IF EXISTS without_fixed_size_columns_replica2;
+DROP TABLE IF EXISTS without_fixed_size_columns_replica1 SYNC;
+DROP TABLE IF EXISTS without_fixed_size_columns_replica2 SYNC;
 CREATE TABLE without_fixed_size_columns_replica1(s String) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test/without_fixed_size_columns_00502', '1') PARTITION BY length(s) ORDER BY s;
 CREATE TABLE without_fixed_size_columns_replica2(s String) ENGINE ReplicatedMergeTree('/clickhouse/tables/{database}/test/without_fixed_size_columns_00502', '2') PARTITION BY length(s) ORDER BY s;

@ -130,5 +131,5 @@ ALTER TABLE without_fixed_size_columns_replica1 DROP PARTITION 1;
 SELECT 'After DROP PARTITION:';
 SELECT * FROM without_fixed_size_columns_replica2 ORDER BY s;

-DROP TABLE without_fixed_size_columns_replica1;
-DROP TABLE without_fixed_size_columns_replica2;
+DROP TABLE without_fixed_size_columns_replica1 SYNC;
+DROP TABLE without_fixed_size_columns_replica2 SYNC;
--- a/tests/queries/0_stateless/00652_replicated_mutations_zookeeper.sh
+++ b/tests/queries/0_stateless/00652_replicated_mutations_zookeeper.sh
@ -9,8 +9,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=./mergetree_mutations.lib
 . "$CURDIR"/mergetree_mutations.lib

-${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_r1"
-${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_r2"
+${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_r1 SYNC"
+${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_r2 SYNC"

 ${CLICKHOUSE_CLIENT} --allow_deprecated_syntax_for_merge_tree=1 --query="CREATE TABLE mutations_r1(d Date, x UInt32, s String, m MATERIALIZED x + 2) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/mutations', 'r1', d, intDiv(x, 10), 8192)"
 ${CLICKHOUSE_CLIENT} --allow_deprecated_syntax_for_merge_tree=1 --query="CREATE TABLE mutations_r2(d Date, x UInt32, s String, m MATERIALIZED x + 2) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/mutations', 'r2', d, intDiv(x, 10), 8192)"
@ -19,9 +19,9 @@ ${CLICKHOUSE_CLIENT} --allow_deprecated_syntax_for_merge_tree=1 --query="CREATE
 ${CLICKHOUSE_CLIENT} --query="ALTER TABLE mutations_r1 DELETE WHERE x = 1 SETTINGS mutations_sync = 2"

 # Insert some data
-${CLICKHOUSE_CLIENT} --query="INSERT INTO mutations_r1(d, x, s) VALUES \
+${CLICKHOUSE_CLIENT} --insert_keeper_fault_injection_probability=0 --query="INSERT INTO mutations_r1(d, x, s) VALUES \
    ('2000-01-01', 1, 'a')"
-${CLICKHOUSE_CLIENT} --query="INSERT INTO mutations_r1(d, x, s) VALUES \
+${CLICKHOUSE_CLIENT} --insert_keeper_fault_injection_probability=0 --query="INSERT INTO mutations_r1(d, x, s) VALUES \
    ('2000-01-01', 2, 'b'), ('2000-01-01', 3, 'c'), ('2000-01-01', 4, 'd') \
    ('2000-02-01', 2, 'b'), ('2000-02-01', 3, 'c'), ('2000-02-01', 4, 'd')"

@ -35,7 +35,7 @@ ${CLICKHOUSE_CLIENT} --query="ALTER TABLE mutations_r1 DELETE WHERE s = 'd' SETT
 ${CLICKHOUSE_CLIENT} --query="ALTER TABLE mutations_r1 DELETE WHERE m = 3 SETTINGS mutations_sync = 2"

 # Insert more data
-${CLICKHOUSE_CLIENT} --query="INSERT INTO mutations_r1(d, x, s) VALUES \
+${CLICKHOUSE_CLIENT} --insert_keeper_fault_injection_probability=0 --query="INSERT INTO mutations_r1(d, x, s) VALUES \
    ('2000-01-01', 5, 'e'), ('2000-02-01', 5, 'e')"

 ${CLICKHOUSE_CLIENT} --query "SYSTEM SYNC REPLICA mutations_r2"
@ -49,8 +49,8 @@ ${CLICKHOUSE_CLIENT} --query="SELECT mutation_id, command, block_numbers.partiti

 ${CLICKHOUSE_CLIENT} --query="SELECT '*** Test mutations cleaner ***'"

-${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_cleaner_r1"
-${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_cleaner_r2"
+${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_cleaner_r1 SYNC"
+${CLICKHOUSE_CLIENT} --query="DROP TABLE IF EXISTS mutations_cleaner_r2 SYNC"

 # Create 2 replicas with finished_mutations_to_keep = 2
 ${CLICKHOUSE_CLIENT} --query="CREATE TABLE mutations_cleaner_r1(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/mutations_cleaner', 'r1') ORDER BY x SETTINGS \
@ -63,7 +63,7 @@ ${CLICKHOUSE_CLIENT} --query="CREATE TABLE mutations_cleaner_r2(x UInt32) ENGINE
    cleanup_delay_period_random_add = 0"

 # Insert some data
-${CLICKHOUSE_CLIENT} --query="INSERT INTO mutations_cleaner_r1(x) VALUES (1), (2), (3), (4), (5)"
+${CLICKHOUSE_CLIENT} --insert_keeper_fault_injection_probability=0 --query="INSERT INTO mutations_cleaner_r1(x) VALUES (1), (2), (3), (4), (5)"

 # Add some mutations and wait for their execution
 ${CLICKHOUSE_CLIENT} --query="ALTER TABLE mutations_cleaner_r1 DELETE WHERE x = 1 SETTINGS mutations_sync = 2"
@ -80,8 +80,8 @@ sleep 1.5
 # Check that the first mutation is cleaned
 ${CLICKHOUSE_CLIENT} --query="SELECT mutation_id, command, is_done FROM system.mutations WHERE database = '$CLICKHOUSE_DATABASE' and table = 'mutations_cleaner_r2' ORDER BY mutation_id"

-${CLICKHOUSE_CLIENT} --query="DROP TABLE mutations_r1"
-${CLICKHOUSE_CLIENT} --query="DROP TABLE mutations_r2"
+${CLICKHOUSE_CLIENT} --query="DROP TABLE mutations_r1 SYNC"
+${CLICKHOUSE_CLIENT} --query="DROP TABLE mutations_r2 SYNC"

-${CLICKHOUSE_CLIENT} --query="DROP TABLE mutations_cleaner_r1"
-${CLICKHOUSE_CLIENT} --query="DROP TABLE mutations_cleaner_r2"
+${CLICKHOUSE_CLIENT} --query="DROP TABLE mutations_cleaner_r1 SYNC"
+${CLICKHOUSE_CLIENT} --query="DROP TABLE mutations_cleaner_r2 SYNC"
--- a/tests/queries/0_stateless/00661_optimize_final_replicated_without_partition_zookeeper.sql
+++ b/tests/queries/0_stateless/00661_optimize_final_replicated_without_partition_zookeeper.sql
@ -2,8 +2,8 @@

 SET optimize_on_insert = 0;

-DROP TABLE IF EXISTS partitioned_by_tuple_replica1_00661;
-DROP TABLE IF EXISTS partitioned_by_tuple_replica2_00661;
+DROP TABLE IF EXISTS partitioned_by_tuple_replica1_00661 SYNC;
+DROP TABLE IF EXISTS partitioned_by_tuple_replica2_00661 SYNC;
 CREATE TABLE partitioned_by_tuple_replica1_00661(d Date, x UInt8, w String, y UInt8) ENGINE = ReplicatedSummingMergeTree('/clickhouse/tables/{database}/test/partitioned_by_tuple_00661', '1') PARTITION BY (d, x) ORDER BY (d, x, w);
 CREATE TABLE partitioned_by_tuple_replica2_00661(d Date, x UInt8, w String, y UInt8) ENGINE = ReplicatedSummingMergeTree('/clickhouse/tables/{database}/test/partitioned_by_tuple_00661', '2') PARTITION BY (d, x) ORDER BY (d, x, w);

@ -21,5 +21,5 @@ OPTIMIZE TABLE partitioned_by_tuple_replica1_00661 FINAL;
 SYSTEM SYNC REPLICA partitioned_by_tuple_replica2_00661;
 SELECT * FROM partitioned_by_tuple_replica2_00661 ORDER BY d, x, w, y;

-DROP TABLE partitioned_by_tuple_replica1_00661;
-DROP TABLE partitioned_by_tuple_replica2_00661;
+DROP TABLE partitioned_by_tuple_replica1_00661 SYNC;
+DROP TABLE partitioned_by_tuple_replica2_00661 SYNC;
--- a/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sh
+++ b/tests/queries/0_stateless/00715_fetch_merged_or_mutated_part_zookeeper.sh
@ -9,8 +9,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)


 ${CLICKHOUSE_CLIENT} -n --query="
-    DROP TABLE IF EXISTS fetches_r1;
-    DROP TABLE IF EXISTS fetches_r2"
+    DROP TABLE IF EXISTS fetches_r1 SYNC;
+    DROP TABLE IF EXISTS fetches_r2 SYNC"

 ${CLICKHOUSE_CLIENT} --query="CREATE TABLE fetches_r1(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/fetches', 'r1') ORDER BY x"
 ${CLICKHOUSE_CLIENT} --query="CREATE TABLE fetches_r2(x UInt32) ENGINE ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/fetches', 'r2') ORDER BY x \
@ -18,6 +18,7 @@ ${CLICKHOUSE_CLIENT} --query="CREATE TABLE fetches_r2(x UInt32) ENGINE Replicate
             prefer_fetch_merged_part_size_threshold=0"

 ${CLICKHOUSE_CLIENT} -n --query="
+    SET insert_keeper_fault_injection_probability=0;
    INSERT INTO fetches_r1 VALUES (1);
    INSERT INTO fetches_r1 VALUES (2);
    INSERT INTO fetches_r1 VALUES (3)"
@ -51,5 +52,5 @@ ${CLICKHOUSE_CLIENT} --query="SELECT '*** Check data after fetch/clone of mutate
 ${CLICKHOUSE_CLIENT} --query="SELECT _part, * FROM fetches_r2 ORDER BY x"

 ${CLICKHOUSE_CLIENT} -n --query="
-    DROP TABLE fetches_r1;
-    DROP TABLE fetches_r2"
+    DROP TABLE fetches_r1 SYNC;
+    DROP TABLE fetches_r2 SYNC"
--- a/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh
+++ b/tests/queries/0_stateless/00814_replicated_minimalistic_part_header_zookeeper.sh
@ -12,6 +12,7 @@ $CLICKHOUSE_CLIENT -nm -q "
 DROP TABLE IF EXISTS part_header_r1;
 DROP TABLE IF EXISTS part_header_r2;

+SET insert_keeper_fault_injection_probability=0; -- disable fault injection; part ids are non-deterministic in case of insert retries
 SET replication_alter_partitions_sync = 2;

 CREATE TABLE part_header_r1(x UInt32, y UInt32)
--- a/tests/queries/0_stateless/01037_zookeeper_check_table_empty_pk.sql
+++ b/tests/queries/0_stateless/01037_zookeeper_check_table_empty_pk.sql
@ -1,9 +1,10 @@
 -- Tags: zookeeper

+SET insert_keeper_fault_injection_probability=0; -- disable fault injection; part ids are non-deterministic in case of insert retries
 SET check_query_single_value_result = 0;
 SET send_logs_level = 'fatal';

-DROP TABLE IF EXISTS mt_without_pk;
+DROP TABLE IF EXISTS mt_without_pk SYNC;

 CREATE TABLE mt_without_pk (SomeField1 Int64, SomeField2 Double) ENGINE = MergeTree() ORDER BY tuple();

@ -11,9 +12,9 @@ INSERT INTO mt_without_pk VALUES (1, 2);

 CHECK TABLE mt_without_pk;

-DROP TABLE IF EXISTS mt_without_pk;
+DROP TABLE IF EXISTS mt_without_pk SYNC;

-DROP TABLE IF EXISTS replicated_mt_without_pk;
+DROP TABLE IF EXISTS replicated_mt_without_pk SYNC;

 CREATE TABLE replicated_mt_without_pk (SomeField1 Int64, SomeField2 Double) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test_01037/replicated_mt_without_pk', '1') ORDER BY tuple();

@ -21,4 +22,4 @@ INSERT INTO replicated_mt_without_pk VALUES (1, 2);

 CHECK TABLE replicated_mt_without_pk;

-DROP TABLE IF EXISTS replicated_mt_without_pk;
+DROP TABLE IF EXISTS replicated_mt_without_pk SYNC;
--- a/tests/queries/0_stateless/01045_zookeeper_system_mutations_with_parts_names.sh
+++ b/tests/queries/0_stateless/01045_zookeeper_system_mutations_with_parts_names.sh
@ -52,7 +52,8 @@ ${CLICKHOUSE_CLIENT} --query="CREATE TABLE replicated_table_for_mutations(k UInt

 ${CLICKHOUSE_CLIENT} --query="SYSTEM STOP MERGES replicated_table_for_mutations"

-${CLICKHOUSE_CLIENT} --query="INSERT INTO replicated_table_for_mutations select number, number from numbers(100000)"
+# test relays on part ids, which are non-deterministic with keeper fault injections, so disable it
+${CLICKHOUSE_CLIENT} --insert_keeper_fault_injection_probability=0 --query="INSERT INTO replicated_table_for_mutations select number, number from numbers(100000)"

 ${CLICKHOUSE_CLIENT} --query="SELECT sum(v1) FROM replicated_table_for_mutations"

--- a/tests/queries/0_stateless/01090_zookeeper_mutations_and_insert_quorum_long.sql
+++ b/tests/queries/0_stateless/01090_zookeeper_mutations_and_insert_quorum_long.sql
@ -1,8 +1,8 @@
 -- Tags: long, zookeeper, no-replicated-database
 -- Tag no-replicated-database: Fails due to additional replicas or shards

-DROP TABLE IF EXISTS mutations_and_quorum1;
-DROP TABLE IF EXISTS mutations_and_quorum2;
+DROP TABLE IF EXISTS mutations_and_quorum1 SYNC;
+DROP TABLE IF EXISTS mutations_and_quorum2 SYNC;

 CREATE TABLE mutations_and_quorum1 (`server_date` Date, `something` String) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test_01090/mutations_and_quorum', '1') PARTITION BY toYYYYMM(server_date) ORDER BY (server_date, something);
 CREATE TABLE mutations_and_quorum2 (`server_date` Date, `something` String) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{database}/test_01090/mutations_and_quorum', '2') PARTITION BY toYYYYMM(server_date) ORDER BY (server_date, something);
@ -10,6 +10,9 @@ CREATE TABLE mutations_and_quorum2 (`server_date` Date, `something` String) ENGI
 -- Should not be larger then 600e6 (default timeout in clickhouse-test)
 SET insert_quorum=2, insert_quorum_parallel=0, insert_quorum_timeout=300e3;

+SET insert_keeper_max_retries=100;
+SET insert_keeper_retry_max_backoff_ms=10;
+
 INSERT INTO mutations_and_quorum1 VALUES ('2019-01-01', 'test1'), ('2019-02-01', 'test2'), ('2019-03-01', 'test3'), ('2019-04-01', 'test4'), ('2019-05-01', 'test1'), ('2019-06-01', 'test2'), ('2019-07-01', 'test3'), ('2019-08-01', 'test4'), ('2019-09-01', 'test1'), ('2019-10-01', 'test2'), ('2019-11-01', 'test3'), ('2019-12-01', 'test4');

 ALTER TABLE mutations_and_quorum1 DELETE WHERE something = 'test1' SETTINGS mutations_sync=2;
@ -19,5 +22,5 @@ SELECT COUNT() FROM mutations_and_quorum2;

 SELECT COUNT() FROM system.mutations WHERE database = currentDatabase() AND table like 'mutations_and_quorum%' and is_done = 0;

-DROP TABLE IF EXISTS mutations_and_quorum1;
-DROP TABLE IF EXISTS mutations_and_quorum2;
+DROP TABLE IF EXISTS mutations_and_quorum1 SYNC;
+DROP TABLE IF EXISTS mutations_and_quorum2 SYNC;
--- a/tests/queries/0_stateless/01135_default_and_alter_zookeeper.sql
+++ b/tests/queries/0_stateless/01135_default_and_alter_zookeeper.sql
@ -1,6 +1,6 @@
 -- Tags: zookeeper

-DROP TABLE IF EXISTS default_table;
+DROP TABLE IF EXISTS default_table SYNC;

 CREATE TABLE default_table
 (
@ -26,4 +26,4 @@ ALTER TABLE default_table MODIFY COLUMN enum_column Enum8('undefined' = 0, 'fox'

 SHOW CREATE TABLE default_table;

-DROP TABLE IF EXISTS default_table;
+DROP TABLE IF EXISTS default_table SYNC;
--- a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
+++ b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
@ -1,5 +1,7 @@
 -- Tags: zookeeper

+SET insert_keeper_fault_injection_probability=0; -- disable fault injection; part ids are non-deterministic in case of insert retries
+
 set send_logs_level='error';
 drop table if exists mt;
 drop table if exists rmt sync;
--- a/tests/queries/0_stateless/01158_zookeeper_log_long.sql
+++ b/tests/queries/0_stateless/01158_zookeeper_log_long.sql
@ -1,13 +1,16 @@
 -- Tags: long, zookeeper, no-replicated-database, no-polymorphic-parts
 -- Tag no-replicated-database: Fails due to additional replicas or shards

-drop table if exists rmt;
+SET insert_keeper_fault_injection_probability=0; -- disable fault injection; part ids are non-deterministic in case of insert retries
+
+drop table if exists rmt sync;
 -- cleanup code will perform extra Exists
 -- (so the .reference will not match)
 create table rmt (n int) engine=ReplicatedMergeTree('/test/01158/{database}/rmt', '1') order by n settings cleanup_delay_period=86400, replicated_can_become_leader=0;
 system sync replica rmt;
 insert into rmt values (1);
 insert into rmt values (1);
+system sync replica rmt;
 system flush logs;

 select 'log';
@ -30,7 +33,7 @@ from system.zookeeper_log
 where (session_id, xid) in (select session_id, xid from system.zookeeper_log where path like '/test/01158/' || currentDatabase() || '/rmt/blocks/%' and op_num not in (1, 12, 500))
 order by xid, type, request_idx;

-drop table rmt;
+drop table rmt sync;

 system flush logs;
 select 'duration_ms';
--- a/tests/queries/0_stateless/01201_drop_column_compact_part_replicated_zookeeper_long.sql
+++ b/tests/queries/0_stateless/01201_drop_column_compact_part_replicated_zookeeper_long.sql
@ -1,5 +1,9 @@
 -- Tags: long, replica

+-- in case of keeper fault injection on insert, set bigger number of retries because partitions
+set insert_keeper_max_retries=100;
+set insert_keeper_retry_max_backoff_ms=10;
+
 -- Testing basic functionality with compact parts
 set replication_alter_partitions_sync = 2;
 drop table if exists mt_compact;
--- a/Show More
+++ b/Show More