Merge branch 'master' into clang-tidy-11

2024-11-24 16:42:05 +00:00 · 2020-09-19 00:02:44 +03:00 · 2020-09-19 00:02:44 +03:00 · cac65b38f2
commit cac65b38f2
parent 8cf4f0fdb3 7aa6af5a13
187 changed files with 3138 additions and 1007 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -37,7 +37,7 @@
 	url = https://github.com/ClickHouse-Extras/mariadb-connector-c.git
 [submodule "contrib/jemalloc"]
 	path = contrib/jemalloc
-	url = https://github.com/jemalloc/jemalloc.git
+	url = https://github.com/ClickHouse-Extras/jemalloc.git
 [submodule "contrib/unixodbc"]
 	path = contrib/unixodbc
 	url = https://github.com/ClickHouse-Extras/UnixODBC.git
--- a/base/common/CMakeLists.txt
+++ b/base/common/CMakeLists.txt
@ -18,6 +18,7 @@ set (SRCS
    terminalColors.cpp
    errnoToString.cpp
    getResource.cpp
+    StringRef.cpp
 )

 if (ENABLE_REPLXX)
--- a/base/common/StringRef.cpp
+++ b/base/common/StringRef.cpp
@ -0,0 +1,13 @@
+#include <ostream>
+
+#include "StringRef.h"
+
+
+std::ostream & operator<<(std::ostream & os, const StringRef & str)
+{
+    if (str.data)
+        os.write(str.data, str.size);
+
+    return os;
+}
+
--- a/base/common/StringRef.h
+++ b/base/common/StringRef.h
@ -4,7 +4,7 @@
 #include <string>
 #include <vector>
 #include <functional>
-#include <ostream>
+#include <iosfwd>

 #include <common/types.h>
 #include <common/unaligned.h>
@ -322,10 +322,4 @@ inline bool operator==(StringRef lhs, const char * rhs)
    return true;
 }

-inline std::ostream & operator<<(std::ostream & os, const StringRef & str)
-{
-    if (str.data)
-        os.write(str.data, str.size);
-
-    return os;
-}
+std::ostream & operator<<(std::ostream & os, const StringRef & str);
--- a/base/common/wide_integer.h
+++ b/base/common/wide_integer.h
@ -54,8 +54,8 @@ template <size_t Bits, typename Signed>
 class integer
 {
 public:
-    using base_type = uint8_t;
-    using signed_base_type = int8_t;
+    using base_type = uint64_t;
+    using signed_base_type = int64_t;

    // ctors
    integer() = default;
@ -127,7 +127,7 @@ private:
    friend class std::numeric_limits<integer<Bits, signed>>;
    friend class std::numeric_limits<integer<Bits, unsigned>>;

-    base_type m_arr[_impl::arr_size];
+    base_type items[_impl::item_count];
 };

 template <typename T>
--- a/base/common/wide_integer_impl.h
+++ b/base/common/wide_integer_impl.h
--- a/base/common/ya.make
+++ b/base/common/ya.make
@ -53,6 +53,7 @@ SRCS(
    setTerminalEcho.cpp
    shift10.cpp
    sleep.cpp
+    StringRef.cpp
    terminalColors.cpp

 )
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@ -1,5 +1,5 @@
 # This strings autochanged from release_lib.sh:
-SET(VERSION_REVISION 54440)
+SET(VERSION_REVISION 54441)
 SET(VERSION_MAJOR 20)
 SET(VERSION_MINOR 10)
 SET(VERSION_PATCH 1)
--- a/contrib/jemalloc
+++ b/contrib/jemalloc
@ -1 +1 @@
-Subproject commit ea6b3e973b477b8061e0076bb257dbd7f3faa756
+Subproject commit 026764f19995c53583ab25a3b9c06a2fd74e4689
--- a/debian/rules
+++ b/debian/rules
@ -18,7 +18,7 @@ ifeq ($(CCACHE_PREFIX),distcc)
    THREADS_COUNT=$(shell distcc -j)
 endif
 ifeq ($(THREADS_COUNT),)
-    THREADS_COUNT=$(shell echo $$(( $$(nproc || grep -c ^processor /proc/cpuinfo || sysctl -n hw.ncpu || echo 8) / 2 )) )
+    THREADS_COUNT=$(shell nproc || grep -c ^processor /proc/cpuinfo || sysctl -n hw.ncpu || echo 4)
 endif
 DEB_BUILD_OPTIONS+=parallel=$(THREADS_COUNT)

--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@ -18,9 +18,9 @@ ccache --zero-stats ||:
 ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so ||:
 rm -f CMakeCache.txt
 cmake --debug-trycompile --verbose=1 -DCMAKE_VERBOSE_MAKEFILE=1 -LA -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DSANITIZE=$SANITIZER $CMAKE_FLAGS ..
-ninja -j $(($(nproc) / 2)) $NINJA_FLAGS clickhouse-bundle
+ninja $NINJA_FLAGS clickhouse-bundle
 mv ./programs/clickhouse* /output
-mv ./src/unit_tests_dbms /output
+mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds
 find . -name '*.so' -print -exec mv '{}' /output \;
 find . -name '*.so.*' -print -exec mv '{}' /output \;

--- a/docker/packager/packager
+++ b/docker/packager/packager
@ -105,6 +105,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
    # Create combined output archive for split build and for performance tests.
    if package_type == "performance":
        result.append("COMBINED_OUTPUT=performance")
+        cmake_flags.append("-DENABLE_TESTS=0")
    elif split_binary:
        result.append("COMBINED_OUTPUT=shared_build")

--- a/docker/test/performance-comparison/README.md
+++ b/docker/test/performance-comparison/README.md
@ -16,7 +16,7 @@ We also consider the test to be unstable, if the observed difference is less tha
 performance differences above 5% more often than in 5% runs, so the test is likely
 to have false positives.

-### How to read the report
+### How to Read the Report

 The check status summarizes the report in a short text message like `1 faster, 10 unstable`:
 * `1 faster` -- how many queries became faster,
@ -27,28 +27,50 @@ The check status summarizes the report in a short text message like `1 faster, 1

 The report page itself constists of a several tables. Some of them always signify errors, e.g. "Run errors" -- the very presence of this table indicates that there were errors during the test, that are not normal and must be fixed. Some tables are mostly informational, e.g. "Test times" -- they reflect normal test results. But if a cell in such table is marked in red, this also means an error, e.g., a test is taking too long to run.

-#### Tested commits
+#### Tested Commits
 Informational, no action required. Log messages for the commits that are tested. Note that for the right commit, we show nominal tested commit `pull/*/head` and real tested commit `pull/*/merge`, which is generated by GitHub by merging latest master to the `pull/*/head` and which we actually build and test in CI.

-#### Run errors
-Action required for every item -- these are errors that must be fixed. The errors that ocurred when running some test queries. For more information about the error, download test output archive and see `test-name-err.log`. To reproduce, see 'How to run' below.
+#### Error Summary
+Action required for every item.

-#### Slow on client
-Action required for every item -- these are errors that must be fixed. This table shows queries that take significantly longer to process on the client than on the server. A possible reason might be sending too much data to the client, e.g., a forgotten `format Null`.
+This table summarizes all errors that ocurred during the test. Click the links to go to the description of a particular error.

-#### Short queries not marked as short
-Action required for every item -- these are errors that must be fixed. This table shows queries that are "short" but not explicitly marked as such. "Short" queries are too fast to meaningfully compare performance, because the changes are drowned by the noise. We consider all queries that run faster than 0.02 s to be "short", and only check the performance if they became slower than this threshold. Probably this mode is not what you want, so you have to increase the query run time to be between 1 and 0.1 s, so that the performance can be compared. You do want this "short" mode for queries that complete "immediately", such as some varieties of `select count(*)`. You have to mark them as "short" explicitly by writing `<query short="1">...`. The value of "short" attribute is evaluated as a python expression, and substitutions are performed, so you can write something like `<query short="{column1} = {column2}">select count(*) from table where {column1} > {column2}</query>`, to mark only a particular combination of variables as short.
+#### Run Errors
+Action required for every item -- these are errors that must be fixed.

-#### Partial queries
-Action required for the cells marked in red. Shows the queries we are unable to run on an old server -- probably because they contain a new function. You should see this table when you add a new function and a performance test for it. Check that the run time and variance are acceptable (run time between 0.1 and 1 seconds, variance below 10%). If not, they will be highlighted in red.
+The errors that ocurred when running some test queries. For more information about the error, download test output archive and see `test-name-err.log`. To reproduce, see 'How to run' below.

-#### Changes in performance
-Action required for the cells marked in red, and some cheering is appropriate for the cells marked in green. These are the queries for which we observe a statistically significant change in performance. Note that there will always be some false positives -- we try to filter by p < 0.001, and have 2000 queries, so two false positives per run are expected. In practice we have more -- e.g. code layout changed because of some unknowable jitter in compiler internals, so the change we observe is real, but it is a 'false positive' in the sense that it is not directly caused by your changes. If, based on your knowledge of ClickHouse internals, you can decide that the observed test changes are not relevant to the changes made in the tested PR, you can ignore them.
+#### Slow on Client
+Action required for every item -- these are errors that must be fixed.
+
+This table shows queries that take significantly longer to process on the client than on the server. A possible reason might be sending too much data to the client, e.g., a forgotten `format Null`.
+
+#### Inconsistent Short Marking
+Action required for every item -- these are errors that must be fixed.
+
+Queries that have "short" duration (on the order of 0.1 s) can't be reliably tested in a normal way, where we perform a small (about ten) measurements for each server, because the signal-to-noise ratio is much smaller. There is a special mode for such queries that instead runs them for a fixed amount of time, normally with much higher number of measurements (up to thousands). This mode must be explicitly enabled by the test author to avoid accidental errors. It must be used only for queries that are meant to complete "immediately", such as `select count(*)`. If your query is not supposed to be "immediate", try to make it run longer, by e.g. processing more data.
+
+This table shows queries for which the "short" marking is not consistent with the actual query run time -- i.e., a query runs for a long time but is marked as short, or it runs very fast but is not marked as short.
+
+If your query is really supposed to complete "immediately" and can't be made to run longer, you have to mark it as "short". To do so, write `<query short="1">...` in the test file. The value of "short" attribute is evaluated as a python expression, and substitutions are performed, so you can write something like `<query short="{column1} = {column2}">select count(*) from table where {column1} > {column2}</query>`, to mark only a particular combination of variables as short.
+
+
+#### Partial Queries
+Action required for the cells marked in red.
+
+Shows the queries we are unable to run on an old server -- probably because they contain a new function. You should see this table when you add a new function and a performance test for it. Check that the run time and variance are acceptable (run time between 0.1 and 1 seconds, variance below 10%). If not, they will be highlighted in red.
+
+#### Changes in Performance
+Action required for the cells marked in red, and some cheering is appropriate for the cells marked in green.
+
+These are the queries for which we observe a statistically significant change in performance. Note that there will always be some false positives -- we try to filter by p < 0.001, and have 2000 queries, so two false positives per run are expected. In practice we have more -- e.g. code layout changed because of some unknowable jitter in compiler internals, so the change we observe is real, but it is a 'false positive' in the sense that it is not directly caused by your changes. If, based on your knowledge of ClickHouse internals, you can decide that the observed test changes are not relevant to the changes made in the tested PR, you can ignore them.

 You can find flame graphs for queries with performance changes in the test output archive, in files named as 'my_test_0_Cpu_SELECT 1 FROM....FORMAT Null.left.svg'. First goes the test name, then the query number in the test, then the trace type (same as in `system.trace_log`), and then the server version (left is old and right is new).

-#### Unstable queries
-Action required for the cells marked in red. These are queries for which we did not observe a statistically significant change in performance, but for which the variance in query performance is very high. This means that we are likely to observe big changes in performance even in the absence of real changes, e.g. when comparing the server to itself. Such queries are going to have bad sensitivity as performance tests -- if a query has, say, 50% expected variability, this means we are going to see changes in performance up to 50%, even when there were no real changes in the code. And because of this, we won't be able to detect changes less than 50% with such a query, which is pretty bad. The reasons for the high variability must be investigated and fixed; ideally, the variability should be brought under 5-10%. 
+#### Unstable Queries
+Action required for the cells marked in red.
+
+These are the queries for which we did not observe a statistically significant change in performance, but for which the variance in query performance is very high. This means that we are likely to observe big changes in performance even in the absence of real changes, e.g. when comparing the server to itself. Such queries are going to have bad sensitivity as performance tests -- if a query has, say, 50% expected variability, this means we are going to see changes in performance up to 50%, even when there were no real changes in the code. And because of this, we won't be able to detect changes less than 50% with such a query, which is pretty bad. The reasons for the high variability must be investigated and fixed; ideally, the variability should be brought under 5-10%. 

 The most frequent reason for instability is that the query is just too short -- e.g. below 0.1 seconds. Bringing query time to 0.2 seconds or above usually helps.
 Other reasons may include:
@ -57,24 +79,33 @@ Other reasons may include:

 Investigating the instablility is the hardest problem in performance testing, and we still have not been able to understand the reasons behind the instability of some queries. There are some data that can help you in the performance test output archive. Look for files named 'my_unstable_test_0_SELECT 1...FORMAT Null.{left,right}.metrics.rep'. They contain metrics from `system.query_log.ProfileEvents` and functions from stack traces from `system.trace_log`, that vary significantly between query runs. The second column is array of \[min, med, max] values for the metric. Say, if you see `PerfCacheMisses` there, it may mean that the code being tested has not-so-cache-local memory access pattern that is sensitive to memory layout.

-#### Skipped tests
-Informational, no action required. Shows the tests that were skipped, and the reason for it. Normally it is because the data set required for the test was not loaded, or the test is marked as 'long' -- both cases mean that the test is too big to be ran per-commit.
+#### Skipped Tests
+Informational, no action required.

-#### Test performance changes
-Informational, no action required. This table summarizes the changes in performance of queries in each test -- how many queries have changed, how many are unstable, and what is the magnitude of the changes.
+Shows the tests that were skipped, and the reason for it. Normally it is because the data set required for the test was not loaded, or the test is marked as 'long' -- both cases mean that the test is too big to be ran per-commit.

-#### Test times
-Action required for the cells marked in red. This table shows the run times for all the tests. You may have to fix two kinds of errors in this table:
+#### Test Performance Changes
+Informational, no action required.
+
+This table summarizes the changes in performance of queries in each test -- how many queries have changed, how many are unstable, and what is the magnitude of the changes.
+
+#### Test Times
+Action required for the cells marked in red.
+
+This table shows the run times for all the tests. You may have to fix two kinds of errors in this table:
 1) Average query run time is too long -- probalby means that the preparatory steps such as creating the table and filling them with data are taking too long. Try to make them faster.
 2) Longest query run time is too long -- some particular queries are taking too long, try to make them faster. The ideal query run time is between 0.1 and 1 s.

-#### Concurrent benchmarks
-No action required. This table shows the results of a concurrent behcmark where queries from `website` are ran in parallel using `clickhouse-benchmark`, and requests per second values are compared for old and new servers. It shows variability up to 20% for no apparent reason, so it's probably safe to disregard it. We have it for special cases like investigating concurrency effects in memory allocators, where it may be important.
+#### Metric Changes
+No action required.

-#### Metric changes
-No action required. These are changes in median values of metrics from `system.asynchronous_metrics_log`. Again, they are prone to unexplained variation and you can safely ignore this table unless it's interesting to you for some particular reason (e.g. you want to compare memory usage). There are also graphs of these metrics in the performance test output archive, in the `metrics` folder.
+These are changes in median values of metrics from `system.asynchronous_metrics_log`. These metrics are prone to unexplained variation and you can safely ignore this table unless it's interesting to you for some particular reason (e.g. you want to compare memory usage). There are also graphs of these metrics in the performance test output archive, in the `metrics` folder.

-### How to run
+#### Errors while Building the Report
+Ask a maintainer for help. These errors normally indicate a problem with testing infrastructure.
+
+
+### How to Run
 Run the entire docker container, specifying PR number (0 for master)
 and SHA of the commit to test. The reference revision is determined as a nearest
 ancestor testing release tag. It is possible to specify the reference revision and
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -121,7 +121,7 @@ function run_tests
    then
        # Use the explicitly set path to directory with test files.
        test_prefix="$CHPC_TEST_PATH"
-    elif [ "$PR_TO_TEST" = "0" ]
+    elif [ "$PR_TO_TEST" == "0" ]
    then
        # When testing commits from master, use the older test files. This
        # allows the tests to pass even when we add new functions and tests for
@ -155,6 +155,20 @@ function run_tests
        test_files=$(ls "$test_prefix"/*.xml)
    fi

+    # For PRs, test only a subset of queries, and run them less times.
+    # If the corresponding environment variables are already set, keep
+    # those values.
+    if [ "$PR_TO_TEST" == "0" ]
+    then
+        CHPC_RUNS=${CHPC_RUNS:-13}
+        CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0}
+    else
+        CHPC_RUNS=${CHPC_RUNS:-7}
+        CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-20}
+    fi
+    export CHPC_RUNS
+    export CHPC_MAX_QUERIES
+
    # Determine which concurrent benchmarks to run. For now, the only test
    # we run as a concurrent benchmark is 'website'. Run it as benchmark if we
    # are also going to run it as a normal test.
@ -184,11 +198,13 @@ function run_tests
        echo test "$test_name"

        TIMEFORMAT=$(printf "$test_name\t%%3R\t%%3U\t%%3S\n")
-        # the grep is to filter out set -x output and keep only time output
+        # The grep is to filter out set -x output and keep only time output.
+        # The '2>&1 >/dev/null' redirects stderr to stdout, and discards stdout.
        { \
            time "$script_dir/perf.py" --host localhost localhost --port 9001 9002 \
+                --runs "$CHPC_RUNS" --max-queries "$CHPC_MAX_QUERIES" \
                -- "$test" > "$test_name-raw.tsv" 2> "$test_name-err.log" ; \
-        } 2>&1 >/dev/null | grep -v ^+ >> "wall-clock-times.tsv" \
+        } 2>&1 >/dev/null | tee >(grep -v ^+ >> "wall-clock-times.tsv") \
            || echo "Test $test_name failed with error code $?" >> "$test_name-err.log"
    done

@ -197,33 +213,9 @@ function run_tests
    wait
 }

-# Run some queries concurrently and report the resulting TPS. This additional
-# (relatively) short test helps detect concurrency-related effects, because the
-# main performance comparison testing is done query-by-query.
-function run_benchmark
-{
-    rm -rf benchmark ||:
-    mkdir benchmark ||:
-
-    # The list is built by run_tests.
-    while IFS= read -r file
-    do
-        name=$(basename "$file" ".xml")
-
-        "$script_dir/perf.py" --print-queries "$file" > "benchmark/$name-queries.txt"
-        "$script_dir/perf.py" --print-settings "$file" > "benchmark/$name-settings.txt"
-
-        readarray -t settings < "benchmark/$name-settings.txt"
-        command=(clickhouse-benchmark --concurrency 6 --cumulative --iterations 1000 --randomize 1 --delay 0 --continue_on_errors "${settings[@]}")
-
-        "${command[@]}" --port 9001 --json "benchmark/$name-left.json" < "benchmark/$name-queries.txt"
-        "${command[@]}" --port 9002 --json "benchmark/$name-right.json" < "benchmark/$name-queries.txt"
-    done < benchmarks-to-run.txt
-}
-
 function get_profiles_watchdog
 {
-    sleep 6000
+    sleep 600

    echo "The trace collection did not finish in time." >> profile-errors.log

@ -490,8 +482,6 @@ build_log_column_definitions
 cat analyze/errors.log >> report/errors.log ||:
 cat profile-errors.log >> report/errors.log ||:

-short_query_threshold="0.02"
-
 clickhouse-local --query "
 create view query_display_names as select * from
    file('analyze/query-display-names.tsv', TSV,
@ -524,18 +514,11 @@ create view query_metric_stats as
 -- Main statistics for queries -- query time as reported in query log.
 create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
    as select
-        -- Comparison mode doesn't make sense for queries that complete
-        -- immediately (on the same order of time as noise). If query duration is
-        -- less that some threshold, we just skip it. If there is a significant
-        -- regression in such query, the time will exceed the threshold, and we
-        -- well process it normally and detect the regression.
-        right < $short_query_threshold as short,
+        abs(diff) > report_threshold        and abs(diff) > stat_threshold as changed_fail,
+        abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show,
        
-        not short and abs(diff) > report_threshold        and abs(diff) > stat_threshold as changed_fail,
-        not short and abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show,
-        
-        not short and not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail,
-        not short and not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
+        not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail,
+        not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
        
        left, right, diff, stat_threshold,
        if(report_threshold > 0, report_threshold, 0.10) as report_threshold,
@ -640,9 +623,9 @@ create table wall_clock_time_per_test engine Memory as select *

 create table test_time engine Memory as
    select test, sum(client) total_client_time,
-        maxIf(client, not short) query_max,
-        minIf(client, not short) query_min,
-        count(*) queries, sum(short) short_queries
+        max(client) query_max,
+        min(client) query_min,
+        count(*) queries
    from total_client_time_per_query full join queries using (test, query_index)
    group by test;

@ -650,7 +633,6 @@ create table test_times_report engine File(TSV, 'report/test-times.tsv') as
    select wall_clock_time_per_test.test, real,
        toDecimal64(total_client_time, 3),
        queries,
-        short_queries,
        toDecimal64(query_max, 3),
        toDecimal64(real / queries, 3) avg_real_per_query,
        toDecimal64(query_min, 3)
@ -685,32 +667,47 @@ create table queries_for_flamegraph engine File(TSVWithNamesAndTypes,
    select test, query_index from queries where unstable_show or changed_show
    ;

-- List of queries that have 'short' duration, but are not marked as 'short' by
-- the test author (we report them).
-create table unmarked_short_queries_report
-    engine File(TSV, 'report/unmarked-short-queries.tsv')
-    as select time, test, query_index, query_display_name
+
+create view shortness
+    as select 
+        (test, query_index) in
+            (select * from file('analyze/marked-short-queries.tsv', TSV,
+            'test text, query_index int'))
+            as marked_short,
+        time, test, query_index, query_display_name
    from (
-            select right time, test, query_index from queries where short
+            select right time, test, query_index from queries
            union all
            select time_median, test, query_index from partial_query_times
-                where time_median < $short_query_threshold
        ) times
        left join query_display_names
            on times.test = query_display_names.test
                and times.query_index = query_display_names.query_index
-    where (test, query_index) not in
-        (select * from file('analyze/marked-short-queries.tsv', TSV,
-            'test text, query_index int'))
-    order by test, query_index
    ;

+-- Report of queries that have inconsistent 'short' markings:
+-- 1) have short duration, but are not marked as 'short'
+-- 2) the reverse -- marked 'short' but take too long.
+-- The threshold for 2) is twice the threshold for 1), to avoid jitter.
+create table inconsistent_short_marking_report
+    engine File(TSV, 'report/inconsistent-short-marking.tsv')
+    as select
+        multiIf(marked_short and time > 0.1, 'marked as short but is too long',
+                not marked_short and time < 0.02, 'is short but not marked as such',
+                '') problem,
+        marked_short, time,
+        test, query_index, query_display_name
+    from shortness
+    where problem != ''
+    ;
+
+
 --------------------------------------------------------------------------------
 -- various compatibility data formats follow, not related to the main report

 -- keep the table in old format so that we can analyze new and old data together
 create table queries_old_format engine File(TSVWithNamesAndTypes, 'queries.rep')
-    as select short, changed_fail, unstable_fail, left, right, diff,
+    as select 0 short, changed_fail, unstable_fail, left, right, diff,
        stat_threshold, test, query_display_name query
    from queries
    ;
@ -1008,9 +1005,6 @@ case "$stage" in
    # Ignore the errors to collect the log and build at least some report, anyway
    time run_tests ||:
    ;&
-"run_benchmark")
-    time run_benchmark 2> >(tee -a run-errors.tsv 1>&2) ||:
-    ;&
 "get_profiles")
    # Check for huge pages.
    cat /sys/kernel/mm/transparent_hugepage/enabled > thp-enabled.txt ||:
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@ -1,16 +1,20 @@
 #!/usr/bin/python3

-import os
-import sys
-import itertools
-import clickhouse_driver
-import xml.etree.ElementTree as et
 import argparse
+import clickhouse_driver
+import itertools
+import functools
+import math
+import os
 import pprint
+import random
 import re
+import statistics
 import string
+import sys
 import time
 import traceback
+import xml.etree.ElementTree as et

 def tsv_escape(s):
    return s.replace('\\', '\\\\').replace('\t', '\\t').replace('\n', '\\n').replace('\r','')
@ -20,7 +24,8 @@ parser = argparse.ArgumentParser(description='Run performance test.')
 parser.add_argument('file', metavar='FILE', type=argparse.FileType('r', encoding='utf-8'), nargs=1, help='test description file')
 parser.add_argument('--host', nargs='*', default=['localhost'], help="Server hostname(s). Corresponds to '--port' options.")
 parser.add_argument('--port', nargs='*', default=[9000], help="Server port(s). Corresponds to '--host' options.")
-parser.add_argument('--runs', type=int, default=int(os.environ.get('CHPC_RUNS', 7)), help='Number of query runs per server. Defaults to CHPC_RUNS environment variable.')
+parser.add_argument('--runs', type=int, default=1, help='Number of query runs per server.')
+parser.add_argument('--max-queries', type=int, default=None, help='Test no more than this number of queries, chosen at random.')
 parser.add_argument('--long', action='store_true', help='Do not skip the tests tagged as long.')
 parser.add_argument('--print-queries', action='store_true', help='Print test queries and exit.')
 parser.add_argument('--print-settings', action='store_true', help='Print test settings and exit.')
@ -62,18 +67,13 @@ def substitute_parameters(query_templates, other_templates = []):
 # Build a list of test queries, substituting parameters to query templates,
 # and reporting the queries marked as short.
 test_queries = []
+is_short = []
 for e in root.findall('query'):
-    new_queries = []
-    if 'short' in e.attrib:
-        new_queries, [is_short] = substitute_parameters([e.text], [[e.attrib['short']]])
-        for i, s in enumerate(is_short):
-            # Don't print this if we only need to print the queries.
-            if eval(s) and not args.print_queries:
-                print(f'short\t{i + len(test_queries)}')
-    else:
-        new_queries = substitute_parameters([e.text])
-
+    new_queries, [new_is_short] = substitute_parameters([e.text], [[e.attrib.get('short', '0')]])
    test_queries += new_queries
+    is_short += [eval(s) for s in new_is_short]
+
+assert(len(test_queries) == len(is_short))


 # If we're only asked to print the queries, do that and exit
@ -82,6 +82,11 @@ if args.print_queries:
        print(q)
    exit(0)

+# Print short queries
+for i, s in enumerate(is_short):
+    if s:
+        print(f'short\t{i}')
+
 # If we're only asked to print the settings, do that and exit. These are settings
 # for clickhouse-benchmark, so we print them as command line arguments, e.g.
 # '--max_memory_usage=10000000'.
@ -116,7 +121,7 @@ if 'max_ignored_relative_change' in root.attrib:

 # Open connections
 servers = [{'host': host, 'port': port} for (host, port) in zip(args.host, args.port)]
-connections = [clickhouse_driver.Client(**server) for server in servers]
+all_connections = [clickhouse_driver.Client(**server) for server in servers]

 for s in servers:
    print('server\t{}\t{}'.format(s['host'], s['port']))
@ -126,7 +131,7 @@ for s in servers:
 # connection loses the changes in settings.
 drop_query_templates = [q.text for q in root.findall('drop_query')]
 drop_queries = substitute_parameters(drop_query_templates)
-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for q in drop_queries:
        try:
            c.execute(q)
@ -142,7 +147,7 @@ for conn_index, c in enumerate(connections):
 # configurable). So the end result is uncertain, but hopefully we'll be able to
 # run at least some queries.
 settings = root.findall('settings/*')
-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for s in settings:
        try:
            q = f"set {s.tag} = '{s.text}'"
@ -154,7 +159,7 @@ for conn_index, c in enumerate(connections):
 # Check tables that should exist. If they don't exist, just skip this test.
 tables = [e.text for e in root.findall('preconditions/table_exists')]
 for t in tables:
-    for c in connections:
+    for c in all_connections:
        try:
            res = c.execute("select 1 from {} limit 1".format(t))
        except:
@ -176,7 +181,7 @@ for q in create_queries:
            file = sys.stderr)
        sys.exit(1)

-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for q in create_queries:
        c.execute(q)
        print(f'create\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')
@ -184,13 +189,19 @@ for conn_index, c in enumerate(connections):
 # Run fill queries
 fill_query_templates = [q.text for q in root.findall('fill_query')]
 fill_queries = substitute_parameters(fill_query_templates)
-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for q in fill_queries:
        c.execute(q)
        print(f'fill\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')

+# Run the queries in randomized order, but preserve their indexes as specified
+# in the test XML. To avoid using too much time, limit the number of queries
+# we run per test.
+queries_to_run = random.sample(range(0, len(test_queries)), min(len(test_queries), args.max_queries or len(test_queries)))
+
 # Run test queries.
-for query_index, q in enumerate(test_queries):
+for query_index in queries_to_run:
+    q = test_queries[query_index]
    query_prefix = f'{test_name}.query{query_index}'

    # We have some crazy long queries (about 100kB), so trim them to a sane
@ -208,8 +219,8 @@ for query_index, q in enumerate(test_queries):
    # new one. We want to run them on the new server only, so that the PR author
    # can ensure that the test works properly. Remember the errors we had on
    # each server.
-    query_error_on_connection = [None] * len(connections);
-    for conn_index, c in enumerate(connections):
+    query_error_on_connection = [None] * len(all_connections);
+    for conn_index, c in enumerate(all_connections):
        try:
            prewarm_id = f'{query_prefix}.prewarm0'
            res = c.execute(q, query_id = prewarm_id)
@ -236,21 +247,22 @@ for query_index, q in enumerate(test_queries):

    if len(no_errors) == 0:
        continue
-    elif len(no_errors) < len(connections):
+    elif len(no_errors) < len(all_connections):
        print(f'partial\t{query_index}\t{no_errors}')

+    this_query_connections = [all_connections[index] for index in no_errors]
+
    # Now, perform measured runs.
    # Track the time spent by the client to process this query, so that we can
    # notice the queries that take long to process on the client side, e.g. by
    # sending excessive data.
    start_seconds = time.perf_counter()
    server_seconds = 0
-    for run in range(0, args.runs):
+    run = 0
+    while True:
        run_id = f'{query_prefix}.run{run}'
-        for conn_index, c in enumerate(connections):
-            if query_error_on_connection[conn_index]:
-                continue

+        for conn_index, c in enumerate(this_query_connections):
            try:
                res = c.execute(q, query_id = run_id)
            except Exception as e:
@ -259,8 +271,8 @@ for query_index, q in enumerate(test_queries):
                e.message = run_id + ': ' + e.message
                raise

-            print(f'query\t{query_index}\t{run_id}\t{conn_index}\t{c.last_query.elapsed}')
            server_seconds += c.last_query.elapsed
+            print(f'query\t{query_index}\t{run_id}\t{conn_index}\t{c.last_query.elapsed}')

            if c.last_query.elapsed > 10:
                # Stop processing pathologically slow queries, to avoid timing out
@ -269,12 +281,37 @@ for query_index, q in enumerate(test_queries):
                print(f'The query no. {query_index} is taking too long to run ({c.last_query.elapsed} s)', file=sys.stderr)
                exit(2)

+        # Be careful with the counter, after this line it's the next iteration
+        # already.
+        run += 1
+
+        # Try to run any query for at least the specified number of times,
+        # before considering other stop conditions.
+        if run < args.runs:
+            continue
+
+        # For very short queries we have a special mode where we run them for at
+        # least some time. The recommended lower bound of run time for "normal"
+        # queries is about 0.1 s, and we run them about 10 times, giving the
+        # time per query per server of about one second. Use this value as a
+        # reference for "short" queries.
+        if is_short[query_index]:
+            if server_seconds >= 2 * len(this_query_connections):
+                break
+            # Also limit the number of runs, so that we don't go crazy processing
+            # the results -- 'eqmed.sql' is really suboptimal.
+            if run >= 500:
+                break
+        else:
+            if run >= args.runs:
+                break
+
    client_seconds = time.perf_counter() - start_seconds
    print(f'client-time\t{query_index}\t{client_seconds}\t{server_seconds}')

 # Run drop queries
 drop_queries = substitute_parameters(drop_query_templates)
-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for q in drop_queries:
        c.execute(q)
        print(f'drop\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')
--- a/docker/test/performance-comparison/report.py
+++ b/docker/test/performance-comparison/report.py
@ -98,6 +98,9 @@ th {{

 tr:nth-child(odd) td {{filter: brightness(90%);}}

+.inconsistent-short-marking tr :nth-child(2),
+.inconsistent-short-marking tr :nth-child(3),
+.inconsistent-short-marking tr :nth-child(5),
 .all-query-times tr :nth-child(1),
 .all-query-times tr :nth-child(2),
 .all-query-times tr :nth-child(3),
@ -126,7 +129,6 @@ tr:nth-child(odd) td {{filter: brightness(90%);}}
 .test-times tr :nth-child(5),
 .test-times tr :nth-child(6),
 .test-times tr :nth-child(7),
-.test-times tr :nth-child(8),
 .concurrent-benchmarks tr :nth-child(2),
 .concurrent-benchmarks tr :nth-child(3),
 .concurrent-benchmarks tr :nth-child(4),
@ -205,9 +207,11 @@ def tableStart(title):
    global table_anchor
    table_anchor = cls
    anchor = currentTableAnchor()
+    help_anchor = '-'.join(title.lower().split(' '));
    return f"""
        <h2 id="{anchor}">
            <a class="cancela" href="#{anchor}">{title}</a>
+            <a class="cancela" href="https://github.com/ClickHouse/ClickHouse/tree/master/docker/test/performance-comparison#{help_anchor}"><sup style="color: #888">?</sup></a>
        </h2>
        <table class="{cls}">
    """
@ -250,7 +254,7 @@ def addSimpleTable(caption, columns, rows, pos=None):
 def add_tested_commits():
    global report_errors
    try:
-        addSimpleTable('Tested commits', ['Old', 'New'],
+        addSimpleTable('Tested Commits', ['Old', 'New'],
            [['<pre>{}</pre>'.format(x) for x in
                [open('left-commit.txt').read(),
                 open('right-commit.txt').read()]]])
@ -276,7 +280,7 @@ def add_report_errors():
    if not report_errors:
        return

-    text = tableStart('Errors while building the report')
+    text = tableStart('Errors while Building the Report')
    text += tableHeader(['Error'])
    for x in report_errors:
        text += tableRow([x])
@ -290,7 +294,7 @@ def add_errors_explained():
        return

    text = '<a name="fail1"/>'
-    text += tableStart('Error summary')
+    text += tableStart('Error Summary')
    text += tableHeader(['Description'])
    for row in errors_explained:
        text += tableRow(row)
@ -308,26 +312,26 @@ if args.report == 'main':

    run_error_rows = tsvRows('run-errors.tsv')
    error_tests += len(run_error_rows)
-    addSimpleTable('Run errors', ['Test', 'Error'], run_error_rows)
+    addSimpleTable('Run Errors', ['Test', 'Error'], run_error_rows)
    if run_error_rows:
        errors_explained.append([f'<a href="#{currentTableAnchor()}">There were some errors while running the tests</a>']);


    slow_on_client_rows = tsvRows('report/slow-on-client.tsv')
    error_tests += len(slow_on_client_rows)
-    addSimpleTable('Slow on client',
+    addSimpleTable('Slow on Client',
                     ['Client time,&nbsp;s', 'Server time,&nbsp;s', 'Ratio', 'Test', 'Query'],
                     slow_on_client_rows)
    if slow_on_client_rows:
        errors_explained.append([f'<a href="#{currentTableAnchor()}">Some queries are taking noticeable time client-side (missing `FORMAT Null`?)</a>']);

-    unmarked_short_rows = tsvRows('report/unmarked-short-queries.tsv')
+    unmarked_short_rows = tsvRows('report/inconsistent-short-marking.tsv')
    error_tests += len(unmarked_short_rows)
-    addSimpleTable('Short queries not marked as short',
-        ['New client time, s', 'Test', '#', 'Query'],
+    addSimpleTable('Inconsistent Short Marking',
+        ['Problem', 'Is marked as short', 'New client time, s', 'Test', '#', 'Query'],
        unmarked_short_rows)
    if unmarked_short_rows:
-        errors_explained.append([f'<a href="#{currentTableAnchor()}">Some queries have short duration but are not explicitly marked as "short"</a>']);
+        errors_explained.append([f'<a href="#{currentTableAnchor()}">Some queries have inconsistent short marking</a>']);

    def add_partial():
        rows = tsvRows('report/partial-queries-report.tsv')
@ -335,7 +339,7 @@ if args.report == 'main':
            return

        global unstable_partial_queries, slow_average_tests, tables
-        text = tableStart('Partial queries')
+        text = tableStart('Partial Queries')
        columns = ['Median time, s', 'Relative time variance', 'Test', '#', 'Query']
        text += tableHeader(columns)
        attrs = ['' for c in columns]
@ -366,7 +370,7 @@ if args.report == 'main':

        global faster_queries, slower_queries, tables

-        text = tableStart('Changes in performance')
+        text = tableStart('Changes in Performance')
        columns = [
            'Old,&nbsp;s',                                          # 0
            'New,&nbsp;s',                                          # 1
@ -423,7 +427,7 @@ if args.report == 'main':
            'Query' #7
        ]

-        text = tableStart('Unstable queries')
+        text = tableStart('Unstable Queries')
        text += tableHeader(columns)

        attrs = ['' for c in columns]
@ -444,9 +448,9 @@ if args.report == 'main':
    add_unstable_queries()

    skipped_tests_rows = tsvRows('analyze/skipped-tests.tsv')
-    addSimpleTable('Skipped tests', ['Test', 'Reason'], skipped_tests_rows)
+    addSimpleTable('Skipped Tests', ['Test', 'Reason'], skipped_tests_rows)

-    addSimpleTable('Test performance changes',
+    addSimpleTable('Test Performance Changes',
        ['Test', 'Ratio of speedup&nbsp;(-) or slowdown&nbsp;(+)', 'Queries', 'Total not OK', 'Changed perf', 'Unstable'],
        tsvRows('report/test-perf-changes.tsv'))

@ -461,13 +465,12 @@ if args.report == 'main':
            'Wall clock time,&nbsp;s',                            #1
            'Total client time,&nbsp;s',                          #2
            'Total queries',                                 #3
-            'Ignored short queries',                         #4
-            'Longest query<br>(sum for all runs),&nbsp;s',        #5
-            'Avg wall clock time<br>(sum for all runs),&nbsp;s',  #6
-            'Shortest query<br>(sum for all runs),&nbsp;s',       #7
+            'Longest query<br>(sum for all runs),&nbsp;s',        #4
+            'Avg wall clock time<br>(sum for all runs),&nbsp;s',  #5
+            'Shortest query<br>(sum for all runs),&nbsp;s',       #6
            ]

-        text = tableStart('Test times')
+        text = tableStart('Test Times')
        text += tableHeader(columns)

        nominal_runs = 7  # FIXME pass this as an argument
@ -476,20 +479,20 @@ if args.report == 'main':
        attrs = ['' for c in columns]
        for r in rows:
            anchor = f'{currentTableAnchor()}.{r[0]}'
-            if float(r[6]) > allowed_average_run_time * total_runs:
+            if float(r[5]) > allowed_average_run_time * total_runs:
                # FIXME should be 15s max -- investigate parallel_insert
                slow_average_tests += 1
-                attrs[6] = f'style="background: {color_bad}"'
+                attrs[5] = f'style="background: {color_bad}"'
                errors_explained.append([f'<a href="#{anchor}">The test \'{r[0]}\' is too slow to run as a whole. Investigate whether the create and fill queries can be sped up'])
            else:
-                attrs[6] = ''
+                attrs[5] = ''

-            if float(r[5]) > allowed_single_run_time * total_runs:
+            if float(r[4]) > allowed_single_run_time * total_runs:
                slow_average_tests += 1
-                attrs[5] = f'style="background: {color_bad}"'
+                attrs[4] = f'style="background: {color_bad}"'
                errors_explained.append([f'<a href="./all-queries.html#all-query-times.{r[0]}.0">Some query of the test \'{r[0]}\' is too slow to run. See the all queries report'])
            else:
-                attrs[5] = ''
+                attrs[4] = ''

            text += tableRow(r, attrs, anchor)

@ -498,74 +501,7 @@ if args.report == 'main':

    add_test_times()

-    def add_benchmark_results():
-        if not os.path.isfile('benchmark/website-left.json'):
-            return
-
-        json_reports = [json.load(open(f'benchmark/website-{x}.json')) for x in ['left', 'right']]
-        stats = [next(iter(x.values()))["statistics"] for x in json_reports]
-        qps = [x["QPS"] for x in stats]
-        queries = [x["num_queries"] for x in stats]
-        errors = [x["num_errors"] for x in stats]
-        relative_diff = (qps[1] - qps[0]) / max(0.01, qps[0]);
-        times_diff = max(qps) / max(0.01, min(qps))
-
-        all_rows = []
-        header = ['Benchmark', 'Metric', 'Old', 'New', 'Relative difference', 'Times difference'];
-
-        attrs = ['' for x in header]
-        row = ['website', 'queries', f'{queries[0]:d}', f'{queries[1]:d}', '--', '--']
-        attrs[0] = 'rowspan=2'
-        all_rows.append([row, attrs])
-
-        attrs = ['' for x in header]
-        row = [None, 'queries/s', f'{qps[0]:.3f}', f'{qps[1]:.3f}', f'{relative_diff:.3f}', f'x{times_diff:.3f}']
-        if abs(relative_diff) > 0.1:
-            # More queries per second is better.
-            if relative_diff > 0.:
-                attrs[4] = f'style="background: {color_good}"'
-            else:
-                attrs[4] = f'style="background: {color_bad}"'
-        else:
-            attrs[4] = ''
-        all_rows.append([row, attrs]);
-
-        if max(errors):
-            all_rows[0][1][0] = "rowspan=3"
-            row = [''] * (len(header))
-            attrs = ['' for x in header]
-
-            attrs[0] = None
-            row[1] = 'errors'
-            row[2] = f'{errors[0]:d}'
-            row[3] = f'{errors[1]:d}'
-            row[4] = '--'
-            row[5] = '--'
-            if errors[0]:
-                attrs[2] += f' style="background: {color_bad}" '
-            if errors[1]:
-                attrs[3] += f' style="background: {color_bad}" '
-
-            all_rows.append([row, attrs])
-
-        text = tableStart('Concurrent benchmarks')
-        text += tableHeader(header)
-        for row, attrs in all_rows:
-            text += tableRow(row, attrs)
-        text += tableEnd()
-
-        global tables
-        tables.append(text)
-
-    try:
-        add_benchmark_results()
-    except:
-        report_errors.append(
-            traceback.format_exception_only(
-                *sys.exc_info()[:2])[-1])
-        pass
-
-    addSimpleTable('Metric changes',
+    addSimpleTable('Metric Changes',
        ['Metric', 'Old median value', 'New median value',
            'Relative difference', 'Times difference'],
        tsvRows('metrics/changes.tsv'))
@ -656,7 +592,7 @@ elif args.report == 'all-queries':
            'Query',                                  #9
            ]

-        text = tableStart('All query times')
+        text = tableStart('All Query Times')
        text += tableHeader(columns)

        attrs = ['' for c in columns]
--- a/docs/en/engines/table-engines/special/distributed.md
+++ b/docs/en/engines/table-engines/special/distributed.md
@ -45,6 +45,18 @@ Clusters are set like this:
 <remote_servers>
    <logs>
        <shard>
+            <!-- Inter-server per-cluster secret for Distributed queries
+                 default: no secret (no authentication will be performed)
+
+                 If set, then Distributed queries will be validated on shards, so at least:
+                 - such cluster should exist on the shard,
+                 - such cluster should have the same secret.
+
+                 And also (and which is more important), the initial_user will
+                 be used as current user for the query.
+            -->
+            <!-- <secret></secret> -->
+
            <!-- Optional. Shard weight when writing data. Default: 1. -->
            <weight>1</weight>
            <!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
--- a/docs/en/operations/settings/query-complexity.md
+++ b/docs/en/operations/settings/query-complexity.md
@ -60,6 +60,31 @@ A maximum number of bytes (uncompressed data) that can be read from a table when

 What to do when the volume of data read exceeds one of the limits: ‘throw’ or ‘break’. By default, throw.

+## max\_rows\_to\_read_leaf {#max-rows-to-read-leaf}
+
+The following restrictions can be checked on each block (instead of on each row). That is, the restrictions can be broken a little.
+
+A maximum number of rows that can be read from a local table on a leaf node when running a distributed query. While
+distributed queries can issue a multiple sub-queries to each shard (leaf) - this limit will be checked only on the read 
+stage on the leaf nodes and ignored on results merging stage on the root node. For example, cluster consists of 2 shards 
+and each shard contains a table with 100 rows. Then distributed query which suppose to read all the data from both 
+tables with setting `max_rows_to_read=150` will fail as in total it will be 200 rows. While query 
+with `max_rows_to_read_leaf=150` will succeed since leaf nodes will read 100 rows at max.
+
+## max\_bytes\_to\_read_leaf {#max-bytes-to-read-leaf}
+
+A maximum number of bytes (uncompressed data) that can be read from a local table on a leaf node when running 
+a distributed query. While distributed queries can issue a multiple sub-queries to each shard (leaf) - this limit will 
+be checked only on the read stage on the leaf nodes and ignored on results merging stage on the root node. 
+For example, cluster consists of 2 shards and each shard contains a table with 100 bytes of data. 
+Then distributed query which suppose to read all the data from both tables with setting `max_bytes_to_read=150` will fail 
+as in total it will be 200 bytes. While query with `max_bytes_to_read_leaf=150` will succeed since leaf nodes will read 
+100 bytes at max.
+
+## read\_overflow\_mode_leaf {#read-overflow-mode-leaf}
+
+What to do when the volume of data read exceeds one of the leaf limits: ‘throw’ or ‘break’. By default, throw.
+
 ## max\_rows\_to\_group\_by {#settings-max-rows-to-group-by}

 A maximum number of unique keys received from aggregation. This setting lets you limit memory consumption when aggregating.
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -940,6 +940,8 @@ This algorithm chooses the first replica in the set or a random replica if the f

 The `first_or_random` algorithm solves the problem of the `in_order` algorithm. With `in_order`, if one replica goes down, the next one gets a double load while the remaining replicas handle the usual amount of traffic. When using the `first_or_random` algorithm, the load is evenly distributed among replicas that are still available.

+It's possible to explicitly define what the first replica is by using the setting `load_balancing_first_offset`. This gives more control to rebalance query workloads among replicas.
+
 ### Round Robin {#load_balancing-round_robin}

 ``` sql
--- a/docs/ru/operations/settings/query-complexity.md
+++ b/docs/ru/operations/settings/query-complexity.md
@ -56,6 +56,32 @@

 Что делать, когда количество прочитанных данных превысило одно из ограничений: throw или break. По умолчанию: throw.

+## max\_rows\_to\_read_leaf {#max-rows-to-read-leaf}
+
+Следующие ограничения могут проверяться на каждый блок (а не на каждую строку). То есть, ограничения могут быть немного нарушены.
+
+Максимальное количество строчек, которое можно прочитать из таблицы на удалённом сервере при выполнении
+распределенного запроса. Распределенные запросы могут создавать несколько подзапросов к каждому из шардов в кластере и 
+тогда этот лимит будет применен при выполнении чтения на удаленных серверах (включая и сервер-инициатор) и проигнорирован 
+на сервере-инициаторе запроса во время обьединения полученных результатов. Например, кластер состоит из 2 шард и каждый 
+из них хранит таблицу с 100 строк. Тогда распределнный запрос для получения всех данных из этих таблиц и установленной 
+настройкой `max_rows_to_read=150` выбросит исключение, т.к. в общем он прочитает 200 строк. Но запрос 
+с настройкой  `max_rows_to_read_leaf=150` завершится успешно, потому что каждый из шардов прочитает максимум 100 строк.
+
+## max\_bytes\_to\_read_leaf {#max-bytes-to-read-leaf}
+
+Максимальное количество байт (несжатых данных), которое можно прочитать из таблицы на удалённом сервере при 
+выполнении распределенного запроса. Распределенные запросы могут создавать несколько подзапросов к каждому из шардов в 
+кластере и тогда этот лимит будет применен при выполнении чтения на удаленных серверах (включая и сервер-инициатор) 
+и проигнорирован на сервере-инициаторе запроса во время обьединения полученных результатов. Например, кластер состоит 
+из 2 шард и каждый из них хранит таблицу со 100 байтами. Тогда распределнный запрос для получения всех данных из этих таблиц 
+и установленной настройкой `max_bytes_to_read=150` выбросит исключение, т.к. в общем он прочитает 200 байт. Но запрос 
+с настройкой  `max_bytes_to_read_leaf=150` завершится успешно, потому что каждый из шардов прочитает максимум 100 байт.
+
+## read\_overflow\_mode_leaf {#read-overflow-mode-leaf}
+
+Что делать, когда количество прочитанных данных на удаленном сервере превысило одно из ограничений: throw или break. По умолчанию: throw.
+
 ## max\_rows\_to\_group\_by {#settings-max-rows-to-group-by}

 Максимальное количество уникальных ключей, получаемых в процессе агрегации. Позволяет ограничить потребление оперативки при агрегации.
--- a/docs/zh/getting-started/tutorial.md
+++ b/docs/zh/getting-started/tutorial.md
@ -80,7 +80,7 @@ clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv

 ## 导入示例数据集 {#import-sample-dataset}

-现在是时候用一些示例数据填充我们的ClickHouse服务器。 在本教程中，我们将使用Yandex的匿名数据。Metrica，在成为开源之前以生产方式运行ClickHouse的第一个服务（更多关于这一点 [历史科](../introduction/history.md)). 有 [多种导入Yandex的方式。梅里卡数据集](example-datasets/metrica.md)，为了本教程，我们将使用最现实的一个。
+现在是时候用一些示例数据填充我们的ClickHouse服务端。 在本教程中，我们将使用Yandex.Metrica的匿名数据，它是在ClickHouse成为开源之前作为生产环境运行的第一个服务（关于这一点的更多内容请参阅[ClickHouse历史](../introduction/history.md))。有 [多种导入Yandex.Metrica数据集的的方法](example-datasets/metrica.md)，为了本教程，我们将使用最现实的一个。

 ### 下载并提取表数据 {#download-and-extract-table-data}

@ -93,22 +93,22 @@ curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unx

 ### 创建表 {#create-tables}

-与大多数数据库管理系统一样，ClickHouse在逻辑上将表分组为 “databases”. 有一个 `default` 数据库，但我们将创建一个名为新的 `tutorial`:
+与大多数数据库管理系统一样，ClickHouse在逻辑上将表分组为数据库。包含一个 `default` 数据库，但我们将创建一个新的数据库 `tutorial`:

 ``` bash
 clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial"
 ```

-与数据库相比，创建表的语法要复杂得多（请参阅 [参考资料](../sql-reference/statements/create.md). 一般 `CREATE TABLE` 声明必须指定三个关键的事情:
+与创建数据库相比，创建表的语法要复杂得多（请参阅 [参考资料](../sql-reference/statements/create.md). 一般 `CREATE TABLE` 声明必须指定三个关键的事情:

 1.  要创建的表的名称。
-2.  Table schema, i.e. list of columns and their [数据类型](../sql-reference/data-types/index.md).
-3.  [表引擎](../engines/table-engines/index.md) 及其设置，这决定了如何物理执行对此表的查询的所有细节。
+2.  表结构，例如：列名和对应的[数据类型](../sql-reference/data-types/index.md)。
+3.  [表引擎](../engines/table-engines/index.md) 及其设置，这决定了对此表的查询操作是如何在物理层面执行的所有细节。

-YandexMetrica是一个网络分析服务，样本数据集不包括其全部功能，因此只有两个表可以创建:
+Yandex.Metrica是一个网络分析服务，样本数据集不包括其全部功能，因此只有两个表可以创建:

-   `hits` 是一个表格，其中包含所有用户在服务所涵盖的所有网站上完成的每个操作。
-   `visits` 是一个包含预先构建的会话而不是单个操作的表。
+-   `hits` 表包含所有用户在服务所涵盖的所有网站上完成的每个操作。
+-   `visits` 表包含预先构建的会话，而不是单个操作。

 让我们看看并执行这些表的实际创建表查询:

@ -453,9 +453,9 @@ SAMPLE BY intHash32(UserID)
 SETTINGS index_granularity = 8192
 ```

-您可以使用以下交互模式执行这些查询 `clickhouse-client` （只需在终端中启动它，而不需要提前指定查询）或尝试一些 [替代接口](../interfaces/index.md) 如果你愿意的话
+您可以使用`clickhouse-client`的交互模式执行这些查询（只需在终端中启动它，而不需要提前指定查询）。或者如果你愿意，可以尝试一些[替代接口](../interfaces/index.md)。

-正如我们所看到的, `hits_v1` 使用 [基本MergeTree引擎](../engines/table-engines/mergetree-family/mergetree.md)，而 `visits_v1` 使用 [崩溃](../engines/table-engines/mergetree-family/collapsingmergetree.md) 变体。
+正如我们所看到的, `hits_v1` 使用 [基本的MergeTree引擎](../engines/table-engines/mergetree-family/mergetree.md)，而 `visits_v1` 使用 [折叠树](../engines/table-engines/mergetree-family/collapsingmergetree.md) 变体。

 ### 导入数据 {#import-data}

--- a/docs/zh/sql-reference/aggregate-functions/index.md
+++ b/docs/zh/sql-reference/aggregate-functions/index.md
@ -1,6 +1,6 @@
 ---
 toc_priority: 33
-toc_title: 简介
+toc_title: 聚合函数
 ---

 # 聚合函数 {#aggregate-functions}
--- a/docs/zh/sql-reference/functions/conditional-functions.md
+++ b/docs/zh/sql-reference/functions/conditional-functions.md
@ -34,7 +34,7 @@
    │ 2 │    3 │
    └───┴──────┘

-执行查询 `SELECT multiIf(isNull(y) x, y < 3, y, NULL) FROM t_null`。结果：
+执行查询 `SELECT multiIf(isNull(y), x, y < 3, y, NULL) FROM t_null`。结果：

    ┌─multiIf(isNull(y), x, less(y, 3), y, NULL)─┐
    │                                          1 │
--- a/programs/benchmark/Benchmark.cpp
+++ b/programs/benchmark/Benchmark.cpp
@ -85,7 +85,12 @@ public:
            std::string cur_host = i >= hosts_.size() ? "localhost" : hosts_[i];

            connections.emplace_back(std::make_unique<ConnectionPool>(
-                concurrency, cur_host, cur_port, default_database_, user_, password_, "benchmark", Protocol::Compression::Enable, secure));
+                concurrency,
+                cur_host, cur_port,
+                default_database_, user_, password_,
+                "", /* cluster */
+                "", /* cluster_secret */
+                "benchmark", Protocol::Compression::Enable, secure));
            comparison_info_per_interval.emplace_back(std::make_shared<Stats>());
            comparison_info_total.emplace_back(std::make_shared<Stats>());
        }
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -701,6 +701,8 @@ private:
            connection_parameters.default_database,
            connection_parameters.user,
            connection_parameters.password,
+            "", /* cluster */
+            "", /* cluster_secret */
            "client",
            connection_parameters.compression,
            connection_parameters.security);
@ -1502,7 +1504,18 @@ private:
        {
            /// Send data contained in the query.
            ReadBufferFromMemory data_in(parsed_insert_query->data, parsed_insert_query->end - parsed_insert_query->data);
+            try
+            {
                sendDataFrom(data_in, sample, columns_description);
+            }
+            catch (Exception & e)
+            {
+                /// The following query will use data from input
+                //      "INSERT INTO data FORMAT TSV\n " < data.csv
+                //  And may be pretty hard to debug, so add information about data source to make it easier.
+                e.addMessage("data for INSERT was parsed from query");
+                throw;
+            }
            // Remember where the data ended. We use this info later to determine
            // where the next query begins.
            parsed_insert_query->end = data_in.buffer().begin() + data_in.count();
@ -1510,8 +1523,16 @@ private:
        else if (!is_interactive)
        {
            /// Send data read from stdin.
+            try
+            {
                sendDataFrom(std_in, sample, columns_description);
            }
+            catch (Exception & e)
+            {
+                e.addMessage("data for INSERT was parsed from stdin");
+                throw;
+            }
+        }
        else
            throw Exception("No data to insert", ErrorCodes::NO_DATA_TO_INSERT);
    }
--- a/programs/client/Suggest.cpp
+++ b/programs/client/Suggest.cpp
@ -26,6 +26,8 @@ void Suggest::load(const ConnectionParameters & connection_parameters, size_t su
                    connection_parameters.default_database,
                    connection_parameters.user,
                    connection_parameters.password,
+                    "" /* cluster */,
+                    "" /* cluster_secret */,
                    "client",
                    connection_parameters.compression,
                    connection_parameters.security);
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -538,6 +538,9 @@ int Server::main(const std::vector<std::string> & /*args*/)
            if (config->has("max_partition_size_to_drop"))
                global_context->setMaxPartitionSizeToDrop(config->getUInt64("max_partition_size_to_drop"));

+            if (config->has("zookeeper"))
+                global_context->reloadZooKeeperIfChanged(config);
+
            global_context->updateStorageConfiguration(*config);
        },
        /* already_loaded = */ true);
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -311,6 +311,28 @@
    <remote_servers incl="clickhouse_remote_servers" >
        <!-- Test only shard config for testing distributed storage -->
        <test_shard_localhost>
+            <!-- Inter-server per-cluster secret for Distributed queries
+                 default: no secret (no authentication will be performed)
+
+                 If set, then Distributed queries will be validated on shards, so at least:
+                 - such cluster should exist on the shard,
+                 - such cluster should have the same secret.
+
+                 And also (and which is more important), the initial_user will
+                 be used as current user for the query.
+
+                 Right now the protocol is pretty simple and it only takes into account:
+                 - cluster name
+                 - query
+
+                 Also it will be nice if the following will be implemented:
+                 - source hostname (see interserver_http_host), but then it will depends from DNS,
+                   it can use IP address instead, but then the you need to get correct on the initiator node.
+                 - target hostname / ip address (same notes as for source hostname)
+                 - time-based security tokens
+            -->
+            <!-- <secret></secret> -->
+
            <shard>
                <!-- Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas). -->
                <!-- <internal_replication>false</internal_replication> -->
--- a/src/Client/Connection.cpp
+++ b/src/Client/Connection.cpp
@ -17,12 +17,15 @@
 #include <Common/CurrentMetrics.h>
 #include <Common/DNSResolver.h>
 #include <Common/StringUtils/StringUtils.h>
+#include <Common/OpenSSLHelpers.h>
+#include <Common/randomSeed.h>
 #include <Interpreters/ClientInfo.h>
 #include <Compression/CompressionFactory.h>
 #include <Processors/Pipe.h>
 #include <Processors/ISink.h>
 #include <Processors/Executors/PipelineExecutor.h>
 #include <Processors/ConcatProcessor.h>
+#include <pcg_random.hpp>

 #if !defined(ARCADIA_BUILD)
 #    include <Common/config_version.h>
@ -171,8 +174,26 @@ void Connection::sendHello()
    // NOTE For backward compatibility of the protocol, client cannot send its version_patch.
    writeVarUInt(client_revision, *out);
    writeStringBinary(default_database, *out);
+    /// If interserver-secret is used, one do not need password
+    /// (NOTE we do not check for DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET, since we cannot ignore inter-server secret if it was requested)
+    if (!cluster_secret.empty())
+    {
+        writeStringBinary(USER_INTERSERVER_MARKER, *out);
+        writeStringBinary("" /* password */, *out);
+
+#if USE_SSL
+        sendClusterNameAndSalt();
+#else
+        throw Exception(
+            "Inter-server secret support is disabled, because ClickHouse was built without SSL library",
+            ErrorCodes::SUPPORT_IS_DISABLED);
+#endif
+    }
+    else
+    {
        writeStringBinary(user, *out);
        writeStringBinary(password, *out);
+    }

    out->next();
 }
@ -288,6 +309,19 @@ void Connection::forceConnected(const ConnectionTimeouts & timeouts)
    }
 }

+#if USE_SSL
+void Connection::sendClusterNameAndSalt()
+{
+    pcg64_fast rng(randomSeed());
+    UInt64 rand = rng();
+
+    salt = encodeSHA256(&rand, sizeof(rand));
+
+    writeStringBinary(cluster, *out);
+    writeStringBinary(salt, *out);
+}
+#endif
+
 bool Connection::ping()
 {
    // LOG_TRACE(log_wrapper.get(), "Ping");
@ -406,6 +440,37 @@ void Connection::sendQuery(
    else
        writeStringBinary("" /* empty string is a marker of the end of settings */, *out);

+    /// Interserver secret
+    if (server_revision >= DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET)
+    {
+        /// Hash
+        ///
+        /// Send correct hash only for !INITIAL_QUERY, due to:
+        /// - this will avoid extra protocol complexity for simplest cases
+        /// - there is no need in hash for the INITIAL_QUERY anyway
+        ///   (since there is no secure/unsecure changes)
+        if (client_info && !cluster_secret.empty() && client_info->query_kind != ClientInfo::QueryKind::INITIAL_QUERY)
+        {
+#if USE_SSL
+            std::string data(salt);
+            data += cluster_secret;
+            data += query;
+            data += query_id;
+            data += client_info->initial_user;
+            /// TODO: add source/target host/ip-address
+
+            std::string hash = encodeSHA256(data);
+            writeStringBinary(hash, *out);
+#else
+        throw Exception(
+            "Inter-server secret support is disabled, because ClickHouse was built without SSL library",
+            ErrorCodes::SUPPORT_IS_DISABLED);
+#endif
+        }
+        else
+            writeStringBinary("", *out);
+    }
+
    writeVarUInt(stage, *out);
    writeVarUInt(static_cast<bool>(compression), *out);

--- a/src/Client/Connection.h
+++ b/src/Client/Connection.h
@ -83,6 +83,8 @@ public:
    Connection(const String & host_, UInt16 port_,
        const String & default_database_,
        const String & user_, const String & password_,
+        const String & cluster_,
+        const String & cluster_secret_,
        const String & client_name_ = "client",
        Protocol::Compression compression_ = Protocol::Compression::Enable,
        Protocol::Secure secure_ = Protocol::Secure::Disable,
@ -90,6 +92,8 @@ public:
        :
        host(host_), port(port_), default_database(default_database_),
        user(user_), password(password_),
+        cluster(cluster_),
+        cluster_secret(cluster_secret_),
        client_name(client_name_),
        compression(compression_),
        secure(secure_),
@ -191,6 +195,11 @@ private:
    String user;
    String password;

+    /// For inter-server authorization
+    String cluster;
+    String cluster_secret;
+    String salt;
+
    /// Address is resolved during the first connection (or the following reconnects)
    /// Use it only for logging purposes
    std::optional<Poco::Net::SocketAddress> current_resolved_address;
@ -269,6 +278,10 @@ private:
    void connect(const ConnectionTimeouts & timeouts);
    void sendHello();
    void receiveHello();
+
+#if USE_SSL
+    void sendClusterNameAndSalt();
+#endif
    bool ping();

    Block receiveData();
--- a/src/Client/ConnectionPool.h
+++ b/src/Client/ConnectionPool.h
@ -54,6 +54,8 @@ public:
            const String & default_database_,
            const String & user_,
            const String & password_,
+            const String & cluster_,
+            const String & cluster_secret_,
            const String & client_name_ = "client",
            Protocol::Compression compression_ = Protocol::Compression::Enable,
            Protocol::Secure secure_ = Protocol::Secure::Disable,
@ -65,6 +67,8 @@ public:
        default_database(default_database_),
        user(user_),
        password(password_),
+        cluster(cluster_),
+        cluster_secret(cluster_secret_),
        client_name(client_name_),
        compression(compression_),
        secure(secure_),
@ -109,6 +113,7 @@ protected:
        return std::make_shared<Connection>(
            host, port,
            default_database, user, password,
+            cluster, cluster_secret,
            client_name, compression, secure);
    }

@ -119,6 +124,10 @@ private:
    String user;
    String password;

+    /// For inter-server authorization
+    String cluster;
+    String cluster_secret;
+
    String client_name;
    Protocol::Compression compression; /// Whether to compress data when interacting with the server.
    Protocol::Secure secure;           /// Whether to encrypt data when interacting with the server.
--- a/src/Client/ConnectionPoolWithFailover.cpp
+++ b/src/Client/ConnectionPoolWithFailover.cpp
@ -56,6 +56,9 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts
        return tryGetEntry(pool, timeouts, fail_message, settings);
    };

+    size_t offset = 0;
+    if (settings)
+        offset = settings->load_balancing_first_offset % nested_pools.size();
    GetPriorityFunc get_priority;
    switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing)
    {
@ -68,7 +71,7 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts
    case LoadBalancing::RANDOM:
        break;
    case LoadBalancing::FIRST_OR_RANDOM:
-        get_priority = [](size_t i) -> size_t { return i >= 1; };
+        get_priority = [offset](size_t i) -> size_t { return i != offset; };
        break;
    case LoadBalancing::ROUND_ROBIN:
        if (last_used >= nested_pools.size())
@ -190,6 +193,9 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
    else
        throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR);

+    size_t offset = 0;
+    if (settings)
+        offset = settings->load_balancing_first_offset % nested_pools.size();
    GetPriorityFunc get_priority;
    switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing)
    {
@ -202,7 +208,7 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
    case LoadBalancing::RANDOM:
        break;
    case LoadBalancing::FIRST_OR_RANDOM:
-        get_priority = [](size_t i) -> size_t { return i >= 1; };
+        get_priority = [offset](size_t i) -> size_t { return i != offset; };
        break;
    case LoadBalancing::ROUND_ROBIN:
        if (last_used >= nested_pools.size())
--- a/src/Columns/ya.make
+++ b/src/Columns/ya.make
@ -2,6 +2,8 @@
 LIBRARY()

 ADDINCL(
+    contrib/libs/icu/common
+    contrib/libs/icu/i18n
    contrib/libs/pdqsort
 )

--- a/src/Common/OpenSSLHelpers.cpp
+++ b/src/Common/OpenSSLHelpers.cpp
@ -12,11 +12,26 @@ namespace DB
 {
 #pragma GCC diagnostic warning "-Wold-style-cast"

+std::string encodeSHA256(const std::string_view & text)
+{
+    return encodeSHA256(text.data(), text.size());
+}
+std::string encodeSHA256(const void * text, size_t size)
+{
+    std::string out;
+    out.resize(32);
+    encodeSHA256(text, size, reinterpret_cast<unsigned char *>(out.data()));
+    return out;
+}
 void encodeSHA256(const std::string_view & text, unsigned char * out)
+{
+    encodeSHA256(text.data(), text.size(), out);
+}
+void encodeSHA256(const void * text, size_t size, unsigned char * out)
 {
    SHA256_CTX ctx;
    SHA256_Init(&ctx);
-    SHA256_Update(&ctx, reinterpret_cast<const UInt8 *>(text.data()), text.size());
+    SHA256_Update(&ctx, reinterpret_cast<const UInt8 *>(text), size);
    SHA256_Final(out, &ctx);
 }

--- a/src/Common/OpenSSLHelpers.h
+++ b/src/Common/OpenSSLHelpers.h
@ -10,8 +10,13 @@

 namespace DB
 {
-/// Encodes `text` and puts the result to `out` which must be at least 32 bytes long.
+
+/// Encodes `text` and returns it.
+std::string encodeSHA256(const std::string_view & text);
+std::string encodeSHA256(const void * text, size_t size);
+/// `out` must be at least 32 bytes long.
 void encodeSHA256(const std::string_view & text, unsigned char * out);
+void encodeSHA256(const void * text, size_t size, unsigned char * out);

 /// Returns concatenation of error strings for all errors that OpenSSL has recorded, emptying the error queue.
 String getOpenSSLErrors();
--- a/src/Common/PODArray.h
+++ b/src/Common/PODArray.h
@ -214,6 +214,9 @@ public:
    void clear() { c_end = c_start; }

    template <typename ... TAllocatorParams>
+#if defined(__clang__)
+    ALWAYS_INLINE /// Better performance in clang build, worse performance in gcc build.
+#endif
    void reserve(size_t n, TAllocatorParams &&... allocator_params)
    {
        if (n > capacity())
--- a/src/Common/ThreadFuzzer.h
+++ b/src/Common/ThreadFuzzer.h
@ -31,10 +31,8 @@ namespace DB
  *
  * Notes:
  * - it can be also implemented with instrumentation (example: LLVM Xray) instead of signals.
-  * - it's also reasonable to insert glitches around interesting functions (example: mutex lock/unlock, starting of threads, etc.),
-  *   it is doable with wrapping these functions (todo).
  * - we should also make the sleep time random.
-  * - sleep obviously helps, but the effect of yield and migration is unclear.
+  * - sleep and migration obviously helps, but the effect of yield is unclear.
  *
  * In addition, we allow to inject glitches around thread synchronization functions.
  * Example:
--- a/src/Common/Visitor.h
+++ b/src/Common/Visitor.h
@ -66,7 +66,8 @@ class Visitor<>
 public:
    using List = TypeList<>;

-    virtual ~Visitor() = default;
+protected:
+    ~Visitor() = default;
 };

 template <typename Type>
@ -76,6 +77,9 @@ public:
    using List = TypeList<Type>;

    virtual void visit(Type &) = 0;
+
+protected:
+    ~Visitor() = default;
 };

 template <typename Type, typename ... Types>
@ -86,6 +90,9 @@ public:
    using Visitor<Types ...>::visit;

    virtual void visit(Type &) = 0;
+
+protected:
+    ~Visitor() = default;
 };


@ -95,6 +102,8 @@ class VisitorImplHelper;
 template <typename Derived, typename VisitorBase>
 class VisitorImplHelper<Derived, VisitorBase> : public VisitorBase
 {
+protected:
+    ~VisitorImplHelper() = default;
 };

 template <typename Derived, typename VisitorBase, typename Type>
@ -111,6 +120,8 @@ protected:
        throw Exception("visitImpl(" + demangle(typeid(T).name()) + " &)" + " is not implemented for class"
                        + demangle(typeid(Derived).name()), ErrorCodes::LOGICAL_ERROR);
    }
+
+    ~VisitorImplHelper() = default;
 };

 template <typename Derived, typename VisitorBase, typename Type, typename ... Types>
@ -128,6 +139,8 @@ protected:
        throw Exception("visitImpl(" + demangle(typeid(T).name()) + " &)" + " is not implemented for class"
                        + demangle(typeid(Derived).name()), ErrorCodes::LOGICAL_ERROR);
    }
+
+    ~VisitorImplHelper() = default;
 };

 template <typename Derived, typename VisitorBase>
@ -140,6 +153,8 @@ class VisitorImpl : public
                >::Type
        >::Type
 {
+protected:
+    ~VisitorImpl() = default;
 };

 template <typename Derived, typename Base, typename Visitor>
--- a/src/Common/ZooKeeper/ZooKeeper.cpp
+++ b/src/Common/ZooKeeper/ZooKeeper.cpp
@ -200,6 +200,18 @@ ZooKeeper::ZooKeeper(const Poco::Util::AbstractConfiguration & config, const std
    init(args.implementation, args.hosts, args.identity, args.session_timeout_ms, args.operation_timeout_ms, args.chroot);
 }

+bool ZooKeeper::configChanged(const Poco::Util::AbstractConfiguration & config, const std::string & config_name) const
+{
+    ZooKeeperArgs args(config, config_name);
+
+    // skip reload testkeeper cause it's for test and data in memory
+    if (args.implementation == implementation && implementation == "testkeeper")
+        return false;
+
+    return std::tie(args.implementation, args.hosts, args.identity, args.session_timeout_ms, args.operation_timeout_ms, args.chroot)
+        != std::tie(implementation, hosts, identity, session_timeout_ms, operation_timeout_ms, chroot);
+}
+

 static Coordination::WatchCallback callbackForEvent(const EventPtr & watch)
 {
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -56,7 +56,7 @@ public:
              int32_t session_timeout_ms_ = DEFAULT_SESSION_TIMEOUT,
              int32_t operation_timeout_ms_ = DEFAULT_OPERATION_TIMEOUT,
              const std::string & chroot_ = "",
-              const std::string & implementation = "zookeeper");
+              const std::string & implementation_ = "zookeeper");

    /** Config of the form:
        <zookeeper>
@ -87,6 +87,8 @@ public:
    /// This object remains unchanged, and the new session is returned.
    Ptr startNewSession() const;

+    bool configChanged(const Poco::Util::AbstractConfiguration & config, const std::string & config_name) const;
+
    /// Returns true, if the session has expired.
    bool expired();

--- a/src/Core/Defines.h
+++ b/src/Core/Defines.h
@ -67,8 +67,11 @@
 /// Minimum revision supporting SettingsBinaryFormat::STRINGS.
 #define DBMS_MIN_REVISION_WITH_SETTINGS_SERIALIZED_AS_STRINGS 54429

+/// Mininum revision supporting interserver secret.
+#define DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET 54441
+
 /// Version of ClickHouse TCP protocol. Set to git tag with latest protocol change.
-#define DBMS_TCP_PROTOCOL_VERSION 54226
+#define DBMS_TCP_PROTOCOL_VERSION 54441

 /// The boundary on which the blocks for asynchronous file operations should be aligned.
 #define DEFAULT_AIO_FILE_BLOCK_SIZE 4096
--- a/src/Core/MySQL/MySQLReplication.cpp
+++ b/src/Core/MySQL/MySQLReplication.cpp
@ -2,6 +2,7 @@

 #include <DataTypes/DataTypeString.h>
 #include <IO/ReadBufferFromString.h>
+#include <IO/MySQLBinlogEventReadBuffer.h>
 #include <IO/ReadHelpers.h>
 #include <common/DateLUT.h>
 #include <Common/FieldVisitors.h>
@ -14,6 +15,7 @@ namespace ErrorCodes
 {
    extern const int UNKNOWN_EXCEPTION;
    extern const int LOGICAL_ERROR;
+    extern const int ATTEMPT_TO_READ_AFTER_EOF;
 }

 namespace MySQLReplication
@ -49,14 +51,13 @@ namespace MySQLReplication
    {
        payload.readStrict(reinterpret_cast<char *>(&binlog_version), 2);
        assert(binlog_version == EVENT_VERSION_V4);
+        server_version.resize(50);
        payload.readStrict(reinterpret_cast<char *>(server_version.data()), 50);
        payload.readStrict(reinterpret_cast<char *>(&create_timestamp), 4);
        payload.readStrict(reinterpret_cast<char *>(&event_header_length), 1);
        assert(event_header_length == EVENT_HEADER_LENGTH);

-        size_t len = header.event_size - (2 + 50 + 4 + 1 + EVENT_HEADER_LENGTH) - 1;
-        event_type_header_length.resize(len);
-        payload.readStrict(reinterpret_cast<char *>(event_type_header_length.data()), len);
+        readStringUntilEOF(event_type_header_length, payload);
    }

    void FormatDescriptionEvent::dump(std::ostream & out) const
@ -72,9 +73,7 @@ namespace MySQLReplication
    void RotateEvent::parseImpl(ReadBuffer & payload)
    {
        payload.readStrict(reinterpret_cast<char *>(&position), 8);
-        size_t len = header.event_size - EVENT_HEADER_LENGTH - 8 - CHECKSUM_CRC32_SIGNATURE_LENGTH;
-        next_binlog.resize(len);
-        payload.readStrict(reinterpret_cast<char *>(next_binlog.data()), len);
+        readStringUntilEOF(next_binlog, payload);
    }

    void RotateEvent::dump(std::ostream & out) const
@ -100,9 +99,7 @@ namespace MySQLReplication
        payload.readStrict(reinterpret_cast<char *>(schema.data()), schema_len);
        payload.ignore(1);

-        size_t len = payload.available() - CHECKSUM_CRC32_SIGNATURE_LENGTH;
-        query.resize(len);
-        payload.readStrict(reinterpret_cast<char *>(query.data()), len);
+        readStringUntilEOF(query, payload);
        if (query.starts_with("BEGIN") || query.starts_with("COMMIT"))
        {
            typ = QUERY_EVENT_MULTI_TXN_FLAG;
@ -285,7 +282,7 @@ namespace MySQLReplication
                break;
        }

-        while (payload.available() > CHECKSUM_CRC32_SIGNATURE_LENGTH)
+        while (!payload.eof())
        {
            parseRow(payload, columns_present_bitmap1);
            if (header.type == UPDATE_ROWS_EVENT_V1 || header.type == UPDATE_ROWS_EVENT_V2)
@ -738,7 +735,7 @@ namespace MySQLReplication
        payload.readStrict(reinterpret_cast<char *>(&gtid.seq_no), 8);

        /// Skip others.
-        payload.ignore(payload.available() - CHECKSUM_CRC32_SIGNATURE_LENGTH);
+        payload.ignoreAll();
    }

    void GTIDEvent::dump(std::ostream & out) const
@ -751,7 +748,7 @@ namespace MySQLReplication
        out << "GTID Next: " << gtid_next << std::endl;
    }

-    void DryRunEvent::parseImpl(ReadBuffer & payload) { payload.ignore(header.event_size - EVENT_HEADER_LENGTH); }
+    void DryRunEvent::parseImpl(ReadBuffer & payload) { payload.ignoreAll(); }

    void DryRunEvent::dump(std::ostream & out) const
    {
@ -804,6 +801,9 @@ namespace MySQLReplication

    void MySQLFlavor::readPayloadImpl(ReadBuffer & payload)
    {
+        if (payload.eof())
+            throw Exception("Attempt to read after EOF.", ErrorCodes::ATTEMPT_TO_READ_AFTER_EOF);
+
        UInt16 header = static_cast<unsigned char>(*payload.position());
        switch (header)
        {
@ -814,37 +814,42 @@ namespace MySQLReplication
                err.readPayloadWithUnpacked(payload);
                throw ReplicationError(err.error_message, ErrorCodes::UNKNOWN_EXCEPTION);
        }
-        // skip the header flag.
+        // skip the generic response packets header flag.
        payload.ignore(1);

-        EventType event_type = static_cast<EventType>(*(payload.position() + 4));
-        switch (event_type)
+        MySQLBinlogEventReadBuffer event_payload(payload);
+
+        EventHeader event_header;
+        event_header.parse(event_payload);
+
+        switch (event_header.type)
        {
-            case FORMAT_DESCRIPTION_EVENT: {
-                event = std::make_shared<FormatDescriptionEvent>();
-                event->parseHeader(payload);
-                event->parseEvent(payload);
+            case FORMAT_DESCRIPTION_EVENT:
+            {
+                event = std::make_shared<FormatDescriptionEvent>(std::move(event_header));
+                event->parseEvent(event_payload);
                position.update(event);
                break;
            }
-            case ROTATE_EVENT: {
-                event = std::make_shared<RotateEvent>();
-                event->parseHeader(payload);
-                event->parseEvent(payload);
+            case ROTATE_EVENT:
+            {
+                event = std::make_shared<RotateEvent>(std::move(event_header));
+                event->parseEvent(event_payload);
                position.update(event);
                break;
            }
-            case QUERY_EVENT: {
-                event = std::make_shared<QueryEvent>();
-                event->parseHeader(payload);
-                event->parseEvent(payload);
+            case QUERY_EVENT:
+            {
+                event = std::make_shared<QueryEvent>(std::move(event_header));
+                event->parseEvent(event_payload);

                auto query = std::static_pointer_cast<QueryEvent>(event);
                switch (query->typ)
                {
                    case QUERY_EVENT_MULTI_TXN_FLAG:
-                    case QUERY_EVENT_XA: {
-                        event = std::make_shared<DryRunEvent>();
+                    case QUERY_EVENT_XA:
+                    {
+                        event = std::make_shared<DryRunEvent>(std::move(query->header));
                        break;
                    }
                    default:
@ -852,68 +857,67 @@ namespace MySQLReplication
                }
                break;
            }
-            case XID_EVENT: {
-                event = std::make_shared<XIDEvent>();
-                event->parseHeader(payload);
-                event->parseEvent(payload);
+            case XID_EVENT:
+            {
+                event = std::make_shared<XIDEvent>(std::move(event_header));
+                event->parseEvent(event_payload);
                position.update(event);
                break;
            }
-            case TABLE_MAP_EVENT: {
-                event = std::make_shared<TableMapEvent>();
-                event->parseHeader(payload);
-                event->parseEvent(payload);
+            case TABLE_MAP_EVENT:
+            {
+                event = std::make_shared<TableMapEvent>(std::move(event_header));
+                event->parseEvent(event_payload);
                table_map = std::static_pointer_cast<TableMapEvent>(event);
                break;
            }
            case WRITE_ROWS_EVENT_V1:
-            case WRITE_ROWS_EVENT_V2: {
+            case WRITE_ROWS_EVENT_V2:
+            {
                if (do_replicate())
-                    event = std::make_shared<WriteRowsEvent>(table_map);
+                    event = std::make_shared<WriteRowsEvent>(table_map, std::move(event_header));
                else
-                    event = std::make_shared<DryRunEvent>();
+                    event = std::make_shared<DryRunEvent>(std::move(event_header));

-                event->parseHeader(payload);
-                event->parseEvent(payload);
+                event->parseEvent(event_payload);
                break;
            }
            case DELETE_ROWS_EVENT_V1:
-            case DELETE_ROWS_EVENT_V2: {
+            case DELETE_ROWS_EVENT_V2:
+            {
                if (do_replicate())
-                    event = std::make_shared<DeleteRowsEvent>(table_map);
+                    event = std::make_shared<DeleteRowsEvent>(table_map, std::move(event_header));
                else
-                    event = std::make_shared<DryRunEvent>();
+                    event = std::make_shared<DryRunEvent>(std::move(event_header));

-                event->parseHeader(payload);
-                event->parseEvent(payload);
+                event->parseEvent(event_payload);
                break;
            }
            case UPDATE_ROWS_EVENT_V1:
-            case UPDATE_ROWS_EVENT_V2: {
+            case UPDATE_ROWS_EVENT_V2:
+            {
                if (do_replicate())
-                    event = std::make_shared<UpdateRowsEvent>(table_map);
+                    event = std::make_shared<UpdateRowsEvent>(table_map, std::move(event_header));
                else
-                    event = std::make_shared<DryRunEvent>();
+                    event = std::make_shared<DryRunEvent>(std::move(event_header));

-                event->parseHeader(payload);
-                event->parseEvent(payload);
+                event->parseEvent(event_payload);
                break;
            }
-            case GTID_EVENT: {
-                event = std::make_shared<GTIDEvent>();
-                event->parseHeader(payload);
-                event->parseEvent(payload);
+            case GTID_EVENT:
+            {
+                event = std::make_shared<GTIDEvent>(std::move(event_header));
+                event->parseEvent(event_payload);
                position.update(event);
                break;
            }
-            default: {
-                event = std::make_shared<DryRunEvent>();
-                event->parseHeader(payload);
-                event->parseEvent(payload);
+            default:
+            {
+                event = std::make_shared<DryRunEvent>(std::move(event_header));
+                event->parseEvent(event_payload);
                break;
            }
        }
-        payload.ignoreAll();
    }
 }

--- a/src/Core/MySQL/MySQLReplication.h
+++ b/src/Core/MySQL/MySQLReplication.h
@ -19,7 +19,6 @@ namespace MySQLReplication
 {
    static const int EVENT_VERSION_V4 = 4;
    static const int EVENT_HEADER_LENGTH = 19;
-    static const int CHECKSUM_CRC32_SIGNATURE_LENGTH = 4;

    using Bitmap = boost::dynamic_bitset<>;

@ -301,9 +300,10 @@ namespace MySQLReplication
    public:
        EventHeader header;

+        EventBase(EventHeader && header_) : header(std::move(header_)) {}
+
        virtual ~EventBase() = default;
        virtual void dump(std::ostream & out) const = 0;
-        virtual void parseHeader(ReadBuffer & payload) { header.parse(payload); }
        virtual void parseEvent(ReadBuffer & payload) { parseImpl(payload); }
        virtual MySQLEventType type() const { return MYSQL_UNHANDLED_EVENT; }

@ -314,7 +314,10 @@ namespace MySQLReplication
    class FormatDescriptionEvent : public EventBase
    {
    public:
-        FormatDescriptionEvent() : binlog_version(0), create_timestamp(0), event_header_length(0) { }
+        FormatDescriptionEvent(EventHeader && header_)
+            : EventBase(std::move(header_)), binlog_version(0), create_timestamp(0), event_header_length(0)
+        {
+        }

    protected:
        UInt16 binlog_version;
@ -336,7 +339,7 @@ namespace MySQLReplication
        UInt64 position;
        String next_binlog;

-        RotateEvent() : position(0) { }
+        RotateEvent(EventHeader && header_) : EventBase(std::move(header_)), position(0) {}
        void dump(std::ostream & out) const override;

    protected:
@ -363,7 +366,11 @@ namespace MySQLReplication
        String query;
        QueryType typ = QUERY_EVENT_DDL;

-        QueryEvent() : thread_id(0), exec_time(0), schema_len(0), error_code(0), status_len(0) { }
+        QueryEvent(EventHeader && header_)
+            : EventBase(std::move(header_)), thread_id(0), exec_time(0), schema_len(0), error_code(0), status_len(0)
+        {
+        }
+
        void dump(std::ostream & out) const override;
        MySQLEventType type() const override { return MYSQL_QUERY_EVENT; }

@ -374,7 +381,7 @@ namespace MySQLReplication
    class XIDEvent : public EventBase
    {
    public:
-        XIDEvent() : xid(0) { }
+        XIDEvent(EventHeader && header_) : EventBase(std::move(header_)), xid(0) {}

    protected:
        UInt64 xid;
@ -397,7 +404,7 @@ namespace MySQLReplication
        std::vector<UInt16> column_meta;
        Bitmap null_bitmap;

-        TableMapEvent() : table_id(0), flags(0), schema_len(0), table_len(0), column_count(0) { }
+        TableMapEvent(EventHeader && header_) : EventBase(std::move(header_)), table_id(0), flags(0), schema_len(0), table_len(0), column_count(0) {}
        void dump(std::ostream & out) const override;

    protected:
@ -413,8 +420,8 @@ namespace MySQLReplication
        String table;
        std::vector<Field> rows;

-        RowsEvent(std::shared_ptr<TableMapEvent> table_map_)
-            : number_columns(0), table_id(0), flags(0), extra_data_len(0), table_map(table_map_)
+        RowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_)
+            : EventBase(std::move(header_)), number_columns(0), table_id(0), flags(0), extra_data_len(0), table_map(table_map_)
        {
            schema = table_map->schema;
            table = table_map->table;
@ -439,21 +446,21 @@ namespace MySQLReplication
    class WriteRowsEvent : public RowsEvent
    {
    public:
-        WriteRowsEvent(std::shared_ptr<TableMapEvent> table_map_) : RowsEvent(table_map_) { }
+        WriteRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_) : RowsEvent(table_map_, std::move(header_)) {}
        MySQLEventType type() const override { return MYSQL_WRITE_ROWS_EVENT; }
    };

    class DeleteRowsEvent : public RowsEvent
    {
    public:
-        DeleteRowsEvent(std::shared_ptr<TableMapEvent> table_map_) : RowsEvent(table_map_) { }
+        DeleteRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_) : RowsEvent(table_map_, std::move(header_)) {}
        MySQLEventType type() const override { return MYSQL_DELETE_ROWS_EVENT; }
    };

    class UpdateRowsEvent : public RowsEvent
    {
    public:
-        UpdateRowsEvent(std::shared_ptr<TableMapEvent> table_map_) : RowsEvent(table_map_) { }
+        UpdateRowsEvent(std::shared_ptr<TableMapEvent> table_map_, EventHeader && header_) : RowsEvent(table_map_, std::move(header_)) {}
        MySQLEventType type() const override { return MYSQL_UPDATE_ROWS_EVENT; }
    };

@ -463,7 +470,7 @@ namespace MySQLReplication
        UInt8 commit_flag;
        GTID gtid;

-        GTIDEvent() : commit_flag(0) { }
+        GTIDEvent(EventHeader && header_) : EventBase(std::move(header_)), commit_flag(0) {}
        void dump(std::ostream & out) const override;

    protected:
@ -472,6 +479,8 @@ namespace MySQLReplication

    class DryRunEvent : public EventBase
    {
+    public:
+        DryRunEvent(EventHeader && header_) : EventBase(std::move(header_)) {}
        void dump(std::ostream & out) const override;

    protected:
--- a/src/Core/Protocol.h
+++ b/src/Core/Protocol.h
@ -52,6 +52,10 @@ namespace DB
 /// Using this block the client can initialize the output formatter and display the prefix of resulting table
 /// beforehand.

+/// Marker of the inter-server secret (passed in the user name)
+/// (anyway user cannot be started with a whitespace)
+const char USER_INTERSERVER_MARKER[] = " INTERSERVER SECRET ";
+
 namespace Protocol
 {
    /// Packet types that server transmits.
@ -71,6 +75,8 @@ namespace Protocol
            TablesStatusResponse = 9, /// A response to TablesStatus request.
            Log = 10,                 /// System logs of the query execution
            TableColumns = 11,        /// Columns' description for default values calculation
+
+            MAX = TableColumns,
        };

        /// NOTE: If the type of packet argument would be Enum, the comparison packet >= 0 && packet < 10
@ -79,9 +85,21 @@ namespace Protocol
        /// See https://www.securecoding.cert.org/confluence/display/cplusplus/INT36-CPP.+Do+not+use+out-of-range+enumeration+values
        inline const char * toString(UInt64 packet)
        {
-            static const char * data[] = { "Hello", "Data", "Exception", "Progress", "Pong", "EndOfStream", "ProfileInfo", "Totals",
-                "Extremes", "TablesStatusResponse", "Log", "TableColumns" };
-            return packet < 12
+            static const char * data[] = {
+                "Hello",
+                "Data",
+                "Exception",
+                "Progress",
+                "Pong",
+                "EndOfStream",
+                "ProfileInfo",
+                "Totals",
+                "Extremes",
+                "TablesStatusResponse",
+                "Log",
+                "TableColumns",
+            };
+            return packet <= MAX
                ? data[packet]
                : "Unknown packet";
        }
@ -113,13 +131,23 @@ namespace Protocol
            Ping = 4,                /// Check that connection to the server is alive.
            TablesStatusRequest = 5, /// Check status of tables on the server.
            KeepAlive = 6,           /// Keep the connection alive
-            Scalar = 7               /// A block of data (compressed or not).
+            Scalar = 7,              /// A block of data (compressed or not).
+
+            MAX = Scalar,
        };

        inline const char * toString(UInt64 packet)
        {
-            static const char * data[] = { "Hello", "Query", "Data", "Cancel", "Ping", "TablesStatusRequest", "KeepAlive" };
-            return packet < 7
+            static const char * data[] = {
+                "Hello",
+                "Query",
+                "Data",
+                "Cancel",
+                "Ping",
+                "TablesStatusRequest",
+                "KeepAlive",
+            };
+            return packet <= MAX
                ? data[packet]
                : "Unknown packet";
        }
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -88,6 +88,7 @@ class IColumn;
    M(UInt64, replication_alter_columns_timeout, 60, "Wait for actions to change the table structure within the specified number of seconds. 0 - wait unlimited time.", 0) \
    \
    M(LoadBalancing, load_balancing, LoadBalancing::RANDOM, "Which replicas (among healthy replicas) to preferably send a query to (on the first attempt) for distributed processing.", 0) \
+    M(UInt64, load_balancing_first_offset, 0, "Which replica to preferably send a query when FIRST_OR_RANDOM load balancing strategy is used.", 0) \
    \
    M(TotalsMode, totals_mode, TotalsMode::AFTER_HAVING_EXCLUSIVE, "How to calculate TOTALS when HAVING is present, as well as when max_rows_to_group_by and group_by_overflow_mode = ‘any’ are present.", IMPORTANT) \
    M(Float, totals_auto_threshold, 0.5, "The threshold for totals_mode = 'auto'.", 0) \
@ -232,6 +233,10 @@ class IColumn;
    M(UInt64, max_bytes_to_read, 0, "Limit on read bytes (after decompression) from the most 'deep' sources. That is, only in the deepest subquery. When reading from a remote server, it is only checked on a remote server.", 0) \
    M(OverflowMode, read_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \
    \
+    M(UInt64, max_rows_to_read_leaf, 0, "Limit on read rows on the leaf nodes for distributed queries. Limit is applied for local reads only excluding the final merge stage on the root node.", 0) \
+    M(UInt64, max_bytes_to_read_leaf, 0, "Limit on read bytes (after decompression) on the leaf nodes for distributed queries. Limit is applied for local reads only excluding the final merge stage on the root node.", 0) \
+    M(OverflowMode, read_overflow_mode_leaf, OverflowMode::THROW, "What to do when the leaf limit is exceeded.", 0) \
+    \
    M(UInt64, max_rows_to_group_by, 0, "", 0) \
    M(OverflowModeGroupBy, group_by_overflow_mode, OverflowMode::THROW, "What to do when the limit is exceeded.", 0) \
    M(UInt64, max_bytes_before_external_group_by, 0, "", 0) \
--- a/src/Databases/DatabaseAtomic.cpp
+++ b/src/Databases/DatabaseAtomic.cpp
@ -302,7 +302,7 @@ void DatabaseAtomic::assertDetachedTableNotInUse(const UUID & uuid)
    /// To avoid it, we remember UUIDs of detached tables and does not allow ATTACH table with such UUID until detached instance still in use.
    if (detached_tables.count(uuid))
        throw Exception("Cannot attach table with UUID " + toString(uuid) +
-              ", because it was detached but still used by come query. Retry later.", ErrorCodes::TABLE_ALREADY_EXISTS);
+              ", because it was detached but still used by some query. Retry later.", ErrorCodes::TABLE_ALREADY_EXISTS);
 }

 DatabaseAtomic::DetachedTables DatabaseAtomic::cleenupDetachedTables()
--- a/src/Databases/MySQL/MaterializeMetadata.cpp
+++ b/src/Databases/MySQL/MaterializeMetadata.cpp
@ -145,7 +145,7 @@ void MaterializeMetadata::transaction(const MySQLReplication::Position & positio
    String persistent_tmp_path = persistent_path + ".tmp";

    {
-        WriteBufferFromFile out(persistent_tmp_path, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_TRUNC | O_CREAT | O_EXCL);
+        WriteBufferFromFile out(persistent_tmp_path, DBMS_DEFAULT_BUFFER_SIZE, O_WRONLY | O_TRUNC | O_CREAT);

        /// TSV format metadata file.
        writeString("Version:\t" + toString(meta_version), out);
--- a/src/Dictionaries/ClickHouseDictionarySource.cpp
+++ b/src/Dictionaries/ClickHouseDictionarySource.cpp
@ -40,6 +40,8 @@ static ConnectionPoolWithFailoverPtr createPool(
        db,
        user,
        password,
+        "", /* cluster */
+        "", /* cluster_secret */
        "ClickHouseDictionarySource",
        Protocol::Compression::Enable,
        secure ? Protocol::Secure::Enable : Protocol::Secure::Disable));
--- a/src/Functions/GatherUtils/Algorithms.h
+++ b/src/Functions/GatherUtils/Algorithms.h
@ -21,6 +21,7 @@ namespace DB::GatherUtils

 inline constexpr size_t MAX_ARRAY_SIZE = 1 << 30;

+
 /// Methods to copy Slice to Sink, overloaded for various combinations of types.

 template <typename T>
@ -781,3 +782,4 @@ void resizeConstantSize(ArraySource && array_source, ValueSource && value_source
 }

 }
+
--- a/src/Functions/GatherUtils/ArraySinkVisitor.h
+++ b/src/Functions/GatherUtils/ArraySinkVisitor.h
@ -4,6 +4,7 @@

 namespace DB::GatherUtils
 {
+#pragma GCC visibility push(hidden)

 template <typename T>
 struct NumericArraySink;
@ -18,9 +19,18 @@ using BasicArraySinks = typename AppendToTypeList<GenericArraySink, NumericArray
 using NullableArraySinks = typename TypeListMap<NullableArraySink, BasicArraySinks>::Type;
 using TypeListArraySinks = typename TypeListConcat<BasicArraySinks, NullableArraySinks>::Type;

-class ArraySinkVisitor : public ApplyTypeListForClass<Visitor, TypeListArraySinks>::Type {};
+class ArraySinkVisitor : public ApplyTypeListForClass<Visitor, TypeListArraySinks>::Type
+{
+protected:
+    ~ArraySinkVisitor() = default;
+};

 template <typename Derived>
-class ArraySinkVisitorImpl : public VisitorImpl<Derived, ArraySinkVisitor> {};
+class ArraySinkVisitorImpl : public VisitorImpl<Derived, ArraySinkVisitor>
+{
+protected:
+    ~ArraySinkVisitorImpl() = default;
+};

+#pragma GCC visibility pop
 }
--- a/src/Functions/GatherUtils/ArraySourceVisitor.h
+++ b/src/Functions/GatherUtils/ArraySourceVisitor.h
@ -4,6 +4,7 @@

 namespace DB::GatherUtils
 {
+#pragma GCC visibility push(hidden)

 template <typename T>
 struct NumericArraySource;
@ -23,9 +24,18 @@ using BasicAndNullableArraySources = typename TypeListConcat<BasicArraySources,
 using ConstArraySources = typename TypeListMap<ConstSource, BasicAndNullableArraySources>::Type;
 using TypeListArraySources = typename TypeListConcat<BasicAndNullableArraySources, ConstArraySources>::Type;

-class ArraySourceVisitor : public ApplyTypeListForClass<Visitor, TypeListArraySources>::Type {};
+class ArraySourceVisitor : public ApplyTypeListForClass<Visitor, TypeListArraySources>::Type
+{
+protected:
+    ~ArraySourceVisitor() = default;
+};

 template <typename Derived>
-class ArraySourceVisitorImpl : public VisitorImpl<Derived, ArraySourceVisitor> {};
+class ArraySourceVisitorImpl : public VisitorImpl<Derived, ArraySourceVisitor>
+{
+protected:
+    ~ArraySourceVisitorImpl() = default;
+};

+#pragma GCC visibility pop
 }
--- a/src/Functions/GatherUtils/IArraySink.h
+++ b/src/Functions/GatherUtils/IArraySink.h
@ -13,6 +13,7 @@ namespace ErrorCodes

 namespace GatherUtils
 {
+#pragma GCC visibility push(hidden)

 struct IArraySink
 {
@ -27,6 +28,7 @@ struct IArraySink
 template <typename Derived>
 class ArraySinkImpl : public Visitable<Derived, IArraySink, ArraySinkVisitor> {};

+#pragma GCC visibility pop
 }

 }
--- a/src/Functions/GatherUtils/IArraySource.h
+++ b/src/Functions/GatherUtils/IArraySource.h
@ -13,6 +13,7 @@ namespace ErrorCodes

 namespace GatherUtils
 {
+#pragma GCC visibility push(hidden)

 struct IArraySource
 {
@ -33,6 +34,7 @@ struct IArraySource
 template <typename Derived>
 class ArraySourceImpl : public Visitable<Derived, IArraySource, ArraySourceVisitor> {};

+#pragma GCC visibility pop
 }

 }
--- a/src/Functions/GatherUtils/IValueSource.h
+++ b/src/Functions/GatherUtils/IValueSource.h
@ -13,6 +13,7 @@ namespace ErrorCodes

 namespace GatherUtils
 {
+#pragma GCC visibility push(hidden)

 struct IValueSource
 {
@ -29,6 +30,7 @@ struct IValueSource
 template <typename Derived>
 class ValueSourceImpl : public Visitable<Derived, IValueSource, ValueSourceVisitor> {};

+#pragma GCC visibility pop
 }

 }
--- a/src/Functions/GatherUtils/Selectors.h
+++ b/src/Functions/GatherUtils/Selectors.h
@ -17,6 +17,7 @@ namespace ErrorCodes

 namespace GatherUtils
 {
+#pragma GCC visibility push(hidden)

 /// Base classes which selects template function implementation with concrete ArraySource or ArraySink
 /// Derived classes should implement selectImpl for ArraySourceSelector and ArraySinkSelector,
@ -32,7 +33,7 @@ void callSelectMemberFunctionWithTupleArgument(Tuple & tuple, Args && ... args)
 }

 template <typename Base, typename ... Args>
-struct ArraySourceSelectorVisitor : public ArraySourceVisitorImpl<ArraySourceSelectorVisitor<Base, Args ...>>
+struct ArraySourceSelectorVisitor final : public ArraySourceVisitorImpl<ArraySourceSelectorVisitor<Base, Args ...>>
 {
    explicit ArraySourceSelectorVisitor(Args && ... args) : packed_args(args ...) {}

@ -60,7 +61,7 @@ struct ArraySourceSelector


 template <typename Base, typename ... Args>
-struct ArraySinkSelectorVisitor : public ArraySinkVisitorImpl<ArraySinkSelectorVisitor<Base, Args ...>>
+struct ArraySinkSelectorVisitor final : public ArraySinkVisitorImpl<ArraySinkSelectorVisitor<Base, Args ...>>
 {
    explicit ArraySinkSelectorVisitor(Args && ... args) : packed_args(args ...) {}

@ -88,7 +89,7 @@ struct ArraySinkSelector


 template <typename Base, typename ... Args>
-struct ValueSourceSelectorVisitor : public ValueSourceVisitorImpl<ValueSourceSelectorVisitor<Base, Args ...>>
+struct ValueSourceSelectorVisitor final : public ValueSourceVisitorImpl<ValueSourceSelectorVisitor<Base, Args ...>>
 {
    explicit ValueSourceSelectorVisitor(Args && ... args) : packed_args(args ...) {}

@ -201,6 +202,7 @@ struct ArrayAndValueSourceSelectorBySink : public ArraySinkSelector<ArrayAndValu
    }
 };

+#pragma GCC visibility pop
 }

 }
--- a/src/Functions/GatherUtils/Sinks.h
+++ b/src/Functions/GatherUtils/Sinks.h
@ -15,6 +15,7 @@

 namespace DB::GatherUtils
 {
+#pragma GCC visibility push(hidden)

 template <typename T>
 struct NumericArraySource;
@ -214,5 +215,5 @@ struct NullableArraySink : public ArraySink
    }
 };

-
+#pragma GCC visibility pop
 }
--- a/src/Functions/GatherUtils/Slices.h
+++ b/src/Functions/GatherUtils/Slices.h
@ -4,6 +4,7 @@

 namespace DB::GatherUtils
 {
+#pragma GCC visibility push(hidden)

 template <typename T>
 struct NumericArraySlice
@ -42,5 +43,6 @@ struct GenericValueSlice
    static constexpr size_t size = 1;
 };

+#pragma GCC visibility pop
 }

--- a/src/Functions/GatherUtils/Sources.h
+++ b/src/Functions/GatherUtils/Sources.h
@ -28,6 +28,7 @@ namespace ErrorCodes

 namespace GatherUtils
 {
+#pragma GCC visibility push(hidden)

 template <typename T> struct NumericArraySink;
 struct StringSink;
@ -819,4 +820,5 @@ struct NullableValueSource : public ValueSource

 }

+#pragma GCC visibility pop
 }
--- a/src/Functions/GatherUtils/ValueSourceVisitor.h
+++ b/src/Functions/GatherUtils/ValueSourceVisitor.h
@ -4,6 +4,7 @@

 namespace DB::GatherUtils
 {
+#pragma GCC visibility push(hidden)

 template <typename T>
 struct NumericValueSource;
@ -23,9 +24,18 @@ using BasicAndNullableValueSources = typename TypeListConcat<BasicValueSources,
 using ConstValueSources = typename TypeListMap<ConstSource, BasicAndNullableValueSources>::Type;
 using TypeListValueSources = typename TypeListConcat<BasicAndNullableValueSources, ConstValueSources>::Type;

-class ValueSourceVisitor : public ApplyTypeListForClass<Visitor, TypeListValueSources>::Type {};
+class ValueSourceVisitor : public ApplyTypeListForClass<Visitor, TypeListValueSources>::Type
+{
+protected:
+    ~ValueSourceVisitor() = default;
+};

 template <typename Derived>
-class ValueSourceVisitorImpl : public VisitorImpl<Derived, ValueSourceVisitor> {};
+class ValueSourceVisitorImpl : public VisitorImpl<Derived, ValueSourceVisitor>
+{
+protected:
+    ~ValueSourceVisitorImpl() = default;
+};

+#pragma GCC visibility pop
 }
--- a/src/Functions/GatherUtils/concat.cpp
+++ b/src/Functions/GatherUtils/concat.cpp
@ -16,6 +16,9 @@ namespace ErrorCodes
 namespace GatherUtils
 {

+namespace
+{
+
 struct ArrayConcat : public ArraySourceSelector<ArrayConcat>
 {
    using Sources = std::vector<std::unique_ptr<IArraySource>>;
@ -54,6 +57,8 @@ struct ArrayConcat : public ArraySourceSelector<ArrayConcat>
    }
 };

+}
+
 ColumnArray::MutablePtr concat(const std::vector<std::unique_ptr<IArraySource>> & sources)
 {
    if (sources.empty())
--- a/src/Functions/GatherUtils/createArraySink.cpp
+++ b/src/Functions/GatherUtils/createArraySink.cpp
@ -7,6 +7,9 @@ namespace DB::GatherUtils
 {
 /// Creates IArraySink from ColumnArray

+namespace
+{
+
 template <typename... Types>
 struct ArraySinkCreator;

@ -48,6 +51,8 @@ struct ArraySinkCreator<>
    }
 };

+}
+
 std::unique_ptr<IArraySink> createArraySink(ColumnArray & col, size_t column_size)
 {
    using Creator = ApplyTypeListForClass<ArraySinkCreator, TypeListNumbersAndUInt128>::Type;
--- a/src/Functions/GatherUtils/createArraySource.cpp
+++ b/src/Functions/GatherUtils/createArraySource.cpp
@ -7,6 +7,9 @@ namespace DB::GatherUtils
 {
 /// Creates IArraySource from ColumnArray

+namespace
+{
+
 template <typename... Types>
 struct ArraySourceCreator;

@ -51,6 +54,8 @@ struct ArraySourceCreator<>
    }
 };

+}
+
 std::unique_ptr<IArraySource> createArraySource(const ColumnArray & col, bool is_const, size_t total_rows)
 {
    using Creator = typename ApplyTypeListForClass<ArraySourceCreator, TypeListNumbersAndUInt128>::Type;
--- a/src/Functions/GatherUtils/createValueSource.cpp
+++ b/src/Functions/GatherUtils/createValueSource.cpp
@ -7,6 +7,9 @@ namespace DB::GatherUtils
 {
 /// Creates IValueSource from Column

+namespace
+{
+
 template <typename... Types>
 struct ValueSourceCreator;

@ -51,6 +54,8 @@ struct ValueSourceCreator<>
    }
 };

+}
+
 std::unique_ptr<IValueSource> createValueSource(const IColumn & col, bool is_const, size_t total_rows)
 {
    using Creator = typename ApplyTypeListForClass<ValueSourceCreator, TypeListNumbersAndUInt128>::Type;
--- a/src/Functions/GatherUtils/has_all.cpp
+++ b/src/Functions/GatherUtils/has_all.cpp
@ -5,6 +5,9 @@
 namespace DB::GatherUtils
 {

+namespace
+{
+
 struct ArrayHasAllSelectArraySourcePair : public ArraySourcePairSelector<ArrayHasAllSelectArraySourcePair>
 {
    template <typename FirstSource, typename SecondSource>
@ -14,6 +17,7 @@ struct ArrayHasAllSelectArraySourcePair : public ArraySourcePairSelector<ArrayHa
    }
 };

+}

 void sliceHasAll(IArraySource & first, IArraySource & second, ColumnUInt8 & result)
 {
--- a/src/Functions/GatherUtils/has_any.cpp
+++ b/src/Functions/GatherUtils/has_any.cpp
@ -5,6 +5,9 @@
 namespace DB::GatherUtils
 {

+namespace
+{
+
 struct ArrayHasAnySelectArraySourcePair : public ArraySourcePairSelector<ArrayHasAnySelectArraySourcePair>
 {
    template <typename FirstSource, typename SecondSource>
@ -14,6 +17,7 @@ struct ArrayHasAnySelectArraySourcePair : public ArraySourcePairSelector<ArrayHa
    }
 };

+}

 void sliceHasAny(IArraySource & first, IArraySource & second, ColumnUInt8 & result)
 {
--- a/src/Functions/GatherUtils/has_substr.cpp
+++ b/src/Functions/GatherUtils/has_substr.cpp
@ -5,6 +5,9 @@
 namespace DB::GatherUtils
 {

+namespace
+{
+
 struct ArrayHasSubstrSelectArraySourcePair : public ArraySourcePairSelector<ArrayHasSubstrSelectArraySourcePair>
 {
    template <typename FirstSource, typename SecondSource>
@ -14,6 +17,7 @@ struct ArrayHasSubstrSelectArraySourcePair : public ArraySourcePairSelector<Arra
    }
 };

+}

 void sliceHasSubstr(IArraySource & first, IArraySource & second, ColumnUInt8 & result)
 {
--- a/src/Functions/GatherUtils/push.cpp
+++ b/src/Functions/GatherUtils/push.cpp
@ -5,6 +5,9 @@
 namespace DB::GatherUtils
 {

+namespace
+{
+
 struct ArrayPush : public ArrayAndValueSourceSelectorBySink<ArrayPush>
 {
    template <typename ArraySource, typename ValueSource, typename Sink>
@ -18,6 +21,7 @@ struct ArrayPush : public ArrayAndValueSourceSelectorBySink<ArrayPush>
    }
 };

+}

 void push(IArraySource & array_source, IValueSource & value_source, IArraySink & sink, bool push_front)
 {
--- a/src/Functions/GatherUtils/resizeConstantSize.cpp
+++ b/src/Functions/GatherUtils/resizeConstantSize.cpp
@ -7,6 +7,9 @@
 namespace DB::GatherUtils
 {

+namespace
+{
+
 struct ArrayResizeConstant : public ArrayAndValueSourceSelectorBySink<ArrayResizeConstant>
 {
    template <typename ArraySource, typename ValueSource, typename Sink>
@ -17,6 +20,7 @@ struct ArrayResizeConstant : public ArrayAndValueSourceSelectorBySink<ArrayResiz
    }
 };

+}

 void resizeConstantSize(IArraySource & array_source, IValueSource & value_source, IArraySink & sink, ssize_t size)
 {
--- a/src/Functions/GatherUtils/resizeDynamicSize.cpp
+++ b/src/Functions/GatherUtils/resizeDynamicSize.cpp
@ -7,6 +7,9 @@
 namespace DB::GatherUtils
 {

+namespace
+{
+
 struct ArrayResizeDynamic : public ArrayAndValueSourceSelectorBySink<ArrayResizeDynamic>
 {
    template <typename ArraySource, typename ValueSource, typename Sink>
@ -17,6 +20,7 @@ struct ArrayResizeDynamic : public ArrayAndValueSourceSelectorBySink<ArrayResize
    }
 };

+}

 void resizeDynamicSize(IArraySource & array_source, IValueSource & value_source, IArraySink & sink, const IColumn & size_column)
 {
--- a/src/Functions/GatherUtils/sliceDynamicOffsetBounded.cpp
+++ b/src/Functions/GatherUtils/sliceDynamicOffsetBounded.cpp
@ -6,6 +6,10 @@

 namespace DB::GatherUtils
 {
+
+namespace
+{
+
 struct SliceDynamicOffsetBoundedSelectArraySource : public ArraySourceSelector<SliceDynamicOffsetBoundedSelectArraySource>
 {
    template <typename Source>
@ -19,6 +23,8 @@ struct SliceDynamicOffsetBoundedSelectArraySource : public ArraySourceSelector<S
    }
 };

+}
+
 ColumnArray::MutablePtr sliceDynamicOffsetBounded(IArraySource & src, const IColumn & offset_column, const IColumn & length_column)
 {
    ColumnArray::MutablePtr res;
--- a/src/Functions/GatherUtils/sliceDynamicOffsetUnbounded.cpp
+++ b/src/Functions/GatherUtils/sliceDynamicOffsetUnbounded.cpp
@ -6,7 +6,12 @@

 namespace DB::GatherUtils
 {
-struct SliceDynamicOffsetUnboundedSelectArraySource : public ArraySourceSelector<SliceDynamicOffsetUnboundedSelectArraySource>
+
+namespace
+{
+
+struct SliceDynamicOffsetUnboundedSelectArraySource
+        : public ArraySourceSelector<SliceDynamicOffsetUnboundedSelectArraySource>
 {
    template <typename Source>
    static void selectImpl(Source && source, const IColumn & offset_column, ColumnArray::MutablePtr & result)
@ -19,6 +24,7 @@ struct SliceDynamicOffsetUnboundedSelectArraySource : public ArraySourceSelector
    }
 };

+}

 ColumnArray::MutablePtr sliceDynamicOffsetUnbounded(IArraySource & src, const IColumn & offset_column)
 {
--- a/src/Functions/GatherUtils/sliceFromLeftConstantOffsetBounded.cpp
+++ b/src/Functions/GatherUtils/sliceFromLeftConstantOffsetBounded.cpp
@ -6,6 +6,10 @@

 namespace DB::GatherUtils
 {
+
+namespace
+{
+
 struct SliceFromLeftConstantOffsetBoundedSelectArraySource
    : public ArraySourceSelector<SliceFromLeftConstantOffsetBoundedSelectArraySource>
 {
@ -20,6 +24,8 @@ struct SliceFromLeftConstantOffsetBoundedSelectArraySource
    }
 };

+}
+
 ColumnArray::MutablePtr sliceFromLeftConstantOffsetBounded(IArraySource & src, size_t offset, ssize_t length)
 {
    ColumnArray::MutablePtr res;
--- a/src/Functions/GatherUtils/sliceFromLeftConstantOffsetUnbounded.cpp
+++ b/src/Functions/GatherUtils/sliceFromLeftConstantOffsetUnbounded.cpp
@ -6,6 +6,10 @@

 namespace DB::GatherUtils
 {
+
+namespace
+{
+
 struct SliceFromLeftConstantOffsetUnboundedSelectArraySource
    : public ArraySourceSelector<SliceFromLeftConstantOffsetUnboundedSelectArraySource>
 {
@ -20,6 +24,8 @@ struct SliceFromLeftConstantOffsetUnboundedSelectArraySource
    }
 };

+}
+
 ColumnArray::MutablePtr sliceFromLeftConstantOffsetUnbounded(IArraySource & src, size_t offset)
 {
    ColumnArray::MutablePtr res;
--- a/src/Functions/GatherUtils/sliceFromRightConstantOffsetBounded.cpp
+++ b/src/Functions/GatherUtils/sliceFromRightConstantOffsetBounded.cpp
@ -6,6 +6,10 @@

 namespace DB::GatherUtils
 {
+
+namespace
+{
+
 struct SliceFromRightConstantOffsetBoundedSelectArraySource
    : public ArraySourceSelector<SliceFromRightConstantOffsetBoundedSelectArraySource>
 {
@ -20,6 +24,8 @@ struct SliceFromRightConstantOffsetBoundedSelectArraySource
    }
 };

+}
+
 ColumnArray::MutablePtr sliceFromRightConstantOffsetBounded(IArraySource & src, size_t offset, ssize_t length)
 {
    ColumnArray::MutablePtr res;
--- a/src/Functions/GatherUtils/sliceFromRightConstantOffsetUnbounded.cpp
+++ b/src/Functions/GatherUtils/sliceFromRightConstantOffsetUnbounded.cpp
@ -6,6 +6,10 @@

 namespace DB::GatherUtils
 {
+
+namespace
+{
+
 struct SliceFromRightConstantOffsetUnboundedSelectArraySource
    : public ArraySourceSelector<SliceFromRightConstantOffsetUnboundedSelectArraySource>
 {
@ -20,6 +24,8 @@ struct SliceFromRightConstantOffsetUnboundedSelectArraySource
    }
 };

+}
+
 ColumnArray::MutablePtr sliceFromRightConstantOffsetUnbounded(IArraySource & src, size_t offset)
 {
    ColumnArray::MutablePtr res;
--- a/src/Functions/extractAllGroups.h
+++ b/src/Functions/extractAllGroups.h
@ -129,7 +129,9 @@ public:
                    for (size_t group = 1; group <= groups_count; ++group)
                        data_col->insertData(matched_groups[group].data(), matched_groups[group].size());

-                    pos = matched_groups[0].data() + matched_groups[0].size();
+                    /// If match is empty - it's technically Ok but we have to shift one character nevertheless
+                    /// to avoid infinite loop.
+                    pos = matched_groups[0].data() + std::max<size_t>(1, matched_groups[0].size());

                    current_nested_offset += groups_count;
                    nested_offsets_data.push_back(current_nested_offset);
@ -167,7 +169,7 @@ public:
                    for (size_t group = 1; group <= groups_count; ++group)
                        all_matches.push_back(matched_groups[group]);

-                    pos = matched_groups[0].data() + matched_groups[0].size();
+                    pos = matched_groups[0].data() + std::max<size_t>(1, matched_groups[0].size());

                    ++matches_per_row;
                }
--- a/src/Functions/finalizeAggregation.cpp
+++ b/src/Functions/finalizeAggregation.cpp
@ -34,11 +34,6 @@ public:
        return name;
    }

-    bool isStateful() const override
-    {
-        return true;
-    }
-
    size_t getNumberOfArguments() const override
    {
        return 1;
--- a/src/IO/MySQLBinlogEventReadBuffer.cpp
+++ b/src/IO/MySQLBinlogEventReadBuffer.cpp
@ -0,0 +1,70 @@
+#include <IO/MySQLBinlogEventReadBuffer.h>
+
+
+namespace DB
+{
+
+MySQLBinlogEventReadBuffer::MySQLBinlogEventReadBuffer(ReadBuffer & in_)
+    : ReadBuffer(nullptr, 0, 0), in(in_)
+{
+    nextIfAtEnd();
+}
+
+bool MySQLBinlogEventReadBuffer::nextImpl()
+{
+    if (hasPendingData())
+        return true;
+
+    if (in.eof())
+        return false;
+
+    if (checksum_buff_size == checksum_buff_limit)
+    {
+        if (likely(in.available() > CHECKSUM_CRC32_SIGNATURE_LENGTH))
+        {
+            working_buffer = ReadBuffer::Buffer(in.position(), in.buffer().end() - CHECKSUM_CRC32_SIGNATURE_LENGTH);
+            in.ignore(working_buffer.size());
+            return true;
+        }
+
+        in.readStrict(checksum_buf, CHECKSUM_CRC32_SIGNATURE_LENGTH);
+        checksum_buff_size = checksum_buff_limit = CHECKSUM_CRC32_SIGNATURE_LENGTH;
+    }
+    else
+    {
+        for (size_t index = 0; index < checksum_buff_size - checksum_buff_limit; ++index)
+            checksum_buf[index] = checksum_buf[checksum_buff_limit + index];
+
+        checksum_buff_size -= checksum_buff_limit;
+        size_t read_bytes = CHECKSUM_CRC32_SIGNATURE_LENGTH - checksum_buff_size;
+        in.readStrict(checksum_buf + checksum_buff_size, read_bytes);   /// Minimum CHECKSUM_CRC32_SIGNATURE_LENGTH bytes
+        checksum_buff_size = checksum_buff_limit = CHECKSUM_CRC32_SIGNATURE_LENGTH;
+    }
+
+    if (in.eof())
+        return false;
+
+    if (in.available() < CHECKSUM_CRC32_SIGNATURE_LENGTH)
+    {
+        size_t left_move_size = CHECKSUM_CRC32_SIGNATURE_LENGTH - in.available();
+        checksum_buff_limit = checksum_buff_size - left_move_size;
+    }
+
+    working_buffer = ReadBuffer::Buffer(checksum_buf, checksum_buf + checksum_buff_limit);
+    return true;
+}
+
+MySQLBinlogEventReadBuffer::~MySQLBinlogEventReadBuffer()
+{
+    try
+    {
+        /// ignore last 4 bytes
+        nextIfAtEnd();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
+}
--- a/src/IO/MySQLBinlogEventReadBuffer.h
+++ b/src/IO/MySQLBinlogEventReadBuffer.h
@ -0,0 +1,28 @@
+#pragma once
+
+#include <IO/ReadBuffer.h>
+
+namespace DB
+{
+
+class MySQLBinlogEventReadBuffer : public ReadBuffer
+{
+protected:
+    static const size_t CHECKSUM_CRC32_SIGNATURE_LENGTH = 4;
+    ReadBuffer & in;
+
+    size_t checksum_buff_size = 0;
+    size_t checksum_buff_limit = 0;
+    char checksum_buf[CHECKSUM_CRC32_SIGNATURE_LENGTH];
+
+    bool nextImpl() override;
+
+public:
+    ~MySQLBinlogEventReadBuffer() override;
+
+    MySQLBinlogEventReadBuffer(ReadBuffer & in_);
+
+};
+
+
+}
--- a/src/IO/tests/gtest_mysql_binlog_event_read_buffer.cpp
+++ b/src/IO/tests/gtest_mysql_binlog_event_read_buffer.cpp
@ -0,0 +1,82 @@
+#include <gtest/gtest.h>
+#include <Common/Exception.h>
+#include <IO/ConcatReadBuffer.h>
+#include <IO/ReadBufferFromMemory.h>
+#include <IO/MySQLBinlogEventReadBuffer.h>
+
+using namespace DB;
+
+TEST(MySQLBinlogEventReadBuffer, CheckBoundary)
+{
+    for (size_t index = 1; index < 4; ++index)
+    {
+        std::vector<char> memory_data(index, 0x01);
+        ReadBufferFromMemory nested_in(memory_data.data(), index);
+
+        EXPECT_THROW({ MySQLBinlogEventReadBuffer binlog_in(nested_in); }, Exception);
+    }
+}
+
+TEST(MySQLBinlogEventReadBuffer, NiceBufferSize)
+{
+    char res[2];
+    std::vector<char> memory_data(6, 0x01);
+    ReadBufferFromMemory nested_in(memory_data.data(), 6);
+
+    MySQLBinlogEventReadBuffer binlog_in(nested_in);
+    binlog_in.readStrict(res, 2);
+    ASSERT_EQ(res[0], 0x01);
+    ASSERT_EQ(res[1], 0x01);
+    ASSERT_TRUE(binlog_in.eof());
+}
+
+TEST(MySQLBinlogEventReadBuffer, BadBufferSizes)
+{
+    char res[4];
+    std::vector<ReadBufferPtr> buffers;
+    std::vector<ReadBuffer *> nested_buffers;
+    std::vector<std::shared_ptr<std::vector<char>>> memory_buffers_data;
+    std::vector<size_t> bad_buffers_size = {2, 1, 2, 3};
+
+    for (const auto & bad_buffer_size : bad_buffers_size)
+    {
+        memory_buffers_data.emplace_back(std::make_shared<std::vector<char>>(bad_buffer_size, 0x01));
+        buffers.emplace_back(std::make_shared<ReadBufferFromMemory>(memory_buffers_data.back()->data(), bad_buffer_size));
+        nested_buffers.emplace_back(buffers.back().get());
+    }
+
+    ConcatReadBuffer concat_buffer(nested_buffers);
+    MySQLBinlogEventReadBuffer binlog_in(concat_buffer);
+    binlog_in.readStrict(res, 4);
+
+    for (const auto & res_byte : res)
+        ASSERT_EQ(res_byte, 0x01);
+
+    ASSERT_TRUE(binlog_in.eof());
+}
+
+TEST(MySQLBinlogEventReadBuffer, NiceAndBadBufferSizes)
+{
+    char res[12];
+    std::vector<ReadBufferPtr> buffers;
+    std::vector<ReadBuffer *> nested_buffers;
+    std::vector<std::shared_ptr<std::vector<char>>> memory_buffers_data;
+    std::vector<size_t> buffers_size = {6, 1, 3, 6};
+
+    for (const auto & bad_buffer_size : buffers_size)
+    {
+        memory_buffers_data.emplace_back(std::make_shared<std::vector<char>>(bad_buffer_size, 0x01));
+        buffers.emplace_back(std::make_shared<ReadBufferFromMemory>(memory_buffers_data.back()->data(), bad_buffer_size));
+        nested_buffers.emplace_back(buffers.back().get());
+    }
+
+    ConcatReadBuffer concat_buffer(nested_buffers);
+    MySQLBinlogEventReadBuffer binlog_in(concat_buffer);
+    binlog_in.readStrict(res, 12);
+
+    for (const auto & res_byte : res)
+        ASSERT_EQ(res_byte, 0x01);
+
+    ASSERT_TRUE(binlog_in.eof());
+}
+
--- a/src/IO/ya.make
+++ b/src/IO/ya.make
@ -28,6 +28,7 @@ SRCS(
    MemoryReadWriteBuffer.cpp
    MMapReadBufferFromFile.cpp
    MMapReadBufferFromFileDescriptor.cpp
+    MySQLBinlogEventReadBuffer.cpp
    MySQLPacketPayloadReadBuffer.cpp
    MySQLPacketPayloadWriteBuffer.cpp
    NullWriteBuffer.cpp
--- a/src/Interpreters/AsynchronousMetrics.cpp
+++ b/src/Interpreters/AsynchronousMetrics.cpp
@ -332,7 +332,7 @@ void AsynchronousMetrics::update()
        ReadBufferFromFile buf("/proc/cpuinfo", 32768 /* buf_size */);

        // We need the following lines:
-        // core id : 4
+        // processor : 4
        // cpu MHz : 4052.941
        // They contain tabs and are interspersed with other info.
        int core_id = 0;
@ -346,7 +346,7 @@ void AsynchronousMetrics::update()
            // It doesn't read the EOL itself.
            ++buf.position();

-            if (s.rfind("core id", 0) == 0)
+            if (s.rfind("processor", 0) == 0)
            {
                if (auto colon = s.find_first_of(':'))
                {
--- a/src/Interpreters/Cluster.cpp
+++ b/src/Interpreters/Cluster.cpp
@ -11,6 +11,7 @@
 #include <Poco/Util/AbstractConfiguration.h>
 #include <Poco/Util/Application.h>
 #include <ext/range.h>
+#include <boost/range/algorithm_ext/erase.hpp>

 namespace DB
 {
@ -73,8 +74,16 @@ bool Cluster::Address::isLocal(UInt16 clickhouse_port) const


 Cluster::Address::Address(
-    const Poco::Util::AbstractConfiguration & config, const String & config_prefix, UInt32 shard_index_, UInt32 replica_index_)
-    : shard_index(shard_index_), replica_index(replica_index_)
+        const Poco::Util::AbstractConfiguration & config,
+        const String & config_prefix,
+        const String & cluster_,
+        const String & cluster_secret_,
+        UInt32 shard_index_,
+        UInt32 replica_index_)
+    : cluster(cluster_)
+    , cluster_secret(cluster_secret_)
+    , shard_index(shard_index_)
+    , replica_index(replica_index_)
 {
    host_name = config.getString(config_prefix + ".host");
    port = static_cast<UInt16>(config.getInt(config_prefix + ".port"));
@ -92,8 +101,15 @@ Cluster::Address::Address(
 }


-Cluster::Address::Address(const String & host_port_, const String & user_, const String & password_, UInt16 clickhouse_port, bool secure_, Int64 priority_)
-    : user(user_), password(password_)
+Cluster::Address::Address(
+        const String & host_port_,
+        const String & user_,
+        const String & password_,
+        UInt16 clickhouse_port,
+        bool secure_,
+        Int64 priority_)
+    : user(user_)
+    , password(password_)
 {
    auto parsed_host_port = parseAddress(host_port_, clickhouse_port);
    host_name = parsed_host_port.first;
@ -219,9 +235,9 @@ Cluster::Address Cluster::Address::fromFullString(const String & full_string)

 /// Implementation of Clusters class

-Clusters::Clusters(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & config_name)
+Clusters::Clusters(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & config_prefix)
 {
-    updateClusters(config, settings, config_name);
+    updateClusters(config, settings, config_prefix);
 }


@ -241,10 +257,10 @@ void Clusters::setCluster(const String & cluster_name, const std::shared_ptr<Clu
 }


-void Clusters::updateClusters(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & config_name)
+void Clusters::updateClusters(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & config_prefix)
 {
    Poco::Util::AbstractConfiguration::Keys config_keys;
-    config.keys(config_name, config_keys);
+    config.keys(config_prefix, config_keys);

    std::lock_guard lock(mutex);

@ -254,7 +270,7 @@ void Clusters::updateClusters(const Poco::Util::AbstractConfiguration & config,
        if (key.find('.') != String::npos)
            throw Exception("Cluster names with dots are not supported: '" + key + "'", ErrorCodes::SYNTAX_ERROR);

-        impl.emplace(key, std::make_shared<Cluster>(config, settings, config_name + "." + key));
+        impl.emplace(key, std::make_shared<Cluster>(config, settings, config_prefix, key));
    }
 }

@ -268,18 +284,25 @@ Clusters::Impl Clusters::getContainer() const

 /// Implementation of `Cluster` class

-Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & cluster_name)
+Cluster::Cluster(const Poco::Util::AbstractConfiguration & config,
+                 const Settings & settings,
+                 const String & config_prefix_,
+                 const String & cluster_name)
 {
+    auto config_prefix = config_prefix_ + "." + cluster_name;
+
    Poco::Util::AbstractConfiguration::Keys config_keys;
-    config.keys(cluster_name, config_keys);
+    config.keys(config_prefix, config_keys);
+
+    config_prefix += ".";
+
+    secret = config.getString(config_prefix + "secret", "");
+    boost::range::remove_erase(config_keys, "secret");

    if (config_keys.empty())
-        throw Exception("No cluster elements (shard, node) specified in config at path " + cluster_name, ErrorCodes::SHARD_HAS_NO_CONNECTIONS);
-
-    const auto & config_prefix = cluster_name + ".";
+        throw Exception("No cluster elements (shard, node) specified in config at path " + config_prefix, ErrorCodes::SHARD_HAS_NO_CONNECTIONS);

    UInt32 current_shard_num = 1;
-
    for (const auto & key : config_keys)
    {
        if (startsWith(key, "node"))
@ -291,7 +314,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, const Setting
            const auto & prefix = config_prefix + key;
            const auto weight = config.getInt(prefix + ".weight", default_weight);

-            addresses.emplace_back(config, prefix, current_shard_num, 1);
+            addresses.emplace_back(config, prefix, cluster_name, secret, current_shard_num, 1);
            const auto & address = addresses.back();

            ShardInfo info;
@ -305,6 +328,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, const Setting
                settings.distributed_connections_pool_size,
                address.host_name, address.port,
                address.default_database, address.user, address.password,
+                address.cluster, address.cluster_secret,
                "server", address.compression,
                address.secure, address.priority);

@ -345,7 +369,12 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, const Setting

                if (startsWith(replica_key, "replica"))
                {
-                    replica_addresses.emplace_back(config, partial_prefix + replica_key, current_shard_num, current_replica_num);
+                    replica_addresses.emplace_back(config,
+                        partial_prefix + replica_key,
+                        cluster_name,
+                        secret,
+                        current_shard_num,
+                        current_replica_num);
                    ++current_replica_num;

                    if (internal_replication)
@ -379,6 +408,7 @@ Cluster::Cluster(const Poco::Util::AbstractConfiguration & config, const Setting
                    settings.distributed_connections_pool_size,
                    replica.host_name, replica.port,
                    replica.default_database, replica.user, replica.password,
+                    replica.cluster, replica.cluster_secret,
                    "server", replica.compression,
                    replica.secure, replica.priority);

@ -442,6 +472,7 @@ Cluster::Cluster(const Settings & settings, const std::vector<std::vector<String
                        settings.distributed_connections_pool_size,
                        replica.host_name, replica.port,
                        replica.default_database, replica.user, replica.password,
+                        replica.cluster, replica.cluster_secret,
                        "server", replica.compression, replica.secure, replica.priority);
            all_replicas.emplace_back(replica_pool);
            if (replica.is_local && !treat_local_as_remote)
@ -546,6 +577,8 @@ Cluster::Cluster(Cluster::ReplicasAsShardsTag, const Cluster & from, const Setti
                address.default_database,
                address.user,
                address.password,
+                address.cluster,
+                address.cluster_secret,
                "server",
                address.compression,
                address.secure,
--- a/src/Interpreters/Cluster.h
+++ b/src/Interpreters/Cluster.h
@ -20,12 +20,17 @@ namespace ErrorCodes
 class Cluster
 {
 public:
-    Cluster(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & cluster_name);
+    Cluster(const Poco::Util::AbstractConfiguration & config,
+            const Settings & settings,
+            const String & config_prefix_,
+            const String & cluster_name);

    /// Construct a cluster by the names of shards and replicas.
    /// Local are treated as well as remote ones if treat_local_as_remote is true.
    /// 'clickhouse_port' - port that this server instance listen for queries.
    /// This parameter is needed only to check that some address is local (points to ourself).
+    ///
+    /// Used for remote() function.
    Cluster(const Settings & settings, const std::vector<std::vector<String>> & names,
            const String & username, const String & password,
            UInt16 clickhouse_port, bool treat_local_as_remote,
@ -62,6 +67,11 @@ public:
        UInt16 port;
        String user;
        String password;
+
+        /// For inter-server authorization
+        String cluster;
+        String cluster_secret;
+
        UInt32 shard_index{}; /// shard serial number in configuration file, starting from 1.
        UInt32 replica_index{}; /// replica serial number in this shard, starting from 1; zero means no replicas.

@ -80,6 +90,8 @@ public:
        Address(
            const Poco::Util::AbstractConfiguration & config,
            const String & config_prefix,
+            const String & cluster_,
+            const String & cluster_secret_,
            UInt32 shard_index_ = 0,
            UInt32 replica_index_ = 0);
        Address(
@ -170,6 +182,8 @@ public:
    /// The number of all shards.
    size_t getShardCount() const { return shards_info.size(); }

+    const String & getSecret() const { return secret; }
+
    /// Get a subcluster consisting of one shard - index by count (from 0) of the shard of this cluster.
    std::unique_ptr<Cluster> getClusterWithSingleShard(size_t index) const;

@ -197,6 +211,9 @@ private:
    struct ReplicasAsShardsTag {};
    Cluster(ReplicasAsShardsTag, const Cluster & from, const Settings & settings);

+    /// Inter-server secret
+    String secret;
+
    String hash_of_addresses;
    /// Description of the cluster shards.
    ShardsInfo shards_info;
@ -219,7 +236,7 @@ using ClusterPtr = std::shared_ptr<Cluster>;
 class Clusters
 {
 public:
-    Clusters(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & config_name = "remote_servers");
+    Clusters(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & config_prefix = "remote_servers");

    Clusters(const Clusters &) = delete;
    Clusters & operator=(const Clusters &) = delete;
@ -227,7 +244,7 @@ public:
    ClusterPtr getCluster(const std::string & cluster_name) const;
    void setCluster(const String & cluster_name, const ClusterPtr & cluster);

-    void updateClusters(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & config_name);
+    void updateClusters(const Poco::Util::AbstractConfiguration & config, const Settings & settings, const String & config_prefix);

 public:
    using Impl = std::map<String, ClusterPtr>;
@ -239,6 +256,4 @@ protected:
    mutable std::mutex mutex;
 };

-using ClustersPtr = std::shared_ptr<Clusters>;
-
 }
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -677,7 +677,7 @@ ConfigurationPtr Context::getUsersConfig()
 }


-void Context::setUser(const String & name, const String & password, const Poco::Net::SocketAddress & address)
+void Context::setUserImpl(const String & name, const std::optional<String> & password, const Poco::Net::SocketAddress & address)
 {
    auto lock = getLock();

@ -686,7 +686,7 @@ void Context::setUser(const String & name, const String & password, const Poco::

 #if defined(ARCADIA_BUILD)
    /// This is harmful field that is used only in foreign "Arcadia" build.
-    client_info.current_password = password;
+    client_info.current_password = password.value_or("");
 #endif

    auto new_user_id = getAccessControlManager().find<User>(name);
@ -694,7 +694,9 @@ void Context::setUser(const String & name, const String & password, const Poco::
    if (new_user_id)
    {
        new_access = getAccessControlManager().getContextAccess(*new_user_id, {}, true, settings, current_database, client_info);
-        if (!new_access->isClientHostAllowed() || !new_access->isCorrectPassword(password))
+        /// Access w/o password is done under interserver-secret (remote_servers.secret)
+        /// So it is okay not to check client's host (since there is trust).
+        if (password && (!new_access->isClientHostAllowed() || !new_access->isCorrectPassword(*password)))
        {
            new_user_id = {};
            new_access = nullptr;
@ -712,6 +714,16 @@ void Context::setUser(const String & name, const String & password, const Poco::
    setSettings(*access->getDefaultSettings());
 }

+void Context::setUser(const String & name, const String & password, const Poco::Net::SocketAddress & address)
+{
+    setUserImpl(name, std::make_optional(password), address);
+}
+
+void Context::setUserWithoutCheckingPassword(const String & name, const Poco::Net::SocketAddress & address)
+{
+    setUserImpl(name, {} /* no password */, address);
+}
+
 std::shared_ptr<const User> Context::getUser() const
 {
    return getAccess()->getUser();
@ -1498,6 +1510,15 @@ void Context::resetZooKeeper() const
    shared->zookeeper.reset();
 }

+void Context::reloadZooKeeperIfChanged(const ConfigurationPtr & config) const
+{
+    std::lock_guard lock(shared->zookeeper_mutex);
+    if (!shared->zookeeper || shared->zookeeper->configChanged(*config, "zookeeper"))
+    {
+        shared->zookeeper = std::make_shared<zkutil::ZooKeeper>(*config, "zookeeper");
+    }
+}
+
 bool Context::hasZooKeeper() const
 {
    return getConfigRef().has("zookeeper");
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -258,6 +258,11 @@ public:
    /// Sets the current user, checks the password and that the specified host is allowed.
    /// Must be called before getClientInfo.
    void setUser(const String & name, const String & password, const Poco::Net::SocketAddress & address);
+    /// Sets the current user, *do not checks the password* but check that the specified host is allowed.
+    /// Must be called before getClientInfo.
+    ///
+    /// (Used only internally in cluster, if the secret matches)
+    void setUserWithoutCheckingPassword(const String & name, const Poco::Net::SocketAddress & address);
    void setQuotaKey(String quota_key_);

    UserPtr getUser() const;
@ -476,6 +481,8 @@ public:
    bool hasZooKeeper() const;
    /// Reset current zookeeper session. Do not create a new one.
    void resetZooKeeper() const;
+    // Reload Zookeeper
+    void reloadZooKeeperIfChanged(const ConfigurationPtr & config) const;

    /// Create a cache of uncompressed blocks of specified size. This can be done only once.
    void setUncompressedCache(size_t max_size_in_bytes);
@ -638,6 +645,9 @@ private:
    StoragePolicySelectorPtr getStoragePolicySelector(std::lock_guard<std::mutex> & lock) const;

    DiskSelectorPtr getDiskSelector(std::lock_guard<std::mutex> & /* lock */) const;
+
+    /// If the password is not set, the password will not be checked
+    void setUserImpl(const String & name, const std::optional<String> & password, const Poco::Net::SocketAddress & address);
 };


--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -1441,16 +1441,22 @@ void InterpreterSelectQuery::executeFetchColumns(
        }

        StreamLocalLimits limits;
+        SizeLimits leaf_limits;
        std::shared_ptr<const EnabledQuota> quota;

+
        /// Set the limits and quota for reading data, the speed and time of the query.
        if (!options.ignore_limits)
+        {
            limits = getLimitsForStorage(settings, options);
+            leaf_limits = SizeLimits(settings.max_rows_to_read_leaf, settings.max_bytes_to_read_leaf,
+                                          settings.read_overflow_mode_leaf);
+        }

        if (!options.ignore_quota && (options.to_stage == QueryProcessingStage::Complete))
            quota = context->getQuota();

-        storage->read(query_plan, table_lock, metadata_snapshot, limits, std::move(quota),
+        storage->read(query_plan, table_lock, metadata_snapshot, limits, leaf_limits, std::move(quota),
                      required_columns, query_info, context, processing_stage, max_block_size, max_streams);
    }
    else
--- a/src/Parsers/formatSettingName.cpp
+++ b/src/Parsers/formatSettingName.cpp
@ -2,6 +2,7 @@
 #include <Common/StringUtils/StringUtils.h>
 #include <Common/quoteString.h>
 #include <common/find_symbols.h>
+#include <ostream>


 namespace DB
--- a/src/Parsers/formatSettingName.h
+++ b/src/Parsers/formatSettingName.h
@ -1,5 +1,6 @@
 #pragma once

+#include <iosfwd>
 #include <common/types.h>


--- a/src/Processors/Pipe.cpp
+++ b/src/Processors/Pipe.cpp
@ -788,6 +788,15 @@ void Pipe::setLimits(const StreamLocalLimits & limits)
    }
 }

+void Pipe::setLeafLimits(const SizeLimits & leaf_limits)
+{
+    for (auto & processor : processors)
+    {
+        if (auto * source_with_progress = dynamic_cast<ISourceWithProgress *>(processor.get()))
+            source_with_progress->setLeafLimits(leaf_limits);
+    }
+}
+
 void Pipe::setQuota(const std::shared_ptr<const EnabledQuota> & quota)
 {
    for (auto & processor : processors)
--- a/src/Processors/Pipe.h
+++ b/src/Processors/Pipe.h
@ -97,6 +97,7 @@ public:

    /// Specify quotas and limits for every ISourceWithProgress.
    void setLimits(const StreamLocalLimits & limits);
+    void setLeafLimits(const SizeLimits & leaf_limits);
    void setQuota(const std::shared_ptr<const EnabledQuota> & quota);

    /// Do not allow to change the table while the processors of pipe are alive.
--- a/src/Processors/QueryPlan/ReadFromStorageStep.cpp
+++ b/src/Processors/QueryPlan/ReadFromStorageStep.cpp
@ -15,6 +15,7 @@ ReadFromStorageStep::ReadFromStorageStep(
    TableLockHolder table_lock_,
    StorageMetadataPtr metadata_snapshot_,
    StreamLocalLimits & limits_,
+    SizeLimits & leaf_limits_,
    std::shared_ptr<const EnabledQuota> quota_,
    StoragePtr storage_,
    const Names & required_columns_,
@ -26,6 +27,7 @@ ReadFromStorageStep::ReadFromStorageStep(
    : table_lock(std::move(table_lock_))
    , metadata_snapshot(std::move(metadata_snapshot_))
    , limits(limits_)
+    , leaf_limits(leaf_limits_)
    , quota(std::move(quota_))
    , storage(std::move(storage_))
    , required_columns(required_columns_)
@ -86,6 +88,16 @@ ReadFromStorageStep::ReadFromStorageStep(

    pipe.setLimits(limits);

+    /**
+      * Leaf size limits should be applied only for local processing of distributed queries.
+      * Such limits allow to control the read stage on leaf nodes and exclude the merging stage.
+      * Consider the case when distributed query needs to read from multiple shards. Then leaf
+      * limits will be applied on the shards only (including the root node) but will be ignored
+      * on the results merging stage.
+      */
+    if (!storage->isRemote())
+        pipe.setLeafLimits(leaf_limits);
+
    if (quota)
        pipe.setQuota(quota);

--- a/src/Processors/QueryPlan/ReadFromStorageStep.h
+++ b/src/Processors/QueryPlan/ReadFromStorageStep.h
@ -26,6 +26,7 @@ public:
        TableLockHolder table_lock,
        StorageMetadataPtr metadata_snapshot,
        StreamLocalLimits & limits,
+        SizeLimits & leaf_limits,
        std::shared_ptr<const EnabledQuota> quota,
        StoragePtr storage,
        const Names & required_columns,
@ -47,6 +48,7 @@ private:
    TableLockHolder table_lock;
    StorageMetadataPtr metadata_snapshot;
    StreamLocalLimits limits;
+    SizeLimits leaf_limits;
    std::shared_ptr<const EnabledQuota> quota;

    StoragePtr storage;
--- a/src/Processors/Sources/SourceFromInputStream.h
+++ b/src/Processors/Sources/SourceFromInputStream.h
@ -33,6 +33,7 @@ public:

    /// Implementation for methods from ISourceWithProgress.
    void setLimits(const StreamLocalLimits & limits_) final { stream->setLimits(limits_); }
+    void setLeafLimits(const SizeLimits &) final { }
    void setQuota(const std::shared_ptr<const EnabledQuota> & quota_) final { stream->setQuota(quota_); }
    void setProcessListElement(QueryStatus * elem) final { stream->setProcessListElement(elem); }
    void setProgressCallback(const ProgressCallback & callback) final { stream->setProgressCallback(callback); }
--- a/src/Processors/Sources/SourceWithProgress.cpp
+++ b/src/Processors/Sources/SourceWithProgress.cpp
@ -93,6 +93,12 @@ void SourceWithProgress::progress(const Progress & value)
            }
        }

+        if (!leaf_limits.check(rows_to_check_limit, progress.read_bytes, "rows or bytes to read on leaf node",
+                                          ErrorCodes::TOO_MANY_ROWS, ErrorCodes::TOO_MANY_BYTES))
+        {
+            cancel();
+        }
+
        size_t total_rows = progress.total_rows_to_read;

        constexpr UInt64 profile_events_update_period_microseconds = 10 * 1000; // 10 milliseconds
--- a/src/Processors/Sources/SourceWithProgress.h
+++ b/src/Processors/Sources/SourceWithProgress.h
@ -17,6 +17,9 @@ public:
    /// Set limitations that checked on each chunk.
    virtual void setLimits(const StreamLocalLimits & limits_) = 0;

+    /// Set limitations that checked on each chunk for distributed queries on leaf nodes.
+    virtual void setLeafLimits(const SizeLimits & leaf_limits_) = 0;
+
    /// Set the quota. If you set a quota on the amount of raw data,
    /// then you should also set mode = LIMITS_TOTAL to LocalLimits with setLimits.
    virtual void setQuota(const std::shared_ptr<const EnabledQuota> & quota_) = 0;
@ -46,6 +49,7 @@ public:
    SourceWithProgress(Block header, bool enable_auto_progress);

    void setLimits(const StreamLocalLimits & limits_) final { limits = limits_; }
+    void setLeafLimits(const SizeLimits & leaf_limits_) final {leaf_limits = leaf_limits_; }
    void setQuota(const std::shared_ptr<const EnabledQuota> & quota_) final { quota = quota_; }
    void setProcessListElement(QueryStatus * elem) final { process_list_elem = elem; }
    void setProgressCallback(const ProgressCallback & callback) final { progress_callback = callback; }
@ -59,6 +63,7 @@ protected:

 private:
    StreamLocalLimits limits;
+    SizeLimits leaf_limits;
    std::shared_ptr<const EnabledQuota> quota;
    ProgressCallback progress_callback;
    QueryStatus * process_list_elem = nullptr;
--- a/src/Server/TCPHandler.cpp
+++ b/src/Server/TCPHandler.cpp
@ -6,6 +6,7 @@
 #include <Common/Stopwatch.h>
 #include <Common/NetException.h>
 #include <Common/setThreadName.h>
+#include <Common/OpenSSLHelpers.h>
 #include <IO/Progress.h>
 #include <Compression/CompressedReadBuffer.h>
 #include <Compression/CompressedWriteBuffer.h>
@ -51,6 +52,7 @@ namespace ErrorCodes
    extern const int POCO_EXCEPTION;
    extern const int SOCKET_TIMEOUT;
    extern const int UNEXPECTED_PACKET_FROM_CLIENT;
+    extern const int SUPPORT_IS_DISABLED;
 }


@ -293,6 +295,12 @@ void TCPHandler::runImpl()
            if (e.code() == ErrorCodes::UNKNOWN_PACKET_FROM_CLIENT)
                throw;

+            /// If there is UNEXPECTED_PACKET_FROM_CLIENT emulate network_error
+            /// to break the loop, but do not throw to send the exception to
+            /// the client.
+            if (e.code() == ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT)
+                network_error = true;
+
            /// If a timeout occurred, try to inform client about it and close the session
            if (e.code() == ErrorCodes::SOCKET_TIMEOUT)
                network_error = true;
@ -351,6 +359,8 @@ void TCPHandler::runImpl()
                    tryLogCurrentException(log, "Can't send logs to client");
                }

+                const auto & e = *exception;
+                LOG_ERROR(log, "Code: {}, e.displayText() = {}, Stack trace:\n\n{}", e.code(), e.displayText(), e.getStackTraceString());
                sendException(*exception, send_exception_with_stack_trace);
            }
        }
@ -716,7 +726,7 @@ void TCPHandler::receiveHello()
 {
    /// Receive `hello` packet.
    UInt64 packet_type = 0;
-    String user = "default";
+    String user;
    String password;

    readVarUInt(packet_type, *in);
@ -747,14 +757,25 @@ void TCPHandler::receiveHello()
    readStringBinary(user, *in);
    readStringBinary(password, *in);

+    if (user.empty())
+        throw NetException("Unexpected packet from client (no user in Hello package)", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
+
    LOG_DEBUG(log, "Connected {} version {}.{}.{}, revision: {}{}{}.",
        client_name,
        client_version_major, client_version_minor, client_version_patch,
        client_revision,
        (!default_database.empty() ? ", database: " + default_database : ""),
-        (!user.empty() ? ", user: " + user : ""));
+        (!user.empty() ? ", user: " + user : "")
+    );

+    if (user != USER_INTERSERVER_MARKER)
+    {
        connection_context.setUser(user, password, socket().peerAddress());
+    }
+    else
+    {
+        receiveClusterNameAndSalt();
+    }
 }


@ -836,6 +857,30 @@ bool TCPHandler::receivePacket()
    }
 }

+void TCPHandler::receiveClusterNameAndSalt()
+{
+    readStringBinary(cluster, *in);
+    readStringBinary(salt, *in, 32);
+
+    try
+    {
+        if (salt.empty())
+            throw NetException("Empty salt is not allowed", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
+
+        cluster_secret = query_context->getCluster(cluster)->getSecret();
+    }
+    catch (const Exception & e)
+    {
+        try
+        {
+            /// We try to send error information to the client.
+            sendException(e, connection_context.getSettingsRef().calculate_text_stack_trace);
+        }
+        catch (...) {}
+
+        throw;
+    }
+}

 void TCPHandler::receiveQuery()
 {
@ -873,10 +918,6 @@ void TCPHandler::receiveQuery()
        client_info.initial_query_id = client_info.current_query_id;
        client_info.initial_address = client_info.current_address;
    }
-    else
-    {
-        query_context->setInitialRowPolicy();
-    }

    /// Per query settings are also passed via TCP.
    /// We need to check them before applying due to they can violate the settings constraints.
@ -884,6 +925,64 @@ void TCPHandler::receiveQuery()
                                                                                                      : SettingsWriteFormat::BINARY;
    Settings passed_settings;
    passed_settings.read(*in, settings_format);
+
+    /// Interserver secret.
+    std::string received_hash;
+    if (client_revision >= DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET)
+    {
+        readStringBinary(received_hash, *in, 32);
+    }
+
+    readVarUInt(stage, *in);
+    state.stage = QueryProcessingStage::Enum(stage);
+
+    readVarUInt(compression, *in);
+    state.compression = static_cast<Protocol::Compression>(compression);
+
+    readStringBinary(state.query, *in);
+
+    /// It is OK to check only when query != INITIAL_QUERY,
+    /// since only in that case the actions will be done.
+    if (!cluster.empty() && client_info.query_kind != ClientInfo::QueryKind::INITIAL_QUERY)
+    {
+#if USE_SSL
+        std::string data(salt);
+        data += cluster_secret;
+        data += state.query;
+        data += state.query_id;
+        data += client_info.initial_user;
+
+        if (received_hash.size() != 32)
+            throw NetException("Unexpected hash received from client", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
+
+        std::string calculated_hash = encodeSHA256(data);
+
+        if (calculated_hash != received_hash)
+            throw NetException("Hash mismatch", ErrorCodes::UNEXPECTED_PACKET_FROM_CLIENT);
+        /// TODO: change error code?
+
+        /// initial_user can be empty in case of Distributed INSERT via Buffer/Kafka,
+        /// i.e. when the INSERT is done with the global context (w/o user).
+        if (!client_info.initial_user.empty())
+        {
+            query_context->setUserWithoutCheckingPassword(client_info.initial_user, socket().peerAddress());
+            LOG_DEBUG(log, "User (initial): {}", query_context->getUserName());
+        }
+        /// No need to update connection_context, since it does not requires user (it will not be used for query execution)
+#else
+        throw Exception(
+            "Inter-server secret support is disabled, because ClickHouse was built without SSL library",
+            ErrorCodes::SUPPORT_IS_DISABLED);
+#endif
+    }
+    else
+    {
+        query_context->setInitialRowPolicy();
+    }
+
+    ///
+    /// Settings
+    ///
    auto settings_changes = passed_settings.changes();
    if (client_info.query_kind == ClientInfo::QueryKind::INITIAL_QUERY)
    {
@ -897,20 +996,11 @@ void TCPHandler::receiveQuery()
    }
    query_context->applySettingsChanges(settings_changes);
    const Settings & settings = query_context->getSettingsRef();
-
    /// Sync timeouts on client and server during current query to avoid dangling queries on server
    /// NOTE: We use settings.send_timeout for the receive timeout and vice versa (change arguments ordering in TimeoutSetter),
    ///  because settings.send_timeout is client-side setting which has opposite meaning on the server side.
    /// NOTE: these settings are applied only for current connection (not for distributed tables' connections)
    state.timeout_setter = std::make_unique<TimeoutSetter>(socket(), settings.receive_timeout, settings.send_timeout);
-
-    readVarUInt(stage, *in);
-    state.stage = QueryProcessingStage::Enum(stage);
-
-    readVarUInt(compression, *in);
-    state.compression = static_cast<Protocol::Compression>(compression);
-
-    readStringBinary(state.query, *in);
 }

 void TCPHandler::receiveUnexpectedQuery()
@ -929,6 +1019,11 @@ void TCPHandler::receiveUnexpectedQuery()
                                                                                                      : SettingsWriteFormat::BINARY;
    skip_settings.read(*in, settings_format);

+    std::string skip_hash;
+    bool interserver_secret = client_revision >= DBMS_MIN_REVISION_WITH_INTERSERVER_SECRET;
+    if (interserver_secret)
+        readStringBinary(skip_hash, *in, 32);
+
    readVarUInt(skip_uint_64, *in);
    readVarUInt(skip_uint_64, *in);
    readStringBinary(skip_string, *in);
--- a/src/Server/TCPHandler.h
+++ b/src/Server/TCPHandler.h
@ -97,7 +97,6 @@ struct LastBlockInputParameters
    Block header;
 };

-
 class TCPHandler : public Poco::Net::TCPServerConnection
 {
 public:
@ -139,6 +138,12 @@ private:

    String default_database;

+    /// For inter-server secret (remote_server.*.secret)
+    String salt;
+    String cluster;
+    String cluster_secret;
+
+
    /// At the moment, only one ongoing query in the connection is supported at a time.
    QueryState state;

@ -187,6 +192,8 @@ private:
    void sendTotals(const Block & totals);
    void sendExtremes(const Block & extremes);

+    void receiveClusterNameAndSalt();
+
    /// Creates state.block_in/block_out for blocks read/write, depending on whether compression is enabled.
    void initBlockInput();
    void initBlockOutput(const Block & block);
--- a/src/Storages/Distributed/DirectoryMonitor.cpp
+++ b/src/Storages/Distributed/DirectoryMonitor.cpp
@ -236,8 +236,17 @@ ConnectionPoolPtr StorageDistributedDirectoryMonitor::createPool(const std::stri
        }

        return std::make_shared<ConnectionPool>(
-            1, address.host_name, address.port, address.default_database, address.user, address.password,
-            storage.getName() + '_' + address.user, Protocol::Compression::Enable, address.secure);
+            1, /* max_connections */
+            address.host_name,
+            address.port,
+            address.default_database,
+            address.user,
+            address.password,
+            address.cluster,
+            address.cluster_secret,
+            storage.getName() + '_' + address.user, /* client */
+            Protocol::Compression::Enable,
+            address.secure);
    };

    auto pools = createPoolsForAddresses(name, pool_factory);
--- a/src/Storages/IStorage.cpp
+++ b/src/Storages/IStorage.cpp
@ -97,6 +97,7 @@ void IStorage::read(
        TableLockHolder table_lock,
        StorageMetadataPtr metadata_snapshot,
        StreamLocalLimits & limits,
+        SizeLimits & leaf_limits,
        std::shared_ptr<const EnabledQuota> quota,
        const Names & column_names,
        const SelectQueryInfo & query_info,
@ -106,7 +107,7 @@ void IStorage::read(
        unsigned num_streams)
 {
    auto read_step = std::make_unique<ReadFromStorageStep>(
-            std::move(table_lock), std::move(metadata_snapshot), limits, std::move(quota), shared_from_this(),
+            std::move(table_lock), std::move(metadata_snapshot), limits, leaf_limits, std::move(quota), shared_from_this(),
            column_names, query_info, std::move(context), processed_stage, max_block_size, num_streams);

    read_step->setStepDescription("Read from " + getName());
--- a/Show More
+++ b/Show More