Merge remote-tracking branch 'origin/master' into tmp

2024-11-25 17:12:03 +00:00 · 2020-09-22 14:02:20 +03:00 · 2020-09-22 14:02:20 +03:00 · 7b64ca33b1
commit 7b64ca33b1
parent bfc3be9e43 478c7309d4
398 changed files with 7041 additions and 3489 deletions
--- a/.gitmodules
+++ b/.gitmodules
@ -37,7 +37,7 @@
 	url = https://github.com/ClickHouse-Extras/mariadb-connector-c.git
 [submodule "contrib/jemalloc"]
 	path = contrib/jemalloc
-	url = https://github.com/jemalloc/jemalloc.git
+	url = https://github.com/ClickHouse-Extras/jemalloc.git
 [submodule "contrib/unixodbc"]
 	path = contrib/unixodbc
 	url = https://github.com/ClickHouse-Extras/UnixODBC.git
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -80,6 +80,11 @@ endif ()

 include (cmake/find/ccache.cmake)

+option(ENABLE_CHECK_HEAVY_BUILDS "Don't allow C++ translation units to compile too long or to take too much memory while compiling" OFF)
+if (ENABLE_CHECK_HEAVY_BUILDS)
+    set (CMAKE_CXX_COMPILER_LAUNCHER prlimit --rss=10000000 --cpu=600)
+endif ()
+
 if (NOT CMAKE_BUILD_TYPE OR CMAKE_BUILD_TYPE STREQUAL "None")
    set (CMAKE_BUILD_TYPE "RelWithDebInfo")
    message (STATUS "CMAKE_BUILD_TYPE is not set, set to default = ${CMAKE_BUILD_TYPE}")
@ -404,7 +409,6 @@ include (cmake/find/amqpcpp.cmake)
 include (cmake/find/capnp.cmake)
 include (cmake/find/llvm.cmake)
 include (cmake/find/termcap.cmake) # for external static llvm
-include (cmake/find/opencl.cmake)
 include (cmake/find/h3.cmake)
 include (cmake/find/libxml2.cmake)
 include (cmake/find/brotli.cmake)
@ -450,13 +454,6 @@ include (cmake/find/mysqlclient.cmake)

 # When testing for memory leaks with Valgrind, don't link tcmalloc or jemalloc.

-if (USE_OPENCL)
-    if (OS_DARWIN)
-        set(OPENCL_LINKER_FLAGS "-framework OpenCL")
-        set(CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OPENCL_LINKER_FLAGS}")
-    endif ()
-endif ()
-
 include (cmake/print_flags.cmake)

 if (TARGET global-group)
--- a/base/common/CMakeLists.txt
+++ b/base/common/CMakeLists.txt
@ -18,6 +18,7 @@ set (SRCS
    terminalColors.cpp
    errnoToString.cpp
    getResource.cpp
+    StringRef.cpp
 )

 if (ENABLE_REPLXX)
--- a/base/common/StringRef.cpp
+++ b/base/common/StringRef.cpp
@ -0,0 +1,13 @@
+#include <ostream>
+
+#include "StringRef.h"
+
+
+std::ostream & operator<<(std::ostream & os, const StringRef & str)
+{
+    if (str.data)
+        os.write(str.data, str.size);
+
+    return os;
+}
+
--- a/base/common/StringRef.h
+++ b/base/common/StringRef.h
@ -4,7 +4,7 @@
 #include <string>
 #include <vector>
 #include <functional>
-#include <ostream>
+#include <iosfwd>

 #include <common/types.h>
 #include <common/unaligned.h>
@ -322,10 +322,4 @@ inline bool operator==(StringRef lhs, const char * rhs)
    return true;
 }

-inline std::ostream & operator<<(std::ostream & os, const StringRef & str)
-{
-    if (str.data)
-        os.write(str.data, str.size);
-
-    return os;
-}
+std::ostream & operator<<(std::ostream & os, const StringRef & str);
--- a/base/common/coverage.cpp
+++ b/base/common/coverage.cpp
@ -3,12 +3,11 @@
 #if WITH_COVERAGE

 #    include <mutex>
-
 #    include <unistd.h>


 #    if defined(__clang__)
-extern "C" void __llvm_profile_dump();
+extern "C" void __llvm_profile_dump(); // NOLINT
 #    elif defined(__GNUC__) || defined(__GNUG__)
 extern "C" void __gcov_exit();
 #    endif
@ -23,7 +22,7 @@ void dumpCoverageReportIfPossible()
    std::lock_guard lock(mutex);

 #    if defined(__clang__)
-    __llvm_profile_dump();
+    __llvm_profile_dump(); // NOLINT
 #    elif defined(__GNUC__) || defined(__GNUG__)
    __gcov_exit();
 #    endif
--- a/base/common/phdr_cache.cpp
+++ b/base/common/phdr_cache.cpp
@ -14,7 +14,7 @@
 #   pragma clang diagnostic ignored "-Wunused-macros"
 #endif

-#define __msan_unpoison(X, Y)
+#define __msan_unpoison(X, Y) // NOLINT
 #if defined(__has_feature)
 #   if __has_feature(memory_sanitizer)
 #       undef __msan_unpoison
@ -84,7 +84,7 @@ extern "C"
 #ifdef ADDRESS_SANITIZER
 void __lsan_ignore_object(const void *);
 #else
-void __lsan_ignore_object(const void *) {}
+void __lsan_ignore_object(const void *) {} // NOLINT
 #endif
 }

--- a/base/common/wide_integer.h
+++ b/base/common/wide_integer.h
@ -54,8 +54,8 @@ template <size_t Bits, typename Signed>
 class integer
 {
 public:
-    using base_type = uint8_t;
-    using signed_base_type = int8_t;
+    using base_type = uint64_t;
+    using signed_base_type = int64_t;

    // ctors
    integer() = default;
@ -127,7 +127,7 @@ private:
    friend class std::numeric_limits<integer<Bits, signed>>;
    friend class std::numeric_limits<integer<Bits, unsigned>>;

-    base_type m_arr[_impl::arr_size];
+    base_type items[_impl::item_count];
 };

 template <typename T>
--- a/base/common/wide_integer_impl.h
+++ b/base/common/wide_integer_impl.h
--- a/base/common/ya.make
+++ b/base/common/ya.make
@ -53,6 +53,7 @@ SRCS(
    setTerminalEcho.cpp
    shift10.cpp
    sleep.cpp
+    StringRef.cpp
    terminalColors.cpp

 )
--- a/benchmark/hardware.sh
+++ b/benchmark/hardware.sh
@ -0,0 +1,120 @@
+#!/bin/bash -e
+
+if [[ -n $1 ]]; then
+    SCALE=$1
+else
+    SCALE=100
+fi
+
+TABLE="hits_${SCALE}m_obfuscated"
+DATASET="${TABLE}_v1.tar.xz"
+QUERIES_FILE="queries.sql"
+TRIES=3
+
+AMD64_BIN_URL="https://clickhouse-builds.s3.yandex.net/0/e29c4c3cc47ab2a6c4516486c1b77d57e7d42643/clickhouse_build_check/gcc-10_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"
+AARCH64_BIN_URL="https://clickhouse-builds.s3.yandex.net/0/e29c4c3cc47ab2a6c4516486c1b77d57e7d42643/clickhouse_special_build_check/clang-10-aarch64_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"
+
+FASTER_DOWNLOAD=wget
+if command -v axel >/dev/null; then
+    FASTER_DOWNLOAD=axel
+else
+    echo "It's recommended to install 'axel' for faster downloads."
+fi
+
+if command -v pixz >/dev/null; then
+    TAR_PARAMS='-Ipixz'
+else
+    echo "It's recommended to install 'pixz' for faster decompression of the dataset."
+fi
+
+mkdir -p clickhouse-benchmark-$SCALE
+pushd clickhouse-benchmark-$SCALE
+
+if [[ ! -f clickhouse ]]; then
+    CPU=$(uname -m)
+    if [[ ($CPU == x86_64) || ($CPU == amd64) ]]; then
+        $FASTER_DOWNLOAD "$AMD64_BIN_URL"
+    elif [[ $CPU == aarch64 ]]; then
+        $FASTER_DOWNLOAD "$AARCH64_BIN_URL"
+    else
+        echo "Unsupported CPU type: $CPU"
+        exit 1
+    fi
+fi
+
+chmod a+x clickhouse
+
+if [[ ! -f $QUERIES_FILE ]]; then
+    wget "https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/$QUERIES_FILE"
+fi
+
+if [[ ! -d data ]]; then
+    if [[ ! -f $DATASET ]]; then
+        $FASTER_DOWNLOAD "https://clickhouse-datasets.s3.yandex.net/hits/partitions/$DATASET"
+    fi
+    
+    tar $TAR_PARAMS --strip-components=1 --directory=. -x -v -f $DATASET
+fi
+
+echo "Starting clickhouse-server"
+
+./clickhouse server > server.log 2>&1 &
+PID=$!
+
+function finish {
+    kill $PID
+    wait
+}
+trap finish EXIT
+
+echo "Waiting for clickhouse-server to start"
+
+for i in {1..30}; do
+    sleep 1
+    ./clickhouse client --query "SELECT 'The dataset size is: ', count() FROM $TABLE" 2>/dev/null && break || echo '.'
+    if [[ $i == 30 ]]; then exit 1; fi
+done
+
+echo
+echo "Will perform benchmark. Results:"
+echo
+
+cat "$QUERIES_FILE" | sed "s/{table}/${TABLE}/g" | while read query; do
+    sync
+    echo 3 | sudo tee /proc/sys/vm/drop_caches >/dev/null
+
+    echo -n "["
+    for i in $(seq 1 $TRIES); do
+        RES=$(./clickhouse client --max_memory_usage 100000000000 --time --format=Null --query="$query" 2>&1)
+        [[ "$?" == "0" ]] && echo -n "${RES}" || echo -n "null"
+        [[ "$i" != $TRIES ]] && echo -n ", "
+    done
+    echo "],"
+done
+
+
+echo
+echo "Benchmark complete. System info:"
+echo
+
+echo '----Version and build id--------'
+./clickhouse local --query "SELECT version(), buildId()"
+echo '----CPU-------------------------'
+lscpu
+echo '----Block Devices---------------'
+lsblk
+echo '----Disk Free and Total--------'
+df -h .
+echo '----Memory Free and Total-------'
+free -h
+echo '----Physical Memory Amount------'
+cat /proc/meminfo | grep MemTotal
+echo '----RAID Info-------------------'
+cat /proc/mdstat
+#echo '----PCI-------------------------'
+#lspci
+#echo '----All Hardware Info-----------'
+#lshw
+echo '--------------------------------'
+
+echo
--- a/cmake/find/opencl.cmake
+++ b/cmake/find/opencl.cmake
@ -1,25 +0,0 @@
-# TODO: enable by default
-if(0)
-    option(ENABLE_OPENCL "Enable OpenCL support" ${ENABLE_LIBRARIES})
-endif()
-
-if(NOT ENABLE_OPENCL)
-    return()
-endif()
-
-# Intel OpenCl driver: sudo apt install intel-opencl-icd
-# @sa https://github.com/intel/compute-runtime/releases
-
-# OpenCL applications should link with ICD loader
-# sudo apt install opencl-headers ocl-icd-libopencl1
-# sudo ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so
-# TODO: add https://github.com/OCL-dev/ocl-icd as submodule instead
-
-find_package(OpenCL)
-if(OpenCL_FOUND)
-    set(USE_OPENCL 1)
-else()
-    message (${RECONFIGURE_MESSAGE_LEVEL} "Can't enable OpenCL support")
-endif()
-
-message(STATUS "Using opencl=${USE_OPENCL}: ${OpenCL_INCLUDE_DIRS} : ${OpenCL_LIBRARIES}")
--- a/contrib/jemalloc
+++ b/contrib/jemalloc
@ -1 +1 @@
-Subproject commit ea6b3e973b477b8061e0076bb257dbd7f3faa756
+Subproject commit 026764f19995c53583ab25a3b9c06a2fd74e4689
--- a/debian/control
+++ b/debian/control
@ -11,7 +11,6 @@ Build-Depends: debhelper (>= 9),
               libicu-dev,
               libreadline-dev,
               gperf,
-               python,
               tzdata
 Standards-Version: 3.9.8

--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@ -17,10 +17,10 @@ ccache --show-stats ||:
 ccache --zero-stats ||:
 ln -s /usr/lib/x86_64-linux-gnu/libOpenCL.so.1.0.0 /usr/lib/libOpenCL.so ||:
 rm -f CMakeCache.txt
-cmake --debug-trycompile --verbose=1 -DCMAKE_VERBOSE_MAKEFILE=1 -LA -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DSANITIZE=$SANITIZER $CMAKE_FLAGS ..
+cmake --debug-trycompile --verbose=1 -DCMAKE_VERBOSE_MAKEFILE=1 -LA -DCMAKE_BUILD_TYPE=$BUILD_TYPE -DSANITIZE=$SANITIZER -DENABLE_CHECK_HEAVY_BUILDS=1 $CMAKE_FLAGS ..
 ninja $NINJA_FLAGS clickhouse-bundle
 mv ./programs/clickhouse* /output
-mv ./src/unit_tests_dbms /output
+mv ./src/unit_tests_dbms /output ||: # may not exist for some binary builds
 find . -name '*.so' -print -exec mv '{}' /output \;
 find . -name '*.so.*' -print -exec mv '{}' /output \;

--- a/docker/packager/packager
+++ b/docker/packager/packager
@ -105,6 +105,7 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
    # Create combined output archive for split build and for performance tests.
    if package_type == "performance":
        result.append("COMBINED_OUTPUT=performance")
+        cmake_flags.append("-DENABLE_TESTS=0")
    elif split_binary:
        result.append("COMBINED_OUTPUT=shared_build")

--- a/docker/test/base/Dockerfile
+++ b/docker/test/base/Dockerfile
@ -1,7 +1,7 @@
 # docker build -t yandex/clickhouse-test-base .
 FROM ubuntu:19.10

-ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=10
+ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11

 RUN apt-get update \
    && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
@ -43,7 +43,6 @@ RUN apt-get update \
        llvm-${LLVM_VERSION} \
        moreutils \
        perl \
-        perl \
        pigz \
        pkg-config \
        tzdata \
--- a/docker/test/integration/base/Dockerfile
+++ b/docker/test/integration/base/Dockerfile
@ -1,5 +1,5 @@
 # docker build -t yandex/clickhouse-integration-test .
-FROM ubuntu:19.10
+FROM yandex/clickhouse-test-base

 RUN apt-get update \
    && env DEBIAN_FRONTEND=noninteractive apt-get -y install \
@ -8,7 +8,6 @@ RUN apt-get update \
        libreadline-dev \
        libicu-dev \
        bsdutils \
-        llvm-9 \
        gdb \
        unixodbc \
        odbcinst \
@ -29,9 +28,3 @@ RUN curl 'https://cdn.mysql.com//Downloads/Connector-ODBC/8.0/mysql-connector-od

 ENV TZ=Europe/Moscow
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
-
-# Sanitizer options
-RUN echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 history_size=7'" >> /etc/environment; \
-    echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \
-    echo "MSAN_OPTIONS='abort_on_error=1'" >> /etc/environment; \
-    ln -s /usr/lib/llvm-9/bin/llvm-symbolizer /usr/bin/llvm-symbolizer;
--- a/docker/test/performance-comparison/Dockerfile
+++ b/docker/test/performance-comparison/Dockerfile
@ -29,7 +29,7 @@ RUN apt-get update \
            tzdata \
            vim \
            wget \
-    && pip3 --no-cache-dir install clickhouse_driver \
+    && pip3 --no-cache-dir install clickhouse_driver scipy \
    && apt-get purge --yes python3-dev g++ \
    && apt-get autoremove --yes \
    && apt-get clean \
--- a/docker/test/performance-comparison/README.md
+++ b/docker/test/performance-comparison/README.md
@ -16,7 +16,7 @@ We also consider the test to be unstable, if the observed difference is less tha
 performance differences above 5% more often than in 5% runs, so the test is likely
 to have false positives.

-### How to read the report
+### How to Read the Report

 The check status summarizes the report in a short text message like `1 faster, 10 unstable`:
 * `1 faster` -- how many queries became faster,
@ -27,28 +27,50 @@ The check status summarizes the report in a short text message like `1 faster, 1

 The report page itself constists of a several tables. Some of them always signify errors, e.g. "Run errors" -- the very presence of this table indicates that there were errors during the test, that are not normal and must be fixed. Some tables are mostly informational, e.g. "Test times" -- they reflect normal test results. But if a cell in such table is marked in red, this also means an error, e.g., a test is taking too long to run.

-#### Tested commits
+#### Tested Commits
 Informational, no action required. Log messages for the commits that are tested. Note that for the right commit, we show nominal tested commit `pull/*/head` and real tested commit `pull/*/merge`, which is generated by GitHub by merging latest master to the `pull/*/head` and which we actually build and test in CI.

-#### Run errors
-Action required for every item -- these are errors that must be fixed. The errors that ocurred when running some test queries. For more information about the error, download test output archive and see `test-name-err.log`. To reproduce, see 'How to run' below.
+#### Error Summary
+Action required for every item.

-#### Slow on client
-Action required for every item -- these are errors that must be fixed. This table shows queries that take significantly longer to process on the client than on the server. A possible reason might be sending too much data to the client, e.g., a forgotten `format Null`.
+This table summarizes all errors that ocurred during the test. Click the links to go to the description of a particular error.

-#### Short queries not marked as short
-Action required for every item -- these are errors that must be fixed. This table shows queries that are "short" but not explicitly marked as such. "Short" queries are too fast to meaningfully compare performance, because the changes are drowned by the noise. We consider all queries that run faster than 0.02 s to be "short", and only check the performance if they became slower than this threshold. Probably this mode is not what you want, so you have to increase the query run time to be between 1 and 0.1 s, so that the performance can be compared. You do want this "short" mode for queries that complete "immediately", such as some varieties of `select count(*)`. You have to mark them as "short" explicitly by writing `<query short="1">...`. The value of "short" attribute is evaluated as a python expression, and substitutions are performed, so you can write something like `<query short="{column1} = {column2}">select count(*) from table where {column1} > {column2}</query>`, to mark only a particular combination of variables as short.
+#### Run Errors
+Action required for every item -- these are errors that must be fixed.

-#### Partial queries
-Action required for the cells marked in red. Shows the queries we are unable to run on an old server -- probably because they contain a new function. You should see this table when you add a new function and a performance test for it. Check that the run time and variance are acceptable (run time between 0.1 and 1 seconds, variance below 10%). If not, they will be highlighted in red.
+The errors that ocurred when running some test queries. For more information about the error, download test output archive and see `test-name-err.log`. To reproduce, see 'How to run' below.

-#### Changes in performance
-Action required for the cells marked in red, and some cheering is appropriate for the cells marked in green. These are the queries for which we observe a statistically significant change in performance. Note that there will always be some false positives -- we try to filter by p < 0.001, and have 2000 queries, so two false positives per run are expected. In practice we have more -- e.g. code layout changed because of some unknowable jitter in compiler internals, so the change we observe is real, but it is a 'false positive' in the sense that it is not directly caused by your changes. If, based on your knowledge of ClickHouse internals, you can decide that the observed test changes are not relevant to the changes made in the tested PR, you can ignore them.
+#### Slow on Client
+Action required for every item -- these are errors that must be fixed.
+
+This table shows queries that take significantly longer to process on the client than on the server. A possible reason might be sending too much data to the client, e.g., a forgotten `format Null`.
+
+#### Unexpected Query Duration
+Action required for every item -- these are errors that must be fixed.
+
+Queries that have "short" duration (on the order of 0.1 s) can't be reliably tested in a normal way, where we perform a small (about ten) measurements for each server, because the signal-to-noise ratio is much smaller. There is a special mode for such queries that instead runs them for a fixed amount of time, normally with much higher number of measurements (up to thousands). This mode must be explicitly enabled by the test author to avoid accidental errors. It must be used only for queries that are meant to complete "immediately", such as `select count(*)`. If your query is not supposed to be "immediate", try to make it run longer, by e.g. processing more data.
+
+This table shows queries for which the "short" marking is not consistent with the actual query run time -- i.e., a query runs for a long time but is marked as short, or it runs very fast but is not marked as short.
+
+If your query is really supposed to complete "immediately" and can't be made to run longer, you have to mark it as "short". To do so, write `<query short="1">...` in the test file. The value of "short" attribute is evaluated as a python expression, and substitutions are performed, so you can write something like `<query short="{column1} = {column2}">select count(*) from table where {column1} > {column2}</query>`, to mark only a particular combination of variables as short.
+
+
+#### Partial Queries
+Action required for the cells marked in red.
+
+Shows the queries we are unable to run on an old server -- probably because they contain a new function. You should see this table when you add a new function and a performance test for it. Check that the run time and variance are acceptable (run time between 0.1 and 1 seconds, variance below 10%). If not, they will be highlighted in red.
+
+#### Changes in Performance
+Action required for the cells marked in red, and some cheering is appropriate for the cells marked in green.
+
+These are the queries for which we observe a statistically significant change in performance. Note that there will always be some false positives -- we try to filter by p < 0.001, and have 2000 queries, so two false positives per run are expected. In practice we have more -- e.g. code layout changed because of some unknowable jitter in compiler internals, so the change we observe is real, but it is a 'false positive' in the sense that it is not directly caused by your changes. If, based on your knowledge of ClickHouse internals, you can decide that the observed test changes are not relevant to the changes made in the tested PR, you can ignore them.

 You can find flame graphs for queries with performance changes in the test output archive, in files named as 'my_test_0_Cpu_SELECT 1 FROM....FORMAT Null.left.svg'. First goes the test name, then the query number in the test, then the trace type (same as in `system.trace_log`), and then the server version (left is old and right is new).

-#### Unstable queries
-Action required for the cells marked in red. These are queries for which we did not observe a statistically significant change in performance, but for which the variance in query performance is very high. This means that we are likely to observe big changes in performance even in the absence of real changes, e.g. when comparing the server to itself. Such queries are going to have bad sensitivity as performance tests -- if a query has, say, 50% expected variability, this means we are going to see changes in performance up to 50%, even when there were no real changes in the code. And because of this, we won't be able to detect changes less than 50% with such a query, which is pretty bad. The reasons for the high variability must be investigated and fixed; ideally, the variability should be brought under 5-10%. 
+#### Unstable Queries
+Action required for the cells marked in red.
+
+These are the queries for which we did not observe a statistically significant change in performance, but for which the variance in query performance is very high. This means that we are likely to observe big changes in performance even in the absence of real changes, e.g. when comparing the server to itself. Such queries are going to have bad sensitivity as performance tests -- if a query has, say, 50% expected variability, this means we are going to see changes in performance up to 50%, even when there were no real changes in the code. And because of this, we won't be able to detect changes less than 50% with such a query, which is pretty bad. The reasons for the high variability must be investigated and fixed; ideally, the variability should be brought under 5-10%. 

 The most frequent reason for instability is that the query is just too short -- e.g. below 0.1 seconds. Bringing query time to 0.2 seconds or above usually helps.
 Other reasons may include:
@ -57,24 +79,33 @@ Other reasons may include:

 Investigating the instablility is the hardest problem in performance testing, and we still have not been able to understand the reasons behind the instability of some queries. There are some data that can help you in the performance test output archive. Look for files named 'my_unstable_test_0_SELECT 1...FORMAT Null.{left,right}.metrics.rep'. They contain metrics from `system.query_log.ProfileEvents` and functions from stack traces from `system.trace_log`, that vary significantly between query runs. The second column is array of \[min, med, max] values for the metric. Say, if you see `PerfCacheMisses` there, it may mean that the code being tested has not-so-cache-local memory access pattern that is sensitive to memory layout.

-#### Skipped tests
-Informational, no action required. Shows the tests that were skipped, and the reason for it. Normally it is because the data set required for the test was not loaded, or the test is marked as 'long' -- both cases mean that the test is too big to be ran per-commit.
+#### Skipped Tests
+Informational, no action required.

-#### Test performance changes
-Informational, no action required. This table summarizes the changes in performance of queries in each test -- how many queries have changed, how many are unstable, and what is the magnitude of the changes.
+Shows the tests that were skipped, and the reason for it. Normally it is because the data set required for the test was not loaded, or the test is marked as 'long' -- both cases mean that the test is too big to be ran per-commit.

-#### Test times
-Action required for the cells marked in red. This table shows the run times for all the tests. You may have to fix two kinds of errors in this table:
+#### Test Performance Changes
+Informational, no action required.
+
+This table summarizes the changes in performance of queries in each test -- how many queries have changed, how many are unstable, and what is the magnitude of the changes.
+
+#### Test Times
+Action required for the cells marked in red.
+
+This table shows the run times for all the tests. You may have to fix two kinds of errors in this table:
 1) Average query run time is too long -- probalby means that the preparatory steps such as creating the table and filling them with data are taking too long. Try to make them faster.
 2) Longest query run time is too long -- some particular queries are taking too long, try to make them faster. The ideal query run time is between 0.1 and 1 s.

-#### Concurrent benchmarks
-No action required. This table shows the results of a concurrent behcmark where queries from `website` are ran in parallel using `clickhouse-benchmark`, and requests per second values are compared for old and new servers. It shows variability up to 20% for no apparent reason, so it's probably safe to disregard it. We have it for special cases like investigating concurrency effects in memory allocators, where it may be important.
+#### Metric Changes
+No action required.

-#### Metric changes
-No action required. These are changes in median values of metrics from `system.asynchronous_metrics_log`. Again, they are prone to unexplained variation and you can safely ignore this table unless it's interesting to you for some particular reason (e.g. you want to compare memory usage). There are also graphs of these metrics in the performance test output archive, in the `metrics` folder.
+These are changes in median values of metrics from `system.asynchronous_metrics_log`. These metrics are prone to unexplained variation and you can safely ignore this table unless it's interesting to you for some particular reason (e.g. you want to compare memory usage). There are also graphs of these metrics in the performance test output archive, in the `metrics` folder.

-### How to run
+#### Errors while Building the Report
+Ask a maintainer for help. These errors normally indicate a problem with testing infrastructure.
+
+
+### How to Run
 Run the entire docker container, specifying PR number (0 for master)
 and SHA of the commit to test. The reference revision is determined as a nearest
 ancestor testing release tag. It is possible to specify the reference revision and
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -63,7 +63,7 @@ function configure
    # Make copies of the original db for both servers. Use hardlinks instead
    # of copying to save space. Before that, remove preprocessed configs and
    # system tables, because sharing them between servers with hardlinks may
-    # lead to weird effects.
+    # lead to weird effects. 
    rm -r left/db ||:
    rm -r right/db ||:
    rm -r db0/preprocessed_configs ||:
@ -114,14 +114,12 @@ function run_tests
    # Just check that the script runs at all
    "$script_dir/perf.py" --help > /dev/null

-    changed_test_files=""
-
    # Find the directory with test files.
    if [ -v CHPC_TEST_PATH ]
    then
        # Use the explicitly set path to directory with test files.
        test_prefix="$CHPC_TEST_PATH"
-    elif [ "$PR_TO_TEST" = "0" ]
+    elif [ "$PR_TO_TEST" == "0" ]
    then
        # When testing commits from master, use the older test files. This
        # allows the tests to pass even when we add new functions and tests for
@ -130,14 +128,6 @@ function run_tests
    else
        # For PRs, use newer test files so we can test these changes.
        test_prefix=right/performance
-
-        # If only the perf tests were changed in the PR, we will run only these
-        # tests. The list of changed tests in changed-test.txt is prepared in
-        # entrypoint.sh from git diffs, because it has the cloned repo.  Used
-        # to use rsync for that but it was really ugly and not always correct
-        # (e.g. when the reference SHA is really old and has some other
-        # differences to the tested SHA, besides the one introduced by the PR).
-        changed_test_files=$(sed "s/tests\/performance/${test_prefix//\//\\/}/" changed-tests.txt)
    fi

    # Determine which tests to run.
@ -146,15 +136,36 @@ function run_tests
        # Run only explicitly specified tests, if any.
        # shellcheck disable=SC2010
        test_files=$(ls "$test_prefix" | grep "$CHPC_TEST_GREP" | xargs -I{} -n1 readlink -f "$test_prefix/{}")
-    elif [ "$changed_test_files" != "" ]
+    elif [ "$PR_TO_TEST" -ne 0 ] \
+        && [ "$(wc -l < changed-test-definitions.txt)" -gt 0 ] \
+        && [ "$(wc -l < changed-test-scripts.txt)" -eq 0 ] \
+        && [ "$(wc -l < other-changed-files.txt)" -eq 0 ]
    then
-        # Use test files that changed in the PR.
-        test_files="$changed_test_files"
+        # If only the perf tests were changed in the PR, we will run only these
+        # tests. The lists of changed files are prepared in entrypoint.sh because
+        # it has the repository.
+        test_files=$(sed "s/tests\/performance/${test_prefix//\//\\/}/" changed-test-definitions.txt)
    else
        # The default -- run all tests found in the test dir.
        test_files=$(ls "$test_prefix"/*.xml)
    fi

+    # For PRs w/o changes in test definitons and scripts, test only a subset of
+    # queries, and run them less times. If the corresponding environment variables
+    # are already set, keep those values.
+    if [ "$PR_TO_TEST" -ne 0 ] \
+        && [ "$(wc -l < changed-test-definitions.txt)" -eq 0 ] \
+        && [ "$(wc -l < changed-test-scripts.txt)" -eq 0 ]
+    then
+        CHPC_RUNS=${CHPC_RUNS:-7}
+        CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-20}
+    else
+        CHPC_RUNS=${CHPC_RUNS:-13}
+        CHPC_MAX_QUERIES=${CHPC_MAX_QUERIES:-0}
+    fi
+    export CHPC_RUNS
+    export CHPC_MAX_QUERIES
+
    # Determine which concurrent benchmarks to run. For now, the only test
    # we run as a concurrent benchmark is 'website'. Run it as benchmark if we
    # are also going to run it as a normal test.
@ -184,11 +195,13 @@ function run_tests
        echo test "$test_name"

        TIMEFORMAT=$(printf "$test_name\t%%3R\t%%3U\t%%3S\n")
-        # the grep is to filter out set -x output and keep only time output
+        # The grep is to filter out set -x output and keep only time output.
+        # The '2>&1 >/dev/null' redirects stderr to stdout, and discards stdout.
        { \
            time "$script_dir/perf.py" --host localhost localhost --port 9001 9002 \
+                --runs "$CHPC_RUNS" --max-queries "$CHPC_MAX_QUERIES" \
                -- "$test" > "$test_name-raw.tsv" 2> "$test_name-err.log" ; \
-        } 2>&1 >/dev/null | grep -v ^+ >> "wall-clock-times.tsv" \
+        } 2>&1 >/dev/null | tee >(grep -v ^+ >> "wall-clock-times.tsv") \
            || echo "Test $test_name failed with error code $?" >> "$test_name-err.log"
    done

@ -197,33 +210,9 @@ function run_tests
    wait
 }

-# Run some queries concurrently and report the resulting TPS. This additional
-# (relatively) short test helps detect concurrency-related effects, because the
-# main performance comparison testing is done query-by-query.
-function run_benchmark
-{
-    rm -rf benchmark ||:
-    mkdir benchmark ||:
-
-    # The list is built by run_tests.
-    while IFS= read -r file
-    do
-        name=$(basename "$file" ".xml")
-
-        "$script_dir/perf.py" --print-queries "$file" > "benchmark/$name-queries.txt"
-        "$script_dir/perf.py" --print-settings "$file" > "benchmark/$name-settings.txt"
-
-        readarray -t settings < "benchmark/$name-settings.txt"
-        command=(clickhouse-benchmark --concurrency 6 --cumulative --iterations 1000 --randomize 1 --delay 0 --continue_on_errors "${settings[@]}")
-
-        "${command[@]}" --port 9001 --json "benchmark/$name-left.json" < "benchmark/$name-queries.txt"
-        "${command[@]}" --port 9002 --json "benchmark/$name-right.json" < "benchmark/$name-queries.txt"
-    done < benchmarks-to-run.txt
-}
-
 function get_profiles_watchdog
 {
-    sleep 6000
+    sleep 600

    echo "The trace collection did not finish in time." >> profile-errors.log

@ -490,8 +479,6 @@ build_log_column_definitions
 cat analyze/errors.log >> report/errors.log ||:
 cat profile-errors.log >> report/errors.log ||:

-short_query_threshold="0.02"
-
 clickhouse-local --query "
 create view query_display_names as select * from
    file('analyze/query-display-names.tsv', TSV,
@ -524,18 +511,11 @@ create view query_metric_stats as
 -- Main statistics for queries -- query time as reported in query log.
 create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
    as select
-        -- Comparison mode doesn't make sense for queries that complete
-        -- immediately (on the same order of time as noise). If query duration is
-        -- less that some threshold, we just skip it. If there is a significant
-        -- regression in such query, the time will exceed the threshold, and we
-        -- well process it normally and detect the regression.
-        right < $short_query_threshold as short,
-
-        not short and abs(diff) > report_threshold        and abs(diff) > stat_threshold as changed_fail,
-        not short and abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show,
+        abs(diff) > report_threshold        and abs(diff) > stat_threshold as changed_fail,
+        abs(diff) > report_threshold - 0.05 and abs(diff) > stat_threshold as changed_show,
        
-        not short and not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail,
-        not short and not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
+        not changed_fail and stat_threshold > report_threshold + 0.10 as unstable_fail,
+        not changed_show and stat_threshold > report_threshold - 0.05 as unstable_show,
        
        left, right, diff, stat_threshold,
        if(report_threshold > 0, report_threshold, 0.10) as report_threshold,
@ -640,24 +620,59 @@ create table wall_clock_time_per_test engine Memory as select *

 create table test_time engine Memory as
    select test, sum(client) total_client_time,
-        maxIf(client, not short) query_max,
-        minIf(client, not short) query_min,
-        count(*) queries, sum(short) short_queries
+        max(client) query_max,
+        min(client) query_min,
+        count(*) queries
    from total_client_time_per_query full join queries using (test, query_index)
    group by test;

+create view query_runs as select * from file('analyze/query-runs.tsv', TSV,
+    'test text, query_index int, query_id text, version UInt8, time float');
+
+--
+-- Guess the number of query runs used for this test. The number is required to
+-- calculate and check the average query run time in the report.
+-- We have to be careful, because we will encounter:
+--  1) partial queries which run only on one server
+--  2) short queries which run for a much higher number of times
+--  3) some errors that make query run for a different number of times on a
+--     particular server.
+--
+create view test_runs as
+    select test,
+        -- Default to 7 runs if there are only 'short' queries in the test, and
+        -- we can't determine the number of runs.
+        if((ceil(medianOrDefaultIf(t.runs, not short), 0) as r) != 0, r, 7) runs
+    from (
+        select
+            -- The query id is the same for both servers, so no need to divide here.
+            uniqExact(query_id) runs,
+            (test, query_index) in
+                (select * from file('analyze/marked-short-queries.tsv', TSV,
+                    'test text, query_index int'))
+            as short,
+            test, query_index
+        from query_runs
+        group by test, query_index
+        ) t
+    group by test
+    ;
+
 create table test_times_report engine File(TSV, 'report/test-times.tsv') as
    select wall_clock_time_per_test.test, real,
        toDecimal64(total_client_time, 3),
        queries,
-        short_queries,
        toDecimal64(query_max, 3),
        toDecimal64(real / queries, 3) avg_real_per_query,
-        toDecimal64(query_min, 3)
+        toDecimal64(query_min, 3),
+        runs
    from test_time
-    -- wall clock times are also measured for skipped tests, so don't
-    -- do full join
-    left join wall_clock_time_per_test using test
+        -- wall clock times are also measured for skipped tests, so don't
+        -- do full join
+        left join wall_clock_time_per_test
+            on wall_clock_time_per_test.test = test_time.test
+        full join test_runs
+            on test_runs.test = test_time.test
    order by avg_real_per_query desc;

 -- report for all queries page, only main metric
@ -685,32 +700,48 @@ create table queries_for_flamegraph engine File(TSVWithNamesAndTypes,
    select test, query_index from queries where unstable_show or changed_show
    ;

-- List of queries that have 'short' duration, but are not marked as 'short' by
-- the test author (we report them).
-create table unmarked_short_queries_report
-    engine File(TSV, 'report/unmarked-short-queries.tsv')
-    as select time, test, query_index, query_display_name
+
+create view shortness
+    as select 
+        (test, query_index) in
+            (select * from file('analyze/marked-short-queries.tsv', TSV,
+            'test text, query_index int'))
+            as marked_short,
+        time, test, query_index, query_display_name
    from (
-            select right time, test, query_index from queries where short
+            select right time, test, query_index from queries
            union all
            select time_median, test, query_index from partial_query_times
-                where time_median < $short_query_threshold
        ) times
        left join query_display_names
            on times.test = query_display_names.test
                and times.query_index = query_display_names.query_index
-    where (test, query_index) not in
-        (select * from file('analyze/marked-short-queries.tsv', TSV,
-            'test text, query_index int'))
-    order by test, query_index
    ;

+-- Report of queries that have inconsistent 'short' markings:
+-- 1) have short duration, but are not marked as 'short'
+-- 2) the reverse -- marked 'short' but take too long.
+-- The threshold for 2) is significantly larger than the threshold for 1), to
+-- avoid jitter.
+create table inconsistent_short_marking_report
+    engine File(TSV, 'report/unexpected-query-duration.tsv')
+    as select
+        multiIf(marked_short and time > 0.1, '"short" queries must run faster than 0.02 s',
+                not marked_short and time < 0.02, '"normal" queries must run longer than 0.1 s',
+                '') problem,
+        marked_short, time,
+        test, query_index, query_display_name
+    from shortness
+    where problem != ''
+    ;
+
+
 --------------------------------------------------------------------------------
 -- various compatibility data formats follow, not related to the main report

 -- keep the table in old format so that we can analyze new and old data together
 create table queries_old_format engine File(TSVWithNamesAndTypes, 'queries.rep')
-    as select short, changed_fail, unstable_fail, left, right, diff,
+    as select 0 short, changed_fail, unstable_fail, left, right, diff,
        stat_threshold, test, query_display_name query
    from queries
    ;
@ -1008,9 +1039,6 @@ case "$stage" in
    # Ignore the errors to collect the log and build at least some report, anyway
    time run_tests ||:
    ;&
-"run_benchmark")
-    time run_benchmark 2> >(tee -a run-errors.tsv 1>&2) ||:
-    ;&
 "get_profiles")
    # Check for huge pages.
    cat /sys/kernel/mm/transparent_hugepage/enabled > thp-enabled.txt ||:
--- a/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
+++ b/docker/test/performance-comparison/config/users.d/perf-comparison-tweaks-users.xml
@ -1,8 +1,6 @@
 <yandex>
    <profiles>
        <default>
-            <query_profiler_real_time_period_ns>10000000</query_profiler_real_time_period_ns>
-            <query_profiler_cpu_time_period_ns>0</query_profiler_cpu_time_period_ns>
            <allow_introspection_functions>1</allow_introspection_functions>
            <log_queries>1</log_queries>
            <metrics_perf_events_enabled>1</metrics_perf_events_enabled>
--- a/docker/test/performance-comparison/entrypoint.sh
+++ b/docker/test/performance-comparison/entrypoint.sh
@ -97,13 +97,10 @@ then
    # tests for use by compare.sh. Compare to merge base, because master might be
    # far in the future and have unrelated test changes.
    base=$(git -C right/ch merge-base pr origin/master)
-    git -C right/ch diff --name-only "$base" pr | tee changed-tests.txt
-    if grep -vq '^tests/performance' changed-tests.txt
-    then
-        # Have some other changes besides the tests, so truncate the test list,
-        # meaning, run all tests.
-        : > changed-tests.txt
-    fi
+    git -C right/ch diff --name-only "$base" pr -- . | tee all-changed-files.txt
+    git -C right/ch diff --name-only "$base" pr -- tests/performance | tee changed-test-definitions.txt
+    git -C right/ch diff --name-only "$base" pr -- docker/test/performance-comparison | tee changed-test-scripts.txt
+    git -C right/ch diff --name-only "$base" pr -- :!tests/performance :!docker/test/performance-comparison | tee other-changed-files.txt
 fi

 # Set python output encoding so that we can print queries with Russian letters.
--- a/docker/test/performance-comparison/perf.py
+++ b/docker/test/performance-comparison/perf.py
@ -1,16 +1,21 @@
 #!/usr/bin/python3

-import os
-import sys
-import itertools
-import clickhouse_driver
-import xml.etree.ElementTree as et
 import argparse
+import clickhouse_driver
+import itertools
+import functools
+import math
+import os
 import pprint
+import random
 import re
+import statistics
 import string
+import sys
 import time
 import traceback
+import xml.etree.ElementTree as et
+from scipy import stats

 def tsv_escape(s):
    return s.replace('\\', '\\\\').replace('\t', '\\t').replace('\n', '\\n').replace('\r','')
@ -20,7 +25,8 @@ parser = argparse.ArgumentParser(description='Run performance test.')
 parser.add_argument('file', metavar='FILE', type=argparse.FileType('r', encoding='utf-8'), nargs=1, help='test description file')
 parser.add_argument('--host', nargs='*', default=['localhost'], help="Server hostname(s). Corresponds to '--port' options.")
 parser.add_argument('--port', nargs='*', default=[9000], help="Server port(s). Corresponds to '--host' options.")
-parser.add_argument('--runs', type=int, default=int(os.environ.get('CHPC_RUNS', 7)), help='Number of query runs per server. Defaults to CHPC_RUNS environment variable.')
+parser.add_argument('--runs', type=int, default=1, help='Number of query runs per server.')
+parser.add_argument('--max-queries', type=int, default=None, help='Test no more than this number of queries, chosen at random.')
 parser.add_argument('--long', action='store_true', help='Do not skip the tests tagged as long.')
 parser.add_argument('--print-queries', action='store_true', help='Print test queries and exit.')
 parser.add_argument('--print-settings', action='store_true', help='Print test settings and exit.')
@ -62,18 +68,13 @@ def substitute_parameters(query_templates, other_templates = []):
 # Build a list of test queries, substituting parameters to query templates,
 # and reporting the queries marked as short.
 test_queries = []
+is_short = []
 for e in root.findall('query'):
-    new_queries = []
-    if 'short' in e.attrib:
-        new_queries, [is_short] = substitute_parameters([e.text], [[e.attrib['short']]])
-        for i, s in enumerate(is_short):
-            # Don't print this if we only need to print the queries.
-            if eval(s) and not args.print_queries:
-                print(f'short\t{i + len(test_queries)}')
-    else:
-        new_queries = substitute_parameters([e.text])
-
+    new_queries, [new_is_short] = substitute_parameters([e.text], [[e.attrib.get('short', '0')]])
    test_queries += new_queries
+    is_short += [eval(s) for s in new_is_short]
+
+assert(len(test_queries) == len(is_short))


 # If we're only asked to print the queries, do that and exit
@ -82,6 +83,11 @@ if args.print_queries:
        print(q)
    exit(0)

+# Print short queries
+for i, s in enumerate(is_short):
+    if s:
+        print(f'short\t{i}')
+
 # If we're only asked to print the settings, do that and exit. These are settings
 # for clickhouse-benchmark, so we print them as command line arguments, e.g.
 # '--max_memory_usage=10000000'.
@ -98,25 +104,13 @@ if not args.long:
            print('skipped\tTest is tagged as long.')
            sys.exit(0)

-# Check main metric to detect infinite tests. We shouldn't have such tests anymore,
-# but we did in the past, and it is convenient to be able to process old tests.
-main_metric_element = root.find('main_metric/*')
-if main_metric_element is not None and main_metric_element.tag != 'min_time':
-    raise Exception('Only the min_time main metric is supported. This test uses \'{}\''.format(main_metric_element.tag))
-
-# Another way to detect infinite tests. They should have an appropriate main_metric
-# but sometimes they don't.
-infinite_sign = root.find('.//average_speed_not_changing_for_ms')
-if infinite_sign is not None:
-    raise Exception('Looks like the test is infinite (sign 1)')
-
 # Print report threshold for the test if it is set.
 if 'max_ignored_relative_change' in root.attrib:
    print(f'report-threshold\t{root.attrib["max_ignored_relative_change"]}')

 # Open connections
 servers = [{'host': host, 'port': port} for (host, port) in zip(args.host, args.port)]
-connections = [clickhouse_driver.Client(**server) for server in servers]
+all_connections = [clickhouse_driver.Client(**server) for server in servers]

 for s in servers:
    print('server\t{}\t{}'.format(s['host'], s['port']))
@ -126,7 +120,7 @@ for s in servers:
 # connection loses the changes in settings.
 drop_query_templates = [q.text for q in root.findall('drop_query')]
 drop_queries = substitute_parameters(drop_query_templates)
-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for q in drop_queries:
        try:
            c.execute(q)
@ -142,7 +136,7 @@ for conn_index, c in enumerate(connections):
 # configurable). So the end result is uncertain, but hopefully we'll be able to
 # run at least some queries.
 settings = root.findall('settings/*')
-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for s in settings:
        try:
            q = f"set {s.tag} = '{s.text}'"
@ -154,7 +148,7 @@ for conn_index, c in enumerate(connections):
 # Check tables that should exist. If they don't exist, just skip this test.
 tables = [e.text for e in root.findall('preconditions/table_exists')]
 for t in tables:
-    for c in connections:
+    for c in all_connections:
        try:
            res = c.execute("select 1 from {} limit 1".format(t))
        except:
@ -176,7 +170,7 @@ for q in create_queries:
            file = sys.stderr)
        sys.exit(1)

-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for q in create_queries:
        c.execute(q)
        print(f'create\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')
@ -184,13 +178,19 @@ for conn_index, c in enumerate(connections):
 # Run fill queries
 fill_query_templates = [q.text for q in root.findall('fill_query')]
 fill_queries = substitute_parameters(fill_query_templates)
-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for q in fill_queries:
        c.execute(q)
        print(f'fill\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')

+# Run the queries in randomized order, but preserve their indexes as specified
+# in the test XML. To avoid using too much time, limit the number of queries
+# we run per test.
+queries_to_run = random.sample(range(0, len(test_queries)), min(len(test_queries), args.max_queries or len(test_queries)))
+
 # Run test queries.
-for query_index, q in enumerate(test_queries):
+for query_index in queries_to_run:
+    q = test_queries[query_index]
    query_prefix = f'{test_name}.query{query_index}'

    # We have some crazy long queries (about 100kB), so trim them to a sane
@ -208,11 +208,12 @@ for query_index, q in enumerate(test_queries):
    # new one. We want to run them on the new server only, so that the PR author
    # can ensure that the test works properly. Remember the errors we had on
    # each server.
-    query_error_on_connection = [None] * len(connections);
-    for conn_index, c in enumerate(connections):
+    query_error_on_connection = [None] * len(all_connections);
+    for conn_index, c in enumerate(all_connections):
        try:
            prewarm_id = f'{query_prefix}.prewarm0'
-            res = c.execute(q, query_id = prewarm_id)
+            # Will also detect too long queries during warmup stage
+            res = c.execute(q, query_id = prewarm_id, settings = {'max_execution_time': 10})
            print(f'prewarm\t{query_index}\t{prewarm_id}\t{conn_index}\t{c.last_query.elapsed}')
        except KeyboardInterrupt:
            raise
@ -222,7 +223,6 @@ for query_index, q in enumerate(test_queries):
            query_error_on_connection[conn_index] = traceback.format_exc();
            continue

-
    # Report all errors that ocurred during prewarm and decide what to do next.
    # If prewarm fails for the query on all servers -- skip the query and
    # continue testing the next query.
@ -236,21 +236,29 @@ for query_index, q in enumerate(test_queries):

    if len(no_errors) == 0:
        continue
-    elif len(no_errors) < len(connections):
+    elif len(no_errors) < len(all_connections):
        print(f'partial\t{query_index}\t{no_errors}')

+    this_query_connections = [all_connections[index] for index in no_errors]
+
    # Now, perform measured runs.
    # Track the time spent by the client to process this query, so that we can
    # notice the queries that take long to process on the client side, e.g. by
    # sending excessive data.
    start_seconds = time.perf_counter()
    server_seconds = 0
-    for run in range(0, args.runs):
-        run_id = f'{query_prefix}.run{run}'
-        for conn_index, c in enumerate(connections):
-            if query_error_on_connection[conn_index]:
-                continue
+    profile_seconds = 0
+    run = 0

+    # Arrays of run times for each connection.
+    all_server_times = []
+    for conn_index, c in enumerate(this_query_connections):
+        all_server_times.append([])
+
+    while True:
+        run_id = f'{query_prefix}.run{run}'
+
+        for conn_index, c in enumerate(this_query_connections):
            try:
                res = c.execute(q, query_id = run_id)
            except Exception as e:
@ -259,22 +267,79 @@ for query_index, q in enumerate(test_queries):
                e.message = run_id + ': ' + e.message
                raise

-            print(f'query\t{query_index}\t{run_id}\t{conn_index}\t{c.last_query.elapsed}')
-            server_seconds += c.last_query.elapsed
+            elapsed = c.last_query.elapsed
+            all_server_times[conn_index].append(elapsed)

-            if c.last_query.elapsed > 10:
+            server_seconds += elapsed
+            print(f'query\t{query_index}\t{run_id}\t{conn_index}\t{elapsed}')
+
+            if elapsed > 10:
                # Stop processing pathologically slow queries, to avoid timing out
                # the entire test task. This shouldn't really happen, so we don't
                # need much handling for this case and can just exit.
-                print(f'The query no. {query_index} is taking too long to run ({c.last_query.elapsed} s)', file=sys.stderr)
+                print(f'The query no. {query_index} is taking too long to run ({elapsed} s)', file=sys.stderr)
                exit(2)

+        # Be careful with the counter, after this line it's the next iteration
+        # already.
+        run += 1
+
+        # Try to run any query for at least the specified number of times,
+        # before considering other stop conditions.
+        if run < args.runs:
+            continue
+
+        # For very short queries we have a special mode where we run them for at
+        # least some time. The recommended lower bound of run time for "normal"
+        # queries is about 0.1 s, and we run them about 10 times, giving the
+        # time per query per server of about one second. Use this value as a
+        # reference for "short" queries.
+        if is_short[query_index]:
+            if server_seconds >= 2 * len(this_query_connections):
+                break
+            # Also limit the number of runs, so that we don't go crazy processing
+            # the results -- 'eqmed.sql' is really suboptimal.
+            if run >= 500:
+                break
+        else:
+            if run >= args.runs:
+                break
+
    client_seconds = time.perf_counter() - start_seconds
    print(f'client-time\t{query_index}\t{client_seconds}\t{server_seconds}')

+    #print(all_server_times)
+    #print(stats.ttest_ind(all_server_times[0], all_server_times[1], equal_var = False).pvalue)
+
+    # Run additional profiling queries to collect profile data, but only if test times appeared to be different.
+    # We have to do it after normal runs because otherwise it will affect test statistics too much
+    if len(all_server_times) == 2 and stats.ttest_ind(all_server_times[0], all_server_times[1], equal_var = False).pvalue < 0.1:
+        run = 0
+        while True:
+            run_id = f'{query_prefix}.profile{run}'
+
+            for conn_index, c in enumerate(this_query_connections):
+                try:
+                    res = c.execute(q, query_id = run_id, settings = {'query_profiler_real_time_period_ns': 10000000})
+                    print(f'profile\t{query_index}\t{run_id}\t{conn_index}\t{c.last_query.elapsed}')
+                except Exception as e:
+                    # Add query id to the exception to make debugging easier.
+                    e.args = (run_id, *e.args)
+                    e.message = run_id + ': ' + e.message
+                    raise
+
+                elapsed = c.last_query.elapsed
+                profile_seconds += elapsed
+
+            run += 1
+            # Don't spend too much time for profile runs
+            if run > args.runs or profile_seconds > 10:
+                break
+            # And don't bother with short queries
+
 # Run drop queries
 drop_queries = substitute_parameters(drop_query_templates)
-for conn_index, c in enumerate(connections):
+for conn_index, c in enumerate(all_connections):
    for q in drop_queries:
        c.execute(q)
        print(f'drop\t{conn_index}\t{c.last_query.elapsed}\t{tsv_escape(q)}')
--- a/docker/test/performance-comparison/report.py
+++ b/docker/test/performance-comparison/report.py
@ -98,6 +98,9 @@ th {{

 tr:nth-child(odd) td {{filter: brightness(90%);}}

+.unexpected-query-duration tr :nth-child(2),
+.unexpected-query-duration tr :nth-child(3),
+.unexpected-query-duration tr :nth-child(5),
 .all-query-times tr :nth-child(1),
 .all-query-times tr :nth-child(2),
 .all-query-times tr :nth-child(3),
@ -126,7 +129,6 @@ tr:nth-child(odd) td {{filter: brightness(90%);}}
 .test-times tr :nth-child(5),
 .test-times tr :nth-child(6),
 .test-times tr :nth-child(7),
-.test-times tr :nth-child(8),
 .concurrent-benchmarks tr :nth-child(2),
 .concurrent-benchmarks tr :nth-child(3),
 .concurrent-benchmarks tr :nth-child(4),
@ -205,9 +207,11 @@ def tableStart(title):
    global table_anchor
    table_anchor = cls
    anchor = currentTableAnchor()
+    help_anchor = '-'.join(title.lower().split(' '));
    return f"""
        <h2 id="{anchor}">
            <a class="cancela" href="#{anchor}">{title}</a>
+            <a class="cancela" href="https://github.com/ClickHouse/ClickHouse/tree/master/docker/test/performance-comparison#{help_anchor}"><sup style="color: #888">?</sup></a>
        </h2>
        <table class="{cls}">
    """
@ -250,7 +254,7 @@ def addSimpleTable(caption, columns, rows, pos=None):
 def add_tested_commits():
    global report_errors
    try:
-        addSimpleTable('Tested commits', ['Old', 'New'],
+        addSimpleTable('Tested Commits', ['Old', 'New'],
            [['<pre>{}</pre>'.format(x) for x in
                [open('left-commit.txt').read(),
                 open('right-commit.txt').read()]]])
@ -276,7 +280,7 @@ def add_report_errors():
    if not report_errors:
        return

-    text = tableStart('Errors while building the report')
+    text = tableStart('Errors while Building the Report')
    text += tableHeader(['Error'])
    for x in report_errors:
        text += tableRow([x])
@ -290,7 +294,7 @@ def add_errors_explained():
        return

    text = '<a name="fail1"/>'
-    text += tableStart('Error summary')
+    text += tableStart('Error Summary')
    text += tableHeader(['Description'])
    for row in errors_explained:
        text += tableRow(row)
@ -308,26 +312,26 @@ if args.report == 'main':

    run_error_rows = tsvRows('run-errors.tsv')
    error_tests += len(run_error_rows)
-    addSimpleTable('Run errors', ['Test', 'Error'], run_error_rows)
+    addSimpleTable('Run Errors', ['Test', 'Error'], run_error_rows)
    if run_error_rows:
        errors_explained.append([f'<a href="#{currentTableAnchor()}">There were some errors while running the tests</a>']);


    slow_on_client_rows = tsvRows('report/slow-on-client.tsv')
    error_tests += len(slow_on_client_rows)
-    addSimpleTable('Slow on client',
+    addSimpleTable('Slow on Client',
                     ['Client time,&nbsp;s', 'Server time,&nbsp;s', 'Ratio', 'Test', 'Query'],
                     slow_on_client_rows)
    if slow_on_client_rows:
        errors_explained.append([f'<a href="#{currentTableAnchor()}">Some queries are taking noticeable time client-side (missing `FORMAT Null`?)</a>']);

-    unmarked_short_rows = tsvRows('report/unmarked-short-queries.tsv')
+    unmarked_short_rows = tsvRows('report/unexpected-query-duration.tsv')
    error_tests += len(unmarked_short_rows)
-    addSimpleTable('Short queries not marked as short',
-        ['New client time, s', 'Test', '#', 'Query'],
+    addSimpleTable('Unexpected Query Duration',
+        ['Problem', 'Marked as "short"?', 'Run time, s', 'Test', '#', 'Query'],
        unmarked_short_rows)
    if unmarked_short_rows:
-        errors_explained.append([f'<a href="#{currentTableAnchor()}">Some queries have short duration but are not explicitly marked as "short"</a>']);
+        errors_explained.append([f'<a href="#{currentTableAnchor()}">Some queries have unexpected duration</a>']);

    def add_partial():
        rows = tsvRows('report/partial-queries-report.tsv')
@ -335,7 +339,7 @@ if args.report == 'main':
            return

        global unstable_partial_queries, slow_average_tests, tables
-        text = tableStart('Partial queries')
+        text = tableStart('Partial Queries')
        columns = ['Median time, s', 'Relative time variance', 'Test', '#', 'Query']
        text += tableHeader(columns)
        attrs = ['' for c in columns]
@ -366,7 +370,7 @@ if args.report == 'main':

        global faster_queries, slower_queries, tables

-        text = tableStart('Changes in performance')
+        text = tableStart('Changes in Performance')
        columns = [
            'Old,&nbsp;s',                                          # 0
            'New,&nbsp;s',                                          # 1
@ -423,7 +427,7 @@ if args.report == 'main':
            'Query' #7
        ]

-        text = tableStart('Unstable queries')
+        text = tableStart('Unstable Queries')
        text += tableHeader(columns)

        attrs = ['' for c in columns]
@ -444,9 +448,9 @@ if args.report == 'main':
    add_unstable_queries()

    skipped_tests_rows = tsvRows('analyze/skipped-tests.tsv')
-    addSimpleTable('Skipped tests', ['Test', 'Reason'], skipped_tests_rows)
+    addSimpleTable('Skipped Tests', ['Test', 'Reason'], skipped_tests_rows)

-    addSimpleTable('Test performance changes',
+    addSimpleTable('Test Performance Changes',
        ['Test', 'Ratio of speedup&nbsp;(-) or slowdown&nbsp;(+)', 'Queries', 'Total not OK', 'Changed perf', 'Unstable'],
        tsvRows('report/test-perf-changes.tsv'))

@ -457,39 +461,38 @@ if args.report == 'main':
            return

        columns = [
-            'Test',                                          #0
+            'Test',                                               #0
            'Wall clock time,&nbsp;s',                            #1
            'Total client time,&nbsp;s',                          #2
            'Total queries',                                 #3
-            'Ignored short queries',                         #4
-            'Longest query<br>(sum for all runs),&nbsp;s',        #5
-            'Avg wall clock time<br>(sum for all runs),&nbsp;s',  #6
-            'Shortest query<br>(sum for all runs),&nbsp;s',       #7
+            'Longest query<br>(sum for all runs),&nbsp;s',        #4
+            'Avg wall clock time<br>(sum for all runs),&nbsp;s',  #5
+            'Shortest query<br>(sum for all runs),&nbsp;s',       #6
+            # 'Runs'                                              #7
            ]

-        text = tableStart('Test times')
+        text = tableStart('Test Times')
        text += tableHeader(columns)

-        nominal_runs = 7  # FIXME pass this as an argument
-        total_runs = (nominal_runs + 1) * 2  # one prewarm run, two servers
-        allowed_average_run_time = allowed_single_run_time + 60 / total_runs; # some allowance for fill/create queries
+        allowed_average_run_time = 3.75 # 60 seconds per test at 7 runs
        attrs = ['' for c in columns]
        for r in rows:
            anchor = f'{currentTableAnchor()}.{r[0]}'
-            if float(r[6]) > allowed_average_run_time * total_runs:
+            total_runs = (int(r[7]) + 1) * 2  # one prewarm run, two servers
+            if float(r[5]) > allowed_average_run_time * total_runs:
                # FIXME should be 15s max -- investigate parallel_insert
                slow_average_tests += 1
-                attrs[6] = f'style="background: {color_bad}"'
+                attrs[5] = f'style="background: {color_bad}"'
                errors_explained.append([f'<a href="#{anchor}">The test \'{r[0]}\' is too slow to run as a whole. Investigate whether the create and fill queries can be sped up'])
            else:
-                attrs[6] = ''
+                attrs[5] = ''

-            if float(r[5]) > allowed_single_run_time * total_runs:
+            if float(r[4]) > allowed_single_run_time * total_runs:
                slow_average_tests += 1
-                attrs[5] = f'style="background: {color_bad}"'
+                attrs[4] = f'style="background: {color_bad}"'
                errors_explained.append([f'<a href="./all-queries.html#all-query-times.{r[0]}.0">Some query of the test \'{r[0]}\' is too slow to run. See the all queries report'])
            else:
-                attrs[5] = ''
+                attrs[4] = ''

            text += tableRow(r, attrs, anchor)

@ -498,74 +501,7 @@ if args.report == 'main':

    add_test_times()

-    def add_benchmark_results():
-        if not os.path.isfile('benchmark/website-left.json'):
-            return
-
-        json_reports = [json.load(open(f'benchmark/website-{x}.json')) for x in ['left', 'right']]
-        stats = [next(iter(x.values()))["statistics"] for x in json_reports]
-        qps = [x["QPS"] for x in stats]
-        queries = [x["num_queries"] for x in stats]
-        errors = [x["num_errors"] for x in stats]
-        relative_diff = (qps[1] - qps[0]) / max(0.01, qps[0]);
-        times_diff = max(qps) / max(0.01, min(qps))
-
-        all_rows = []
-        header = ['Benchmark', 'Metric', 'Old', 'New', 'Relative difference', 'Times difference'];
-
-        attrs = ['' for x in header]
-        row = ['website', 'queries', f'{queries[0]:d}', f'{queries[1]:d}', '--', '--']
-        attrs[0] = 'rowspan=2'
-        all_rows.append([row, attrs])
-
-        attrs = ['' for x in header]
-        row = [None, 'queries/s', f'{qps[0]:.3f}', f'{qps[1]:.3f}', f'{relative_diff:.3f}', f'x{times_diff:.3f}']
-        if abs(relative_diff) > 0.1:
-            # More queries per second is better.
-            if relative_diff > 0.:
-                attrs[4] = f'style="background: {color_good}"'
-            else:
-                attrs[4] = f'style="background: {color_bad}"'
-        else:
-            attrs[4] = ''
-        all_rows.append([row, attrs]);
-
-        if max(errors):
-            all_rows[0][1][0] = "rowspan=3"
-            row = [''] * (len(header))
-            attrs = ['' for x in header]
-
-            attrs[0] = None
-            row[1] = 'errors'
-            row[2] = f'{errors[0]:d}'
-            row[3] = f'{errors[1]:d}'
-            row[4] = '--'
-            row[5] = '--'
-            if errors[0]:
-                attrs[2] += f' style="background: {color_bad}" '
-            if errors[1]:
-                attrs[3] += f' style="background: {color_bad}" '
-
-            all_rows.append([row, attrs])
-
-        text = tableStart('Concurrent benchmarks')
-        text += tableHeader(header)
-        for row, attrs in all_rows:
-            text += tableRow(row, attrs)
-        text += tableEnd()
-
-        global tables
-        tables.append(text)
-
-    try:
-        add_benchmark_results()
-    except:
-        report_errors.append(
-            traceback.format_exception_only(
-                *sys.exc_info()[:2])[-1])
-        pass
-
-    addSimpleTable('Metric changes',
+    addSimpleTable('Metric Changes',
        ['Metric', 'Old median value', 'New median value',
            'Relative difference', 'Times difference'],
        tsvRows('metrics/changes.tsv'))
@ -656,7 +592,7 @@ elif args.report == 'all-queries':
            'Query',                                  #9
            ]

-        text = tableStart('All query times')
+        text = tableStart('All Query Times')
        text += tableHeader(columns)

        attrs = ['' for c in columns]
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -48,13 +48,6 @@ fi

 ln -sf /usr/share/clickhouse-test/config/client_config.xml /etc/clickhouse-client/config.xml

-echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 history_size=7'" >> /etc/environment
-echo "TSAN_SYMBOLIZER_PATH=/usr/lib/llvm-10/bin/llvm-symbolizer" >> /etc/environment
-echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment
-echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-10/bin/llvm-symbolizer" >> /etc/environment
-echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-10/bin/llvm-symbolizer" >> /etc/environment
-echo "LLVM_SYMBOLIZER_PATH=/usr/lib/llvm-10/bin/llvm-symbolizer" >> /etc/environment
-
 service zookeeper start
 sleep 5
 service clickhouse-server start && sleep 5
--- a/docker/test/stateless_unbundled/run.sh
+++ b/docker/test/stateless_unbundled/run.sh
@ -48,13 +48,6 @@ fi

 ln -sf /usr/share/clickhouse-test/config/client_config.xml /etc/clickhouse-client/config.xml

-echo "TSAN_OPTIONS='verbosity=1000 halt_on_error=1 history_size=7'" >> /etc/environment
-echo "TSAN_SYMBOLIZER_PATH=/usr/lib/llvm-10/bin/llvm-symbolizer" >> /etc/environment
-echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment
-echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-10/bin/llvm-symbolizer" >> /etc/environment
-echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-10/bin/llvm-symbolizer" >> /etc/environment
-echo "LLVM_SYMBOLIZER_PATH=/usr/lib/llvm-10/bin/llvm-symbolizer" >> /etc/environment
-
 service zookeeper start
 sleep 5
 service clickhouse-server start && sleep 5
--- a/docker/test/stress/run.sh
+++ b/docker/test/stress/run.sh
@ -43,8 +43,6 @@ ln -s /usr/share/clickhouse-test/config/log_queries.xml /etc/clickhouse-server/u
 ln -s /usr/share/clickhouse-test/config/part_log.xml /etc/clickhouse-server/config.d/
 ln -s /usr/share/clickhouse-test/config/text_log.xml /etc/clickhouse-server/config.d/

-echo "TSAN_OPTIONS='halt_on_error=1 history_size=7 ignore_noninstrumented_modules=1 verbosity=1'" >> /etc/environment
-echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment
 echo "ASAN_OPTIONS='malloc_context_size=10 verbosity=1 allocator_release_to_os_interval_ms=10000'" >> /etc/environment

 start
--- a/docker/test/unit/Dockerfile
+++ b/docker/test/unit/Dockerfile
@ -5,12 +5,5 @@ ENV TZ=Europe/Moscow
 RUN ln -snf /usr/share/zoneinfo/$TZ /etc/localtime && echo $TZ > /etc/timezone
 RUN apt-get install gdb

-CMD ln -s /usr/lib/llvm-8/bin/llvm-symbolizer /usr/bin/llvm-symbolizer; \
-    echo "TSAN_OPTIONS='halt_on_error=1 history_size=7'" >> /etc/environment; \
-    echo "UBSAN_OPTIONS='print_stacktrace=1'" >> /etc/environment; \
-    echo "ASAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
-    echo "UBSAN_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
-    echo "TSAN_SYMBOLIZER_PATH=/usr/lib/llvm-8/bin/llvm-symbolizer" >> /etc/environment; \
-    echo "LLVM_SYMBOLIZER_PATH=/usr/lib/llvm-6.0/bin/llvm-symbolizer" >> /etc/environment; \
-    service zookeeper start && sleep 7 && /usr/share/zookeeper/bin/zkCli.sh -server localhost:2181 -create create /clickhouse_test ''; \
+CMD service zookeeper start && sleep 7 && /usr/share/zookeeper/bin/zkCli.sh -server localhost:2181 -create create /clickhouse_test ''; \
    gdb -q  -ex 'set print inferior-events off' -ex 'set confirm off' -ex 'set print thread-events off' -ex run -ex bt -ex quit --args ./unit_tests_dbms | tee test_output/test_result.txt
--- a/docs/en/introduction/adopters.md
+++ b/docs/en/introduction/adopters.md
@ -38,6 +38,7 @@ toc_title: Adopters
 | <a href="https://db.com" class="favicon">Deutsche Bank</a>                                     | Finance                         | BI Analytics          | —                                                          | —                                                                            | [Slides in English, October 2019](https://bigdatadays.ru/wp-content/uploads/2019/10/D2-H3-3_Yakunin-Goihburg.pdf)                                                                                                       |
 | <a href="https://www.diva-e.com" class="favicon">Diva-e</a>                                    | Digital consulting              | Main Product          | —                                                          | —                                                                            | [Slides in English, September 2019](https://github.com/ClickHouse/clickhouse-presentations/blob/master/meetup29/ClickHouse-MeetUp-Unusual-Applications-sd-2019-09-17.pdf)                                               |
 | <a href="https://www.ecwid.com/" class="favicon">Ecwid</a>                                    | E-commerce SaaS | Metrics, Logging      | —                                                          | —                                                                            | [Slides in Russian, April 2019](https://nastachku.ru/var/files/1/presentation/backend/2_Backend_6.pdf)                                                                                                                                                        |
+| <a href="https://www.ebay.com/" class="favicon">eBay</a>                                    | E-commerce | TBA | —                                                          | —                                                                            | [Webinar, Sep 2020](https://altinity.com/webinarspage/2020/09/08/migrating-from-druid-to-next-gen-olap-on-clickhouse-ebays-experience)                                                                                                                                                        |
 | <a href="https://www.exness.com" class="favicon">Exness</a>                                    | Trading                         | Metrics, Logging      | —                                                          | —                                                                            | [Talk in Russian, May 2019](https://youtu.be/_rpU-TvSfZ8?t=3215)                                                                                                                                                        |
 | <a href="https://fastnetmon.com/" class="favicon">FastNetMon</a> | DDoS Protection | Main Product |  | —                                                                            | [Official website](https://fastnetmon.com/docs-fnm-advanced/fastnetmon-advanced-traffic-persistency/) |
 | <a href="https://www.flipkart.com/" class="favicon">Flipkart</a>                               | e-Commerce                      | —                     | —                                                          | —                                                                            | [Talk in English, July 2020](https://youtu.be/GMiXCMFDMow?t=239)                                                                                               |
--- a/docs/en/operations/performance-test.md
+++ b/docs/en/operations/performance-test.md
@ -13,49 +13,41 @@ With this instruction you can run basic ClickHouse performance test on any serve
 4.  ssh to the server and download it with wget:
 ```bash
 # For amd64:
-wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578163263_binary/clickhouse
+wget https://clickhouse-builds.s3.yandex.net/0/e29c4c3cc47ab2a6c4516486c1b77d57e7d42643/clickhouse_build_check/gcc-10_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse
 # For aarch64:
-wget https://clickhouse-builds.s3.yandex.net/0/00ba767f5d2a929394ea3be193b1f79074a1c4bc/1578161264_binary/clickhouse
+wget https://clickhouse-builds.s3.yandex.net/0/e29c4c3cc47ab2a6c4516486c1b77d57e7d42643/clickhouse_special_build_check/clang-10-aarch64_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse
 # Then do:
 chmod a+x clickhouse
 ```
-5.  Download configs:
-```bash
-wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/programs/server/config.xml
-wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/programs/server/users.xml
-mkdir config.d
-wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/programs/server/config.d/path.xml -O config.d/path.xml
-wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/programs/server/config.d/log_to_console.xml -O config.d/log_to_console.xml
-```
-6.  Download benchmark files:
+5.  Download benchmark files:
 ```bash
 wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/benchmark-new.sh
 chmod a+x benchmark-new.sh
 wget https://raw.githubusercontent.com/ClickHouse/ClickHouse/master/benchmark/clickhouse/queries.sql
 ```
-7.  Download test data according to the [Yandex.Metrica dataset](../getting-started/example-datasets/metrica.md) instruction (“hits” table containing 100 million rows).
+6.  Download test data according to the [Yandex.Metrica dataset](../getting-started/example-datasets/metrica.md) instruction (“hits” table containing 100 million rows).
 ```bash
 wget https://clickhouse-datasets.s3.yandex.net/hits/partitions/hits_100m_obfuscated_v1.tar.xz
 tar xvf hits_100m_obfuscated_v1.tar.xz -C .
 mv hits_100m_obfuscated_v1/* .
 ```
-8.  Run the server:
+7.  Run the server:
 ```bash
 ./clickhouse server
 ```
-9.  Check the data: ssh to the server in another terminal
+8.  Check the data: ssh to the server in another terminal
 ```bash
 ./clickhouse client --query "SELECT count() FROM hits_100m_obfuscated"
 100000000
 ```
-10.  Edit the benchmark-new.sh, change `clickhouse-client` to `./clickhouse client` and add `--max_memory_usage 100000000000` parameter.
+9.  Edit the benchmark-new.sh, change `clickhouse-client` to `./clickhouse client` and add `--max_memory_usage 100000000000` parameter.
 ```bash
 mcedit benchmark-new.sh
 ```
-11.  Run the benchmark:
+10.  Run the benchmark:
 ```bash
 ./benchmark-new.sh hits_100m_obfuscated
 ```
-12.  Send the numbers and the info about your hardware configuration to clickhouse-feedback@yandex-team.com
+11.  Send the numbers and the info about your hardware configuration to clickhouse-feedback@yandex-team.com

 All the results are published here: https://clickhouse.tech/benchmark/hardware/
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -521,6 +521,22 @@ For more information, see the MergeTreeSettings.h header file.
 </merge_tree>
 ```

+## replicated\_merge\_tree {#server_configuration_parameters-replicated_merge_tree}
+
+Fine tuning for tables in the [ReplicatedMergeTree](../../engines/table-engines/mergetree-family/mergetree.md).
+
+This setting has higher priority.
+
+For more information, see the MergeTreeSettings.h header file.
+
+**Example**
+
+``` xml
+<replicated_merge_tree>
+    <max_suspicious_broken_parts>5</max_suspicious_broken_parts>
+</replicated_merge_tree>
+```
+
 ## openSSL {#server_configuration_parameters-openssl}

 SSL client/server configuration.
--- a/docs/en/operations/settings/query-complexity.md
+++ b/docs/en/operations/settings/query-complexity.md
@ -60,6 +60,31 @@ A maximum number of bytes (uncompressed data) that can be read from a table when

 What to do when the volume of data read exceeds one of the limits: ‘throw’ or ‘break’. By default, throw.

+## max\_rows\_to\_read_leaf {#max-rows-to-read-leaf}
+
+The following restrictions can be checked on each block (instead of on each row). That is, the restrictions can be broken a little.
+
+A maximum number of rows that can be read from a local table on a leaf node when running a distributed query. While
+distributed queries can issue a multiple sub-queries to each shard (leaf) - this limit will be checked only on the read 
+stage on the leaf nodes and ignored on results merging stage on the root node. For example, cluster consists of 2 shards 
+and each shard contains a table with 100 rows. Then distributed query which suppose to read all the data from both 
+tables with setting `max_rows_to_read=150` will fail as in total it will be 200 rows. While query 
+with `max_rows_to_read_leaf=150` will succeed since leaf nodes will read 100 rows at max.
+
+## max\_bytes\_to\_read_leaf {#max-bytes-to-read-leaf}
+
+A maximum number of bytes (uncompressed data) that can be read from a local table on a leaf node when running 
+a distributed query. While distributed queries can issue a multiple sub-queries to each shard (leaf) - this limit will 
+be checked only on the read stage on the leaf nodes and ignored on results merging stage on the root node. 
+For example, cluster consists of 2 shards and each shard contains a table with 100 bytes of data. 
+Then distributed query which suppose to read all the data from both tables with setting `max_bytes_to_read=150` will fail 
+as in total it will be 200 bytes. While query with `max_bytes_to_read_leaf=150` will succeed since leaf nodes will read 
+100 bytes at max.
+
+## read\_overflow\_mode_leaf {#read-overflow-mode-leaf}
+
+What to do when the volume of data read exceeds one of the leaf limits: ‘throw’ or ‘break’. By default, throw.
+
 ## max\_rows\_to\_group\_by {#settings-max-rows-to-group-by}

 A maximum number of unique keys received from aggregation. This setting lets you limit memory consumption when aggregating.
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -940,6 +940,8 @@ This algorithm chooses the first replica in the set or a random replica if the f

 The `first_or_random` algorithm solves the problem of the `in_order` algorithm. With `in_order`, if one replica goes down, the next one gets a double load while the remaining replicas handle the usual amount of traffic. When using the `first_or_random` algorithm, the load is evenly distributed among replicas that are still available.

+It's possible to explicitly define what the first replica is by using the setting `load_balancing_first_offset`. This gives more control to rebalance query workloads among replicas.
+
 ### Round Robin {#load_balancing-round_robin}

 ``` sql
@ -1815,7 +1817,7 @@ Default value: 8192.

 Turns on or turns off using of single dictionary for the data part.

-By default, ClickHouse server monitors the size of dictionaries and if a dictionary overflows then the server starts to write the next one. To prohibit creating several dictionaries set `low_cardinality_use_single_dictionary_for_part = 1`.
+By default, the ClickHouse server monitors the size of dictionaries and if a dictionary overflows then the server starts to write the next one. To prohibit creating several dictionaries set `low_cardinality_use_single_dictionary_for_part = 1`.

 Possible values:

@ -1974,4 +1976,54 @@ Possible values:

 Default value: `120` seconds.

+## output_format_pretty_max_value_width {#output_format_pretty_max_value_width}
+
+Limits the width of value displayed in [Pretty](../../interfaces/formats.md#pretty) formats. If the value width exceeds the limit, the value is cut. 
+
+Possible values:
+
+-   Positive integer. 
+-   0 — The value is cut completely.
+
+Default value: `10000` symbols.
+
+**Examples**
+
+Query:
+```sql
+SET output_format_pretty_max_value_width = 10;
+SELECT range(number) FROM system.numbers LIMIT 10 FORMAT PrettyCompactNoEscapes;
+```
+Result:
+```text
+┌─range(number)─┐
+│ []            │
+│ [0]           │
+│ [0,1]         │
+│ [0,1,2]       │
+│ [0,1,2,3]     │
+│ [0,1,2,3,4⋯   │
+│ [0,1,2,3,4⋯   │
+│ [0,1,2,3,4⋯   │
+│ [0,1,2,3,4⋯   │
+│ [0,1,2,3,4⋯   │
+└───────────────┘
+```
+
+Query with zero width:
+```sql
+SET output_format_pretty_max_value_width = 0;
+SELECT range(number) FROM system.numbers LIMIT 5 FORMAT PrettyCompactNoEscapes;
+```
+Result:
+```text
+┌─range(number)─┐
+│ ⋯             │
+│ ⋯             │
+│ ⋯             │
+│ ⋯             │
+│ ⋯             │
+└───────────────┘
+```
+
 [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) <!-- hide -->
--- a/docs/en/sql-reference/aggregate-functions/reference/maxmap.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/maxmap.md
@ -6,10 +6,13 @@ toc_priority: 143

 Syntax: `maxMap(key, value)` or `maxMap(Tuple(key, value))`

-Calculates the maximum from `value` array according to the keys specified in the ‘key’ array.
-Passing tuple of keys and values arrays is synonymical to passing two arrays of keys and values.
-The number of elements in ‘key’ and ‘value’ must be the same for each row that is totaled.
-Returns a tuple of two arrays: keys in sorted order, and values calculated for the corresponding keys.
+Calculates the maximum from `value` array according to the keys specified in the `key` array.
+
+Passing a tuple of keys and value arrays is identical to passing two arrays of keys and values.
+
+The number of elements in `key` and `value` must be the same for each row that is totaled.
+
+Returns a tuple of two arrays: keys and values calculated for the corresponding keys.

 Example:

--- a/docs/en/sql-reference/aggregate-functions/reference/minmap.md
+++ b/docs/en/sql-reference/aggregate-functions/reference/minmap.md
@ -8,7 +8,7 @@ Syntax: `minMap(key, value)` or `minMap(Tuple(key, value))`

 Calculates the minimum from `value` array according to the keys specified in the `key` array.

-Passing tuple of keys and values arrays is a synonym to passing two arrays of keys and values.
+Passing a tuple of keys and value arrays is identical to passing two arrays of keys and values.

 The number of elements in `key` and `value` must be the same for each row that is totaled.

--- a/docs/en/sql-reference/data-types/lowcardinality.md
+++ b/docs/en/sql-reference/data-types/lowcardinality.md
@ -21,7 +21,7 @@ LowCardinality(data_type)

 `LowCardinality` is a superstructure that changes a data storage method and rules of data processing. ClickHouse applies [dictionary coding](https://en.wikipedia.org/wiki/Dictionary_coder) to `LowCardinality`-columns. Operating with dictionary encoded data significantly increases performance of [SELECT](../../sql-reference/statements/select/index.md) queries for many applications.

-The efficiency of using `LowCarditality` data type depends on data diversity. If a dictionary contains less than 10,000 distinct values, then ClickHouse mostly shows higher efficiency of data reading and storing. If a dictionary contains more than 100,000 distinct values, then ClickHouse can perform worse in comparison with using ordinary data types.
+The efficiency of using `LowCardinality` data type depends on data diversity. If a dictionary contains less than 10,000 distinct values, then ClickHouse mostly shows higher efficiency of data reading and storing. If a dictionary contains more than 100,000 distinct values, then ClickHouse can perform worse in comparison with using ordinary data types.

 Consider using `LowCardinality` instead of [Enum](../../sql-reference/data-types/enum.md) when working with strings. `LowCardinality` provides more flexibility in use and often reveals the same or higher efficiency.

--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@ -516,14 +516,14 @@ Result:

 **See Also**

-   \[ISO 8601 announcement by @xkcd\](https://xkcd.com/1179/)
+-   [ISO 8601 announcement by @xkcd](https://xkcd.com/1179/)
 -   [RFC 1123](https://tools.ietf.org/html/rfc1123)
 -   [toDate](#todate)
 -   [toDateTime](#todatetime)

 ## parseDateTimeBestEffortUS {#parsedatetimebesteffortUS}

-This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebesteffort), the only difference is that this function prefers US style (`MM/DD/YYYY` etc) in case of ambiguouty.
+This function is similar to [‘parseDateTimeBestEffort’](#parsedatetimebesteffort), the only difference is that this function prefers US date format (`MM/DD/YYYY` etc.) in case of ambiguity.

 **Syntax**

@ -541,7 +541,7 @@ parseDateTimeBestEffortUS(time_string [, time_zone]);
 -   A string containing 9..10 digit [unix timestamp](https://en.wikipedia.org/wiki/Unix_time).
 -   A string with a date and a time component: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc.
 -   A string with a date, but no time component: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc.
-   A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case `YYYY-MM` are substituted as `2000-01`.
+-   A string with a day and time: `DD`, `DD hh`, `DD hh:mm`. In this case, `YYYY-MM` are substituted as `2000-01`.
 -   A string that includes the date and time along with time zone offset information: `YYYY-MM-DD hh:mm:ss ±h:mm`, etc. For example, `2020-12-12 17:36:00 -5:00`.

 **Returned value**
--- a/docs/ru/commercial/index.md
+++ b/docs/ru/commercial/index.md
@ -6,4 +6,14 @@ toc_title: "\u041A\u043E\u043C\u043C\u0435\u0440\u0447\u0435\u0441\u043A\u0438\u
  \ \u0443\u0441\u043B\u0443\u0433\u0438"
 ---

+# Коммерческие услуги {#clickhouse-commercial-services}

+Данный раздел содержит описание коммерческих услуг, предоставляемых для ClickHouse. Поставщики этих услуг — независимые компании, которые могут не быть аффилированы с Яндексом.
+
+Категории услуг:
+
+-   Облачные услуги [Cloud](../commercial/cloud.md)
+-   Поддержка [Support](../commercial/support.md)
+
+!!! note "Для поставщиков услуг"
+    Если вы — представитель компании-поставщика услуг, вы можете отправить запрос на добавление вашей компании и ваших услуг в соответствующий раздел данной документации (или на добавление нового раздела, если ваши услуги не соответствуют ни одной из существующих категорий). Чтобы отправить запрос (pull-request) на добавление описания в документацию, нажмите на значок "карандаша" в правом верхнем углу страницы. Если ваши услуги доступны в только отдельных регионах, не забудьте указать это на соответствующих локализованных страницах (и обязательно отметьте это при отправке заявки).
--- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md
@ -43,9 +43,6 @@ ORDER BY expr

 Описание параметров смотрите в [описании запроса CREATE](../../../engines/table-engines/mergetree-family/mergetree.md).

-!!! note "Примечание"
-    `INDEX` — экспериментальная возможность, смотрите [Индексы пропуска данных](#table_engine-mergetree-data_skipping-indexes).
-
 ### Секции запроса {#mergetree-query-clauses}

 -   `ENGINE` — имя и параметры движка. `ENGINE = MergeTree()`. `MergeTree` не имеет параметров.
@ -269,7 +266,7 @@ ClickHouse не может использовать индекс, если зн

 ClickHouse использует эту логику не только для последовательностей дней месяца, но и для любого частично-монотонного первичного ключа.

-### Индексы пропуска данных (экспериментальная функциональность) {#table_engine-mergetree-data_skipping-indexes}
+### Индексы пропуска данных {#table_engine-mergetree-data_skipping-indexes}

 Объявление индексов при определении столбцов в запросе `CREATE`.

@ -566,7 +563,7 @@ ALTER TABLE example_table
 -   `volume_name_N` — название тома. Названия томов должны быть уникальны.
 -   `disk` — диск, находящийся внутри тома.
 -   `max_data_part_size_bytes` — максимальный размер куска данных, который может находится на любом из дисков этого тома.
-   `move_factor` — доля свободного места, при превышении которого данные начинают перемещаться на следующий том, если он есть (по умолчанию 0.1).
+-   `move_factor` — доля доступного свободного места на томе, если места становится меньше, то данные начнут перемещение на следующий том, если он есть (по умолчанию 0.1).

 Примеры конфигураций:

--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@ -1050,13 +1050,13 @@ $ clickhouse-client --query="SELECT * FROM {some_table} FORMAT Parquet" > {some_

 Для обмена данными с экосистемой Hadoop можно использовать движки таблиц [HDFS](../engines/table-engines/integrations/hdfs.md).

-## Arrow {data-format-arrow}
+## Arrow {#data-format-arrow}

 [Apache Arrow](https://arrow.apache.org/) поставляется с двумя встроенными поколоночнами форматами хранения. ClickHouse поддерживает операции чтения и записи для этих форматов.

 `Arrow` — это Apache Arrow's "file mode" формат. Он предназначен для произвольного доступа в памяти.

-## ArrowStream {data-format-arrow-stream}
+## ArrowStream {#data-format-arrow-stream}

 `ArrowStream` — это Apache Arrow's "stream mode" формат. Он предназначен для обработки потоков в памяти.

--- a/docs/ru/operations/settings/query-complexity.md
+++ b/docs/ru/operations/settings/query-complexity.md
@ -56,6 +56,32 @@

 Что делать, когда количество прочитанных данных превысило одно из ограничений: throw или break. По умолчанию: throw.

+## max\_rows\_to\_read_leaf {#max-rows-to-read-leaf}
+
+Следующие ограничения могут проверяться на каждый блок (а не на каждую строку). То есть, ограничения могут быть немного нарушены.
+
+Максимальное количество строчек, которое можно прочитать из таблицы на удалённом сервере при выполнении
+распределенного запроса. Распределенные запросы могут создавать несколько подзапросов к каждому из шардов в кластере и 
+тогда этот лимит будет применен при выполнении чтения на удаленных серверах (включая и сервер-инициатор) и проигнорирован 
+на сервере-инициаторе запроса во время обьединения полученных результатов. Например, кластер состоит из 2 шард и каждый 
+из них хранит таблицу с 100 строк. Тогда распределнный запрос для получения всех данных из этих таблиц и установленной 
+настройкой `max_rows_to_read=150` выбросит исключение, т.к. в общем он прочитает 200 строк. Но запрос 
+с настройкой  `max_rows_to_read_leaf=150` завершится успешно, потому что каждый из шардов прочитает максимум 100 строк.
+
+## max\_bytes\_to\_read_leaf {#max-bytes-to-read-leaf}
+
+Максимальное количество байт (несжатых данных), которое можно прочитать из таблицы на удалённом сервере при 
+выполнении распределенного запроса. Распределенные запросы могут создавать несколько подзапросов к каждому из шардов в 
+кластере и тогда этот лимит будет применен при выполнении чтения на удаленных серверах (включая и сервер-инициатор) 
+и проигнорирован на сервере-инициаторе запроса во время обьединения полученных результатов. Например, кластер состоит 
+из 2 шард и каждый из них хранит таблицу со 100 байтами. Тогда распределнный запрос для получения всех данных из этих таблиц 
+и установленной настройкой `max_bytes_to_read=150` выбросит исключение, т.к. в общем он прочитает 200 байт. Но запрос 
+с настройкой  `max_bytes_to_read_leaf=150` завершится успешно, потому что каждый из шардов прочитает максимум 100 байт.
+
+## read\_overflow\_mode_leaf {#read-overflow-mode-leaf}
+
+Что делать, когда количество прочитанных данных на удаленном сервере превысило одно из ограничений: throw или break. По умолчанию: throw.
+
 ## max\_rows\_to\_group\_by {#settings-max-rows-to-group-by}

 Максимальное количество уникальных ключей, получаемых в процессе агрегации. Позволяет ограничить потребление оперативки при агрегации.
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -484,7 +484,7 @@ INSERT INTO test VALUES (lower('Hello')), (lower('world')), (lower('INSERT')), (

 См. также:

-   [JOIN strictness](../../sql-reference/statements/select/join.md#select-join-strictness)
+-   [JOIN strictness](../../sql-reference/statements/select/join.md#join-settings)

 ## max\_block\_size {#setting-max_block_size}

@ -1616,6 +1616,63 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1;

 -   [Обработка значения NULL в операторе IN](../../sql-reference/operators/in.md#in-null-processing)

+## low\_cardinality\_max\_dictionary\_size {#low_cardinality_max_dictionary_size}
+
+Задает максимальный размер общего глобального словаря (в строках) для типа данных `LowCardinality`, который может быть записан в файловую систему хранилища. Настройка предотвращает проблемы с оперативной памятью в случае неограниченного увеличения словаря. Все данные, которые не могут быть закодированы из-за ограничения максимального размера словаря, ClickHouse записывает обычным способом.
+
+Допустимые значения:
+
+-   Положительное целое число.
+
+Значение по умолчанию: 8192.
+
+## low\_cardinality\_use\_single\_dictionary\_for\_part {#low_cardinality_use_single_dictionary_for_part}
+
+Включает или выключает использование единого словаря для куска (парта).
+
+По умолчанию сервер ClickHouse следит за размером словарей, и если словарь переполняется, сервер создает следующий. Чтобы запретить создание нескольких словарей, задайте настройку `low_cardinality_use_single_dictionary_for_part = 1`.
+
+Допустимые значения:
+
+-   1 — Создание нескольких словарей для частей данных запрещено.
+-   0 — Создание нескольких словарей для частей данных не запрещено.
+
+Значение по умолчанию: 0.
+
+## low\_cardinality\_allow\_in\_native\_format {#low_cardinality_allow_in_native_format}
+
+Разрешает или запрещает использование типа данных `LowCardinality` с форматом данных [Native](../../interfaces/formats.md#native).
+
+Если использование типа `LowCardinality` ограничено, сервер CLickHouse преобразует столбцы `LowCardinality` в обычные столбцы для запросов `SELECT`, а обычные столбцы - в столбцы `LowCardinality` для запросов `INSERT`.
+
+В основном настройка используется для сторонних клиентов, не поддерживающих тип данных `LowCardinality`.
+
+Допустимые значения:
+
+-   1 — Использование `LowCardinality` не ограничено.
+-   0 — Использование `LowCardinality` ограничено.
+
+Значение по умолчанию: 1.
+
+## allow\_suspicious\_low\_cardinality\_types {#allow_suspicious_low_cardinality_types}
+
+Разрешает или запрещает использование типа данных `LowCardinality` с типами данных с фиксированным размером 8 байт или меньше: числовые типы данных и `FixedString (8_bytes_or_less)`.
+
+Для небольших фиксированных значений использование `LowCardinality` обычно неэффективно, поскольку ClickHouse хранит числовой индекс для каждой строки. В результате:
+
+-   Используется больше дискового пространства.
+-   Потребление ОЗУ увеличивается, в зависимости от размера словаря.
+-   Некоторые функции работают медленнее из-за дополнительных операций кодирования.
+
+Время слияния в таблицах на движке [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md) также может увеличиться по описанным выше причинам.
+
+Допустимые значения:
+
+-   1 — Использование `LowCardinality` не ограничено.
+-   0 — Использование `LowCardinality` ограничено.
+
+Значение по умолчанию: 0.
+
 ## background_buffer_flush_schedule_pool_size {#background_buffer_flush_schedule_pool_size}

 Задает количество потоков для выполнения фонового сброса данных в таблицах с движком [Buffer](../../engines/table-engines/special/buffer.md). Настройка применяется при запуске сервера ClickHouse и не может быть изменена в пользовательском сеансе.
@ -1756,6 +1813,60 @@ SELECT idx, i FROM null_in WHERE i IN (1, NULL) SETTINGS transform_null_in = 1;
 -   [Секции и настройки запроса CREATE TABLE](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-query-clauses) (настройка `merge_with_ttl_timeout`)
 -   [Table TTL](../../engines/table-engines/mergetree-family/mergetree.md#mergetree-table-ttl)

+## output_format_pretty_max_value_width {#output_format_pretty_max_value_width}
+
+Ограничивает длину значения, выводимого в формате [Pretty](../../interfaces/formats.md#pretty). Если значение длиннее указанного количества символов, оно обрезается. 
+
+Возможные значения:
+
+-   Положительное целое число. 
+-   0 — значение обрезается полностью.
+
+Значение по умолчанию: `10000` символов.
+
+**Примеры**
+
+Запрос:
+
+```sql
+SET output_format_pretty_max_value_width = 10;
+SELECT range(number) FROM system.numbers LIMIT 10 FORMAT PrettyCompactNoEscapes;
+```
+Результат:
+
+```text
+┌─range(number)─┐
+│ []            │
+│ [0]           │
+│ [0,1]         │
+│ [0,1,2]       │
+│ [0,1,2,3]     │
+│ [0,1,2,3,4⋯   │
+│ [0,1,2,3,4⋯   │
+│ [0,1,2,3,4⋯   │
+│ [0,1,2,3,4⋯   │
+│ [0,1,2,3,4⋯   │
+└───────────────┘
+```
+
+Запрос, где длина выводимого значения ограничена 0 символов:
+
+```sql
+SET output_format_pretty_max_value_width = 0;
+SELECT range(number) FROM system.numbers LIMIT 5 FORMAT PrettyCompactNoEscapes;
+```
+Результат:
+
+```text
+┌─range(number)─┐
+│ ⋯             │
+│ ⋯             │
+│ ⋯             │
+│ ⋯             │
+│ ⋯             │
+└───────────────┘
+```
+
 ## lock_acquire_timeout {#lock_acquire_timeout}

 Устанавливает, сколько секунд сервер ожидает возможности выполнить блокировку таблицы.
--- a/docs/ru/operations/system-tables/storage_policies.md
+++ b/docs/ru/operations/system-tables/storage_policies.md
@ -9,7 +9,7 @@
 -   `volume_priority` ([UInt64](../../sql-reference/data-types/int-uint.md)) — порядковый номер тома согласно конфигурации.
 -   `disks` ([Array(String)](../../sql-reference/data-types/array.md)) — имена дисков, содержащихся в политике хранения.
 -   `max_data_part_size` ([UInt64](../../sql-reference/data-types/int-uint.md)) — максимальный размер куска данных, который может храниться на дисках тома (0 — без ограничений).
-   `move_factor` ([Float64](../../sql-reference/data-types/float.md))\` — доля свободного места, при превышении которой данные начинают перемещаться на следующий том.
+-   `move_factor` — доля доступного свободного места на томе, если места становится меньше, то данные начнут перемещение на следующий том, если он есть (по умолчанию 0.1).

 Если политика хранения содержит несколько томов, то каждому тому соответствует отдельная запись в таблице.

--- a/docs/ru/operations/system-tables/tables.md
+++ b/docs/ru/operations/system-tables/tables.md
@ -24,13 +24,16 @@
    -   [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-multiple-volumes)
    -   [Distributed](../../engines/table-engines/special/distributed.md#distributed)

-   `total_rows` (Nullable(UInt64)) - Общее количество строк, если есть возможность быстро определить точное количество строк в таблице, в противном случае `Null` (включая базовую таблицу `Buffer`).
+-   `total_rows` (Nullable(UInt64)) - общее количество строк, если есть возможность быстро определить точное количество строк в таблице, в противном случае `Null` (включая базовую таблицу `Buffer`).

-   `total_bytes` (Nullable(UInt64)) - Общее количество байт, если можно быстро определить точное количество байт для таблицы на накопителе, в противном случае `Null` (**не включает** в себя никакого базового хранилища).
+-   `total_bytes` (Nullable(UInt64)) - общее количество байт, если можно быстро определить точное количество байт для таблицы на накопителе, в противном случае `Null` (**не включает** в себя никакого базового хранилища).

    -   Если таблица хранит данные на диске, возвращает используемое пространство на диске (т. е. сжатое).
    -   Если таблица хранит данные в памяти, возвращает приблизительное количество используемых байт в памяти.

+-   `lifetime_rows` (Nullable(UInt64)) - общее количество строк, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`).
+
+-   `lifetime_bytes` (Nullable(UInt64)) - общее количество байт, добавленных оператором `INSERT` с момента запуска сервера (только для таблиц `Buffer`).

 Таблица `system.tables` используется при выполнении запроса `SHOW TABLES`.

--- a/docs/ru/sql-reference/aggregate-functions/reference/groupbitmap.md
+++ b/docs/ru/sql-reference/aggregate-functions/reference/groupbitmap.md
@ -4,7 +4,7 @@ toc_priority: 128

 # groupBitmap {#groupbitmap}

-Bitmap или агрегатные вычисления для столбца с типом данных `UInt*`, возвращают кардинальность в виде значения типа UInt64, если добавить суффикс -State, то возвращают [объект bitmap](../../../sql-reference/functions/bitmap-functions.md).
+Bitmap или агрегатные вычисления для столбца с типом данных `UInt*`, возвращают кардинальность в виде значения типа UInt64, если добавить суффикс `-State`, то возвращают [объект bitmap](../../../sql-reference/functions/bitmap-functions.md#bitmap-functions).

 ``` sql
 groupBitmap(expr)
--- a/docs/ru/sql-reference/aggregate-functions/reference/maxmap.md
+++ b/docs/ru/sql-reference/aggregate-functions/reference/maxmap.md
@ -0,0 +1,28 @@
+---
+toc_priority: 143
+---
+
+# maxMap {#agg_functions-maxmap}
+
+Синтаксис: `maxMap(key, value)` or `maxMap(Tuple(key, value))`
+
+Вычисляет максимальные значения массива `value`, соответствующие ключам, указанным в массиве `key`.
+
+Передача кортежа ключей и массивов значений идентична передаче двух массивов ключей и значений.
+
+Количество элементов в параметрах `key` и `value` должно быть одинаковым для каждой суммируемой строки.
+
+Возвращает кортеж из двух массивов: ключи и значения, рассчитанные для соответствующих ключей.
+
+Пример:
+
+``` sql
+SELECT maxMap(a, b)
+FROM values('a Array(Int32), b Array(Int64)', ([1, 2], [2, 2]), ([2, 3], [1, 1]))
+```
+
+``` text
+┌─maxMap(a, b)──────┐
+│ ([1,2,3],[2,2,1]) │
+└───────────────────┘
+```
--- a/docs/ru/sql-reference/aggregate-functions/reference/minmap.md
+++ b/docs/ru/sql-reference/aggregate-functions/reference/minmap.md
@ -0,0 +1,28 @@
+---
+toc_priority: 142
+---
+
+# minMap {#agg_functions-minmap}
+
+Синтаксис: `minMap(key, value)` or `minMap(Tuple(key, value))`
+
+Вычисляет минимальное значение массива `value` в соответствии с ключами, указанными в массиве `key`.
+
+Передача кортежа ключей и массивов значений идентична передаче двух массивов ключей и значений.
+
+Количество элементов в параметрах `key` и `value` должно быть одинаковым для каждой суммируемой строки.
+
+Возвращает кортеж из двух массивов: ключи в отсортированном порядке и значения, рассчитанные для соответствующих ключей.
+
+Пример:
+
+``` sql
+SELECT minMap(a, b)
+FROM values('a Array(Int32), b Array(Int64)', ([1, 2], [2, 2]), ([2, 3], [1, 1]))
+```
+
+``` text
+┌─minMap(a, b)──────┐
+│ ([1,2,3],[2,1,1]) │
+└───────────────────┘
+```
--- a/docs/ru/sql-reference/data-types/aggregatefunction.md
+++ b/docs/ru/sql-reference/data-types/aggregatefunction.md
@ -1,3 +1,8 @@
+---
+toc_priority: 53
+toc_title: AggregateFunction
+---
+
 # AggregateFunction {#data-type-aggregatefunction}

 Агрегатные функции могут обладать определяемым реализацией промежуточным состоянием, которое может быть сериализовано в тип данных, соответствующий AggregateFunction(…), и быть записано в таблицу обычно посредством [материализованного представления] (../../sql-reference/statements/create.md#create-view). Чтобы получить промежуточное состояние, обычно используются агрегатные функции с суффиксом `-State`. Чтобы в дальнейшем получить агрегированные данные необходимо использовать те же агрегатные функции с суффиксом `-Merge`.
--- a/docs/ru/sql-reference/data-types/array.md
+++ b/docs/ru/sql-reference/data-types/array.md
@ -1,3 +1,8 @@
+---
+toc_priority: 52
+toc_title: Array(T)
+---
+
 # Array(T) {#data-type-array}

 Массив из элементов типа `T`.
--- a/docs/ru/sql-reference/data-types/lowcardinality.md
+++ b/docs/ru/sql-reference/data-types/lowcardinality.md
@ -0,0 +1,59 @@
+---
+toc_priority: 51
+toc_title: LowCardinality
+---
+
+# LowCardinality {#lowcardinality-data-type}
+
+Изменяет внутреннее представление других типов данных, превращая их в тип со словарным кодированием.
+
+## Синтаксис {#lowcardinality-syntax}
+
+```sql
+LowCardinality(data_type)
+```
+
+**Параметры**
+
+- `data_type` — [String](string.md), [FixedString](fixedstring.md), [Date](date.md), [DateTime](datetime.md) и числа за исключением типа [Decimal](decimal.md). `LowCardinality` неэффективен для некоторых типов данных, см. описание настройки [allow_suspicious_low_cardinality_types](../../operations/settings/settings.md#allow_suspicious_low_cardinality_types).
+
+## Описание {#lowcardinality-dscr}
+
+`LowCardinality` — это надстройка, изменяющая способ хранения и правила обработки данных. ClickHouse применяет [словарное кодирование](https://en.wikipedia.org/wiki/Dictionary_coder) в столбцы типа `LowCardinality`. Работа с данными, представленными в словарном виде, может значительно увеличивать производительность запросов [SELECT](../statements/select/index.md) для многих приложений.
+
+Эффективность использования типа данных `LowCarditality` зависит от разнообразия данных. Если словарь содержит менее 10 000 различных значений, ClickHouse в основном показывает более высокую эффективность чтения и хранения данных. Если же словарь содержит более 100 000 различных значений, ClickHouse может работать хуже, чем при использовании обычных типов данных.
+
+При работе со строками, использование `LowCardinality` вместо [Enum](enum.md). `LowCardinality` обеспечивает большую гибкость в использовании и часто показывает такую же или более высокую эффективность.
+
+## Пример
+
+Создать таблицу со столбцами типа `LowCardinality`:
+
+```sql
+CREATE TABLE lc_t
+(
+    `id` UInt16, 
+    `strings` LowCardinality(String)
+)
+ENGINE = MergeTree()
+ORDER BY id
+```
+
+## Связанные настройки и функции
+
+Настройки:
+
+- [low_cardinality_max_dictionary_size](../../operations/settings/settings.md#low_cardinality_max_dictionary_size)
+- [low_cardinality_use_single_dictionary_for_part](../../operations/settings/settings.md#low_cardinality_use_single_dictionary_for_part)
+- [low_cardinality_allow_in_native_format](../../operations/settings/settings.md#low_cardinality_allow_in_native_format)
+- [allow_suspicious_low_cardinality_types](../../operations/settings/settings.md#allow_suspicious_low_cardinality_types)
+
+Функции:
+
+- [toLowCardinality](../functions/type-conversion-functions.md#tolowcardinality)
+
+## Смотрите также
+
+- [A Magical Mystery Tour of the LowCardinality Data Type](https://www.altinity.com/blog/2019/3/27/low-cardinality).
+- [Reducing Clickhouse Storage Cost with the Low Cardinality Type – Lessons from an Instana Engineer](https://www.instana.com/blog/reducing-clickhouse-storage-cost-with-the-low-cardinality-type-lessons-from-an-instana-engineer/).
+- [String Optimization (video presentation in Russian)](https://youtu.be/rqf-ILRgBdY?list=PL0Z2YDlm0b3iwXCpEFiOOYmwXzVmjJfEt). [Slides in English](https://github.com/yandex/clickhouse-presentations/raw/master/meetup19/string_optimization.pdf).
--- a/docs/ru/sql-reference/data-types/nullable.md
+++ b/docs/ru/sql-reference/data-types/nullable.md
@ -1,3 +1,8 @@
+---
+toc_priority: 55
+toc_title: Nullable
+---
+
 # Nullable(TypeName) {#data_type-nullable}

 Позволяет работать как со значением типа `TypeName` так и с отсутствием этого значения ([NULL](../../sql-reference/data-types/nullable.md)) в одной и той же переменной, в том числе хранить `NULL` в таблицах вместе со значения типа `TypeName`. Например, в столбце типа `Nullable(Int8)` можно хранить значения типа `Int8`, а в тех строках, где значения нет, будет храниться `NULL`.
--- a/docs/ru/sql-reference/data-types/tuple.md
+++ b/docs/ru/sql-reference/data-types/tuple.md
@ -1,3 +1,8 @@
+---
+toc_priority: 54
+toc_title: Tuple(T1, T2, ...)
+---
+
 # Tuple(T1, T2, …) {#tuplet1-t2}

 Кортеж из элементов любого [типа](index.md#data_types). Элементы кортежа могут быть одного или разных типов.
--- a/docs/ru/sql-reference/functions/bitmap-functions.md
+++ b/docs/ru/sql-reference/functions/bitmap-functions.md
@ -1,4 +1,4 @@
-# Функции для битмапов {#funktsii-dlia-bitmapov}
+# Функции для битмапов {#bitmap-functions}

 ## bitmapBuild {#bitmap_functions-bitmapbuild}

@ -61,8 +61,8 @@ bitmapSubsetLimit(bitmap, range_start, cardinality_limit)
 **Параметры**

 -   `bitmap` – Битмап. [Bitmap object](#bitmap_functions-bitmapbuild).
-   `range_start` – Начальная точка подмножества. [UInt32](../../sql-reference/functions/bitmap-functions.md).
-   `cardinality_limit` – Верхний предел подмножества. [UInt32](../../sql-reference/functions/bitmap-functions.md).
+-   `range_start` – Начальная точка подмножества. [UInt32](../../sql-reference/functions/bitmap-functions.md#bitmap-functions).
+-   `cardinality_limit` – Верхний предел подмножества. [UInt32](../../sql-reference/functions/bitmap-functions.md#bitmap-functions).

 **Возвращаемое значение**

@ -97,7 +97,7 @@ bitmapContains(haystack, needle)
 **Параметры**

 -   `haystack` – [объект Bitmap](#bitmap_functions-bitmapbuild), в котором функция ищет значение.
-   `needle` – значение, которое функция ищет. Тип — [UInt32](../../sql-reference/functions/bitmap-functions.md).
+-   `needle` – значение, которое функция ищет. Тип — [UInt32](../../sql-reference/functions/bitmap-functions.md#bitmap-functions).

 **Возвращаемые значения**

--- a/docs/ru/sql-reference/functions/random-functions.md
+++ b/docs/ru/sql-reference/functions/random-functions.md
@ -100,5 +100,6 @@ FROM numbers(3)
 │ a*cjab+                               │
 │ aeca2A                                │
 └───────────────────────────────────────┘
+```

 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/random_functions/) <!--hide-->
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@ -508,11 +508,85 @@ SELECT parseDateTimeBestEffort('10 20:19')

 **См. также**

-   \[Информация о формате ISO 8601 от @xkcd\](https://xkcd.com/1179/)
+-   [Информация о формате ISO 8601 от @xkcd](https://xkcd.com/1179/)
 -   [RFC 1123](https://tools.ietf.org/html/rfc1123)
 -   [toDate](#todate)
 -   [toDateTime](#todatetime)

+## parseDateTimeBestEffortUS {#parsedatetimebesteffortUS}
+
+Эта функция похожа на [‘parseDateTimeBestEffort’](#parsedatetimebesteffort), но разница состоит в том, что в она предполагает американский формат даты (`MM/DD/YYYY` etc.) в случае неоднозначности.
+
+**Синтаксис**
+
+``` sql
+parseDateTimeBestEffortUS(time_string [, time_zone]);
+```
+
+**Параметры**
+
+-   `time_string` — строка, содержащая дату и время для преобразования. [String](../../sql-reference/data-types/string.md).
+-   `time_zone` — часовой пояс. Функция анализирует `time_string` в соответствии с часовым поясом. [String](../../sql-reference/data-types/string.md).
+
+**Поддерживаемые нестандартные форматы**
+
+-   Строка, содержащая 9-10 цифр [unix timestamp](https://en.wikipedia.org/wiki/Unix_time).
+-   Строка, содержащая дату и время: `YYYYMMDDhhmmss`, `MM/DD/YYYY hh:mm:ss`, `MM-DD-YY hh:mm`, `YYYY-MM-DD hh:mm:ss`, etc.
+-   Строка с датой, но без времени: `YYYY`, `YYYYMM`, `YYYY*MM`, `MM/DD/YYYY`, `MM-DD-YY` etc.
+-   Строка, содержащая день и время: `DD`, `DD hh`, `DD hh:mm`. В этом случае `YYYY-MM` заменяется на `2000-01`.
+-   Строка, содержащая дату и время, а также информацию о часовом поясе: `YYYY-MM-DD hh:mm:ss ±h:mm` и т.д. Например, `2020-12-12 17:36:00 -5:00`.
+
+**Возвращаемое значение**
+
+-   `time_string` преобразован в тип данных `DateTime`.
+
+**Примеры**
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUS('09/12/2020 12:12:57')
+AS parseDateTimeBestEffortUS;
+```
+
+Ответ:
+
+``` text
+┌─parseDateTimeBestEffortUS─┐
+│     2020-09-12 12:12:57   │
+└─────────────────────────——┘
+```
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUS('09-12-2020 12:12:57')
+AS parseDateTimeBestEffortUS;
+```
+
+Ответ:
+
+``` text
+┌─parseDateTimeBestEffortUS─┐
+│     2020-09-12 12:12:57   │
+└─────────────────────────——┘
+```
+
+Запрос:
+
+``` sql
+SELECT parseDateTimeBestEffortUS('09.12.2020 12:12:57')
+AS parseDateTimeBestEffortUS;
+```
+
+Ответ:
+
+``` text
+┌─parseDateTimeBestEffortUS─┐
+│     2020-09-12 12:12:57   │
+└─────────────────────────——┘
+```
+
 ## toUnixTimestamp64Milli
 ## toUnixTimestamp64Micro
 ## toUnixTimestamp64Nano
@ -604,4 +678,43 @@ SELECT fromUnixTimestamp64Milli(i64, 'UTC')
 └──────────────────────────────────────┘
 ```

+## toLowCardinality {#tolowcardinality}
+
+Преобразует входные данные в версию [LowCardianlity](../data-types/lowcardinality.md) того же типа данных.
+
+Чтобы преобразовать данные из типа `LowCardinality`, используйте функцию [CAST](#type_conversion_function-cast). Например, `CAST(x as String)`.
+
+**Синтаксис**
+
+```sql
+toLowCardinality(expr)
+```
+
+**Параметры**
+
+- `expr` — [Выражение](../syntax.md#syntax-expressions), которое в результате преобразуется в один из [поддерживаемых типов данных](../data-types/index.md#data_types).
+
+
+**Возвращаемое значение**
+
+- Результат преобразования `expr`.
+
+Тип: `LowCardinality(expr_result_type)`
+
+**Example**
+
+Запрос:
+
+```sql
+SELECT toLowCardinality('1')
+```
+
+Результат:
+
+```text
+┌─toLowCardinality('1')─┐
+│ 1                     │
+└───────────────────────┘
+```
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/type_conversion_functions/) <!--hide-->
--- a/docs/ru/sql-reference/statements/index.md
+++ b/docs/ru/sql-reference/statements/index.md
@ -3,4 +3,28 @@ toc_folder_title: "\u0412\u044B\u0440\u0430\u0436\u0435\u043D\u0438\u044F"
 toc_priority: 31
 ---

+# SQL выражения в ClickHouse {#clickhouse-sql-statements}

+Выражения описывают различные действия, которые можно выполнить с помощью SQL запросов. Каждый вид выражения имеет свой синтаксис и особенности использования, которые описаны в соответствующих разделах документации:
+
+-   [SELECT](../../sql-reference/statements/select/index.md)
+-   [INSERT INTO](../../sql-reference/statements/insert-into.md)
+-   [CREATE](../../sql-reference/statements/create/index.md)
+-   [ALTER](../../sql-reference/statements/alter/index.md)
+-   [SYSTEM](../../sql-reference/statements/system.md)
+-   [SHOW](../../sql-reference/statements/show.md)
+-   [GRANT](../../sql-reference/statements/grant.md)
+-   [REVOKE](../../sql-reference/statements/revoke.md)
+-   [ATTACH](../../sql-reference/statements/attach.md)
+-   [CHECK TABLE](../../sql-reference/statements/check-table.md)
+-   [DESCRIBE TABLE](../../sql-reference/statements/describe-table.md)
+-   [DETACH](../../sql-reference/statements/detach.md)
+-   [DROP](../../sql-reference/statements/drop.md)
+-   [EXISTS](../../sql-reference/statements/exists.md)
+-   [KILL](../../sql-reference/statements/kill.md)
+-   [OPTIMIZE](../../sql-reference/statements/optimize.md)
+-   [RENAME](../../sql-reference/statements/rename.md)
+-   [SET](../../sql-reference/statements/set.md)
+-   [SET ROLE](../../sql-reference/statements/set-role.md)
+-   [TRUNCATE](../../sql-reference/statements/truncate.md)
+-   [USE](../../sql-reference/statements/use.md)
--- a/docs/tools/requirements.txt
+++ b/docs/tools/requirements.txt
@ -18,11 +18,11 @@ Markdown==3.2.1
 MarkupSafe==1.1.1
 mkdocs==1.1.2
 mkdocs-htmlproofer-plugin==0.0.3
-mkdocs-macros-plugin==0.4.9
+mkdocs-macros-plugin==0.4.13
 nltk==3.5
 nose==1.3.7
 protobuf==3.13.0
-numpy==1.19.1
+numpy==1.19.2
 Pygments==2.5.2
 pymdown-extensions==8.0
 python-slugify==4.0.1
--- a/docs/tools/test.py
+++ b/docs/tools/test.py
@ -92,7 +92,7 @@ def test_single_page(input_path, lang):
            logging.warning('Found %d duplicate anchor points' % duplicate_anchor_points)

        if links_to_nowhere:
-            if lang == 'en':  # TODO: check all languages again
+            if lang == 'en' or lang == 'ru':  # TODO: check all languages again
                logging.error(f'Found {links_to_nowhere} links to nowhere in {lang}')
                sys.exit(1)
            else:
--- a/docs/zh/engines/table-engines/mergetree-family/aggregatingmergetree.md
+++ b/docs/zh/engines/table-engines/mergetree-family/aggregatingmergetree.md
@ -1,12 +1,15 @@
 # AggregatingMergeTree {#aggregatingmergetree}

-该引擎继承自 [MergeTree](mergetree.md)，并改变了数据片段的合并逻辑。 ClickHouse 会将相同主键的所有行（在一个数据片段内）替换为单个存储一系列聚合函数状态的行。
+该引擎继承自 [MergeTree](mergetree.md)，并改变了数据片段的合并逻辑。 ClickHouse 会将一个数据片段内所有具有相同主键（准确的说是 [排序键](../../../engines/table-engines/mergetree-family/mergetree.md)）的行替换成一行，这一行会存储一系列聚合函数的状态。

-可以使用 `AggregatingMergeTree` 表来做增量数据统计聚合，包括物化视图的数据聚合。
+可以使用 `AggregatingMergeTree` 表来做增量数据的聚合统计，包括物化视图的数据聚合。

-引擎需使用 [AggregateFunction](../../../engines/table-engines/mergetree-family/aggregatingmergetree.md) 类型来处理所有列。
+引擎使用以下类型来处理所有列：

-如果要按一组规则来合并减少行数，则使用 `AggregatingMergeTree` 是合适的。
+-   [AggregateFunction](../../../sql-reference/data-types/aggregatefunction.md)
+-   [SimpleAggregateFunction](../../../sql-reference/data-types/simpleaggregatefunction.md)
+
+`AggregatingMergeTree` 适用于能够按照一定的规则缩减行数的情况。

 ## 建表 {#jian-biao}

@ -20,10 +23,11 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 [PARTITION BY expr]
 [ORDER BY expr]
 [SAMPLE BY expr]
+[TTL expr]
 [SETTINGS name=value, ...]
 ```

-语句参数的说明，请参阅 [语句描述](../../../engines/table-engines/mergetree-family/aggregatingmergetree.md)。
+语句参数的说明，请参阅 [建表语句描述](../../../sql-reference/statements/create.md#create-table-query)。

 **子句**

@ -33,7 +37,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]

 <summary>已弃用的建表方法</summary>

-!!! 注意 "注意"
+!!! attention "注意"
    不要在新项目中使用该方法，可能的话，请将旧项目切换到上述方法。

 ``` sql
@ -45,15 +49,15 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 ) ENGINE [=] AggregatingMergeTree(date-column [, sampling_expression], (primary, key), index_granularity)
 ```

-上面的所有参数跟 `MergeTree` 中的一样。
+上面的所有参数的含义跟 `MergeTree` 中的一样。
 </details>

 ## SELECT 和 INSERT {#select-he-insert}

-插入数据，需使用带有聚合 -State- 函数的 [INSERT SELECT](../../../engines/table-engines/mergetree-family/aggregatingmergetree.md) 语句。
+要插入数据，需使用带有 -State- 聚合函数的 [INSERT SELECT](../../../sql-reference/statements/insert-into.md) 语句。
 从 `AggregatingMergeTree` 表中查询数据时，需使用 `GROUP BY` 子句并且要使用与插入时相同的聚合函数，但后缀要改为 `-Merge` 。

-在 `SELECT` 查询的结果中，对于 ClickHouse 的所有输出格式 `AggregateFunction` 类型的值都实现了特定的二进制表示法。如果直接用 `SELECT` 导出这些数据，例如如用 `TabSeparated` 格式，那么这些导出数据也能直接用 `INSERT` 语句加载导入。
+对于 `SELECT` 查询的结果， `AggregateFunction` 类型的值对 ClickHouse 的所有输出格式都实现了特定的二进制表示法。在进行数据转储时，例如使用 `TabSeparated` 格式进行 `SELECT` 查询，那么这些转储数据也能直接用 `INSERT` 语句导回。

 ## 聚合物化视图的示例 {#ju-he-wu-hua-shi-tu-de-shi-li}

--- a/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md
+++ b/docs/zh/engines/table-engines/mergetree-family/custom-partitioning-key.md
@ -2,9 +2,9 @@

 [MergeTree](mergetree.md) 系列的表（包括 [可复制表](replication.md) ）可以使用分区。基于 MergeTree 表的 [物化视图](../special/materializedview.md#materializedview) 也支持分区。

-一个分区是指按指定规则逻辑组合一起的表的记录集。可以按任意标准进行分区，如按月，按日或按事件类型。为了减少需要操作的数据，每个分区都是分开存储的。访问数据时，ClickHouse 尽量使用这些分区的最小子集。
+分区是在一个表中通过指定的规则划分而成的逻辑数据集。可以按任意标准进行分区，如按月，按日或按事件类型。为了减少需要操作的数据，每个分区都是分开存储的。访问数据时，ClickHouse 尽量使用这些分区的最小子集。

-分区是在 [建表](mergetree.md#table_engine-mergetree-creating-a-table) 的 `PARTITION BY expr` 子句中指定。分区键可以是关于列的任何表达式。例如，指定按月分区，表达式为 `toYYYYMM(date_column)`：
+分区是在 [建表](mergetree.md#table_engine-mergetree-creating-a-table) 时通过 `PARTITION BY expr` 子句指定的。分区键可以是表中列的任意表达式。例如，指定按月分区，表达式为 `toYYYYMM(date_column)`：

 ``` sql
 CREATE TABLE visits
@ -30,10 +30,10 @@ ORDER BY (CounterID, StartDate, intHash32(UserID));

 新数据插入到表中时，这些数据会存储为按主键排序的新片段（块）。插入后 10-15 分钟，同一分区的各个片段会合并为一整个片段。

-!!! attention "注意"
-    那些有相同分区表达式值的数据片段才会合并。这意味着 **你不应该用太精细的分区方案**（超过一千个分区）。否则，会因为文件系统中的文件数量和需要找开的文件描述符过多，导致 `SELECT` 查询效率不佳。
+!!! info "注意"
+    那些有相同分区表达式值的数据片段才会合并。这意味着 **你不应该用太精细的分区方案**（超过一千个分区）。否则，会因为文件系统中的文件数量过多和需要打开的文件描述符过多，导致 `SELECT` 查询效率不佳。

-可以通过 [系统。零件](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md#system_tables-parts) 表查看表片段和分区信息。例如，假设我们有一个 `visits` 表，按月分区。对 `system.parts` 表执行 `SELECT`：
+可以通过 [system.parts](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md#system_tables-parts) 表查看表片段和分区信息。例如，假设我们有一个 `visits` 表，按月分区。对 `system.parts` 表执行 `SELECT`：

 ``` sql
 SELECT
@ -44,55 +44,59 @@ FROM system.parts
 WHERE table = 'visits'
 ```

-    ┌─partition─┬─name───────────┬─active─┐
-    │ 201901    │ 201901_1_3_1   │      0 │
-    │ 201901    │ 201901_1_9_2   │      1 │
-    │ 201901    │ 201901_8_8_0   │      0 │
-    │ 201901    │ 201901_9_9_0   │      0 │
-    │ 201902    │ 201902_4_6_1   │      1 │
-    │ 201902    │ 201902_10_10_0 │      1 │
-    │ 201902    │ 201902_11_11_0 │      1 │
-    └───────────┴────────────────┴────────┘
+``` text
+┌─partition─┬─name───────────┬─active─┐
+│ 201901    │ 201901_1_3_1   │      0 │
+│ 201901    │ 201901_1_9_2   │      1 │
+│ 201901    │ 201901_8_8_0   │      0 │
+│ 201901    │ 201901_9_9_0   │      0 │
+│ 201902    │ 201902_4_6_1   │      1 │
+│ 201902    │ 201902_10_10_0 │      1 │
+│ 201902    │ 201902_11_11_0 │      1 │
+└───────────┴────────────────┴────────┘
+```

 `partition` 列存储分区的名称。此示例中有两个分区：`201901` 和 `201902`。在 [ALTER … PARTITION](#alter_manipulations-with-partitions) 语句中你可以使用该列值来指定分区名称。

 `name` 列为分区中数据片段的名称。在 [ALTER ATTACH PART](#alter_attach-partition) 语句中你可以使用此列值中来指定片段名称。

-这里我们拆解下第一部分的名称：`201901_1_3_1`：
+这里我们拆解下第一个数据片段的名称：`201901_1_3_1`：

 -   `201901` 是分区名称。
 -   `1` 是数据块的最小编号。
 -   `3` 是数据块的最大编号。
 -   `1` 是块级别（即在由块组成的合并树中，该块在树中的深度）。

-!!! attention "注意"
+!!! info "注意"
    旧类型表的片段名称为：`20190117_20190123_2_2_0`（最小日期 - 最大日期 - 最小块编号 - 最大块编号 - 块级别）。

-`active` 列为片段状态。`1` 激活状态；`0` 非激活状态。非激活片段是那些在合并到较大片段之后剩余的源数据片段。损坏的数据片段也表示为非活动状态。
+`active` 列为片段状态。`1` 代表激活状态；`0` 代表非激活状态。非激活片段是那些在合并到较大片段之后剩余的源数据片段。损坏的数据片段也表示为非活动状态。

-正如在示例中所看到的，同一分区中有几个独立的片段（例如，`201901_1_3_1`和`201901_1_9_2`）。这意味着这些片段尚未合并。ClickHouse 大约在插入后15分钟定期报告合并操作，合并插入的数据片段。此外，你也可以使用 [OPTIMIZE](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md#misc_operations-optimize) 语句直接执行合并。例：
+正如在示例中所看到的，同一分区中有几个独立的片段（例如，`201901_1_3_1`和`201901_1_9_2`）。这意味着这些片段尚未合并。ClickHouse 会定期的对插入的数据片段进行合并，大约是在插入后15分钟左右。此外，你也可以使用 [OPTIMIZE](../../../sql-reference/statements/misc.md#misc_operations-optimize) 语句发起一个计划外的合并。例如：

 ``` sql
 OPTIMIZE TABLE visits PARTITION 201902;
 ```

-    ┌─partition─┬─name───────────┬─active─┐
-    │ 201901    │ 201901_1_3_1   │      0 │
-    │ 201901    │ 201901_1_9_2   │      1 │
-    │ 201901    │ 201901_8_8_0   │      0 │
-    │ 201901    │ 201901_9_9_0   │      0 │
-    │ 201902    │ 201902_4_6_1   │      0 │
-    │ 201902    │ 201902_4_11_2  │      1 │
-    │ 201902    │ 201902_10_10_0 │      0 │
-    │ 201902    │ 201902_11_11_0 │      0 │
-    └───────────┴────────────────┴────────┘
+```
+┌─partition─┬─name───────────┬─active─┐
+│ 201901    │ 201901_1_3_1   │      0 │
+│ 201901    │ 201901_1_9_2   │      1 │
+│ 201901    │ 201901_8_8_0   │      0 │
+│ 201901    │ 201901_9_9_0   │      0 │
+│ 201902    │ 201902_4_6_1   │      0 │
+│ 201902    │ 201902_4_11_2  │      1 │
+│ 201902    │ 201902_10_10_0 │      0 │
+│ 201902    │ 201902_11_11_0 │      0 │
+└───────────┴────────────────┴────────┘
+```

-非激活片段会在合并后的10分钟左右删除。
+非激活片段会在合并后的10分钟左右被删除。

 查看片段和分区信息的另一种方法是进入表的目录：`/var/lib/clickhouse/data/<database>/<table>/`。例如：

 ``` bash
-dev:/var/lib/clickhouse/data/default/visits$ ls -l
+/var/lib/clickhouse/data/default/visits$ ls -l
 total 40
 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb  1 16:48 201901_1_3_1
 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb  5 16:17 201901_1_9_2
@ -105,12 +109,12 @@ drwxr-xr-x 2 clickhouse clickhouse 4096 Feb  5 12:09 201902_4_6_1
 drwxr-xr-x 2 clickhouse clickhouse 4096 Feb  1 16:48 detached
 ```

-文件夹 ‘201901\_1\_1\_0’，‘201901\_1\_7\_1’ 等是片段的目录。每个片段都与一个对应的分区相关，并且只包含这个月的数据（本例中的表按月分区）。
+‘201901\_1\_1\_0’，‘201901\_1\_7\_1’ 等文件夹是数据片段的目录。每个片段都与一个对应的分区相关，并且只包含这个月的数据（本例中的表按月分区）。

-`detached` 目录存放着使用 [DETACH](../../../sql-reference/statements/alter.md#alter_detach-partition) 语句从表中分离的片段。损坏的片段也会移到该目录，而不是删除。服务器不使用`detached`目录中的片段。可以随时添加，删除或修改此目录中的数据 – 在运行 [ATTACH](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md#alter_attach-partition) 语句前，服务器不会感知到。
+`detached` 目录存放着使用 [DETACH](../../../sql-reference/statements/alter.md#alter_detach-partition) 语句从表中卸载的片段。损坏的片段不会被删除而是也会移到该目录下。服务器不会去使用`detached`目录中的数据片段。因此你可以随时添加，删除或修改此目录中的数据 – 在运行 [ATTACH](../../../sql-reference/statements/alter.md#alter_attach-partition) 语句前，服务器不会感知到。

 注意，在操作服务器时，你不能手动更改文件系统上的片段集或其数据，因为服务器不会感知到这些修改。对于非复制表，可以在服务器停止时执行这些操作，但不建议这样做。对于复制表，在任何情况下都不要更改片段文件。

-ClickHouse 支持对分区执行这些操作：删除分区，从一个表复制到另一个表，或创建备份。了解分区的所有操作，请参阅 [分区和片段的操作](../../../engines/table-engines/mergetree-family/custom-partitioning-key.md#alter_manipulations-with-partitions) 一节。
+ClickHouse 支持对分区执行这些操作：删除分区，将分区从一个表复制到另一个表，或创建备份。了解分区的所有操作，请参阅 [分区和片段的操作](../../../sql-reference/statements/alter.md#alter_manipulations-with-partitions) 一节。

 [来源文章](https://clickhouse.tech/docs/en/operations/table_engines/custom_partitioning_key/) <!--hide-->
--- a/docs/zh/engines/table-engines/mergetree-family/replacingmergetree.md
+++ b/docs/zh/engines/table-engines/mergetree-family/replacingmergetree.md
@ -1,8 +1,8 @@
-# 替换合并树 {#replacingmergetree}
+# ReplacingMergeTree {#replacingmergetree}

-该引擎和[MergeTree](mergetree.md)的不同之处在于它会删除具有相同主键的重复项。
+该引擎和 [MergeTree](mergetree.md) 的不同之处在于它会删除排序键值相同的重复项。

-数据的去重只会在合并的过程中出现。合并会在未知的时间在后台进行，因此你无法预先作出计划。有一些数据可能仍未被处理。尽管你可以调用 `OPTIMIZE` 语句发起计划外的合并，但请不要指望使用它，因为 `OPTIMIZE` 语句会引发对大量数据的读和写。
+数据的去重只会在数据合并期间进行。合并会在后台一个不确定的时间进行，因此你无法预先作出计划。有一些数据可能仍未被处理。尽管你可以调用 `OPTIMIZE` 语句发起计划外的合并，但请不要依靠它，因为 `OPTIMIZE` 语句会引发对数据的大量读写。

 因此，`ReplacingMergeTree` 适用于在后台清除重复的数据以节省空间，但是它不保证没有重复的数据出现。

@ -21,19 +21,20 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 [SETTINGS name=value, ...]
 ```

-请求参数的描述，参考[请求参数](../../../engines/table-engines/mergetree-family/replacingmergetree.md)。
+有关建表参数的描述，可参考 [创建表](../../../sql-reference/statements/create.md#create-table-query)。

-**参数**
+**ReplacingMergeTree 的参数**

 -   `ver` — 版本列。类型为 `UInt*`, `Date` 或 `DateTime`。可选参数。

-        合并的时候，`ReplacingMergeTree` 从所有具有相同主键的行中选择一行留下：
-        - 如果 `ver` 列未指定，选择最后一条。
-        - 如果 `ver` 列已指定，选择 `ver` 值最大的版本。
+    在数据合并的时候，`ReplacingMergeTree` 从所有具有相同排序键的行中选择一行留下：
+    
+     - 如果 `ver` 列未指定，保留最后一条。
+     - 如果 `ver` 列已指定，保留 `ver` 值最大的版本。

 **子句**

-创建 `ReplacingMergeTree` 表时，需要与创建 `MergeTree` 表时相同的[子句](mergetree.md)。
+创建 `ReplacingMergeTree` 表时，需要使用与创建 `MergeTree` 表时相同的 [子句](mergetree.md)。

 <details markdown="1">

--- a/docs/zh/getting-started/tutorial.md
+++ b/docs/zh/getting-started/tutorial.md
@ -80,7 +80,7 @@ clickhouse-client --query='INSERT INTO table FORMAT TabSeparated' < data.tsv

 ## 导入示例数据集 {#import-sample-dataset}

-现在是时候用一些示例数据填充我们的ClickHouse服务器。 在本教程中，我们将使用Yandex的匿名数据。Metrica，在成为开源之前以生产方式运行ClickHouse的第一个服务（更多关于这一点 [历史科](../introduction/history.md)). 有 [多种导入Yandex的方式。梅里卡数据集](example-datasets/metrica.md)，为了本教程，我们将使用最现实的一个。
+现在是时候用一些示例数据填充我们的ClickHouse服务端。 在本教程中，我们将使用Yandex.Metrica的匿名数据，它是在ClickHouse成为开源之前作为生产环境运行的第一个服务（关于这一点的更多内容请参阅[ClickHouse历史](../introduction/history.md))。有 [多种导入Yandex.Metrica数据集的的方法](example-datasets/metrica.md)，为了本教程，我们将使用最现实的一个。

 ### 下载并提取表数据 {#download-and-extract-table-data}

@ -93,22 +93,22 @@ curl https://clickhouse-datasets.s3.yandex.net/visits/tsv/visits_v1.tsv.xz | unx

 ### 创建表 {#create-tables}

-与大多数数据库管理系统一样，ClickHouse在逻辑上将表分组为 “databases”. 有一个 `default` 数据库，但我们将创建一个名为新的 `tutorial`:
+与大多数数据库管理系统一样，ClickHouse在逻辑上将表分组为数据库。包含一个 `default` 数据库，但我们将创建一个新的数据库 `tutorial`:

 ``` bash
 clickhouse-client --query "CREATE DATABASE IF NOT EXISTS tutorial"
 ```

-与数据库相比，创建表的语法要复杂得多（请参阅 [参考资料](../sql-reference/statements/create.md). 一般 `CREATE TABLE` 声明必须指定三个关键的事情:
+与创建数据库相比，创建表的语法要复杂得多（请参阅 [参考资料](../sql-reference/statements/create.md). 一般 `CREATE TABLE` 声明必须指定三个关键的事情:

 1.  要创建的表的名称。
-2.  Table schema, i.e. list of columns and their [数据类型](../sql-reference/data-types/index.md).
-3.  [表引擎](../engines/table-engines/index.md) 及其设置，这决定了如何物理执行对此表的查询的所有细节。
+2.  表结构，例如：列名和对应的[数据类型](../sql-reference/data-types/index.md)。
+3.  [表引擎](../engines/table-engines/index.md) 及其设置，这决定了对此表的查询操作是如何在物理层面执行的所有细节。

-YandexMetrica是一个网络分析服务，样本数据集不包括其全部功能，因此只有两个表可以创建:
+Yandex.Metrica是一个网络分析服务，样本数据集不包括其全部功能，因此只有两个表可以创建:

-   `hits` 是一个表格，其中包含所有用户在服务所涵盖的所有网站上完成的每个操作。
-   `visits` 是一个包含预先构建的会话而不是单个操作的表。
+-   `hits` 表包含所有用户在服务所涵盖的所有网站上完成的每个操作。
+-   `visits` 表包含预先构建的会话，而不是单个操作。

 让我们看看并执行这些表的实际创建表查询:

@ -453,9 +453,9 @@ SAMPLE BY intHash32(UserID)
 SETTINGS index_granularity = 8192
 ```

-您可以使用以下交互模式执行这些查询 `clickhouse-client` （只需在终端中启动它，而不需要提前指定查询）或尝试一些 [替代接口](../interfaces/index.md) 如果你愿意的话
+您可以使用`clickhouse-client`的交互模式执行这些查询（只需在终端中启动它，而不需要提前指定查询）。或者如果你愿意，可以尝试一些[替代接口](../interfaces/index.md)。

-正如我们所看到的, `hits_v1` 使用 [基本MergeTree引擎](../engines/table-engines/mergetree-family/mergetree.md)，而 `visits_v1` 使用 [崩溃](../engines/table-engines/mergetree-family/collapsingmergetree.md) 变体。
+正如我们所看到的, `hits_v1` 使用 [基本的MergeTree引擎](../engines/table-engines/mergetree-family/mergetree.md)，而 `visits_v1` 使用 [折叠树](../engines/table-engines/mergetree-family/collapsingmergetree.md) 变体。

 ### 导入数据 {#import-data}

--- a/docs/zh/introduction/history.md
+++ b/docs/zh/introduction/history.md
@ -13,7 +13,7 @@ Yandex.Metrica基于用户定义的字段，对实时访问、连接会话，生

 ClickHouse还被使用在：

-   存储来自Yandex.Metrica回话重放数据。
+-   存储来自Yandex.Metrica的会话重放数据。
 -   处理中间数据
 -   与Analytics一起构建全球报表。
 -   为调试Yandex.Metrica引擎运行查询
--- a/docs/zh/sql-reference/aggregate-functions/index.md
+++ b/docs/zh/sql-reference/aggregate-functions/index.md
@ -1,6 +1,6 @@
 ---
 toc_priority: 33
-toc_title: 简介
+toc_title: 聚合函数
 ---

 # 聚合函数 {#aggregate-functions}
--- a/docs/zh/sql-reference/functions/conditional-functions.md
+++ b/docs/zh/sql-reference/functions/conditional-functions.md
@ -34,7 +34,7 @@
    │ 2 │    3 │
    └───┴──────┘

-执行查询 `SELECT multiIf(isNull(y) x, y < 3, y, NULL) FROM t_null`。结果：
+执行查询 `SELECT multiIf(isNull(y), x, y < 3, y, NULL) FROM t_null`。结果：

    ┌─multiIf(isNull(y), x, less(y, 3), y, NULL)─┐
    │                                          1 │
--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -1167,6 +1167,9 @@ private:
                            dump_of_cloned_ast.str().c_str());
                        fprintf(stderr, "dump after fuzz:\n");
                        fuzz_base->dumpTree(std::cerr);
+
+                        fmt::print(stderr, "IAST::clone() is broken for some AST node. This is a bug. The original AST ('dump before fuzz') and its cloned copy ('dump of cloned AST') refer to the same nodes, which must never happen. This means that their parent node doesn't implement clone() correctly.");
+
                        assert(false);
                    }

@ -1504,7 +1507,18 @@ private:
        {
            /// Send data contained in the query.
            ReadBufferFromMemory data_in(parsed_insert_query->data, parsed_insert_query->end - parsed_insert_query->data);
-            sendDataFrom(data_in, sample, columns_description);
+            try
+            {
+                sendDataFrom(data_in, sample, columns_description);
+            }
+            catch (Exception & e)
+            {
+                /// The following query will use data from input
+                //      "INSERT INTO data FORMAT TSV\n " < data.csv
+                //  And may be pretty hard to debug, so add information about data source to make it easier.
+                e.addMessage("data for INSERT was parsed from query");
+                throw;
+            }
            // Remember where the data ended. We use this info later to determine
            // where the next query begins.
            parsed_insert_query->end = data_in.buffer().begin() + data_in.count();
@ -1512,7 +1526,15 @@ private:
        else if (!is_interactive)
        {
            /// Send data read from stdin.
-            sendDataFrom(std_in, sample, columns_description);
+            try
+            {
+                sendDataFrom(std_in, sample, columns_description);
+            }
+            catch (Exception & e)
+            {
+                e.addMessage("data for INSERT was parsed from stdin");
+                throw;
+            }
        }
        else
            throw Exception("No data to insert", ErrorCodes::NO_DATA_TO_INSERT);
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -32,6 +32,7 @@
 #include <Common/getExecutablePath.h>
 #include <Common/ThreadProfileEvents.h>
 #include <Common/ThreadStatus.h>
+#include <Common/getMappedArea.h>
 #include <Common/remapExecutable.h>
 #include <IO/HTTPCommon.h>
 #include <IO/UseSSL.h>
@ -43,7 +44,6 @@
 #include <Interpreters/loadMetadata.h>
 #include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/DNSCacheUpdater.h>
-#include <Interpreters/SystemLog.cpp>
 #include <Interpreters/ExternalLoaderXMLConfigRepository.h>
 #include <Access/AccessControlManager.h>
 #include <Storages/StorageReplicatedMergeTree.h>
@ -90,6 +90,23 @@ namespace CurrentMetrics
    extern const Metric MemoryTracking;
 }

+
+int mainEntryClickHouseServer(int argc, char ** argv)
+{
+    DB::Server app;
+    try
+    {
+        return app.run(argc, argv);
+    }
+    catch (...)
+    {
+        std::cerr << DB::getCurrentExceptionMessage(true) << "\n";
+        auto code = DB::getCurrentExceptionCode();
+        return code ? code : 1;
+    }
+}
+
+
 namespace
 {

@ -280,6 +297,11 @@ int Server::main(const std::vector<std::string> & /*args*/)
    global_context->makeGlobalContext();
    global_context->setApplicationType(Context::ApplicationType::SERVER);

+    // Initialize global thread pool. Do it before we fetch configs from zookeeper
+    // nodes (`from_zk`), because ZooKeeper interface uses the pool. We will
+    // ignore `max_thread_pool_size` in configs we fetch from ZK, but oh well.
+    GlobalThreadPool::initialize(config().getUInt("max_thread_pool_size", 10000));
+
    bool has_zookeeper = config().has("zookeeper");

    zkutil::ZooKeeperNodeCache main_config_zk_node_cache([&] { return global_context->getZooKeeper(); });
@ -317,11 +339,16 @@ int Server::main(const std::vector<std::string> & /*args*/)
        {
            if (hasLinuxCapability(CAP_IPC_LOCK))
            {
-                LOG_TRACE(log, "Will mlockall to prevent executable memory from being paged out. It may take a few seconds.");
-                if (0 != mlockall(MCL_CURRENT))
-                    LOG_WARNING(log, "Failed mlockall: {}", errnoToString(ErrorCodes::SYSTEM_ERROR));
+                /// Get the memory area with (current) code segment.
+                /// It's better to lock only the code segment instead of calling "mlockall",
+                /// because otherwise debug info will be also locked in memory, and it can be huge.
+                auto [addr, len] = getMappedArea(reinterpret_cast<void *>(mainEntryClickHouseServer));
+
+                LOG_TRACE(log, "Will do mlock to prevent executable memory from being paged out. It may take a few seconds.");
+                if (0 != mlock(addr, len))
+                    LOG_WARNING(log, "Failed mlock: {}", errnoToString(ErrorCodes::SYSTEM_ERROR));
                else
-                    LOG_TRACE(log, "The memory map of clickhouse executable has been mlock'ed");
+                    LOG_TRACE(log, "The memory map of clickhouse executable has been mlock'ed, total {}", ReadableSize(len));
            }
            else
            {
@ -414,9 +441,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
    DateLUT::instance();
    LOG_TRACE(log, "Initialized DateLUT with time zone '{}'.", DateLUT::instance().getTimeZone());

-    /// Initialize global thread pool
-    GlobalThreadPool::initialize(config().getUInt("max_thread_pool_size", 10000));
-
    /// Storage with temporary data for processing of heavy queries.
    {
        std::string tmp_path = config().getString("tmp_path", path + "tmp/");
@ -607,6 +631,7 @@ int Server::main(const std::vector<std::string> & /*args*/)

    /// Check sanity of MergeTreeSettings on server startup
    global_context->getMergeTreeSettings().sanityCheck(settings);
+    global_context->getReplicatedMergeTreeSettings().sanityCheck(settings);

    /// Limit on total memory usage
    size_t max_server_memory_usage = config().getUInt64("max_server_memory_usage", 0);
@ -719,7 +744,10 @@ int Server::main(const std::vector<std::string> & /*args*/)
    {
        /// DDL worker should be started after all tables were loaded
        String ddl_zookeeper_path = config().getString("distributed_ddl.path", "/clickhouse/task_queue/ddl/");
-        global_context->setDDLWorker(std::make_unique<DDLWorker>(ddl_zookeeper_path, *global_context, &config(), "distributed_ddl"));
+        int pool_size = config().getInt("distributed_ddl.pool_size", 1);
+        if (pool_size < 1)
+            throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
+        global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(), "distributed_ddl"));
    }

    std::unique_ptr<DNSCacheUpdater> dns_cache_updater;
@ -1135,21 +1163,3 @@ int Server::main(const std::vector<std::string> & /*args*/)
    return Application::EXIT_OK;
 }
 }
-
-#pragma GCC diagnostic ignored "-Wunused-function"
-#pragma GCC diagnostic ignored "-Wmissing-declarations"
-
-int mainEntryClickHouseServer(int argc, char ** argv)
-{
-    DB::Server app;
-    try
-    {
-        return app.run(argc, argv);
-    }
-    catch (...)
-    {
-        std::cerr << DB::getCurrentExceptionMessage(true) << "\n";
-        auto code = DB::getCurrentExceptionCode();
-        return code ? code : 1;
-    }
-}
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@ -671,6 +671,9 @@

        <!-- Settings from this profile will be used to execute DDL queries -->
        <!-- <profile>default</profile> -->
+
+        <!-- Controls how much ON CLUSTER queries can be run simultaneously. -->
+        <!-- <pool_size>1</pool_size> -->
    </distributed_ddl>

    <!-- Settings to fine tune MergeTree tables. See documentation in source code, in MergeTreeSettings.h -->
--- a/2
+++ b/2
@ -106,7 +106,7 @@ elif [[ $BUILD_TYPE == 'debug' ]]; then
    VERSION_POSTFIX+="+debug"
 fi

-CMAKE_FLAGS=" $MALLOC_OPTS -DSANITIZE=$SANITIZER $CMAKE_FLAGS"
+CMAKE_FLAGS=" $MALLOC_OPTS -DSANITIZE=$SANITIZER -DENABLE_CHECK_HEAVY_BUILDS=1 $CMAKE_FLAGS"
 [[ -n "$CMAKE_BUILD_TYPE" ]] && CMAKE_FLAGS=" -DCMAKE_BUILD_TYPE=$CMAKE_BUILD_TYPE $CMAKE_FLAGS"

 export CMAKE_FLAGS
--- a/src/Access/AccessControlManager.cpp
+++ b/src/Access/AccessControlManager.cpp
@ -339,6 +339,11 @@ void AccessControlManager::addStoragesFromMainConfig(
 }


+UUID AccessControlManager::login(const String & user_name, const String & password, const Poco::Net::IPAddress & address) const
+{
+    return MultipleAccessStorage::login(user_name, password, address, *external_authenticators);
+}
+
 void AccessControlManager::setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config)
 {
    external_authenticators->setConfig(config, getLogger());
--- a/src/Access/AccessControlManager.h
+++ b/src/Access/AccessControlManager.h
@ -106,6 +106,7 @@ public:
    bool isSettingNameAllowed(const std::string_view & name) const;
    void checkSettingNameIsAllowed(const std::string_view & name) const;

+    UUID login(const String & user_name, const String & password, const Poco::Net::IPAddress & address) const;
    void setExternalAuthenticatorsConfig(const Poco::Util::AbstractConfiguration & config);

    std::shared_ptr<const ContextAccess> getContextAccess(
--- a/src/Access/ContextAccess.cpp
+++ b/src/Access/ContextAccess.cpp
@ -288,23 +288,6 @@ void ContextAccess::calculateAccessRights() const
 }


-bool ContextAccess::isCorrectPassword(const String & password) const
-{
-    std::lock_guard lock{mutex};
-    if (!user)
-        return false;
-    return user->authentication.isCorrectPassword(password, user_name, manager->getExternalAuthenticators());
-}
-
-bool ContextAccess::isClientHostAllowed() const
-{
-    std::lock_guard lock{mutex};
-    if (!user)
-        return false;
-    return user->allowed_client_hosts.contains(params.address);
-}
-
-
 UserPtr ContextAccess::getUser() const
 {
    std::lock_guard lock{mutex};
--- a/src/Access/ContextAccess.h
+++ b/src/Access/ContextAccess.h
@ -63,9 +63,6 @@ public:
    UserPtr getUser() const;
    String getUserName() const;

-    bool isCorrectPassword(const String & password) const;
-    bool isClientHostAllowed() const;
-
    /// Returns information about current and enabled roles.
    /// The function can return nullptr.
    std::shared_ptr<const EnabledRolesInfo> getRolesInfo() const;
--- a/src/Access/IAccessStorage.cpp
+++ b/src/Access/IAccessStorage.cpp
@ -1,4 +1,5 @@
 #include <Access/IAccessStorage.h>
+#include <Access/User.h>
 #include <Common/Exception.h>
 #include <Common/quoteString.h>
 #include <IO/WriteHelpers.h>
@ -13,6 +14,7 @@ namespace ErrorCodes
    extern const int ACCESS_ENTITY_ALREADY_EXISTS;
    extern const int ACCESS_ENTITY_NOT_FOUND;
    extern const int ACCESS_STORAGE_READONLY;
+    extern const int AUTHENTICATION_FAILED;
    extern const int LOGICAL_ERROR;
 }

@ -412,6 +414,57 @@ void IAccessStorage::notify(const Notifications & notifications)
 }


+UUID IAccessStorage::login(
+    const String & user_name,
+    const String & password,
+    const Poco::Net::IPAddress & address,
+    const ExternalAuthenticators & external_authenticators) const
+{
+    return loginImpl(user_name, password, address, external_authenticators);
+}
+
+
+UUID IAccessStorage::loginImpl(
+    const String & user_name,
+    const String & password,
+    const Poco::Net::IPAddress & address,
+    const ExternalAuthenticators & external_authenticators) const
+{
+    if (auto id = find<User>(user_name))
+    {
+        if (auto user = tryRead<User>(*id))
+        {
+            if (isPasswordCorrectImpl(*user, password, external_authenticators) && isAddressAllowedImpl(*user, address))
+                return *id;
+        }
+    }
+    throwCannotAuthenticate(user_name);
+}
+
+
+bool IAccessStorage::isPasswordCorrectImpl(const User & user, const String & password, const ExternalAuthenticators & external_authenticators) const
+{
+    return user.authentication.isCorrectPassword(password, user.getName(), external_authenticators);
+}
+
+
+bool IAccessStorage::isAddressAllowedImpl(const User & user, const Poco::Net::IPAddress & address) const
+{
+    return user.allowed_client_hosts.contains(address);
+}
+
+UUID IAccessStorage::getIDOfLoggedUser(const String & user_name) const
+{
+    return getIDOfLoggedUserImpl(user_name);
+}
+
+
+UUID IAccessStorage::getIDOfLoggedUserImpl(const String & user_name) const
+{
+    return getID<User>(user_name);
+}
+
+
 UUID IAccessStorage::generateRandomID()
 {
    static Poco::UUIDGenerator generator;
@ -500,4 +553,13 @@ void IAccessStorage::throwReadonlyCannotRemove(EntityType type, const String & n
        "Cannot remove " + outputEntityTypeAndName(type, name) + " from " + getStorageName() + " because this storage is readonly",
        ErrorCodes::ACCESS_STORAGE_READONLY);
 }
+
+
+void IAccessStorage::throwCannotAuthenticate(const String & user_name)
+{
+    /// We use the same message for all authentification failures because we don't want to give away any unnecessary information for security reasons,
+    /// only the log will show the exact reason.
+    throw Exception(user_name + ": Authentication failed: password is incorrect or there is no user with such name", ErrorCodes::AUTHENTICATION_FAILED);
+}
+
 }
--- a/src/Access/IAccessStorage.h
+++ b/src/Access/IAccessStorage.h
@ -11,9 +11,13 @@


 namespace Poco { class Logger; }
+namespace Poco::Net { class IPAddress; }

 namespace DB
 {
+struct User;
+class ExternalAuthenticators;
+
 /// Contains entities, i.e. instances of classes derived from IAccessEntity.
 /// The implementations of this class MUST be thread-safe.
 class IAccessStorage
@ -138,6 +142,14 @@ public:
    bool hasSubscription(EntityType type) const;
    bool hasSubscription(const UUID & id) const;

+    /// Finds an user, check its password and returns the ID of the user.
+    /// Throws an exception if no such user or password is incorrect.
+    UUID login(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const;
+
+    /// Returns the ID of an user who has logged in (maybe on another node).
+    /// The function assumes that the password has been already checked somehow, so we can skip checking it now.
+    UUID getIDOfLoggedUser(const String & user_name) const;
+
 protected:
    virtual std::optional<UUID> findImpl(EntityType type, const String & name) const = 0;
    virtual std::vector<UUID> findAllImpl(EntityType type) const = 0;
@ -152,6 +164,10 @@ protected:
    virtual ext::scope_guard subscribeForChangesImpl(EntityType type, const OnChangedHandler & handler) const = 0;
    virtual bool hasSubscriptionImpl(const UUID & id) const = 0;
    virtual bool hasSubscriptionImpl(EntityType type) const = 0;
+    virtual UUID loginImpl(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const;
+    virtual bool isPasswordCorrectImpl(const User & user, const String & password, const ExternalAuthenticators & external_authenticators) const;
+    virtual bool isAddressAllowedImpl(const User & user, const Poco::Net::IPAddress & address) const;
+    virtual UUID getIDOfLoggedUserImpl(const String & user_name) const;

    static UUID generateRandomID();
    Poco::Logger * getLogger() const;
@ -166,6 +182,7 @@ protected:
    [[noreturn]] void throwReadonlyCannotInsert(EntityType type, const String & name) const;
    [[noreturn]] void throwReadonlyCannotUpdate(EntityType type, const String & name) const;
    [[noreturn]] void throwReadonlyCannotRemove(EntityType type, const String & name) const;
+    [[noreturn]] static void throwCannotAuthenticate(const String & user_name);

    using Notification = std::tuple<OnChangedHandler, UUID, AccessEntityPtr>;
    using Notifications = std::vector<Notification>;
--- a/src/Access/MultipleAccessStorage.cpp
+++ b/src/Access/MultipleAccessStorage.cpp
@ -392,4 +392,58 @@ void MultipleAccessStorage::updateSubscriptionsToNestedStorages(std::unique_lock
    added_subscriptions->clear();
 }

+
+UUID MultipleAccessStorage::loginImpl(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const
+{
+    auto storages = getStoragesInternal();
+    for (const auto & storage : *storages)
+    {
+        try
+        {
+            auto id = storage->login(user_name, password, address, external_authenticators);
+            std::lock_guard lock{mutex};
+            ids_cache.set(id, storage);
+            return id;
+        }
+        catch (...)
+        {
+            if (!storage->find(EntityType::USER, user_name))
+            {
+                /// The authentication failed because there no users with such name in the `storage`
+                /// thus we can try to search in other nested storages.
+                continue;
+            }
+            throw;
+        }
+    }
+    throwCannotAuthenticate(user_name);
+}
+
+
+UUID MultipleAccessStorage::getIDOfLoggedUserImpl(const String & user_name) const
+{
+    auto storages = getStoragesInternal();
+    for (const auto & storage : *storages)
+    {
+        try
+        {
+            auto id = storage->getIDOfLoggedUser(user_name);
+            std::lock_guard lock{mutex};
+            ids_cache.set(id, storage);
+            return id;
+        }
+        catch (...)
+        {
+            if (!storage->find(EntityType::USER, user_name))
+            {
+                /// The authentication failed because there no users with such name in the `storage`
+                /// thus we can try to search in other nested storages.
+                continue;
+            }
+            throw;
+        }
+    }
+    throwNotFound(EntityType::USER, user_name);
+}
+
 }
--- a/src/Access/MultipleAccessStorage.h
+++ b/src/Access/MultipleAccessStorage.h
@ -47,6 +47,8 @@ protected:
    ext::scope_guard subscribeForChangesImpl(EntityType type, const OnChangedHandler & handler) const override;
    bool hasSubscriptionImpl(const UUID & id) const override;
    bool hasSubscriptionImpl(EntityType type) const override;
+    UUID loginImpl(const String & user_name, const String & password, const Poco::Net::IPAddress & address, const ExternalAuthenticators & external_authenticators) const override;
+    UUID getIDOfLoggedUserImpl(const String & user_name) const override;

 private:
    using Storages = std::vector<StoragePtr>;
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@ -67,6 +67,7 @@ set(dbms_sources)
 add_headers_and_sources(clickhouse_common_io Common)
 add_headers_and_sources(clickhouse_common_io Common/HashTable)
 add_headers_and_sources(clickhouse_common_io IO)
+add_headers_and_sources(clickhouse_common_io IO/S3)
 list (REMOVE_ITEM clickhouse_common_io_sources Common/malloc.cpp Common/new_delete.cpp)

 if(USE_RDKAFKA)
@ -378,11 +379,6 @@ if (USE_BROTLI)
    target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${BROTLI_INCLUDE_DIR})
 endif()

-if (USE_OPENCL)
-    target_link_libraries (clickhouse_common_io PRIVATE ${OpenCL_LIBRARIES})
-    target_include_directories (clickhouse_common_io SYSTEM BEFORE PRIVATE ${OpenCL_INCLUDE_DIRS})
-endif ()
-
 if (USE_CASSANDRA)
    dbms_target_link_libraries(PUBLIC ${CASSANDRA_LIBRARY})
    dbms_target_include_directories (SYSTEM BEFORE PUBLIC ${CASS_INCLUDE_DIR})
--- a/src/Client/ConnectionPoolWithFailover.cpp
+++ b/src/Client/ConnectionPoolWithFailover.cpp
@ -56,6 +56,9 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts
        return tryGetEntry(pool, timeouts, fail_message, settings);
    };

+    size_t offset = 0;
+    if (settings)
+        offset = settings->load_balancing_first_offset % nested_pools.size();
    GetPriorityFunc get_priority;
    switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing)
    {
@ -68,7 +71,7 @@ IConnectionPool::Entry ConnectionPoolWithFailover::get(const ConnectionTimeouts
    case LoadBalancing::RANDOM:
        break;
    case LoadBalancing::FIRST_OR_RANDOM:
-        get_priority = [](size_t i) -> size_t { return i >= 1; };
+        get_priority = [offset](size_t i) -> size_t { return i != offset; };
        break;
    case LoadBalancing::ROUND_ROBIN:
        if (last_used >= nested_pools.size())
@ -190,6 +193,9 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
    else
        throw DB::Exception("Unknown pool allocation mode", DB::ErrorCodes::LOGICAL_ERROR);

+    size_t offset = 0;
+    if (settings)
+        offset = settings->load_balancing_first_offset % nested_pools.size();
    GetPriorityFunc get_priority;
    switch (settings ? LoadBalancing(settings->load_balancing) : default_load_balancing)
    {
@ -202,7 +208,7 @@ std::vector<ConnectionPoolWithFailover::TryResult> ConnectionPoolWithFailover::g
    case LoadBalancing::RANDOM:
        break;
    case LoadBalancing::FIRST_OR_RANDOM:
-        get_priority = [](size_t i) -> size_t { return i >= 1; };
+        get_priority = [offset](size_t i) -> size_t { return i != offset; };
        break;
    case LoadBalancing::ROUND_ROBIN:
        if (last_used >= nested_pools.size())
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@ -781,18 +781,21 @@ void ColumnArray::getPermutation(bool reverse, size_t limit, int nan_direction_h

 void ColumnArray::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, EqualRanges & equal_range) const
 {
+    if (equal_range.empty())
+        return;
+
    if (limit >= size() || limit >= equal_range.back().second)
        limit = 0;

-    size_t n = equal_range.size();
+    size_t number_of_ranges = equal_range.size();

    if (limit)
-        --n;
+        --number_of_ranges;

    EqualRanges new_ranges;
-    for (size_t i = 0; i < n; ++i)
+    for (size_t i = 0; i < number_of_ranges; ++i)
    {
-        const auto& [first, last] = equal_range[i];
+        const auto & [first, last] = equal_range[i];

        if (reverse)
            std::sort(res.begin() + first, res.begin() + last, Less<false>(*this, nan_direction_hint));
@ -817,7 +820,13 @@ void ColumnArray::updatePermutation(bool reverse, size_t limit, int nan_directio

    if (limit)
    {
-        const auto& [first, last] = equal_range.back();
+        const auto & [first, last] = equal_range.back();
+
+        if (limit < first || limit > last)
+            return;
+
+        /// Since then we are working inside the interval.
+
        if (reverse)
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, Less<false>(*this, nan_direction_hint));
        else
--- a/src/Columns/ColumnDecimal.cpp
+++ b/src/Columns/ColumnDecimal.cpp
@ -7,6 +7,7 @@
 #include <Core/BigInt.h>

 #include <common/unaligned.h>
+#include <ext/scope_guard.h>

 #include <IO/WriteHelpers.h>

@ -142,25 +143,31 @@ void ColumnDecimal<T>::getPermutation(bool reverse, size_t limit, int , IColumn:
 }

 template <typename T>
-void ColumnDecimal<T>::updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges & equal_range) const
+void ColumnDecimal<T>::updatePermutation(bool reverse, size_t limit, int, IColumn::Permutation & res, EqualRanges & equal_ranges) const
 {
-    if (limit >= data.size() || limit >= equal_range.back().second)
+    if (equal_ranges.empty())
+        return;
+
+    if (limit >= data.size() || limit >= equal_ranges.back().second)
        limit = 0;

-    size_t n = equal_range.size();
+    size_t number_of_ranges = equal_ranges.size();
    if (limit)
-        --n;
+        --number_of_ranges;

    EqualRanges new_ranges;
-    for (size_t i = 0; i < n; ++i)
+    SCOPE_EXIT({equal_ranges = std::move(new_ranges);});
+
+    for (size_t i = 0; i < number_of_ranges; ++i)
    {
-        const auto& [first, last] = equal_range[i];
+        const auto& [first, last] = equal_ranges[i];
        if (reverse)
            std::partial_sort(res.begin() + first, res.begin() + last, res.begin() + last,
                [this](size_t a, size_t b) { return data[a] > data[b]; });
        else
            std::partial_sort(res.begin() + first, res.begin() + last, res.begin() + last,
                [this](size_t a, size_t b) { return data[a] < data[b]; });
+
        auto new_first = first;
        for (auto j = first + 1; j < last; ++j)
        {
@ -178,13 +185,20 @@ void ColumnDecimal<T>::updatePermutation(bool reverse, size_t limit, int, IColum

    if (limit)
    {
-        const auto& [first, last] = equal_range.back();
+        const auto & [first, last] = equal_ranges.back();
+
+        if (limit < first || limit > last)
+            return;
+
+        /// Since then we are working inside the interval.
+
        if (reverse)
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last,
                [this](size_t a, size_t b) { return data[a] > data[b]; });
        else
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last,
                [this](size_t a, size_t b) { return data[a] < data[b]; });
+
        auto new_first = first;
        for (auto j = first + 1; j < limit; ++j)
        {
@ -208,7 +222,6 @@ void ColumnDecimal<T>::updatePermutation(bool reverse, size_t limit, int, IColum
        if (new_last - new_first > 1)
            new_ranges.emplace_back(new_first, new_last);
    }
-    equal_range = std::move(new_ranges);
 }

 template <typename T>
--- a/src/Columns/ColumnFixedString.cpp
+++ b/src/Columns/ColumnFixedString.cpp
@ -9,6 +9,8 @@
 #include <Common/WeakHash.h>
 #include <Common/HashTable/Hash.h>

+#include <ext/scope_guard.h>
+
 #include <DataStreams/ColumnGathererStream.h>

 #include <IO/WriteHelpers.h>
@ -168,24 +170,29 @@ void ColumnFixedString::getPermutation(bool reverse, size_t limit, int /*nan_dir
    }
 }

-void ColumnFixedString::updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_range) const
+void ColumnFixedString::updatePermutation(bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_ranges) const
 {
-    if (limit >= size() || limit >= equal_range.back().second)
+    if (equal_ranges.empty())
+        return;
+
+    if (limit >= size() || limit >= equal_ranges.back().second)
        limit = 0;

-    size_t k = equal_range.size();
+    size_t number_of_ranges = equal_ranges.size();
    if (limit)
-        --k;
+        --number_of_ranges;

    EqualRanges new_ranges;
+    SCOPE_EXIT({equal_ranges = std::move(new_ranges);});

-    for (size_t i = 0; i < k; ++i)
+    for (size_t i = 0; i < number_of_ranges; ++i)
    {
-        const auto& [first, last] = equal_range[i];
+        const auto& [first, last] = equal_ranges[i];
        if (reverse)
            std::sort(res.begin() + first, res.begin() + last, less<false>(*this));
        else
            std::sort(res.begin() + first, res.begin() + last, less<true>(*this));
+
        auto new_first = first;
        for (auto j = first + 1; j < last; ++j)
        {
@ -202,11 +209,18 @@ void ColumnFixedString::updatePermutation(bool reverse, size_t limit, int, Permu
    }
    if (limit)
    {
-        const auto& [first, last] = equal_range.back();
+        const auto & [first, last] = equal_ranges.back();
+
+        if (limit < first || limit > last)
+            return;
+
+        /// Since then we are working inside the interval.
+
        if (reverse)
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less<false>(*this));
        else
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less<true>(*this));
+
        auto new_first = first;
        for (auto j = first + 1; j < limit; ++j)
        {
@ -230,7 +244,6 @@ void ColumnFixedString::updatePermutation(bool reverse, size_t limit, int, Permu
        if (new_last - new_first > 1)
            new_ranges.emplace_back(new_first, new_last);
    }
-    equal_range = std::move(new_ranges);
 }

 void ColumnFixedString::insertRangeFrom(const IColumn & src, size_t start, size_t length)
--- a/src/Columns/ColumnLowCardinality.cpp
+++ b/src/Columns/ColumnLowCardinality.cpp
@ -6,6 +6,7 @@
 #include <Common/assert_cast.h>
 #include <Common/WeakHash.h>

+#include <ext/scope_guard.h>

 namespace DB
 {
@ -329,19 +330,24 @@ void ColumnLowCardinality::getPermutation(bool reverse, size_t limit, int nan_di
    }
 }

-void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const
+void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
 {
-    if (limit >= size() || limit >= equal_range.back().second)
+    if (equal_ranges.empty())
+        return;
+
+    if (limit >= size() || limit >= equal_ranges.back().second)
        limit = 0;

-    size_t n = equal_range.size();
+    size_t number_of_ranges = equal_ranges.size();
    if (limit)
-        --n;
+        --number_of_ranges;

    EqualRanges new_ranges;
-    for (size_t i = 0; i < n; ++i)
+    SCOPE_EXIT({equal_ranges = std::move(new_ranges);});
+
+    for (size_t i = 0; i < number_of_ranges; ++i)
    {
-        const auto& [first, last] = equal_range[i];
+        const auto& [first, last] = equal_ranges[i];
        if (reverse)
            std::sort(res.begin() + first, res.begin() + last, [this, nan_direction_hint](size_t a, size_t b)
                      {return getDictionary().compareAt(getIndexes().getUInt(a), getIndexes().getUInt(b), getDictionary(), nan_direction_hint) > 0; });
@ -366,7 +372,13 @@ void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan

    if (limit)
    {
-        const auto& [first, last] = equal_range.back();
+        const auto & [first, last] = equal_ranges.back();
+
+        if (limit < first || limit > last)
+            return;
+
+        /// Since then we are working inside the interval.
+
        if (reverse)
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, [this, nan_direction_hint](size_t a, size_t b)
                              {return getDictionary().compareAt(getIndexes().getUInt(a), getIndexes().getUInt(b), getDictionary(), nan_direction_hint) > 0; });
@ -374,6 +386,7 @@ void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, [this, nan_direction_hint](size_t a, size_t b)
                              {return getDictionary().compareAt(getIndexes().getUInt(a), getIndexes().getUInt(b), getDictionary(), nan_direction_hint) < 0; });
        auto new_first = first;
+
        for (auto j = first + 1; j < limit; ++j)
        {
            if (getDictionary().compareAt(getIndexes().getUInt(res[new_first]), getIndexes().getUInt(res[j]), getDictionary(), nan_direction_hint) != 0)
@ -384,6 +397,7 @@ void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan
                new_first = j;
            }
        }
+
        auto new_last = limit;
        for (auto j = limit; j < last; ++j)
        {
@ -396,7 +410,6 @@ void ColumnLowCardinality::updatePermutation(bool reverse, size_t limit, int nan
        if (new_last - new_first > 1)
            new_ranges.emplace_back(new_first, new_last);
    }
-    equal_range = std::move(new_ranges);
 }

 std::vector<MutableColumnPtr> ColumnLowCardinality::scatter(ColumnIndex num_columns, const Selector & selector) const
--- a/src/Columns/ColumnNullable.cpp
+++ b/src/Columns/ColumnNullable.cpp
@ -329,73 +329,113 @@ void ColumnNullable::getPermutation(bool reverse, size_t limit, int null_directi
    }
 }

-void ColumnNullable::updatePermutation(bool reverse, size_t limit, int null_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const
+void ColumnNullable::updatePermutation(bool reverse, size_t limit, int null_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
 {
-    if (limit >= equal_range.back().second || limit >= size())
-        limit = 0;
+    if (equal_ranges.empty())
+        return;

-    EqualRanges new_ranges, temp_ranges;
+    /// We will sort nested columns into `new_ranges` and call updatePermutation in next columns with `null_ranges`.
+    EqualRanges new_ranges, null_ranges;

-    for (const auto &[first, last] : equal_range)
+    const auto is_nulls_last = ((null_direction_hint > 0) != reverse);
+
+    if (is_nulls_last)
    {
-        bool direction = ((null_direction_hint > 0) != reverse);
        /// Shift all NULL values to the end.
-
-        size_t read_idx = first;
-        size_t write_idx = first;
-        while (read_idx < last && (isNullAt(res[read_idx])^direction))
+        for (const auto & [first, last] : equal_ranges)
        {
-            ++read_idx;
-            ++write_idx;
-        }
+            /// Current interval is righter than limit. 
+            if (limit && first > limit)
+                break;

-        ++read_idx;
+            /// Consider a half interval [first, last)
+            size_t read_idx = first;
+            size_t write_idx = first;
+            size_t end_idx = last;

-        /// Invariants:
-        ///  write_idx < read_idx
-        ///  write_idx points to NULL
-        ///  read_idx will be incremented to position of next not-NULL
-        ///  there are range of NULLs between write_idx and read_idx - 1,
-        /// We are moving elements from end to begin of this range,
-        ///  so range will "bubble" towards the end.
-        /// Relative order of NULL elements could be changed,
-        ///  but relative order of non-NULLs is preserved.
-
-        while (read_idx < last && write_idx < last)
-        {
-            if (isNullAt(res[read_idx])^direction)
+            /// We can't check the limit here because the interval is not sorted by nested column.
+            while (read_idx < end_idx && !isNullAt(res[read_idx]))
            {
-                std::swap(res[read_idx], res[write_idx]);
+                ++read_idx;
                ++write_idx;
            }
-            ++read_idx;
-        }

-        if (write_idx - first > 1)
-        {
-            if (direction)
-                temp_ranges.emplace_back(first, write_idx);
-            else
+            ++read_idx;
+
+            /// Invariants:
+            ///  write_idx < read_idx
+            ///  write_idx points to NULL
+            ///  read_idx will be incremented to position of next not-NULL
+            ///  there are range of NULLs between write_idx and read_idx - 1,
+            /// We are moving elements from end to begin of this range,
+            ///  so range will "bubble" towards the end.
+            /// Relative order of NULL elements could be changed,
+            ///  but relative order of non-NULLs is preserved.
+
+            while (read_idx < end_idx && write_idx < end_idx)
+            {
+                if (!isNullAt(res[read_idx]))
+                {
+                    std::swap(res[read_idx], res[write_idx]);
+                    ++write_idx;
+                }
+                ++read_idx;
+            }
+
+            /// We have a range [first, write_idx) of non-NULL values
+            if (first != write_idx)
                new_ranges.emplace_back(first, write_idx);

-        }
-
-        if (last - write_idx > 1)
-        {
-            if (direction)
-                new_ranges.emplace_back(write_idx, last);
-            else
-                temp_ranges.emplace_back(write_idx, last);
+            /// We have a range [write_idx, list) of NULL values
+            if (write_idx != last)
+                null_ranges.emplace_back(write_idx, last);
        }
    }
-    while (!new_ranges.empty() && limit && limit <= new_ranges.back().first)
-        new_ranges.pop_back();
+    else
+    {
+        /// Shift all NULL values to the beginning.
+        for (const auto & [first, last] : equal_ranges)
+        {
+            /// Current interval is righter than limit.
+            if (limit && first > limit)
+                break;

-    if (!temp_ranges.empty())
-        getNestedColumn().updatePermutation(reverse, limit, null_direction_hint, res, temp_ranges);
+            ssize_t read_idx = last - 1;
+            ssize_t write_idx = last - 1;
+            ssize_t begin_idx = first;

-    equal_range.resize(temp_ranges.size() + new_ranges.size());
-    std::merge(temp_ranges.begin(), temp_ranges.end(), new_ranges.begin(), new_ranges.end(), equal_range.begin());
+            while (read_idx >= begin_idx && !isNullAt(res[read_idx]))
+            {
+                --read_idx;
+                --write_idx;
+            }
+
+            --read_idx;
+
+            while (read_idx >= begin_idx && write_idx >= begin_idx)
+            {
+                if (!isNullAt(res[read_idx]))
+                {
+                    std::swap(res[read_idx], res[write_idx]);
+                    --write_idx;
+                }
+                --read_idx;
+            }
+
+            /// We have a range [write_idx+1, last) of non-NULL values
+            if (write_idx != static_cast<ssize_t>(last))
+                new_ranges.emplace_back(write_idx + 1, last);
+
+            /// We have a range [first, write_idx+1) of NULL values
+            if (static_cast<ssize_t>(first) != write_idx)
+                null_ranges.emplace_back(first, write_idx + 1);
+        }
+    }
+
+    getNestedColumn().updatePermutation(reverse, limit, null_direction_hint, res, new_ranges);
+
+    equal_ranges = std::move(new_ranges);
+    std::move(null_ranges.begin(), null_ranges.end(), std::back_inserter(equal_ranges));
 }

 void ColumnNullable::gather(ColumnGathererStream & gatherer)
--- a/src/Columns/ColumnString.cpp
+++ b/src/Columns/ColumnString.cpp
@ -9,7 +9,7 @@
 #include <DataStreams/ColumnGathererStream.h>

 #include <common/unaligned.h>
-
+#include <ext/scope_guard.h>

 namespace DB
 {
@ -325,25 +325,30 @@ void ColumnString::getPermutation(bool reverse, size_t limit, int /*nan_directio
    }
 }

-void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res, EqualRanges & equal_range) const
+void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direction_hint*/, Permutation & res, EqualRanges & equal_ranges) const
 {
-    if (limit >= size() || limit > equal_range.back().second)
+    if (equal_ranges.empty())
+        return;
+
+    if (limit >= size() || limit > equal_ranges.back().second)
        limit = 0;

    EqualRanges new_ranges;
-    auto less_true = less<true>(*this);
-    auto less_false = less<false>(*this);
-    size_t n = equal_range.size();
-    if (limit)
-        --n;
+    SCOPE_EXIT({equal_ranges = std::move(new_ranges);});

-    for (size_t i = 0; i < n; ++i)
+    size_t number_of_ranges = equal_ranges.size();
+    if (limit)
+        --number_of_ranges;
+
+    for (size_t i = 0; i < number_of_ranges; ++i)
    {
-        const auto &[first, last] = equal_range[i];
+        const auto & [first, last] = equal_ranges[i];
+
        if (reverse)
-            std::sort(res.begin() + first, res.begin() + last, less_false);
+            std::sort(res.begin() + first, res.begin() + last, less<false>(*this));
        else
-            std::sort(res.begin() + first, res.begin() + last, less_true);
+            std::sort(res.begin() + first, res.begin() + last, less<true>(*this));
+
        size_t new_first = first;
        for (size_t j = first + 1; j < last; ++j)
        {
@ -363,11 +368,18 @@ void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direc

    if (limit)
    {
-        const auto &[first, last] = equal_range.back();
+        const auto & [first, last] = equal_ranges.back();
+
+        if (limit < first || limit > last)
+            return;
+
+        /// Since then we are working inside the interval.
+
        if (reverse)
-            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less_false);
+            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less<false>(*this));
        else
-            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less_true);
+            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, less<true>(*this));
+
        size_t new_first = first;
        for (size_t j = first + 1; j < limit; ++j)
        {
@ -394,7 +406,6 @@ void ColumnString::updatePermutation(bool reverse, size_t limit, int /*nan_direc
        if (new_last - new_first > 1)
            new_ranges.emplace_back(new_first, new_last);
    }
-    equal_range = std::move(new_ranges);
 }

 ColumnPtr ColumnString::replicate(const Offsets & replicate_offsets) const
@ -534,19 +545,25 @@ void ColumnString::getPermutationWithCollation(const Collator & collator, bool r
    }
 }

-void ColumnString::updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int, Permutation &res, EqualRanges &equal_range) const
+void ColumnString::updatePermutationWithCollation(const Collator & collator, bool reverse, size_t limit, int, Permutation & res, EqualRanges & equal_ranges) const
 {
-    if (limit >= size() || limit >= equal_range.back().second)
+    if (equal_ranges.empty())
+        return;
+
+    if (limit >= size() || limit >= equal_ranges.back().second)
        limit = 0;

-    size_t n = equal_range.size();
+    size_t number_of_ranges = equal_ranges.size();
    if (limit)
-        --n;
+        --number_of_ranges;

    EqualRanges new_ranges;
-    for (size_t i = 0; i < n; ++i)
+    SCOPE_EXIT({equal_ranges = std::move(new_ranges);});
+
+    for (size_t i = 0; i < number_of_ranges; ++i)
    {
-        const auto& [first, last] = equal_range[i];
+        const auto& [first, last] = equal_ranges[i];
+
        if (reverse)
            std::sort(res.begin() + first, res.begin() + last, lessWithCollation<false>(*this, collator));
        else
@ -566,16 +583,22 @@ void ColumnString::updatePermutationWithCollation(const Collator & collator, boo
        }
        if (last - new_first > 1)
            new_ranges.emplace_back(new_first, last);
-
    }

    if (limit)
    {
-        const auto& [first, last] = equal_range.back();
+        const auto & [first, last] = equal_ranges.back();
+
+        if (limit < first || limit > last)
+            return;
+
+        /// Since then we are working inside the interval.
+
        if (reverse)
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, lessWithCollation<false>(*this, collator));
        else
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, lessWithCollation<true>(*this, collator));
+
        auto new_first = first;
        for (auto j = first + 1; j < limit; ++j)
        {
@ -603,7 +626,6 @@ void ColumnString::updatePermutationWithCollation(const Collator & collator, boo
        if (new_last - new_first > 1)
            new_ranges.emplace_back(new_first, new_last);
    }
-    equal_range = std::move(new_ranges);
 }

 void ColumnString::protect()
--- a/src/Columns/ColumnTuple.cpp
+++ b/src/Columns/ColumnTuple.cpp
@ -344,15 +344,19 @@ void ColumnTuple::getPermutation(bool reverse, size_t limit, int nan_direction_h
    }
 }

-void ColumnTuple::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const
+void ColumnTuple::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
 {
-    for (const auto& column : columns)
-    {
-        column->updatePermutation(reverse, limit, nan_direction_hint, res, equal_range);
-        while (limit && !equal_range.empty() && limit <= equal_range.back().first)
-            equal_range.pop_back();
+    if (equal_ranges.empty())
+        return;

-        if (equal_range.empty())
+    for (const auto & column : columns)
+    {
+        column->updatePermutation(reverse, limit, nan_direction_hint, res, equal_ranges);
+
+        while (limit && !equal_ranges.empty() && limit <= equal_ranges.back().first)
+            equal_ranges.pop_back();
+
+        if (equal_ranges.empty())
            break;
    }
 }
--- a/src/Columns/ColumnUnique.h
+++ b/src/Columns/ColumnUnique.h
@ -382,17 +382,20 @@ int ColumnUnique<ColumnType>::compareAt(size_t n, size_t m, const IColumn & rhs,
        }
    }

-    auto & column_unique = static_cast<const IColumnUnique &>(rhs);
+    const auto & column_unique = static_cast<const IColumnUnique &>(rhs);
    return getNestedColumn()->compareAt(n, m, *column_unique.getNestedColumn(), nan_direction_hint);
 }

 template <typename ColumnType>
-void ColumnUnique<ColumnType>::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const
+void ColumnUnique<ColumnType>::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_ranges) const
 {
+    if (equal_ranges.empty())
+        return;
+
    bool found_null_value_index = false;
-    for (size_t i = 0; i < equal_range.size() && !found_null_value_index; ++i)
+    for (size_t i = 0; i < equal_ranges.size() && !found_null_value_index; ++i)
    {
-        auto& [first, last] = equal_range[i];
+        auto & [first, last] = equal_ranges[i];
        for (auto j = first; j < last; ++j)
        {
            if (res[j] == getNullValueIndex())
@ -409,14 +412,14 @@ void ColumnUnique<ColumnType>::updatePermutation(bool reverse, size_t limit, int
                }
                if (last - first <= 1)
                {
-                    equal_range.erase(equal_range.begin() + i);
+                    equal_ranges.erase(equal_ranges.begin() + i);
                }
                found_null_value_index = true;
                break;
            }
        }
    }
-    getNestedColumn()->updatePermutation(reverse, limit, nan_direction_hint, res, equal_range);
+    getNestedColumn()->updatePermutation(reverse, limit, nan_direction_hint, res, equal_ranges);
 }

 template <typename IndexType>
--- a/src/Columns/ColumnVector.cpp
+++ b/src/Columns/ColumnVector.cpp
@ -15,17 +15,9 @@
 #include <Columns/ColumnsCommon.h>
 #include <DataStreams/ColumnGathererStream.h>
 #include <ext/bit_cast.h>
+#include <ext/scope_guard.h>
 #include <pdqsort.h>
-#include <numeric>

-#if !defined(ARCADIA_BUILD)
-#    include <Common/config.h>
-#    if USE_OPENCL
-#        include "Common/BitonicSort.h" // Y_IGNORE
-#    endif
-#else
-#undef USE_OPENCL
-#endif

 #ifdef __SSE2__
    #include <emmintrin.h>
@ -38,7 +30,6 @@ namespace ErrorCodes
 {
    extern const int PARAMETER_OUT_OF_BOUND;
    extern const int SIZES_OF_COLUMNS_DOESNT_MATCH;
-    extern const int OPENCL_ERROR;
    extern const int LOGICAL_ERROR;
 }

@ -146,29 +137,6 @@ namespace
    };
 }

-template <typename T>
-void ColumnVector<T>::getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res,
-                                            IColumn::SpecialSort special_sort) const
-{
-    if (special_sort == IColumn::SpecialSort::OPENCL_BITONIC)
-    {
-#if !defined(ARCADIA_BUILD)
-#if USE_OPENCL
-        if (!limit || limit >= data.size())
-        {
-            res.resize(data.size());
-
-            if (data.empty() || BitonicSort::getInstance().sort(data, res, !reverse))
-                return;
-        }
-#else
-        throw DB::Exception("'special_sort = bitonic' specified but OpenCL not available", DB::ErrorCodes::OPENCL_ERROR);
-#endif
-#endif
-    }
-
-    getPermutation(reverse, limit, nan_direction_hint, res);
-}

 template <typename T>
 void ColumnVector<T>::getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const
@ -243,10 +211,14 @@ void ColumnVector<T>::getPermutation(bool reverse, size_t limit, int nan_directi
 template <typename T>
 void ColumnVector<T>::updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges & equal_range) const
 {
+    if (equal_range.empty())
+        return;
+
    if (limit >= data.size() || limit >= equal_range.back().second)
        limit = 0;

    EqualRanges new_ranges;
+    SCOPE_EXIT({equal_range = std::move(new_ranges);});

    for (size_t i = 0; i < equal_range.size() - bool(limit); ++i)
    {
@ -275,6 +247,12 @@ void ColumnVector<T>::updatePermutation(bool reverse, size_t limit, int nan_dire
    if (limit)
    {
        const auto & [first, last] = equal_range.back();
+
+        if (limit < first || limit > last)
+            return;
+
+        /// Since then, we are working inside the interval.
+
        if (reverse)
            std::partial_sort(res.begin() + first, res.begin() + limit, res.begin() + last, greater(*this, nan_direction_hint));
        else
@ -307,7 +285,6 @@ void ColumnVector<T>::updatePermutation(bool reverse, size_t limit, int nan_dire
            new_ranges.emplace_back(new_first, new_last);
        }
    }
-    equal_range = std::move(new_ranges);
 }

 template <typename T>
--- a/src/Columns/ColumnVector.h
+++ b/src/Columns/ColumnVector.h
@ -218,8 +218,6 @@ public:
    }

    void getPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res) const override;
-    void getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res,
-                               IColumn::SpecialSort) const override;

    void updatePermutation(bool reverse, size_t limit, int nan_direction_hint, IColumn::Permutation & res, EqualRanges& equal_range) const override;

--- a/src/Columns/IColumn.h
+++ b/src/Columns/IColumn.h
@ -267,17 +267,6 @@ public:
      */
    virtual void getPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res) const = 0;

-    enum class SpecialSort
-    {
-        NONE = 0,
-        OPENCL_BITONIC,
-    };
-
-    virtual void getSpecialPermutation(bool reverse, size_t limit, int nan_direction_hint, Permutation & res, SpecialSort) const
-    {
-        getPermutation(reverse, limit, nan_direction_hint, res);
-    }
-
    /*in updatePermutation we pass the current permutation and the intervals at which it should be sorted
     * Then for each interval separately (except for the last one, if there is a limit)
     * We sort it based on data about the current column, and find all the intervals within this
--- a/src/Columns/ya.make
+++ b/src/Columns/ya.make
@ -2,6 +2,8 @@
 LIBRARY()

 ADDINCL(
+    contrib/libs/icu/common
+    contrib/libs/icu/i18n
    contrib/libs/pdqsort
 )

--- a/src/Common/BitonicSort.h
+++ b/src/Common/BitonicSort.h
@ -1,221 +0,0 @@
-#pragma once
-
-#include <string.h>
-#if !defined(__APPLE__) && !defined(__FreeBSD__)
-#include <malloc.h>
-#endif
-
-#ifdef __APPLE__
-#include <OpenCL/opencl.h>
-#else
-#include <CL/cl.h>
-#endif
-
-#include <ext/bit_cast.h>
-#include <common/types.h>
-#include <Core/Defines.h>
-#include <Common/PODArray.h>
-#include <Columns/ColumnsCommon.h>
-
-#include "oclBasics.h"
-#include "bitonicSortKernels.cl"
-
-class BitonicSort
-{
-public:
-    using KernelType = OCL::KernelType;
-
-    enum Types
-    {
-        KernelInt8 = 0,
-        KernelUInt8,
-        KernelInt16,
-        KernelUInt16,
-        KernelInt32,
-        KernelUInt32,
-        KernelInt64,
-        KernelUInt64,
-        KernelMax
-    };
-
-    static BitonicSort & getInstance()
-    {
-        static BitonicSort instance = BitonicSort();
-        return instance;
-    }
-
-    /// Sorts given array in specified order. Returns `true` if given sequence was sorted, `false` otherwise.
-    template <typename T>
-    bool sort(const DB::PaddedPODArray<T> & data, DB::IColumn::Permutation & res, cl_uint sort_ascending [[maybe_unused]]) const
-    {
-        if constexpr (
-            std::is_same_v<T, Int8> ||
-            std::is_same_v<T, UInt8> ||
-            std::is_same_v<T, Int16> ||
-            std::is_same_v<T, UInt16> ||
-            std::is_same_v<T, Int32> ||
-            std::is_same_v<T, UInt32> ||
-            std::is_same_v<T, Int64> ||
-            std::is_same_v<T, UInt64>)
-        {
-            size_t data_size = data.size();
-
-            /// Getting the nearest power of 2.
-            size_t power = 8;
-            while (power < data_size)
-                power <<= 1;
-
-            /// Allocates more space for additional stubs to be added if needed.
-            std::vector<T> pairs_content(power);
-            std::vector<UInt32> pairs_indices(power);
-
-            memcpy(&pairs_content[0], &data[0], sizeof(T) * data_size);
-            for (UInt32 i = 0; i < data_size; ++i)
-                pairs_indices[i] = i;
-
-            fillWithStubs(pairs_content.data(), pairs_indices.data(), data_size, power - data_size, sort_ascending);
-            sort(pairs_content.data(), pairs_indices.data(), power, sort_ascending);
-
-            for (size_t i = 0, shift = 0; i < power; ++i)
-            {
-                if (pairs_indices[i] >= data_size)
-                {
-                    ++shift;
-                    continue;
-                }
-                res[i - shift] = pairs_indices[i];
-            }
-
-            return true;
-        }
-
-        return false;
-    }
-
-    /// Creating a configuration instance with making all OpenCl required variables
-    /// such as device, platform, context, queue, program and kernel.
-    void configure()
-    {
-        OCL::Settings settings = OCL::Settings(1, nullptr, 1, nullptr, 1, 0);
-
-        cl_platform_id platform = OCL::getPlatformID(settings);
-        cl_device_id device = OCL::getDeviceID(platform, settings);
-        cl_context gpu_context = OCL::makeContext(device, settings);
-        cl_command_queue command_queue = OCL::makeCommandQueue<2>(device, gpu_context, settings);
-
-        cl_program program = OCL::makeProgram(bitonic_sort_kernels, gpu_context, device, settings);
-
-        /// Creating kernels for each specified data type.
-        cl_int error = 0;
-        kernels.resize(KernelMax);
-
-        kernels[KernelInt8] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_char", &error), clReleaseKernel);
-        OCL::checkError(error);
-
-        kernels[KernelUInt8] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_uchar", &error), clReleaseKernel);
-        OCL::checkError(error);
-
-        kernels[KernelInt16] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_short", &error), clReleaseKernel);
-        OCL::checkError(error);
-
-        kernels[KernelUInt16] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_ushort", &error), clReleaseKernel);
-        OCL::checkError(error);
-
-        kernels[KernelInt32] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_int", &error), clReleaseKernel);
-        OCL::checkError(error);
-
-        kernels[KernelUInt32] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_uint", &error), clReleaseKernel);
-        OCL::checkError(error);
-
-        kernels[KernelInt64] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_long", &error), clReleaseKernel);
-        OCL::checkError(error);
-
-        kernels[KernelUInt64] = std::shared_ptr<KernelType>(clCreateKernel(program, "bitonicSort_ulong", &error), clReleaseKernel);
-        OCL::checkError(error);
-
-        configuration = std::shared_ptr<OCL::Configuration>(new OCL::Configuration(device, gpu_context, command_queue, program));
-    }
-
-private:
-    /// Dictionary with kernels for each type from list: uchar, char, ushort, short, uint, int, ulong and long.
-    std::vector<std::shared_ptr<KernelType>> kernels;
-    /// Current configuration with core OpenCL instances.
-    std::shared_ptr<OCL::Configuration> configuration = nullptr;
-
-    cl_kernel getKernel(Int8) const { return kernels[KernelInt8].get(); }
-    cl_kernel getKernel(UInt8) const { return kernels[KernelUInt8].get(); }
-    cl_kernel getKernel(Int16) const { return kernels[KernelInt16].get(); }
-    cl_kernel getKernel(UInt16) const { return kernels[KernelUInt16].get(); }
-    cl_kernel getKernel(Int32) const { return kernels[KernelInt32].get(); }
-    cl_kernel getKernel(UInt32) const { return kernels[KernelUInt32].get(); }
-    cl_kernel getKernel(Int64) const { return kernels[KernelInt64].get(); }
-    cl_kernel getKernel(UInt64) const { return kernels[KernelUInt64].get(); }
-
-    /// Sorts p_input inplace with indices. Works only with arrays which size equals to power of two.
-    template <class T>
-    void sort(T * p_input, cl_uint * indices, cl_int array_size, cl_uint sort_ascending) const
-    {
-        cl_kernel kernel = getKernel(T(0));
-        cl_int error = CL_SUCCESS;
-        cl_int num_stages = 0;
-
-        for (cl_int temp = array_size; temp > 2; temp >>= 1)
-            num_stages++;
-
-        /// Creating OpenCL buffers using input arrays memory.
-        cl_mem cl_input_buffer = OCL::createBuffer<T>(p_input, array_size, configuration.get()->context());
-        cl_mem cl_indices_buffer = OCL::createBuffer<cl_uint>(indices, array_size, configuration.get()->context());
-
-        configureKernel<cl_mem>(kernel, 0, static_cast<void *>(&cl_input_buffer));
-        configureKernel<cl_mem>(kernel, 1, static_cast<void *>(&cl_indices_buffer));
-        configureKernel<cl_uint>(kernel, 4, static_cast<void *>(&sort_ascending));
-
-        for (cl_int stage = 0; stage < num_stages; stage++)
-        {
-            configureKernel<cl_uint>(kernel, 2, static_cast<void *>(&stage));
-
-            for (cl_int pass_of_stage = stage; pass_of_stage >= 0; pass_of_stage--)
-            {
-                configureKernel<cl_uint>(kernel, 3, static_cast<void *>(&pass_of_stage));
-
-                /// Setting work-item dimensions.
-                size_t gsize = array_size / (2 * 4);
-                size_t global_work_size[1] = {pass_of_stage ? gsize : gsize << 1 }; // number of quad items in input array
-
-                /// Executing kernel.
-                error = clEnqueueNDRangeKernel(configuration.get()->commandQueue(), kernel, 1, nullptr,
-                                               global_work_size, nullptr, 0, nullptr, nullptr);
-                OCL::checkError(error);
-            }
-        }
-
-        /// Syncs all threads.
-        OCL::finishCommandQueue(configuration.get()->commandQueue());
-
-        OCL::releaseData(p_input, array_size, cl_input_buffer, configuration.get()->commandQueue());
-        OCL::releaseData(indices, array_size, cl_indices_buffer, configuration.get()->commandQueue());
-    }
-
-    template <class T>
-    void configureKernel(cl_kernel kernel, int number_of_argument, void * source) const
-    {
-        cl_int error = clSetKernelArg(kernel, number_of_argument, sizeof(T), source);
-        OCL::checkError(error);
-    }
-
-    /// Fills given sequences from `arraySize` index with `numberOfStubs` values.
-    template <class T>
-    void fillWithStubs(T * p_input, cl_uint * indices, cl_int array_size, cl_int number_of_stubs, cl_uint sort_ascending) const
-    {
-        T value = sort_ascending ? std::numeric_limits<T>::max() : std::numeric_limits<T>::min();
-        for (cl_int index = 0; index < number_of_stubs; ++index)
-        {
-            p_input[array_size + index] = value;
-            indices[array_size + index] = array_size + index;
-        }
-    }
-
-    BitonicSort() = default;
-    BitonicSort(BitonicSort const &) = delete;
-    void operator = (BitonicSort const &) = delete;
-};
--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -486,7 +486,6 @@ namespace ErrorCodes
    extern const int NO_REMOTE_SHARD_AVAILABLE = 519;
    extern const int CANNOT_DETACH_DICTIONARY_AS_TABLE = 520;
    extern const int ATOMIC_RENAME_FAIL = 521;
-    extern const int OPENCL_ERROR = 522;
    extern const int UNKNOWN_ROW_POLICY = 523;
    extern const int ALTER_OF_COLUMN_IS_FORBIDDEN = 524;
    extern const int INCORRECT_DISK_INDEX = 525;
--- a/src/Common/PODArray.h
+++ b/src/Common/PODArray.h
@ -214,6 +214,9 @@ public:
    void clear() { c_end = c_start; }

    template <typename ... TAllocatorParams>
+#if defined(__clang__)
+    ALWAYS_INLINE /// Better performance in clang build, worse performance in gcc build.
+#endif
    void reserve(size_t n, TAllocatorParams &&... allocator_params)
    {
        if (n > capacity())
--- a/src/Common/SymbolIndex.cpp
+++ b/src/Common/SymbolIndex.cpp
@ -59,7 +59,7 @@ Otherwise you will get only exported symbols from program headers.
 #   pragma clang diagnostic ignored "-Wunused-macros"
 #endif

-#define __msan_unpoison_string(X)
+#define __msan_unpoison_string(X) // NOLINT
 #if defined(__has_feature)
 #   if __has_feature(memory_sanitizer)
 #       undef __msan_unpoison_string
--- a/src/Common/ThreadFuzzer.h
+++ b/src/Common/ThreadFuzzer.h
@ -31,10 +31,8 @@ namespace DB
  *
  * Notes:
  * - it can be also implemented with instrumentation (example: LLVM Xray) instead of signals.
-  * - it's also reasonable to insert glitches around interesting functions (example: mutex lock/unlock, starting of threads, etc.),
-  *   it is doable with wrapping these functions (todo).
  * - we should also make the sleep time random.
-  * - sleep obviously helps, but the effect of yield and migration is unclear.
+  * - sleep and migration obviously helps, but the effect of yield is unclear.
  *
  * In addition, we allow to inject glitches around thread synchronization functions.
  * Example:
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@ -13,6 +13,7 @@ namespace DB
    namespace ErrorCodes
    {
        extern const int CANNOT_SCHEDULE_TASK;
+        extern const int LOGICAL_ERROR;
    }
 }

@ -233,6 +234,7 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
                    std::is_same_v<Thread, std::thread> ? CurrentMetrics::GlobalThreadActive : CurrentMetrics::LocalThreadActive);

                job();
+                job = {};
            }
            catch (...)
            {
@ -276,7 +278,11 @@ std::unique_ptr<GlobalThreadPool> GlobalThreadPool::the_instance;

 void GlobalThreadPool::initialize(size_t max_threads)
 {
-    assert(!the_instance);
+    if (the_instance)
+    {
+        throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR,
+            "The global thread pool is initialized twice");
+    }

    the_instance.reset(new GlobalThreadPool(max_threads,
        1000 /*max_free_threads*/, 10000 /*max_queue_size*/,
--- a/src/Common/UnicodeBar.cpp
+++ b/src/Common/UnicodeBar.cpp
@ -0,0 +1,70 @@
+#include <cstring>
+#include <cmath>
+#include <string>
+#include <common/types.h>
+#include <common/arithmeticOverflow.h>
+#include <Common/Exception.h>
+#include <Common/UnicodeBar.h>
+
+
+namespace DB
+{
+    namespace ErrorCodes
+    {
+        extern const int PARAMETER_OUT_OF_BOUND;
+    }
+}
+
+
+namespace UnicodeBar
+{
+    double getWidth(Int64 x, Int64 min, Int64 max, double max_width)
+    {
+        if (x <= min)
+            return 0;
+
+        if (x >= max)
+            return max_width;
+
+        /// The case when max - min overflows
+        Int64 max_difference;
+        if (common::subOverflow(max, min, max_difference))
+            throw DB::Exception(DB::ErrorCodes::PARAMETER_OUT_OF_BOUND, "The arguments to render unicode bar will lead to arithmetic overflow");
+
+        return (x - min) * max_width / max_difference;
+    }
+
+    size_t getWidthInBytes(double width)
+    {
+        return ceil(width - 1.0 / 8) * UNICODE_BAR_CHAR_SIZE;
+    }
+
+    void render(double width, char * dst)
+    {
+        size_t floor_width = floor(width);
+
+        for (size_t i = 0; i < floor_width; ++i)
+        {
+            memcpy(dst, "█", UNICODE_BAR_CHAR_SIZE);
+            dst += UNICODE_BAR_CHAR_SIZE;
+        }
+
+        size_t remainder = floor((width - floor_width) * 8);
+
+        if (remainder)
+        {
+            memcpy(dst, &"▏▎▍▌▋▋▊▉"[(remainder - 1) * UNICODE_BAR_CHAR_SIZE], UNICODE_BAR_CHAR_SIZE);
+            dst += UNICODE_BAR_CHAR_SIZE;
+        }
+
+        *dst = 0;
+    }
+
+    std::string render(double width)
+    {
+        std::string res(getWidthInBytes(width), '\0');
+        render(width, res.data());
+        return res;
+    }
+}
+
--- a/src/Common/UnicodeBar.h
+++ b/src/Common/UnicodeBar.h
@ -1,7 +1,5 @@
 #pragma once

-#include <cstring>
-#include <cmath>
 #include <string>
 #include <common/types.h>

@ -10,54 +8,12 @@

 /** Allows you to draw a unicode-art bar whose width is displayed with a resolution of 1/8 character.
  */
-
-
 namespace UnicodeBar
 {
-    using DB::Int64;
-
-    inline double getWidth(Int64 x, Int64 min, Int64 max, double max_width)
-    {
-        if (x <= min)
-            return 0;
-
-        if (x >= max)
-            return max_width;
-
-        return (x - min) * max_width / (max - min);
-    }
-
-    inline size_t getWidthInBytes(double width)
-    {
-        return ceil(width - 1.0 / 8) * UNICODE_BAR_CHAR_SIZE;
-    }
+    double getWidth(Int64 x, Int64 min, Int64 max, double max_width);
+    size_t getWidthInBytes(double width);

    /// In `dst` there must be a space for barWidthInBytes(width) characters and a trailing zero.
-    inline void render(double width, char * dst)
-    {
-        size_t floor_width = floor(width);
-
-        for (size_t i = 0; i < floor_width; ++i)
-        {
-            memcpy(dst, "█", UNICODE_BAR_CHAR_SIZE);
-            dst += UNICODE_BAR_CHAR_SIZE;
-        }
-
-        size_t remainder = floor((width - floor_width) * 8);
-
-        if (remainder)
-        {
-            memcpy(dst, &"▏▎▍▌▋▋▊▉"[(remainder - 1) * UNICODE_BAR_CHAR_SIZE], UNICODE_BAR_CHAR_SIZE);
-            dst += UNICODE_BAR_CHAR_SIZE;
-        }
-
-        *dst = 0;
-    }
-
-    inline std::string render(double width)
-    {
-        std::string res(getWidthInBytes(width), '\0');
-        render(width, res.data());
-        return res;
-    }
+    void render(double width, char * dst);
+    std::string render(double width);
 }
--- a/Show More
+++ b/Show More