From 7e6cd00cee84394a6d2ead085fcb7b30acfca2da Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Mon, 15 Feb 2021 09:31:35 +0300
Subject: [PATCH 001/177] Try to switch to llvm-12/clang-12

---
 docker/builder/build.sh            |   2 +-
 docker/packager/binary/Dockerfile  |   7 +-
 docker/packager/deb/Dockerfile     |   7 +-
 docker/packager/packager           |   1 +
 docker/test/codebrowser/Dockerfile |   2 +-
 docker/test/fuzzer/run-fuzzer.sh   |   2 +-
 docs/en/development/build.md       |  10 +--
 tests/ci/ci_config.json            | 110 ++++++++++++++---------------
 8 files changed, 76 insertions(+), 65 deletions(-)

diff --git a/docker/builder/build.sh b/docker/builder/build.sh
index d4cf662e91b..7c7a8893751 100755
--- a/docker/builder/build.sh
+++ b/docker/builder/build.sh
@@ -4,7 +4,7 @@ set -e
 #ccache -s # uncomment to display CCache statistics
 mkdir -p /server/build_docker
 cd /server/build_docker
-cmake -G Ninja /server "-DCMAKE_C_COMPILER=$(command -v clang-11)" "-DCMAKE_CXX_COMPILER=$(command -v clang++-11)"
+cmake -G Ninja /server "-DCMAKE_C_COMPILER=$(command -v clang-12)" "-DCMAKE_CXX_COMPILER=$(command -v clang++-12)"
 
 # Set the number of build jobs to the half of number of virtual CPU cores (rounded up).
 # By default, ninja use all virtual CPU cores, that leads to very high memory consumption without much improvement in build time.
diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile
index 91036d88d8c..e8071c79a50 100644
--- a/docker/packager/binary/Dockerfile
+++ b/docker/packager/binary/Dockerfile
@@ -1,7 +1,7 @@
 #  docker build -t yandex/clickhouse-binary-builder .
 FROM ubuntu:20.04
 
-ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
+ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=12
 
 RUN apt-get update \
     && apt-get install \
@@ -57,6 +57,11 @@ RUN cat /etc/resolv.conf \
         lld-11 \
         llvm-11 \
         llvm-11-dev \
+        clang-12 \
+        clang-tidy-12 \
+        lld-12 \
+        llvm-12 \
+        llvm-12-dev \
         libicu-dev \
         libreadline-dev \
         ninja-build \
diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile
index 8fd89d60f85..42a55ab72bd 100644
--- a/docker/packager/deb/Dockerfile
+++ b/docker/packager/deb/Dockerfile
@@ -1,7 +1,7 @@
 # docker build -t yandex/clickhouse-deb-builder .
 FROM ubuntu:20.04
 
-ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
+ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=12
 
 RUN apt-get update \
     && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
@@ -36,6 +36,11 @@ RUN apt-get update \
     && apt-get install \
         gcc-9 \
         g++-9 \
+        clang-12 \
+        clang-tidy-12 \
+        lld-12 \
+        llvm-12 \
+        llvm-12-dev \
         clang-11 \
         clang-tidy-11 \
         lld-11 \
diff --git a/docker/packager/packager b/docker/packager/packager
index 65c03cc10e3..a681086f955 100755
--- a/docker/packager/packager
+++ b/docker/packager/packager
@@ -184,6 +184,7 @@ if __name__ == "__main__":
     parser.add_argument("--build-type", choices=("debug", ""), default="")
     parser.add_argument("--compiler", choices=("clang-10", "clang-10-darwin", "clang-10-aarch64", "clang-10-freebsd",
                                                "clang-11", "clang-11-darwin", "clang-11-aarch64", "clang-11-freebsd",
+                                               "clang-12", "clang-12-darwin", "clang-12-aarch64", "clang-12-freebsd",
                                                "gcc-9", "gcc-10"), default="gcc-9")
     parser.add_argument("--sanitizer", choices=("address", "thread", "memory", "undefined", ""), default="")
     parser.add_argument("--unbundled", action="store_true")
diff --git a/docker/test/codebrowser/Dockerfile b/docker/test/codebrowser/Dockerfile
index e03f94a85e0..8f6c760c4a0 100644
--- a/docker/test/codebrowser/Dockerfile
+++ b/docker/test/codebrowser/Dockerfile
@@ -22,7 +22,7 @@ ENV SHA=nosha
 ENV DATA="data"
 
 CMD mkdir -p $BUILD_DIRECTORY && cd $BUILD_DIRECTORY && \
-    cmake $SOURCE_DIRECTORY -DCMAKE_CXX_COMPILER=/usr/bin/clang\+\+-11 -DCMAKE_C_COMPILER=/usr/bin/clang-11 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON && \
+    cmake $SOURCE_DIRECTORY -DCMAKE_CXX_COMPILER=/usr/bin/clang\+\+-12 -DCMAKE_C_COMPILER=/usr/bin/clang-12 -DCMAKE_EXPORT_COMPILE_COMMANDS=ON && \
     mkdir -p $HTML_RESULT_DIRECTORY && \
     $CODEGEN -b $BUILD_DIRECTORY -a -o $HTML_RESULT_DIRECTORY -p ClickHouse:$SOURCE_DIRECTORY:$SHA -d $DATA && \
     cp -r $STATIC_DATA $HTML_RESULT_DIRECTORY/ &&\
diff --git a/docker/test/fuzzer/run-fuzzer.sh b/docker/test/fuzzer/run-fuzzer.sh
index e21f9efae66..b62c573f586 100755
--- a/docker/test/fuzzer/run-fuzzer.sh
+++ b/docker/test/fuzzer/run-fuzzer.sh
@@ -12,7 +12,7 @@ stage=${stage:-}
 script_dir="$( cd "$( dirname "${BASH_SOURCE[0]}" )" >/dev/null 2>&1 && pwd )"
 echo "$script_dir"
 repo_dir=ch
-BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-11_debug_none_bundled_unsplitted_disable_False_binary"}
+BINARY_TO_DOWNLOAD=${BINARY_TO_DOWNLOAD:="clang-12_debug_none_bundled_unsplitted_disable_False_binary"}
 
 function clone
 {
diff --git a/docs/en/development/build.md b/docs/en/development/build.md
index 3181f26800d..cc184e0302a 100644
--- a/docs/en/development/build.md
+++ b/docs/en/development/build.md
@@ -23,7 +23,7 @@ $ sudo apt-get install git cmake python ninja-build
 
 Or cmake3 instead of cmake on older systems.
 
-### Install clang-11 (recommended) {#install-clang-11}
+### Install clang-12 (recommended) {#install-clang-12}
 
 On Ubuntu/Debian you can use the automatic installation script (check [official webpage](https://apt.llvm.org/))
 
@@ -33,16 +33,16 @@ sudo bash -c "$(wget -O - https://apt.llvm.org/llvm.sh)"
 
 For other Linux distribution - check the availability of the [prebuild packages](https://releases.llvm.org/download.html) or build clang [from sources](https://clang.llvm.org/get_started.html).
 
-#### Use clang-11 for Builds {#use-gcc-10-for-builds}
+#### Use clang-12 for Builds {#use-clang-12-for-builds}
 
 ``` bash
-$ export CC=clang-11
-$ export CXX=clang++-11
+$ export CC=clang-12
+$ export CXX=clang++-12
 ```
 
 ### Install GCC 10 {#install-gcc-10}
 
-We recommend building ClickHouse with clang-11, GCC-10 also supported, but it is not used for production builds.
+We recommend building ClickHouse with clang-12, GCC-10 also supported, but it is not used for production builds.
 
 If you want to use GCC-10 there are several ways to install it.
 
diff --git a/tests/ci/ci_config.json b/tests/ci/ci_config.json
index 0e467319285..703cdc10fed 100644
--- a/tests/ci/ci_config.json
+++ b/tests/ci/ci_config.json
@@ -1,7 +1,7 @@
 {
     "build_config": [
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "",
             "sanitizer": "",
             "package-type": "deb",
@@ -12,7 +12,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "",
             "sanitizer": "",
             "package-type": "performance",
@@ -32,7 +32,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "",
             "sanitizer": "address",
             "package-type": "deb",
@@ -42,7 +42,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "",
             "sanitizer": "undefined",
             "package-type": "deb",
@@ -52,7 +52,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "",
             "sanitizer": "thread",
             "package-type": "deb",
@@ -62,7 +62,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "",
             "sanitizer": "memory",
             "package-type": "deb",
@@ -82,7 +82,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "debug",
             "sanitizer": "",
             "package-type": "deb",
@@ -102,7 +102,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",
@@ -114,7 +114,7 @@
     ],
     "special_build_config": [
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "debug",
             "sanitizer": "",
             "package-type": "deb",
@@ -124,7 +124,7 @@
             "with_coverage": true
         },
         {
-            "compiler": "clang-11",
+            "compiler": "clang-12",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",
@@ -134,7 +134,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11-darwin",
+            "compiler": "clang-12-darwin",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",
@@ -144,7 +144,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11-aarch64",
+            "compiler": "clang-12-aarch64",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",
@@ -154,7 +154,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-11-freebsd",
+            "compiler": "clang-12-freebsd",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",
@@ -167,7 +167,7 @@
     "tests_config": {
         "Functional stateful tests (address)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "address",
@@ -179,7 +179,7 @@
         },
         "Functional stateful tests (thread)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "thread",
@@ -191,7 +191,7 @@
         },
         "Functional stateful tests (memory)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "memory",
@@ -203,7 +203,7 @@
         },
         "Functional stateful tests (ubsan)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "undefined",
@@ -215,7 +215,7 @@
         },
         "Functional stateful tests (debug)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "debug",
                 "sanitizer": "none",
@@ -227,7 +227,7 @@
         },
         "Functional stateless tests (ANTLR debug)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "debug",
                 "sanitizer": "none",
@@ -239,7 +239,7 @@
         },
         "Functional stateful tests (release)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -251,7 +251,7 @@
         },
         "Functional stateful tests (release, DatabaseOrdinary)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -275,7 +275,7 @@
         },
         "Functional stateless tests (address)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "address",
@@ -287,7 +287,7 @@
         },
         "Functional stateless tests (thread)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "thread",
@@ -299,7 +299,7 @@
         },
         "Functional stateless tests (memory)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "memory",
@@ -311,7 +311,7 @@
         },
         "Functional stateless tests (ubsan)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "undefined",
@@ -323,7 +323,7 @@
         },
         "Functional stateless tests (debug)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "debug",
                 "sanitizer": "none",
@@ -335,7 +335,7 @@
         },
         "Functional stateless tests (release)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -347,7 +347,7 @@
         },
         "Functional stateless tests (pytest)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -371,7 +371,7 @@
         },
         "Functional stateless tests (release, wide parts enabled)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -383,7 +383,7 @@
         },
         "Functional stateless tests (release, DatabaseOrdinary)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -407,7 +407,7 @@
         },
         "Stress test (address)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "address",
@@ -419,7 +419,7 @@
         },
         "Stress test (thread)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "thread",
@@ -431,7 +431,7 @@
         },
         "Stress test (undefined)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "undefined",
@@ -443,7 +443,7 @@
         },
         "Stress test (memory)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "memory",
@@ -455,7 +455,7 @@
         },
         "Stress test (debug)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "debug",
                 "sanitizer": "none",
@@ -467,7 +467,7 @@
         },
         "Integration tests (asan)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "address",
@@ -479,7 +479,7 @@
         },
         "Integration tests (thread)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "thread",
@@ -491,7 +491,7 @@
         },
         "Integration tests (release)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -503,7 +503,7 @@
         },
         "Integration tests (memory)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "memory",
@@ -515,7 +515,7 @@
         },
         "Integration tests flaky check (asan)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "address",
@@ -527,7 +527,7 @@
         },
         "Compatibility check": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -539,7 +539,7 @@
         },
         "Split build smoke test": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -551,7 +551,7 @@
         },
         "Testflows check": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -575,7 +575,7 @@
         },
         "Unit tests release clang": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -587,7 +587,7 @@
         },
         "Unit tests ASAN": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "address",
@@ -599,7 +599,7 @@
         },
         "Unit tests MSAN": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "memory",
@@ -611,7 +611,7 @@
         },
         "Unit tests TSAN": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "thread",
@@ -623,7 +623,7 @@
         },
         "Unit tests UBSAN": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "thread",
@@ -635,7 +635,7 @@
         },
         "AST fuzzer (debug)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "debug",
                 "sanitizer": "none",
@@ -647,7 +647,7 @@
         },
         "AST fuzzer (ASan)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "address",
@@ -659,7 +659,7 @@
         },
         "AST fuzzer (MSan)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "memory",
@@ -671,7 +671,7 @@
         },
         "AST fuzzer (TSan)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "thread",
@@ -683,7 +683,7 @@
         },
         "AST fuzzer (UBSan)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "binary",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "undefined",
@@ -695,7 +695,7 @@
         },
         "Release": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -707,7 +707,7 @@
         },
         "Functional stateless tests flaky check (address)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "address",

From aabf5307c417a4757470bfff1da1ddb428ef3773 Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Fri, 30 Apr 2021 13:26:13 +0300
Subject: [PATCH 002/177] more

---
 docker/test/keeper-jepsen/run.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/test/keeper-jepsen/run.sh b/docker/test/keeper-jepsen/run.sh
index 352585e16e3..8d31b5b7f1c 100644
--- a/docker/test/keeper-jepsen/run.sh
+++ b/docker/test/keeper-jepsen/run.sh
@@ -2,7 +2,7 @@
 set -euo pipefail
 
 
-CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-11_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"}
+CLICKHOUSE_PACKAGE=${CLICKHOUSE_PACKAGE:="https://clickhouse-builds.s3.yandex.net/$PR_TO_TEST/$SHA_TO_TEST/clickhouse_build_check/clang-12_relwithdebuginfo_none_bundled_unsplitted_disable_False_binary/clickhouse"}
 CLICKHOUSE_REPO_PATH=${CLICKHOUSE_REPO_PATH:=""}
 
 

From cf277a67846b909dff43d09060e27a75c585ad6f Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Fri, 30 Apr 2021 17:55:38 +0300
Subject: [PATCH 003/177] find llvm

---
 cmake/find/llvm.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/find/llvm.cmake b/cmake/find/llvm.cmake
index e0ba1d9b039..0025cc0f9d3 100644
--- a/cmake/find/llvm.cmake
+++ b/cmake/find/llvm.cmake
@@ -26,7 +26,7 @@ endif ()
 if (NOT USE_INTERNAL_LLVM_LIBRARY)
     set (LLVM_PATHS "/usr/local/lib/llvm")
 
-    foreach(llvm_v 10 9 8)
+    foreach(llvm_v 12 11)
         if (NOT LLVM_FOUND)
             find_package (LLVM ${llvm_v} CONFIG PATHS ${LLVM_PATHS})
         endif ()

From a4e6a96c8243d8a50907f6d831c2ff91d0477516 Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Fri, 30 Apr 2021 19:02:23 +0300
Subject: [PATCH 004/177] fasttest change

---
 docker/test/fasttest/Dockerfile | 2 +-
 docker/test/fasttest/run.sh     | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile
index 2864f7fc4da..0c7e2af6ec6 100644
--- a/docker/test/fasttest/Dockerfile
+++ b/docker/test/fasttest/Dockerfile
@@ -1,7 +1,7 @@
 #  docker build -t yandex/clickhouse-fasttest .
 FROM ubuntu:20.04
 
-ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
+ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=12
 
 RUN apt-get update \
     && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh
index a7cc398e5c9..d7bf73f4755 100755
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@@ -9,7 +9,7 @@ trap 'kill $(jobs -pr) ||:' EXIT
 stage=${stage:-}
 
 # Compiler version, normally set by Dockerfile
-export LLVM_VERSION=${LLVM_VERSION:-11}
+export LLVM_VERSION=${LLVM_VERSION:-12}
 
 # A variable to pass additional flags to CMake.
 # Here we explicitly default it to nothing so that bash doesn't complain about

From 366a7fe45bbfdbfa6f6ad07bafe293054500c0b5 Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Wed, 12 May 2021 18:24:27 +0300
Subject: [PATCH 005/177] linker path required to be specific one

---
 docker/packager/binary/build.sh | 22 ++++++++++++++++------
 docker/packager/packager        |  5 ++++-
 2 files changed, 20 insertions(+), 7 deletions(-)

diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh
index cf74105fbbb..d746aed76ed 100755
--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@@ -2,14 +2,23 @@
 
 set -x -e
 
-mkdir -p build/cmake/toolchain/darwin-x86_64
-tar xJf MacOSX10.15.sdk.tar.xz -C build/cmake/toolchain/darwin-x86_64 --strip-components=1
+if [ "1" == "${IS_CROSS_DARWIN:0}" ]
+then
+    mkdir -p build/cmake/toolchain/darwin-x86_64
+    tar xJf MacOSX10.15.sdk.tar.xz -C build/cmake/toolchain/darwin-x86_64 --strip-components=1
+fi
 
-mkdir -p build/cmake/toolchain/linux-aarch64
-tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build/cmake/toolchain/linux-aarch64 --strip-components=1
+if [ "1" == "${IS_CROSS_ARM:0}" ]
+then
+    mkdir -p build/cmake/toolchain/linux-aarch64
+    tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build/cmake/toolchain/linux-aarch64 --strip-components=1
+fi
 
-mkdir -p build/cmake/toolchain/freebsd-x86_64
-tar xJf freebsd-11.3-toolchain.tar.xz -C build/cmake/toolchain/freebsd-x86_64 --strip-components=1
+if [ "1" == "${IS_CROSS_ARM:0}" ]
+then
+    mkdir -p build/cmake/toolchain/freebsd-x86_64
+    tar xJf freebsd-11.3-toolchain.tar.xz -C build/cmake/toolchain/freebsd-x86_64 --strip-components=1
+fi
 
 # Uncomment to debug ccache. Don't put ccache log in /output right away, or it
 # will be confusingly packed into the "performance" package.
@@ -21,6 +30,7 @@ cd build/build_docker
 rm -f CMakeCache.txt
 # Read cmake arguments into array (possibly empty)
 read -ra CMAKE_FLAGS <<< "${CMAKE_FLAGS:-}"
+env
 cmake --debug-trycompile --verbose=1 -DCMAKE_VERBOSE_MAKEFILE=1 -LA "-DCMAKE_BUILD_TYPE=$BUILD_TYPE" "-DSANITIZE=$SANITIZER" -DENABLE_CHECK_HEAVY_BUILDS=1 "${CMAKE_FLAGS[@]}" ..
 
 ccache --show-config ||:
diff --git a/docker/packager/packager b/docker/packager/packager
index 9b7692b57ae..6c9cfcc7a1a 100755
--- a/docker/packager/packager
+++ b/docker/packager/packager
@@ -73,9 +73,10 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
     # Explicitly use LLD with Clang by default.
     # Don't force linker for cross-compilation.
     if is_clang and not is_cross_compile:
-        cmake_flags.append("-DLINKER_NAME=lld")
+        cmake_flags.append("-DLINKER_NAME=ld.lld")
 
     if is_cross_darwin:
+        result.append("IS_CROSS_DARWIN=1")
         cc = compiler[:-len(DARWIN_SUFFIX)]
         cmake_flags.append("-DCMAKE_AR:FILEPATH=/cctools/bin/x86_64-apple-darwin-ar")
         cmake_flags.append("-DCMAKE_INSTALL_NAME_TOOL=/cctools/bin/x86_64-apple-darwin-install_name_tool")
@@ -83,9 +84,11 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
         cmake_flags.append("-DLINKER_NAME=/cctools/bin/x86_64-apple-darwin-ld")
         cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/darwin/toolchain-x86_64.cmake")
     elif is_cross_arm:
+        result.append("IS_CROSS_ARM=1")
         cc = compiler[:-len(ARM_SUFFIX)]
         cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-aarch64.cmake")
     elif is_cross_freebsd:
+        result.append("IS_CROSS_FREEBSD=1")
         cc = compiler[:-len(FREEBSD_SUFFIX)]
         cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/freebsd/toolchain-x86_64.cmake")
     else:

From 783e9b3c1c440787a4a10a6f1d14f19223743aac Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Thu, 13 May 2021 14:13:37 +0300
Subject: [PATCH 006/177] more

---
 docker/builder/Dockerfile   | 2 +-
 docker/test/base/Dockerfile | 2 +-
 tests/ci/ci_config.json     | 4 ++--
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/docker/builder/Dockerfile b/docker/builder/Dockerfile
index 199b5217d79..e9ba6b2ccc1 100644
--- a/docker/builder/Dockerfile
+++ b/docker/builder/Dockerfile
@@ -1,6 +1,6 @@
 FROM ubuntu:20.04
 
-ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
+ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=12
 
 RUN apt-get update \
     && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
diff --git a/docker/test/base/Dockerfile b/docker/test/base/Dockerfile
index 44b9d42d6a1..5e41ee11ea0 100644
--- a/docker/test/base/Dockerfile
+++ b/docker/test/base/Dockerfile
@@ -1,7 +1,7 @@
 # docker build -t yandex/clickhouse-test-base .
 FROM ubuntu:20.04
 
-ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=11
+ENV DEBIAN_FRONTEND=noninteractive LLVM_VERSION=12
 
 RUN apt-get update \
     && apt-get install ca-certificates lsb-release wget gnupg apt-transport-https \
diff --git a/tests/ci/ci_config.json b/tests/ci/ci_config.json
index ee0e1a4c09d..f2c70fa1b8e 100644
--- a/tests/ci/ci_config.json
+++ b/tests/ci/ci_config.json
@@ -253,7 +253,7 @@
         },
         "Functional stateful tests (release, DatabaseReplicated)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",
@@ -385,7 +385,7 @@
         },
         "Functional stateless tests (release, DatabaseReplicated)": {
             "required_build_properties": {
-                "compiler": "clang-11",
+                "compiler": "clang-12",
                 "package_type": "deb",
                 "build_type": "relwithdebuginfo",
                 "sanitizer": "none",

From d3149ae61cd8cfdbf6d7f876db7c73e2c36df960 Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Fri, 21 May 2021 17:42:04 +0300
Subject: [PATCH 007/177] more

---
 cmake/tools.cmake | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/cmake/tools.cmake b/cmake/tools.cmake
index 8ff94ab867b..f94f4b289a3 100644
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@@ -79,8 +79,9 @@ endif ()
 
 if (LINKER_NAME)
     if (COMPILER_CLANG AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 12.0.0 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 12.0.0))
-        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LINKER_NAME}")
-        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LINKER_NAME}")
+        find_program (LLD_PATH NAMES ${LINKER_NAME})
+        set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} --ld-path=${LLD_PATH}")
+        set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} --ld-path=${LLD_PATH}")
     else ()
         set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")
         set (CMAKE_SHARED_LINKER_FLAGS "${CMAKE_SHARED_LINKER_FLAGS} -fuse-ld=${LINKER_NAME}")

From 50e233680a6d7f259dfc5670eccbb4cae1bda656 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Fri, 21 May 2021 01:04:26 +0300
Subject: [PATCH 008/177] LLVM remove non internal build

---
 cmake/find/llvm.cmake            | 125 ++++++++++++++++---------------
 cmake/find/termcap.cmake         |  28 +++----
 contrib/CMakeLists.txt           |   2 +-
 utils/ci/build-normal.sh         |   5 --
 utils/ci/default-config          |   1 -
 utils/ci/install-libraries.sh    |   4 -
 utils/ci/jobs/quick-build/run.sh |   1 -
 7 files changed, 78 insertions(+), 88 deletions(-)

diff --git a/cmake/find/llvm.cmake b/cmake/find/llvm.cmake
index c2259fc7757..88ce8927497 100644
--- a/cmake/find/llvm.cmake
+++ b/cmake/find/llvm.cmake
@@ -1,81 +1,82 @@
-if (APPLE OR SPLIT_SHARED_LIBRARIES OR NOT ARCH_AMD64)
+if (APPLE OR SPLIT_SHARED_LIBRARIES OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined")
     set (ENABLE_EMBEDDED_COMPILER OFF CACHE INTERNAL "")
 endif()
 
 option (ENABLE_EMBEDDED_COMPILER "Enable support for 'compile_expressions' option for query execution" ON)
+
 # Broken in macos. TODO: update clang, re-test, enable on Apple
-if (ENABLE_EMBEDDED_COMPILER AND NOT SPLIT_SHARED_LIBRARIES AND ARCH_AMD64 AND NOT (SANITIZE STREQUAL "undefined"))
-    option (USE_INTERNAL_LLVM_LIBRARY "Use bundled or system LLVM library." ${NOT_UNBUNDLED})
-endif()
+# if (ENABLE_EMBEDDED_COMPILER AND NOT SPLIT_SHARED_LIBRARIES AND ARCH_AMD64 AND NOT (SANITIZE STREQUAL "undefined"))
+#     option (USE_INTERNAL_LLVM_LIBRARY "Use bundled or system LLVM library." ${NOT_UNBUNDLED})
+# endif()
 
 if (NOT ENABLE_EMBEDDED_COMPILER)
-    if(USE_INTERNAL_LLVM_LIBRARY)
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot use internal LLVM library with ENABLE_EMBEDDED_COMPILER=OFF")
-    endif()
+#     if(USE_INTERNAL_LLVM_LIBRARY)
+#         message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot use internal LLVM library with ENABLE_EMBEDDED_COMPILER=OFF")
+#     endif()
+    set (USE_EMBEDDED_COMPILER 0)
     return()
 endif()
 
 if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/llvm/llvm/CMakeLists.txt")
-    if (USE_INTERNAL_LLVM_LIBRARY)
-        message (WARNING "submodule contrib/llvm is missing. to fix try run: \n git submodule update --init --recursive")
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't fidd internal LLVM library")
-    endif()
-    set (MISSING_INTERNAL_LLVM_LIBRARY 1)
+    # if (USE_INTERNAL_LLVM_LIBRARY)
+    message (${RECONFIGURE_MESSAGE_LEVEL} "submodule /contrib/llvm is missing. to fix try run: \n git submodule update --init --recursive")
+        # message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find internal LLVM library")
+    # endif()
+    # set (MISSING_INTERNAL_LLVM_LIBRARY 1)
 endif ()
 
-if (NOT USE_INTERNAL_LLVM_LIBRARY)
-    set (LLVM_PATHS "/usr/local/lib/llvm" "/usr/lib/llvm")
+# if (NOT USE_INTERNAL_LLVM_LIBRARY)
+#     set (LLVM_PATHS "/usr/local/lib/llvm" "/usr/lib/llvm")
 
+#     foreach(llvm_v 12 11.1 11)
+#         if (NOT LLVM_FOUND)
+#             find_package (LLVM ${llvm_v} CONFIG PATHS ${LLVM_PATHS})
+#         endif ()
+#     endforeach ()
 
-    foreach(llvm_v 12 11.1 11)
-        if (NOT LLVM_FOUND)
-            find_package (LLVM ${llvm_v} CONFIG PATHS ${LLVM_PATHS})
-        endif ()
-    endforeach ()
+#     if (LLVM_FOUND)
+#         # Remove dynamically-linked zlib and libedit from LLVM's dependencies:
+#         set_target_properties(LLVMSupport PROPERTIES INTERFACE_LINK_LIBRARIES "-lpthread;LLVMDemangle;${ZLIB_LIBRARIES}")
+#         set_target_properties(LLVMLineEditor PROPERTIES INTERFACE_LINK_LIBRARIES "LLVMSupport")
 
-    if (LLVM_FOUND)
-        # Remove dynamically-linked zlib and libedit from LLVM's dependencies:
-        set_target_properties(LLVMSupport PROPERTIES INTERFACE_LINK_LIBRARIES "-lpthread;LLVMDemangle;${ZLIB_LIBRARIES}")
-        set_target_properties(LLVMLineEditor PROPERTIES INTERFACE_LINK_LIBRARIES "LLVMSupport")
+#         option(LLVM_HAS_RTTI "Enable if LLVM was build with RTTI enabled" ON)
+set (USE_EMBEDDED_COMPILER 1)
+#     else()
+#         message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system LLVM")
+#         set (USE_EMBEDDED_COMPILER 0)
+#     endif()
 
-        option(LLVM_HAS_RTTI "Enable if LLVM was build with RTTI enabled" ON)
-        set (USE_EMBEDDED_COMPILER 1)
-    else()
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system LLVM")
-        set (USE_EMBEDDED_COMPILER 0)
-    endif()
+#     if (LLVM_FOUND AND OS_LINUX AND USE_LIBCXX AND NOT FORCE_LLVM_WITH_LIBCXX)
+#         message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is not set but the LLVM library from OS packages "
+#                 "in Linux is incompatible with libc++ ABI. LLVM Will be disabled. Force: -DFORCE_LLVM_WITH_LIBCXX=ON")
+#         message (${RECONFIGURE_MESSAGE_LEVEL} "Unsupported LLVM configuration, cannot enable LLVM")
+#         set (LLVM_FOUND 0)
+#         set (USE_EMBEDDED_COMPILER 0)
+#     endif ()
+# endif()
 
-    if (LLVM_FOUND AND OS_LINUX AND USE_LIBCXX AND NOT FORCE_LLVM_WITH_LIBCXX)
-        message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is not set but the LLVM library from OS packages "
-                "in Linux is incompatible with libc++ ABI. LLVM Will be disabled. Force: -DFORCE_LLVM_WITH_LIBCXX=ON")
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Unsupported LLVM configuration, cannot enable LLVM")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    endif ()
-endif()
-
-if(NOT LLVM_FOUND AND NOT MISSING_INTERNAL_LLVM_LIBRARY)
-    if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
-        message(WARNING "Option ENABLE_EMBEDDED_COMPILER is set but internal LLVM library cannot build if build directory is the same as source directory.")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    elseif (SPLIT_SHARED_LIBRARIES)
+# if(NOT LLVM_FOUND AND NOT MISSING_INTERNAL_LLVM_LIBRARY)
+#     if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
+#         message(WARNING "Option ENABLE_EMBEDDED_COMPILER is set but internal LLVM library cannot build if build directory is the same as source directory.")
+#         set (LLVM_FOUND 0)
+#         set (USE_EMBEDDED_COMPILER 0)
+#     elseif (SPLIT_SHARED_LIBRARIES)
         # llvm-tablegen cannot find shared libraries that we build. Probably can be easily fixed.
-        message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is not compatible with SPLIT_SHARED_LIBRARIES. Build of LLVM will be disabled.")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    elseif (NOT ARCH_AMD64)
+    #     message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is not compatible with SPLIT_SHARED_LIBRARIES. Build of LLVM will be disabled.")
+    #     set (LLVM_FOUND 0)
+    #     set (USE_EMBEDDED_COMPILER 0)
+    # elseif (NOT ARCH_AMD64)
         # It's not supported yet, but you can help.
-        message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is only available for x86_64. Build of LLVM will be disabled.")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    elseif (SANITIZE STREQUAL "undefined")
-        # llvm-tblgen, that is used during LLVM build, doesn't work with UBSan.
-        message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY does not work with UBSan, because 'llvm-tblgen' tool from LLVM has undefined behaviour. Build of LLVM will be disabled.")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    else ()
-        set (USE_INTERNAL_LLVM_LIBRARY ON)
+        # message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is only available for x86_64. Build of LLVM will be disabled.")
+        # set (LLVM_FOUND 0)
+        # set (USE_EMBEDDED_COMPILER 0)
+    # elseif (SANITIZE STREQUAL "undefined")
+    #     # llvm-tblgen, that is used during LLVM build, doesn't work with UBSan.
+    #     message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY does not work with UBSan, because 'llvm-tblgen' tool from LLVM has undefined behaviour. Build of LLVM will be disabled.")
+    #     set (LLVM_FOUND 0)
+    #     set (USE_EMBEDDED_COMPILER 0)
+    # else ()
+        # set (USE_INTERNAL_LLVM_LIBRARY ON)
         set (LLVM_FOUND 1)
         set (USE_EMBEDDED_COMPILER 1)
         set (LLVM_VERSION "9.0.0bundled")
@@ -87,13 +88,13 @@ if(NOT LLVM_FOUND AND NOT MISSING_INTERNAL_LLVM_LIBRARY)
     endif()
 endif()
 
-if (LLVM_FOUND)
+# if (LLVM_FOUND)
     message(STATUS "LLVM include Directory: ${LLVM_INCLUDE_DIRS}")
     message(STATUS "LLVM library Directory: ${LLVM_LIBRARY_DIRS}")
     message(STATUS "LLVM C++ compiler flags: ${LLVM_CXXFLAGS}")
-else()
-    message (${RECONFIGURE_MESSAGE_LEVEL} "Can't enable LLVM")
-endif()
+# else()
+#     message (${RECONFIGURE_MESSAGE_LEVEL} "Can't enable LLVM")
+# endif()
 
 # This list was generated by listing all LLVM libraries, compiling the binary and removing all libraries while it still compiles.
 set (REQUIRED_LLVM_LIBRARIES
diff --git a/cmake/find/termcap.cmake b/cmake/find/termcap.cmake
index 58454165785..448ef34f3c3 100644
--- a/cmake/find/termcap.cmake
+++ b/cmake/find/termcap.cmake
@@ -1,17 +1,17 @@
-if (ENABLE_EMBEDDED_COMPILER AND NOT USE_INTERNAL_LLVM_LIBRARY AND USE_STATIC_LIBRARIES)
-    find_library (TERMCAP_LIBRARY tinfo)
-    if (NOT TERMCAP_LIBRARY)
-        find_library (TERMCAP_LIBRARY ncurses)
-    endif()
-    if (NOT TERMCAP_LIBRARY)
-        find_library (TERMCAP_LIBRARY termcap)
-    endif()
+# if (ENABLE_EMBEDDED_COMPILER AND NOT USE_INTERNAL_LLVM_LIBRARY AND USE_STATIC_LIBRARIES)
+#     find_library (TERMCAP_LIBRARY tinfo)
+#     if (NOT TERMCAP_LIBRARY)
+#         find_library (TERMCAP_LIBRARY ncurses)
+#     endif()
+#     if (NOT TERMCAP_LIBRARY)
+#         find_library (TERMCAP_LIBRARY termcap)
+#     endif()
 
-    if (NOT TERMCAP_LIBRARY)
-        message (FATAL_ERROR "Statically Linking external LLVM requires termcap")
-    endif()
+#     if (NOT TERMCAP_LIBRARY)
+#         message (FATAL_ERROR "Statically Linking external LLVM requires termcap")
+#     endif()
 
-    target_link_libraries(LLVMSupport INTERFACE ${TERMCAP_LIBRARY})
+#     target_link_libraries(LLVMSupport INTERFACE ${TERMCAP_LIBRARY})
 
-    message (STATUS "Using termcap: ${TERMCAP_LIBRARY}")
-endif()
+#     message (STATUS "Using termcap: ${TERMCAP_LIBRARY}")
+# endif()
diff --git a/contrib/CMakeLists.txt b/contrib/CMakeLists.txt
index 9eafec23f51..21d26695e33 100644
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@@ -205,7 +205,7 @@ elseif(GTEST_SRC_DIR)
     target_compile_definitions(gtest INTERFACE GTEST_HAS_POSIX_RE=0)
 endif()
 
-if (USE_EMBEDDED_COMPILER AND USE_INTERNAL_LLVM_LIBRARY)
+if (USE_EMBEDDED_COMPILER)
     # ld: unknown option: --color-diagnostics
     if (APPLE)
         set (LINKER_SUPPORTS_COLOR_DIAGNOSTICS 0 CACHE INTERNAL "")
diff --git a/utils/ci/build-normal.sh b/utils/ci/build-normal.sh
index b937269c8a3..328bd2c9f51 100755
--- a/utils/ci/build-normal.sh
+++ b/utils/ci/build-normal.sh
@@ -8,11 +8,6 @@ source default-config
 mkdir -p "${WORKSPACE}/build"
 pushd "${WORKSPACE}/build"
 
-if [[ "${ENABLE_EMBEDDED_COMPILER}" == 1 ]]; then
-    [[ "$USE_LLVM_LIBRARIES_FROM_SYSTEM" == 0 ]] && CMAKE_FLAGS="$CMAKE_FLAGS -DUSE_INTERNAL_LLVM_LIBRARY=1"
-    [[ "$USE_LLVM_LIBRARIES_FROM_SYSTEM" != 0 ]] && CMAKE_FLAGS="$CMAKE_FLAGS -DUSE_INTERNAL_LLVM_LIBRARY=0"
-fi
-
 cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DENABLE_EMBEDDED_COMPILER=${ENABLE_EMBEDDED_COMPILER} $CMAKE_FLAGS ../sources
 
 [[ "$BUILD_TARGETS" != 'all' ]] && BUILD_TARGETS_STRING="--target $BUILD_TARGETS"
diff --git a/utils/ci/default-config b/utils/ci/default-config
index cd6f25ecf9b..b66121cc757 100644
--- a/utils/ci/default-config
+++ b/utils/ci/default-config
@@ -27,7 +27,6 @@ CLANG_SOURCES_BRANCH=trunk          # or tags/RELEASE_600/final
 GCC_SOURCES_VERSION=latest          # or gcc-7.1.0
 
 # install-libraries
-USE_LLVM_LIBRARIES_FROM_SYSTEM=0    # 0 or 1
 ENABLE_EMBEDDED_COMPILER=1
 
 # build
diff --git a/utils/ci/install-libraries.sh b/utils/ci/install-libraries.sh
index d7fb856dbed..7615375fbc1 100755
--- a/utils/ci/install-libraries.sh
+++ b/utils/ci/install-libraries.sh
@@ -5,7 +5,3 @@ source default-config
 
 ./install-os-packages.sh libicu-dev
 ./install-os-packages.sh libreadline-dev
-
-if [[ "$ENABLE_EMBEDDED_COMPILER" == 1 && "$USE_LLVM_LIBRARIES_FROM_SYSTEM" == 1 ]]; then
-    ./install-os-packages.sh llvm-libs-5.0
-fi
diff --git a/utils/ci/jobs/quick-build/run.sh b/utils/ci/jobs/quick-build/run.sh
index 3d755625c8d..af977d14465 100755
--- a/utils/ci/jobs/quick-build/run.sh
+++ b/utils/ci/jobs/quick-build/run.sh
@@ -15,7 +15,6 @@ SOURCES_METHOD=local
 COMPILER=clang
 COMPILER_INSTALL_METHOD=packages
 COMPILER_PACKAGE_VERSION=6.0
-USE_LLVM_LIBRARIES_FROM_SYSTEM=0
 BUILD_METHOD=normal
 BUILD_TARGETS=clickhouse
 BUILD_TYPE=Debug

From 07556fac2ce0b04907f2ca2c52aacdbb3bf4e73b Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Tue, 25 May 2021 12:52:53 +0300
Subject: [PATCH 009/177] try fix aarch64

---
 cmake/tools.cmake | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/cmake/tools.cmake b/cmake/tools.cmake
index f94f4b289a3..0e213c285d8 100644
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@@ -77,6 +77,11 @@ if (OS_LINUX AND NOT LINKER_NAME)
     endif ()
 endif ()
 
+if (LINKER_NAME AND NOT LLD_PATH)
+    find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "lld-${COMPILER_VERSION_MAJOR}" "ld.lld" "lld")
+    find_program (GOLD_PATH NAMES "ld.gold" "gold")
+endif ()
+
 if (LINKER_NAME)
     if (COMPILER_CLANG AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 12.0.0 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 12.0.0))
         find_program (LLD_PATH NAMES ${LINKER_NAME})

From 4f711ee038f82a4d506cc3ab8176d5e17bfd3adb Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Tue, 25 May 2021 16:05:30 +0300
Subject: [PATCH 010/177] fix darwin

---
 cmake/tools.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/tools.cmake b/cmake/tools.cmake
index 0e213c285d8..7c15332a51a 100644
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@@ -77,7 +77,7 @@ if (OS_LINUX AND NOT LINKER_NAME)
     endif ()
 endif ()
 
-if (LINKER_NAME AND NOT LLD_PATH)
+if (NOT OS_DARWIN AND LINKER_NAME AND NOT LLD_PATH)
     find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "lld-${COMPILER_VERSION_MAJOR}" "ld.lld" "lld")
     find_program (GOLD_PATH NAMES "ld.gold" "gold")
 endif ()

From 227eb9fda5a386ec4aa64fa23a203b876be322dc Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Tue, 25 May 2021 18:29:04 +0300
Subject: [PATCH 011/177] try

---
 cmake/freebsd/toolchain-x86_64.cmake | 2 +-
 cmake/linux/toolchain-aarch64.cmake  | 2 +-
 cmake/tools.cmake                    | 5 -----
 3 files changed, 2 insertions(+), 7 deletions(-)

diff --git a/cmake/freebsd/toolchain-x86_64.cmake b/cmake/freebsd/toolchain-x86_64.cmake
index d9839ec74ee..f9e45686db7 100644
--- a/cmake/freebsd/toolchain-x86_64.cmake
+++ b/cmake/freebsd/toolchain-x86_64.cmake
@@ -10,7 +10,7 @@ set (CMAKE_TRY_COMPILE_TARGET_TYPE STATIC_LIBRARY)  # disable linkage check - it
 set (CMAKE_AR "/usr/bin/ar" CACHE FILEPATH "" FORCE)
 set (CMAKE_RANLIB "/usr/bin/ranlib" CACHE FILEPATH "" FORCE)
 
-set (LINKER_NAME "lld" CACHE STRING "" FORCE)
+set (LINKER_NAME "ld.lld" CACHE STRING "" FORCE)
 
 set (CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld")
 set (CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld")
diff --git a/cmake/linux/toolchain-aarch64.cmake b/cmake/linux/toolchain-aarch64.cmake
index e3924fdc537..b4dc6e45cbb 100644
--- a/cmake/linux/toolchain-aarch64.cmake
+++ b/cmake/linux/toolchain-aarch64.cmake
@@ -13,7 +13,7 @@ set (CMAKE_C_FLAGS_INIT "${CMAKE_C_FLAGS} --gcc-toolchain=${CMAKE_CURRENT_LIST_D
 set (CMAKE_CXX_FLAGS_INIT "${CMAKE_CXX_FLAGS} --gcc-toolchain=${CMAKE_CURRENT_LIST_DIR}/../toolchain/linux-aarch64")
 set (CMAKE_ASM_FLAGS_INIT "${CMAKE_ASM_FLAGS} --gcc-toolchain=${CMAKE_CURRENT_LIST_DIR}/../toolchain/linux-aarch64")
 
-set (LINKER_NAME "lld" CACHE STRING "" FORCE)
+set (LINKER_NAME "ld.lld" CACHE STRING "" FORCE)
 
 set (CMAKE_EXE_LINKER_FLAGS_INIT "-fuse-ld=lld")
 set (CMAKE_SHARED_LINKER_FLAGS_INIT "-fuse-ld=lld")
diff --git a/cmake/tools.cmake b/cmake/tools.cmake
index 7c15332a51a..f94f4b289a3 100644
--- a/cmake/tools.cmake
+++ b/cmake/tools.cmake
@@ -77,11 +77,6 @@ if (OS_LINUX AND NOT LINKER_NAME)
     endif ()
 endif ()
 
-if (NOT OS_DARWIN AND LINKER_NAME AND NOT LLD_PATH)
-    find_program (LLD_PATH NAMES "ld.lld-${COMPILER_VERSION_MAJOR}" "lld-${COMPILER_VERSION_MAJOR}" "ld.lld" "lld")
-    find_program (GOLD_PATH NAMES "ld.gold" "gold")
-endif ()
-
 if (LINKER_NAME)
     if (COMPILER_CLANG AND (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 12.0.0 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 12.0.0))
         find_program (LLD_PATH NAMES ${LINKER_NAME})

From fc212753d5d1019e08bdab633a41e003d480fe51 Mon Sep 17 00:00:00 2001
From: romanzhukov <romanzhukov@yandex-team.ru>
Date: Sun, 22 Aug 2021 17:38:07 +0300
Subject: [PATCH 012/177] Add Snowflake ID-date docs.

---
 .../functions/type-conversion-functions.md    |  32 ++--
 .../functions/type-conversion-functions.md    | 141 ++++++++++++++++++
 .../sql-reference/statements/create/table.md  |   1 +
 3 files changed, 155 insertions(+), 19 deletions(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index ad6edaea312..df1994e86fa 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1342,7 +1342,7 @@ Result:
 
 ## snowflakeToDateTime {#snowflakeToDateTime}
 
-Extract time from snowflake id as DateTime format.
+Extracts time from Snowflake ID as [DateTime](../data-types/datetime.md) format.
 
 **Syntax**
 
@@ -1352,12 +1352,12 @@ snowflakeToDateTime(value [, time_zone])
 
 **Parameters**
 
--   `value` — `snowflake id`, Int64 value.
+-   `value` — Snowflake ID. [Int64](../data-types/int-uint.md).
 -   `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md).
 
 **Returned value**
 
--  value converted to the `DateTime` data type.
+-  Value converted to the [DateTime](../data-types/datetime.md) data type.
 
 **Example**
 
@@ -1378,7 +1378,7 @@ Result:
 
 ## snowflakeToDateTime64 {#snowflakeToDateTime64}
 
-Extract time from snowflake id as DateTime64 format.
+Extracts time from Snowflake ID as [DateTime64](../data-types/datetime64.md) format.
 
 **Syntax**
 
@@ -1388,12 +1388,12 @@ snowflakeToDateTime64(value [, time_zone])
 
 **Parameters**
 
--   `value` — `snowflake id`, Int64 value.
+-   `value` — Snowflake ID. [Int64](../data-types/int-uint.md).
 -   `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md).
 
 **Returned value**
 
--  value converted to the `DateTime64` data type.
+-  Value converted to the [DateTime64](../data-types/datetime64.md) data type.
 
 **Example**
 
@@ -1414,7 +1414,7 @@ Result:
 
 ## dateTimeToSnowflake {#dateTimeToSnowflake}
 
-Convert DateTime to the first snowflake id at the giving time.
+Converts [DateTime](../data-types/datetime.md) value to the first Snowflake ID at the giving time.
 
 **Syntax**
 
@@ -1426,33 +1426,29 @@ dateTimeToSnowflake(value)
 
 -   `value` — Date and time. [DateTime](../../sql-reference/data-types/datetime.md).
 
-
 **Returned value**
 
--   `value` converted to the `Int64` data type as the first snowflake id at that time.
+-   `value` converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time.
 
 **Example**
 
 Query:
 
 ``` sql
-WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt
-SELECT dateTimeToSnowflake(dt);
+WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt SELECT dateTimeToSnowflake(dt);
 ```
 
 Result:
 
 ``` text
-
 ┌─dateTimeToSnowflake(dt)─┐
 │     1426860702823350272 │
 └─────────────────────────┘
 ```
 
-
 ## dateTime64ToSnowflake {#dateTime64ToSnowflake}
 
-Convert DateTime64 to the first snowflake id at the giving time.
+Convert [DateTime64](../data-types/datetime64.md) to the first Snowflake ID at the giving time.
 
 **Syntax**
 
@@ -1464,18 +1460,16 @@ dateTime64ToSnowflake(value)
 
 -   `value` — Date and time. [DateTime64](../../sql-reference/data-types/datetime64.md).
 
-
 **Returned value**
 
--   `value` converted to the `Int64` data type as the first snowflake id at that time.
+-   `value` converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time.
 
 **Example**
 
 Query:
 
 ``` sql
-WITH toDateTime64('2021-08-15 18:57:56.492', 3, 'Asia/Shanghai') AS dt64
-SELECT dateTime64ToSnowflake(dt64);
+WITH toDateTime64('2021-08-15 18:57:56.492', 3, 'Asia/Shanghai') AS dt64 SELECT dateTime64ToSnowflake(dt64);
 ```
 
 Result:
@@ -1484,4 +1478,4 @@ Result:
 ┌─dateTime64ToSnowflake(dt64)─┐
 │         1426860704886947840 │
 └─────────────────────────────┘
-```
\ No newline at end of file
+```
diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index 757afca9588..0793864f526 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1338,3 +1338,144 @@ FROM numbers(3);
 │ 2,"good"                                  │
 └───────────────────────────────────────────┘
 ```
+
+## snowflakeToDateTime {#snowflakeToDateTime}
+
+Извлекает время из Snowflake ID в формате [DateTime](../data-types/datetime.md).
+
+**Синтаксис**
+
+``` sql
+snowflakeToDateTime(value [, time_zone])
+```
+
+**Аргументы**
+
+-   `value` — Snowflake ID. [Int64](../data-types/int-uint.md).
+-   `time_zone` — [временная зона сервера](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция распознает `time_string` в соответствии с часовым поясом. Необязательный. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-  Значение, преобразованное в фомат [DateTime](../data-types/datetime.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC');
+```
+
+Результат:
+
+``` text
+
+┌─snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC')─┐
+│                                              2021-08-15 10:57:56 │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+## snowflakeToDateTime64 {#snowflakeToDateTime64}
+
+Извлекает время из Snowflake ID в формате [DateTime64](../data-types/datetime64.md).
+
+**Синтаксис**
+
+``` sql
+snowflakeToDateTime64(value [, time_zone])
+```
+
+**Аргументы**
+
+-   `value` — Snowflake ID. [Int64](../data-types/int-uint.md).
+-   `time_zone` — [временная зона сервера](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). Функция распознает `time_string` в соответствии с часовым поясом. Необязательный. [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-  Значение, преобразованное в фомат [DateTime64](../data-types/datetime64.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC');
+```
+
+Результат:
+
+``` text
+
+┌─snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC')─┐
+│                                            2021-08-15 10:58:19.841 │
+└────────────────────────────────────────────────────────────────────┘
+```
+
+## dateTimeToSnowflake {#dateTimeToSnowflake}
+
+Преобразует значение [DateTime](../data-types/datetime.md) в первый идентификатор Snowflake ID на текущий момент.
+
+**Syntax**
+
+``` sql
+dateTimeToSnowflake(value)
+```
+
+**Аргументы**
+
+-   `value` — дата и время. [DateTime](../../sql-reference/data-types/datetime.md).
+
+**Возвращаемое значение**
+
+-   Значение, преобразованное в [Int64](../data-types/int-uint.md), как первый идентификатор Snowflake ID в момент выполнения.
+
+**Пример**
+
+Запрос:
+
+``` sql
+WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt SELECT dateTimeToSnowflake(dt);
+```
+
+Результат:
+
+``` text
+┌─dateTimeToSnowflake(dt)─┐
+│     1426860702823350272 │
+└─────────────────────────┘
+```
+
+## dateTime64ToSnowflake {#dateTime64ToSnowflake}
+
+Преобразует значение [DateTime64](../data-types/datetime64.md) в первый идентификатор Snowflake ID на текущий момент.
+
+**Синтаксис**
+
+``` sql
+dateTime64ToSnowflake(value)
+```
+
+**Аргументы**
+
+-   `value` — дата и время. [DateTime64](../data-types/datetime64.md).
+
+**Возвращаемое значение**
+
+-   Значение, преобразованное в [Int64](../data-types/int-uint.md), как первый идентификатор Snowflake ID в момент выполнения.
+
+
+**Пример**
+
+Запрос:
+
+``` sql
+WITH toDateTime64('2021-08-15 18:57:56.492', 3, 'Asia/Shanghai') AS dt64 SELECT dateTime64ToSnowflake(dt64);
+```
+
+Результат:
+
+``` text
+┌─dateTime64ToSnowflake(dt64)─┐
+│         1426860704886947840 │
+└─────────────────────────────┘
+```
diff --git a/docs/ru/sql-reference/statements/create/table.md b/docs/ru/sql-reference/statements/create/table.md
index 073cd4fa7c1..77c192b2b26 100644
--- a/docs/ru/sql-reference/statements/create/table.md
+++ b/docs/ru/sql-reference/statements/create/table.md
@@ -247,6 +247,7 @@ CREATE TABLE codec_example
 )
 ENGINE = MergeTree()
 ```
+
 ## Временные таблицы {#temporary-tables}
 
 ClickHouse поддерживает временные таблицы со следующими характеристиками:

From 57664d4802e2bf8d09f865f9e0ac5e4cc6da2977 Mon Sep 17 00:00:00 2001
From: romanzhukov <romanzhukov@yandex-team.ru>
Date: Sun, 22 Aug 2021 17:48:35 +0300
Subject: [PATCH 013/177] Minor fix

---
 .../sql-reference/functions/type-conversion-functions.md  | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index df1994e86fa..32822aae488 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1357,7 +1357,7 @@ snowflakeToDateTime(value [, time_zone])
 
 **Returned value**
 
--  Value converted to the [DateTime](../data-types/datetime.md) data type.
+-  Input value converted to the [DateTime](../data-types/datetime.md) data type.
 
 **Example**
 
@@ -1393,7 +1393,7 @@ snowflakeToDateTime64(value [, time_zone])
 
 **Returned value**
 
--  Value converted to the [DateTime64](../data-types/datetime64.md) data type.
+-  Input value converted to the [DateTime64](../data-types/datetime64.md) data type.
 
 **Example**
 
@@ -1428,7 +1428,7 @@ dateTimeToSnowflake(value)
 
 **Returned value**
 
--   `value` converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time.
+-   Input value converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time.
 
 **Example**
 
@@ -1462,7 +1462,7 @@ dateTime64ToSnowflake(value)
 
 **Returned value**
 
--   `value` converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time.
+-   Input value converted to the [Int64](../data-types/int-uint.md) data type as the first Snowflake ID at that time.
 
 **Example**
 

From 30f404f4b6b8dbee29c086154efe3aea95e536d4 Mon Sep 17 00:00:00 2001
From: Roman Zhukov <romanzhukov@yandex-team.ru>
Date: Tue, 24 Aug 2021 16:43:56 +0000
Subject: [PATCH 014/177] Update setting ZooKeeper

---
 .../settings.md                                | 14 ++++++++++++--
 .../settings.md                                | 18 ++++++++++++++++--
 2 files changed, 28 insertions(+), 4 deletions(-)

diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index 66a2bde4b4b..adc7c3e0845 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -1253,7 +1253,7 @@ If this section is specified, the path from [users_config](../../operations/serv
 
 The `user_directories` section can contain any number of items, the order of the items means their precedence (the higher the item the higher the precedence).
 
-**Example**
+**Examples**
 
 ``` xml
 <user_directories>
@@ -1263,13 +1263,23 @@ The `user_directories` section can contain any number of items, the order of the
     <local_directory>
         <path>/var/lib/clickhouse/access/</path>
     </local_directory>
+</user_directories>
+```
+
+Users, roles, row policies, quotas and profiles can be also stored in ZooKeeper:
+
+``` xml
+<user_directories>
+    <users_xml>
+        <path>/etc/clickhouse-server/users.xml</path>
+    </users_xml>
     <replicated>
         <zookeeper_path>/clickhouse/access/</zookeeper_path>
     </replicated>
 </user_directories>
 ```
 
-You can also specify settings `memory` — means storing information only in memory, without writing to disk, and `ldap` — means storing information on an LDAP server.
+You can also define sections memory `memory` — means storing information only in memory, without writing to disk, and `ldap` — means storing information on an LDAP server.
 
 To add an LDAP server as a remote user directory of users that are not defined locally, define a single `ldap` section with a following parameters:
 -   `server` — one of LDAP server names defined in `ldap_servers` config section. This parameter is mandatory and cannot be empty.
diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md
index 98c5748ba41..8c6033b75ae 100644
--- a/docs/ru/operations/server-configuration-parameters/settings.md
+++ b/docs/ru/operations/server-configuration-parameters/settings.md
@@ -1200,12 +1200,13 @@ ClickHouse использует ZooKeeper для хранения метадан
 Секция конфигурационного файла,которая содержит настройки:
 -   Путь к конфигурационному файлу с предустановленными пользователями.
 -   Путь к файлу, в котором содержатся пользователи, созданные при помощи SQL команд.
+-   Путь к узлу ZooKeeper, где хранятся и реплицируются пользователи, созданные с помощью команд SQL (экспериментально).
 
 Если эта секция определена, путь из [users_config](../../operations/server-configuration-parameters/settings.md#users-config) и [access_control_path](../../operations/server-configuration-parameters/settings.md#access_control_path) не используется.
 
 Секция `user_directories` может содержать любое количество элементов, порядок расположения элементов обозначает их приоритет (чем выше элемент, тем выше приоритет).
 
-**Пример**
+**Примеры**
 
 ``` xml
 <user_directories>
@@ -1218,7 +1219,20 @@ ClickHouse использует ZooKeeper для хранения метадан
 </user_directories>
 ```
 
-Также вы можете указать настройку `memory` — означает хранение информации только в памяти, без записи на диск, и `ldap` — означает хранения информации на [LDAP-сервере](https://en.wikipedia.org/wiki/Lightweight_Directory_Access_Protocol).
+Пользователи, роли, политики доступа к строкам, квоты и профили могут храниться в ZooKeeper:
+
+``` xml
+<user_directories>
+    <users_xml>
+        <path>/etc/clickhouse-server/users.xml</path>
+    </users_xml>
+    <replicated>
+        <zookeeper_path>/clickhouse/access/</zookeeper_path>
+    </replicated>
+</user_directories>
+```
+
+Также вы можете секцию `memory` — означает хранение информации только в памяти, без записи на диск, и `ldap` — означает хранения информации на [LDAP-сервере](https://en.wikipedia.org/wiki/Lightweight_Directory_Access_Protocol).
 
 Чтобы добавить LDAP-сервер в качестве удаленного каталога пользователей, которые не определены локально, определите один раздел `ldap` со следующими параметрами:
 -   `server` — имя одного из LDAP-серверов, определенных в секции `ldap_servers` конфигурациионного файла. Этот параметр явялется необязательным и может быть пустым.

From 6caced9de215dbfa511ea9180dbb2527010bb39f Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:43:39 +0300
Subject: [PATCH 015/177] Update
 docs/en/operations/server-configuration-parameters/settings.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/operations/server-configuration-parameters/settings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index adc7c3e0845..199e287702f 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -1266,7 +1266,7 @@ The `user_directories` section can contain any number of items, the order of the
 </user_directories>
 ```
 
-Users, roles, row policies, quotas and profiles can be also stored in ZooKeeper:
+Users, roles, row policies, quotas, and profiles can be also stored in ZooKeeper:
 
 ``` xml
 <user_directories>

From a140891ac078540db80b07c91b224d248d74ee5d Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:44:01 +0300
Subject: [PATCH 016/177] Update
 docs/en/operations/server-configuration-parameters/settings.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/operations/server-configuration-parameters/settings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/operations/server-configuration-parameters/settings.md b/docs/en/operations/server-configuration-parameters/settings.md
index 199e287702f..41cfc75fe9a 100644
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@@ -1279,7 +1279,7 @@ Users, roles, row policies, quotas, and profiles can be also stored in ZooKeeper
 </user_directories>
 ```
 
-You can also define sections memory `memory` — means storing information only in memory, without writing to disk, and `ldap` — means storing information on an LDAP server.
+You can also define sections `memory` — means storing information only in memory, without writing to disk, and `ldap` — means storing information on an LDAP server.
 
 To add an LDAP server as a remote user directory of users that are not defined locally, define a single `ldap` section with a following parameters:
 -   `server` — one of LDAP server names defined in `ldap_servers` config section. This parameter is mandatory and cannot be empty.

From e5b600eadc908b18318719ee8cb1ee9e08df71e3 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:44:08 +0300
Subject: [PATCH 017/177] Update
 docs/en/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index 32822aae488..ff3c5375a9a 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1376,7 +1376,7 @@ Result:
 └──────────────────────────────────────────────────────────────────┘
 ```
 
-## snowflakeToDateTime64 {#snowflakeToDateTime64}
+## snowflakeToDateTime64 {#snowflaketodatetime64}
 
 Extracts time from Snowflake ID as [DateTime64](../data-types/datetime64.md) format.
 

From c5b9bbb7e139262618440ee60c030c158b31411e Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:44:32 +0300
Subject: [PATCH 018/177] Update
 docs/en/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index ff3c5375a9a..8766f8c4392 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1412,7 +1412,7 @@ Result:
 └────────────────────────────────────────────────────────────────────┘
 ```
 
-## dateTimeToSnowflake {#dateTimeToSnowflake}
+## dateTimeToSnowflake {#datetimetosnowflake}
 
 Converts [DateTime](../data-types/datetime.md) value to the first Snowflake ID at the giving time.
 

From e3e9ac5c9c6b92e1cbb504247c39c3ea28d22df8 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:44:43 +0300
Subject: [PATCH 019/177] Update
 docs/en/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index 8766f8c4392..488ff127b7d 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1446,7 +1446,7 @@ Result:
 └─────────────────────────┘
 ```
 
-## dateTime64ToSnowflake {#dateTime64ToSnowflake}
+## dateTime64ToSnowflake {#datetime64tosnowflake}
 
 Convert [DateTime64](../data-types/datetime64.md) to the first Snowflake ID at the giving time.
 

From fd580de663954455bf9f97fdfdc414bbf4606082 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:44:50 +0300
Subject: [PATCH 020/177] Update
 docs/ru/operations/server-configuration-parameters/settings.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/operations/server-configuration-parameters/settings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md
index 8c6033b75ae..0850b59cdb7 100644
--- a/docs/ru/operations/server-configuration-parameters/settings.md
+++ b/docs/ru/operations/server-configuration-parameters/settings.md
@@ -1232,7 +1232,7 @@ ClickHouse использует ZooKeeper для хранения метадан
 </user_directories>
 ```
 
-Также вы можете секцию `memory` — означает хранение информации только в памяти, без записи на диск, и `ldap` — означает хранения информации на [LDAP-сервере](https://en.wikipedia.org/wiki/Lightweight_Directory_Access_Protocol).
+Также вы можете добавить секции `memory` — означает хранение информации только в памяти, без записи на диск, и `ldap` — означает хранения информации на [LDAP-сервере](https://en.wikipedia.org/wiki/Lightweight_Directory_Access_Protocol).
 
 Чтобы добавить LDAP-сервер в качестве удаленного каталога пользователей, которые не определены локально, определите один раздел `ldap` со следующими параметрами:
 -   `server` — имя одного из LDAP-серверов, определенных в секции `ldap_servers` конфигурациионного файла. Этот параметр явялется необязательным и может быть пустым.

From a151aff2ca6cf33b4e8fe4a4e14f18a1b345503b Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:44:57 +0300
Subject: [PATCH 021/177] Update
 docs/ru/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index 0793864f526..ab38222ed10 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1339,7 +1339,7 @@ FROM numbers(3);
 └───────────────────────────────────────────┘
 ```
 
-## snowflakeToDateTime {#snowflakeToDateTime}
+## snowflakeToDateTime {#snowflaketodatetime}
 
 Извлекает время из Snowflake ID в формате [DateTime](../data-types/datetime.md).
 

From 30cfd8bcc2e0140646b8040972bf6227f1a0ab3c Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:45:05 +0300
Subject: [PATCH 022/177] Update
 docs/ru/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index ab38222ed10..ee2ac927e68 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1375,7 +1375,7 @@ SELECT snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC');
 └──────────────────────────────────────────────────────────────────┘
 ```
 
-## snowflakeToDateTime64 {#snowflakeToDateTime64}
+## snowflakeToDateTime64 {#snowflaketodatetime64}
 
 Извлекает время из Snowflake ID в формате [DateTime64](../data-types/datetime64.md).
 

From 73cf90a674332107940bdcc35ae6e2cd6b10c2c6 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:47:02 +0300
Subject: [PATCH 023/177] Update
 docs/ru/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index ee2ac927e68..1e26eb023fb 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1411,7 +1411,7 @@ SELECT snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC');
 └────────────────────────────────────────────────────────────────────┘
 ```
 
-## dateTimeToSnowflake {#dateTimeToSnowflake}
+## dateTimeToSnowflake {#datetimetosnowflake}
 
 Преобразует значение [DateTime](../data-types/datetime.md) в первый идентификатор Snowflake ID на текущий момент.
 

From f0573add400d3bb3d7a72b8d4b4bf8373805d4ef Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:47:07 +0300
Subject: [PATCH 024/177] Update
 docs/ru/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index 1e26eb023fb..76fc13e09f1 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1445,7 +1445,7 @@ WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt SELECT dateTimeToS
 └─────────────────────────┘
 ```
 
-## dateTime64ToSnowflake {#dateTime64ToSnowflake}
+## dateTime64ToSnowflake {#datetime64tosnowflake}
 
 Преобразует значение [DateTime64](../data-types/datetime64.md) в первый идентификатор Snowflake ID на текущий момент.
 

From 82129dc2fed77263f87bd62a6358197e9e37c326 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:47:14 +0300
Subject: [PATCH 025/177] Update
 docs/en/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index 488ff127b7d..52b4da796f4 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1378,7 +1378,7 @@ Result:
 
 ## snowflakeToDateTime64 {#snowflaketodatetime64}
 
-Extracts time from Snowflake ID as [DateTime64](../data-types/datetime64.md) format.
+Extracts time from [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) as [DateTime64](../data-types/datetime64.md) format.
 
 **Syntax**
 

From 62724a1566fcf85ff4171b8b8eb7674958e9edc6 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:47:20 +0300
Subject: [PATCH 026/177] Update
 docs/en/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index 52b4da796f4..10c9d9f4664 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1414,7 +1414,7 @@ Result:
 
 ## dateTimeToSnowflake {#datetimetosnowflake}
 
-Converts [DateTime](../data-types/datetime.md) value to the first Snowflake ID at the giving time.
+Converts [DateTime](../data-types/datetime.md) value to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time.
 
 **Syntax**
 

From af874dd7e052d0d6dbfbef8d0150c98d947e92e3 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:47:26 +0300
Subject: [PATCH 027/177] Update
 docs/en/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index 10c9d9f4664..d8c40f6f551 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1448,7 +1448,7 @@ Result:
 
 ## dateTime64ToSnowflake {#datetime64tosnowflake}
 
-Convert [DateTime64](../data-types/datetime64.md) to the first Snowflake ID at the giving time.
+Convert [DateTime64](../data-types/datetime64.md) to the first [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) at the giving time.
 
 **Syntax**
 

From 40cb7171a89f75ee67a387c7ecf19eb164c618aa Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:47:32 +0300
Subject: [PATCH 028/177] Update
 docs/ru/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index 76fc13e09f1..9cb8b9875b2 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1377,7 +1377,7 @@ SELECT snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC');
 
 ## snowflakeToDateTime64 {#snowflaketodatetime64}
 
-Извлекает время из Snowflake ID в формате [DateTime64](../data-types/datetime64.md).
+Извлекает время из [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) в формате [DateTime64](../data-types/datetime64.md).
 
 **Синтаксис**
 

From feb149f0ee533fc70d276a3f78d7a2b248a8fe1f Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:47:39 +0300
Subject: [PATCH 029/177] Update
 docs/ru/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index 9cb8b9875b2..f6bc06771e9 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1447,7 +1447,7 @@ WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt SELECT dateTimeToS
 
 ## dateTime64ToSnowflake {#datetime64tosnowflake}
 
-Преобразует значение [DateTime64](../data-types/datetime64.md) в первый идентификатор Snowflake ID на текущий момент.
+Преобразует значение [DateTime64](../data-types/datetime64.md) в первый идентификатор [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) на текущий момент.
 
 **Синтаксис**
 

From a8fd2813184e1b91ee70443dc22c8d59c5c33b00 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:47:47 +0300
Subject: [PATCH 030/177] Update
 docs/ru/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index f6bc06771e9..543b80f4cc0 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1413,7 +1413,7 @@ SELECT snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC');
 
 ## dateTimeToSnowflake {#datetimetosnowflake}
 
-Преобразует значение [DateTime](../data-types/datetime.md) в первый идентификатор Snowflake ID на текущий момент.
+Преобразует значение [DateTime](../data-types/datetime.md) в первый идентификатор [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) на текущий момент.
 
 **Syntax**
 

From 1b4ea013475ba6f4bec9ac3dee3c35ea75eb9d87 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:48:10 +0300
Subject: [PATCH 031/177] Update
 docs/ru/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/ru/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/functions/type-conversion-functions.md b/docs/ru/sql-reference/functions/type-conversion-functions.md
index 543b80f4cc0..912f4ec0b1c 100644
--- a/docs/ru/sql-reference/functions/type-conversion-functions.md
+++ b/docs/ru/sql-reference/functions/type-conversion-functions.md
@@ -1341,7 +1341,7 @@ FROM numbers(3);
 
 ## snowflakeToDateTime {#snowflaketodatetime}
 
-Извлекает время из Snowflake ID в формате [DateTime](../data-types/datetime.md).
+Извлекает время из [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) в формате [DateTime](../data-types/datetime.md).
 
 **Синтаксис**
 

From 4ddf7a2841fe2cc8bb1118a3c689d0ad6c5922f8 Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:48:16 +0300
Subject: [PATCH 032/177] Update
 docs/en/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index d8c40f6f551..0ec144f5d69 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1342,7 +1342,7 @@ Result:
 
 ## snowflakeToDateTime {#snowflakeToDateTime}
 
-Extracts time from Snowflake ID as [DateTime](../data-types/datetime.md) format.
+Extracts time from [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) as [DateTime](../data-types/datetime.md) format.
 
 **Syntax**
 

From 3ce50c3b4ac315420d52a5c8adfe9b27ac12799b Mon Sep 17 00:00:00 2001
From: Roman Bug <rrrrrroman@gmail.com>
Date: Wed, 25 Aug 2021 10:48:23 +0300
Subject: [PATCH 033/177] Update
 docs/en/sql-reference/functions/type-conversion-functions.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/sql-reference/functions/type-conversion-functions.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index 0ec144f5d69..22a15a4a4f8 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -1340,7 +1340,7 @@ Result:
 └───────────────────────────────────────────┘
 ```
 
-## snowflakeToDateTime {#snowflakeToDateTime}
+## snowflakeToDateTime {#snowflaketodatetime}
 
 Extracts time from [Snowflake ID](https://en.wikipedia.org/wiki/Snowflake_ID) as [DateTime](../data-types/datetime.md) format.
 

From c902afddde705547cfc6e89f5c385d7eb30a6108 Mon Sep 17 00:00:00 2001
From: Vasily Nemkov <V.Nemkov@gmail.com>
Date: Fri, 5 Mar 2021 16:57:16 +0200
Subject: [PATCH 034/177] Added system.session_log table

Which logs all the info about LogIn, LogOut and LogIn Failure events.
Additional info that is logged:
- User name
- event type (LogIn, LogOut, LoginFailure)
- Event date\time\time with microseconds
- authentication type (same as for IDENTIFIED BY of CREATE USER statement)
- array of active settings profiles upon login
- array of active roles upon login
- array of changed settings with corresponding values
- client address and port
- interface (TCP\HTTP\MySQL\PostgreSQL, etc.)
- client info (name, version info)
- optional LoginFailure reason text message.

Added some tests to verify that events are properly saved with all necessary info via following interfaces:
- TCP
- HTTP
- MySQL

Known limitations
- Not tested against named HTTP sessions, PostgreSQL and gRPC, hence those are not guaranteed to work 100% properly.
---
 docker/test/fasttest/run.sh                   |   3 +
 programs/local/LocalServer.cpp                |   2 +
 programs/server/Server.cpp                    |   7 +-
 programs/server/config.xml                    |   8 +
 programs/server/users.d/session_log_test.xml  |   1 +
 src/Access/AccessControlManager.h             |  11 +-
 src/Access/SettingsProfilesCache.cpp          |   1 -
 src/Access/SettingsProfilesInfo.h             |  10 +
 src/Core/MySQL/Authentication.cpp             |   3 +
 src/Core/MySQL/MySQLSession.h                 |  19 +
 src/Core/PostgreSQLProtocol.h                 |   5 +-
 src/Interpreters/Context.cpp                  |  28 +-
 src/Interpreters/Context.h                    |  15 +
 src/Interpreters/InterpreterSetQuery.cpp      |   5 +
 src/Interpreters/InterpreterSystemQuery.cpp   |   4 +-
 src/Interpreters/Session.cpp                  |  66 +++-
 src/Interpreters/Session.h                    |  10 +-
 src/Interpreters/SessionLog.cpp               | 261 ++++++++++++
 src/Interpreters/SessionLog.h                 |  74 ++++
 src/Interpreters/SystemLog.cpp                |  14 +
 src/Interpreters/SystemLog.h                  |   3 +
 src/Interpreters/ya.make                      |   2 +
 .../Formats/Impl/MySQLOutputFormat.h          |   2 +
 src/Server/HTTPHandler.h                      |   3 +-
 src/Server/MySQLHandler.cpp                   |   2 +-
 src/TableFunctions/TableFunctionMySQL.cpp     |   3 +-
 tests/config/install.sh                       |   1 +
 tests/config/users.d/session_log_test.xml     |  30 ++
 .../0_stateless/01033_quota_dcl.reference     |   2 +-
 .../01702_system_query_log.reference          |   2 +-
 .../01747_system_session_log_long.reference   | 218 +++++++++++
 .../01747_system_session_log_long.sh          | 370 ++++++++++++++++++
 tests/queries/skip_list.json                  |   4 +-
 33 files changed, 1164 insertions(+), 25 deletions(-)
 create mode 120000 programs/server/users.d/session_log_test.xml
 create mode 100644 src/Core/MySQL/MySQLSession.h
 create mode 100644 src/Interpreters/SessionLog.cpp
 create mode 100644 src/Interpreters/SessionLog.h
 create mode 100644 tests/config/users.d/session_log_test.xml
 create mode 100644 tests/queries/0_stateless/01747_system_session_log_long.reference
 create mode 100755 tests/queries/0_stateless/01747_system_session_log_long.sh

diff --git a/docker/test/fasttest/run.sh b/docker/test/fasttest/run.sh
index 00af261f6c8..108544779b4 100755
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@@ -399,6 +399,9 @@ function run_tests
 
         # depends on Go
         02013_zlib_read_after_eof
+
+        # Accesses CH via mysql table function (which is unavailable)
+        01747_system_session_log_long
     )
 
     time clickhouse-test --hung-check -j 8 --order=random --use-skip-list \
diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 2b1b6185321..258743c7e16 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -9,6 +9,7 @@
 #include <Databases/DatabaseMemory.h>
 #include <Storages/System/attachSystemTables.h>
 #include <Interpreters/ProcessList.h>
+#include <Interpreters/Session.h>
 #include <Interpreters/executeQuery.h>
 #include <Interpreters/loadMetadata.h>
 #include <Interpreters/DatabaseCatalog.h>
@@ -393,6 +394,7 @@ void LocalServer::processQueries()
     auto context = session.makeQueryContext();
     context->makeSessionContext(); /// initial_create_query requires a session context to be set.
     context->setCurrentQueryId("");
+ 
     applyCmdSettings(context);
 
     /// Use the same query_id (and thread group) for all queries
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index ddbc4c4e433..09b6add62d0 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -45,16 +45,21 @@
 #include <IO/UseSSL.h>
 #include <Interpreters/AsynchronousMetrics.h>
 #include <Interpreters/DDLWorker.h>
+#include <Interpreters/DNSCacheUpdater.h>
+#include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/ExternalDictionariesLoader.h>
+#include <Interpreters/ExternalLoaderXMLConfigRepository.h>
 #include <Interpreters/ExternalModelsLoader.h>
+#include <Interpreters/InterserverCredentials.h>
 #include <Interpreters/ProcessList.h>
+#include <Interpreters/Session.h>
 #include <Interpreters/loadMetadata.h>
 #include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/DNSCacheUpdater.h>
 #include <Interpreters/ExternalLoaderXMLConfigRepository.h>
 #include <Interpreters/InterserverCredentials.h>
-#include <Interpreters/UserDefinedObjectsLoader.h>
 #include <Interpreters/JIT/CompiledExpressionCache.h>
+#include <Interpreters/UserDefinedObjectsLoader.h>
 #include <Access/AccessControlManager.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Storages/System/attachSystemTables.h>
diff --git a/programs/server/config.xml b/programs/server/config.xml
index 510a5e230f8..98c4416da46 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -964,6 +964,14 @@
         <flush_interval_milliseconds>1000</flush_interval_milliseconds>
     </crash_log>
 
+    <!-- Session log. Stores user log in (successful or not) and log out events. -->
+    <session_log>
+        <database>system</database>
+        <table>session_log</table>
+
+        <partition_by>toYYYYMM(event_date)</partition_by>
+        <flush_interval_milliseconds>7500</flush_interval_milliseconds>
+    </session_log>
 
     <!-- Parameters for embedded dictionaries, used in Yandex.Metrica.
          See https://clickhouse.yandex/docs/en/dicts/internal_dicts/
diff --git a/programs/server/users.d/session_log_test.xml b/programs/server/users.d/session_log_test.xml
new file mode 120000
index 00000000000..85377f51630
--- /dev/null
+++ b/programs/server/users.d/session_log_test.xml
@@ -0,0 +1 @@
+../../../tests/config/users.d/session_log_test.xml
\ No newline at end of file
diff --git a/src/Access/AccessControlManager.h b/src/Access/AccessControlManager.h
index 15079ea61be..c20216a291a 100644
--- a/src/Access/AccessControlManager.h
+++ b/src/Access/AccessControlManager.h
@@ -143,15 +143,18 @@ public:
 
     std::vector<QuotaUsage> getAllQuotasUsage() const;
 
-    std::shared_ptr<const EnabledSettings> getEnabledSettings(const UUID & user_id,
-                                                              const SettingsProfileElements & settings_from_user,
-                                                              const boost::container::flat_set<UUID> & enabled_roles,
-                                                              const SettingsProfileElements & settings_from_enabled_roles) const;
+    std::shared_ptr<const EnabledSettings> getEnabledSettings(
+        const UUID & user_id,
+        const SettingsProfileElements & settings_from_user,
+        const boost::container::flat_set<UUID> & enabled_roles,
+        const SettingsProfileElements & settings_from_enabled_roles) const;
 
     std::shared_ptr<const SettingsProfilesInfo> getSettingsProfileInfo(const UUID & profile_id);
 
     const ExternalAuthenticators & getExternalAuthenticators() const;
 
+    String getProfileName(const UUID & profile_id) const;
+
 private:
     class ContextAccessCache;
     class CustomSettingsPrefixes;
diff --git a/src/Access/SettingsProfilesCache.cpp b/src/Access/SettingsProfilesCache.cpp
index 3cd73720c3e..20880b94aba 100644
--- a/src/Access/SettingsProfilesCache.cpp
+++ b/src/Access/SettingsProfilesCache.cpp
@@ -116,7 +116,6 @@ void SettingsProfilesCache::mergeSettingsAndConstraints()
     }
 }
 
-
 void SettingsProfilesCache::mergeSettingsAndConstraintsFor(EnabledSettings & enabled) const
 {
     SettingsProfileElements merged_settings;
diff --git a/src/Access/SettingsProfilesInfo.h b/src/Access/SettingsProfilesInfo.h
index d1fba0e9f5f..f554ba8d89e 100644
--- a/src/Access/SettingsProfilesInfo.h
+++ b/src/Access/SettingsProfilesInfo.h
@@ -36,6 +36,16 @@ struct SettingsProfilesInfo
     friend bool operator ==(const SettingsProfilesInfo & lhs, const SettingsProfilesInfo & rhs);
     friend bool operator !=(const SettingsProfilesInfo & lhs, const SettingsProfilesInfo & rhs) { return !(lhs == rhs); }
 
+    Strings getProfileNames() const
+    {
+        Strings result;
+        result.reserve(profiles.size());
+        for (const auto & profile_id : profiles)
+            result.push_back(names_of_profiles.at(profile_id));
+
+        return result;
+    }
+
 private:
     const AccessControlManager & manager;
 };
diff --git a/src/Core/MySQL/Authentication.cpp b/src/Core/MySQL/Authentication.cpp
index aeb9a411082..76fb6bad833 100644
--- a/src/Core/MySQL/Authentication.cpp
+++ b/src/Core/MySQL/Authentication.cpp
@@ -3,6 +3,9 @@
 #include <Poco/RandomStream.h>
 #include <Poco/SHA1Engine.h>
 #include <Interpreters/Session.h>
+#include <Access/User.h>
+#include <Access/AccessControlManager.h>
+#include <Interpreters/Session.h>
 
 #include <common/logger_useful.h>
 #include <Common/OpenSSLHelpers.h>
diff --git a/src/Core/MySQL/MySQLSession.h b/src/Core/MySQL/MySQLSession.h
new file mode 100644
index 00000000000..1ba17a40483
--- /dev/null
+++ b/src/Core/MySQL/MySQLSession.h
@@ -0,0 +1,19 @@
+#pragma once
+
+#include <common/types.h>
+#include <Interpreters/Session.h>
+
+namespace DB
+{
+
+class MySQLSession : public DB::Session
+{
+public:
+    using DB::Session::Session;
+
+    uint8_t sequence_id = 0;
+    uint32_t client_capabilities = 0;
+    size_t max_packet_size = 0;
+};
+
+}
diff --git a/src/Core/PostgreSQLProtocol.h b/src/Core/PostgreSQLProtocol.h
index aef0ed6ab25..6fc69d2d5b2 100644
--- a/src/Core/PostgreSQLProtocol.h
+++ b/src/Core/PostgreSQLProtocol.h
@@ -1,6 +1,8 @@
 #pragma once
 
 #include <functional>
+#include <Interpreters/Session.h>
+#include <Interpreters/Context.h>
 #include <IO/ReadBuffer.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBuffer.h>
@@ -900,8 +902,7 @@ public:
         Messaging::MessageTransport & mt,
         const Poco::Net::SocketAddress & address)
     {
-        Authentication::Type user_auth_type = session.getAuthenticationType(user_name);
-
+        const Authentication::Type user_auth_type = session.getAuthenticationTypeOrLogInFailure(user_name);
         if (type_to_method.find(user_auth_type) != type_to_method.end())
         {
             type_to_method[user_auth_type]->authenticate(user_name, session, mt, address);
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index d984a350c80..da7228ebd0f 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -57,6 +57,7 @@
 #include <Interpreters/Cluster.h>
 #include <Interpreters/InterserverIOHandler.h>
 #include <Interpreters/SystemLog.h>
+#include <Interpreters/SessionLog.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/DDLWorker.h>
 #include <Interpreters/DDLTask.h>
@@ -625,6 +626,12 @@ ConfigurationPtr Context::getUsersConfig()
     return shared->users_config;
 }
 
+std::shared_ptr<const ContextAccess> Context::getContextAccessForUser(const UUID & user_id) const
+{
+    return getAccessControlManager().getContextAccess(
+        user_id, /* current_roles = */ {}, /* use_default_roles = */ true,
+        settings, current_database, client_info);
+}
 
 void Context::setUser(const UUID & user_id_)
 {
@@ -632,8 +639,7 @@ void Context::setUser(const UUID & user_id_)
 
     user_id = user_id_;
 
-    access = getAccessControlManager().getContextAccess(
-        user_id_, /* current_roles = */ {}, /* use_default_roles = */ true, settings, current_database, client_info);
+    access = getContextAccessForUser(user_id_);
 
     auto user = access->getUser();
     current_roles = std::make_shared<std::vector<UUID>>(user->granted_roles.findGranted(user->default_roles));
@@ -1265,6 +1271,14 @@ ContextMutablePtr Context::getBufferContext() const
     return buffer_context;
 }
 
+Session * Context::getSessionOrNull() const
+{
+    if (hasSessionContext())
+        return getSession();
+    else
+        return nullptr;
+}
+
 
 const EmbeddedDictionaries & Context::getEmbeddedDictionaries() const
 {
@@ -2055,6 +2069,16 @@ std::shared_ptr<OpenTelemetrySpanLog> Context::getOpenTelemetrySpanLog() const
     return shared->system_logs->opentelemetry_span_log;
 }
 
+std::shared_ptr<SessionLog> Context::getSessionLog() const
+{
+    auto lock = getLock();
+
+    if (!shared->system_logs)
+        return {};
+
+    return shared->system_logs->session_log;
+}
+
 
 std::shared_ptr<ZooKeeperLog> Context::getZooKeeperLog() const
 {
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 6af2c3c4d62..3e373c493f5 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -74,6 +74,7 @@ class MetricLog;
 class AsynchronousMetricLog;
 class OpenTelemetrySpanLog;
 class ZooKeeperLog;
+class SessionLog;
 struct MergeTreeSettings;
 class StorageS3Settings;
 class IDatabase;
@@ -275,6 +276,8 @@ private:
 
     /// XXX: move this stuff to shared part instead.
     ContextMutablePtr buffer_context;  /// Buffer context. Could be equal to this.
+    /// Non-owning, only here for MySQLOutputFormat to be able to modify sequence_id, see setSession() and getSession()
+    Session * session = nullptr;
 
     /// A flag, used to distinguish between user query and internal query to a database engine (MaterializePostgreSQL).
     bool is_internal_query = false;
@@ -370,6 +373,8 @@ public:
     /// Normally you shouldn't call this function. Use the Session class to do authentication instead.
     void setUser(const UUID & user_id_);
 
+    std::shared_ptr<const ContextAccess> getContextAccessForUser(const UUID & user_id) const;
+
     UserPtr getUser() const;
     String getUserName() const;
     std::optional<UUID> getUserID() const;
@@ -598,6 +603,15 @@ public:
     bool hasSessionContext() const { return !session_context.expired(); }
 
     ContextMutablePtr getGlobalContext() const;
+
+    // Exists only due to MySQLOutputFormat
+    Session * getSession() const { return getSessionContext()->session; }
+    void setSession(Session * new_session)
+    {
+        session = getSessionContext()->session = new_session;
+    }
+    Session * getSessionOrNull() const;
+
     bool hasGlobalContext() const { return !global_context.expired(); }
     bool isGlobalContext() const
     {
@@ -733,6 +747,7 @@ public:
     std::shared_ptr<AsynchronousMetricLog> getAsynchronousMetricLog() const;
     std::shared_ptr<OpenTelemetrySpanLog> getOpenTelemetrySpanLog() const;
     std::shared_ptr<ZooKeeperLog> getZooKeeperLog() const;
+    std::shared_ptr<SessionLog> getSessionLog() const;
 
     /// Returns an object used to log operations with parts if it possible.
     /// Provide table name to make required checks.
diff --git a/src/Interpreters/InterpreterSetQuery.cpp b/src/Interpreters/InterpreterSetQuery.cpp
index 1c6a4236bf6..73af2bbe3c0 100644
--- a/src/Interpreters/InterpreterSetQuery.cpp
+++ b/src/Interpreters/InterpreterSetQuery.cpp
@@ -9,8 +9,13 @@ namespace DB
 BlockIO InterpreterSetQuery::execute()
 {
     const auto & ast = query_ptr->as<ASTSetQuery &>();
+
     getContext()->checkSettingsConstraints(ast.changes);
+    // Here settings are pushed to the session context and are not visible in the query context
     getContext()->getSessionContext()->applySettingsChanges(ast.changes);
+    // Make setting changes also available to the query context.
+    getContext()->applySettingsChanges(ast.changes);
+
     return {};
 }
 
diff --git a/src/Interpreters/InterpreterSystemQuery.cpp b/src/Interpreters/InterpreterSystemQuery.cpp
index d4ac555add0..bcf6792b799 100644
--- a/src/Interpreters/InterpreterSystemQuery.cpp
+++ b/src/Interpreters/InterpreterSystemQuery.cpp
@@ -21,6 +21,7 @@
 #include <Interpreters/PartLog.h>
 #include <Interpreters/QueryThreadLog.h>
 #include <Interpreters/QueryViewsLog.h>
+#include <Interpreters/SessionLog.h>
 #include <Interpreters/TraceLog.h>
 #include <Interpreters/TextLog.h>
 #include <Interpreters/MetricLog.h>
@@ -420,7 +421,8 @@ BlockIO InterpreterSystemQuery::execute()
                 [&] { if (auto asynchronous_metric_log = getContext()->getAsynchronousMetricLog()) asynchronous_metric_log->flush(true); },
                 [&] { if (auto opentelemetry_span_log = getContext()->getOpenTelemetrySpanLog()) opentelemetry_span_log->flush(true); },
                 [&] { if (auto query_views_log = getContext()->getQueryViewsLog()) query_views_log->flush(true); },
-                [&] { if (auto zookeeper_log = getContext()->getZooKeeperLog()) zookeeper_log->flush(true); }
+                [&] { if (auto zookeeper_log = getContext()->getZooKeeperLog()) zookeeper_log->flush(true); },
+                [&] { if (auto session_log = getContext()->getSessionLog()) session_log->flush(true); }
             );
             break;
         }
diff --git a/src/Interpreters/Session.cpp b/src/Interpreters/Session.cpp
index 58a0407be00..b1f7f4349f4 100644
--- a/src/Interpreters/Session.cpp
+++ b/src/Interpreters/Session.cpp
@@ -8,6 +8,7 @@
 #include <Common/ThreadPool.h>
 #include <Common/setThreadName.h>
 #include <Interpreters/Context.h>
+#include <Interpreters/SessionLog.h>
 
 #include <atomic>
 #include <condition_variable>
@@ -241,7 +242,8 @@ void Session::shutdownNamedSessions()
 
 
 Session::Session(const ContextPtr & global_context_, ClientInfo::Interface interface_)
-    : global_context(global_context_)
+    : session_id(UUIDHelpers::generateV4()),
+      global_context(global_context_)
 {
     prepared_client_info.emplace();
     prepared_client_info->interface = interface_;
@@ -254,6 +256,14 @@ Session::~Session()
     /// Early release a NamedSessionData.
     if (named_session)
         named_session->release();
+
+    if (notified_session_log_about_login)
+    {
+        // must have been set in makeQueryContext or makeSessionContext
+        assert(user);
+        if (auto session_log = getSessionLog())
+            session_log->addLogOut(session_id, user->getName(), getClientInfo());
+    }
 }
 
 Authentication::Type Session::getAuthenticationType(const String & user_name) const
@@ -261,6 +271,21 @@ Authentication::Type Session::getAuthenticationType(const String & user_name) co
     return global_context->getAccessControlManager().read<User>(user_name)->authentication.getType();
 }
 
+Authentication::Type Session::getAuthenticationTypeOrLogInFailure(const String & user_name) const
+{
+    try
+    {
+        return getAuthenticationType(user_name);
+    }
+    catch (const Exception & e)
+    {
+        if (auto session_log = getSessionLog())
+            session_log->addLoginFailure(session_id, getClientInfo(), user_name, e);
+
+        throw;
+    }
+}
+
 Authentication::Digest Session::getPasswordDoubleSHA1(const String & user_name) const
 {
     return global_context->getAccessControlManager().read<User>(user_name)->authentication.getPasswordDoubleSHA1();
@@ -280,7 +305,16 @@ void Session::authenticate(const Credentials & credentials_, const Poco::Net::So
     if ((address == Poco::Net::SocketAddress{}) && (prepared_client_info->interface == ClientInfo::Interface::LOCAL))
         address = Poco::Net::SocketAddress{"127.0.0.1", 0};
 
-    user_id = global_context->getAccessControlManager().login(credentials_, address.host());
+    try
+    {
+        user_id = global_context->getAccessControlManager().login(credentials_, address.host());
+    }
+    catch(const Exception & e)
+    {
+        if (auto session_log = getSessionLog())
+            session_log->addLoginFailure(session_id, *prepared_client_info, credentials_.getUserName(), e);
+        throw;
+    }
 
     prepared_client_info->current_user = credentials_.getUserName();
     prepared_client_info->current_address = address;
@@ -330,7 +364,7 @@ ContextMutablePtr Session::makeSessionContext()
     return session_context;
 }
 
-ContextMutablePtr Session::makeSessionContext(const String & session_id_, std::chrono::steady_clock::duration timeout_, bool session_check_)
+ContextMutablePtr Session::makeSessionContext(const String & session_name_, std::chrono::steady_clock::duration timeout_, bool session_check_)
 {
     if (session_context)
         throw Exception("Session context already exists", ErrorCodes::LOGICAL_ERROR);
@@ -342,7 +376,7 @@ ContextMutablePtr Session::makeSessionContext(const String & session_id_, std::c
     std::shared_ptr<NamedSessionData> new_named_session;
     bool new_named_session_created = false;
     std::tie(new_named_session, new_named_session_created)
-        = NamedSessionsStorage::instance().acquireSession(global_context, user_id.value_or(UUID{}), session_id_, timeout_, session_check_);
+        = NamedSessionsStorage::instance().acquireSession(global_context, user_id.value_or(UUID{}), session_name_, timeout_, session_check_);
 
     auto new_session_context = new_named_session->context;
     new_session_context->makeSessionContext();
@@ -359,8 +393,7 @@ ContextMutablePtr Session::makeSessionContext(const String & session_id_, std::c
         new_session_context->setUser(*user_id);
 
     /// Session context is ready.
-    session_context = new_session_context;
-    session_id = session_id_;
+    session_context = std::move(new_session_context);
     named_session = new_named_session;
     named_session_created = new_named_session_created;
     user = session_context->getUser();
@@ -378,6 +411,13 @@ ContextMutablePtr Session::makeQueryContext(ClientInfo && query_client_info) con
     return makeQueryContextImpl(nullptr, &query_client_info);
 }
 
+std::shared_ptr<SessionLog> Session::getSessionLog() const
+{
+    // take it from global context, since it outlives the Session and always available.
+    // please note that server may have session_log disabled, hence this may return nullptr.
+    return global_context->getSessionLog();
+}
+
 ContextMutablePtr Session::makeQueryContextImpl(const ClientInfo * client_info_to_copy, ClientInfo * client_info_to_move) const
 {
     /// We can create a query context either from a session context or from a global context.
@@ -425,7 +465,21 @@ ContextMutablePtr Session::makeQueryContextImpl(const ClientInfo * client_info_t
     query_context_created = true;
     user = query_context->getUser();
 
+    if (!notified_session_log_about_login)
+    {
+        if (auto session_log = getSessionLog())
+        {
+            session_log->addLoginSuccess(
+                    session_id,
+                    named_session ? std::optional<std::string>(named_session->key.second) : std::nullopt,
+                    *query_context);
+
+            notified_session_log_about_login = true;
+        }
+    }
+
     return query_context;
 }
 
 }
+
diff --git a/src/Interpreters/Session.h b/src/Interpreters/Session.h
index d104e250099..772ccba7766 100644
--- a/src/Interpreters/Session.h
+++ b/src/Interpreters/Session.h
@@ -19,6 +19,7 @@ struct NamedSessionData;
 class NamedSessionsStorage;
 struct User;
 using UserPtr = std::shared_ptr<const User>;
+class SessionLog;
 
 /** Represents user-session from the server perspective,
  *  basically it is just a smaller subset of Context API, simplifies Context management.
@@ -41,6 +42,8 @@ public:
     /// Provides information about the authentication type of a specified user.
     Authentication::Type getAuthenticationType(const String & user_name) const;
     Authentication::Digest getPasswordDoubleSHA1(const String & user_name) const;
+    /// Same as getAuthenticationType, but adds LoginFailure event in case of error.
+    Authentication::Type getAuthenticationTypeOrLogInFailure(const String & user_name) const;
 
     /// Sets the current user, checks the credentials and that the specified address is allowed to connect from.
     /// The function throws an exception if there is no such user or password is wrong.
@@ -54,7 +57,7 @@ public:
     /// Makes a session context, can be used one or zero times.
     /// The function also assigns an user to this context.
     ContextMutablePtr makeSessionContext();
-    ContextMutablePtr makeSessionContext(const String & session_id_, std::chrono::steady_clock::duration timeout_, bool session_check_);
+    ContextMutablePtr makeSessionContext(const String & session_name_, std::chrono::steady_clock::duration timeout_, bool session_check_);
     ContextMutablePtr sessionContext() { return session_context; }
     ContextPtr sessionContext() const { return session_context; }
 
@@ -66,8 +69,11 @@ public:
     ContextMutablePtr makeQueryContext(ClientInfo && query_client_info) const;
 
 private:
+    std::shared_ptr<SessionLog> getSessionLog() const;
     ContextMutablePtr makeQueryContextImpl(const ClientInfo * client_info_to_copy, ClientInfo * client_info_to_move) const;
 
+    mutable bool notified_session_log_about_login = false;
+    const UUID session_id;
     const ContextPtr global_context;
 
     /// ClientInfo that will be copied to a session context when it's created.
@@ -79,9 +85,9 @@ private:
     ContextMutablePtr session_context;
     mutable bool query_context_created = false;
 
-    String session_id;
     std::shared_ptr<NamedSessionData> named_session;
     bool named_session_created = false;
 };
 
 }
+
diff --git a/src/Interpreters/SessionLog.cpp b/src/Interpreters/SessionLog.cpp
new file mode 100644
index 00000000000..2d2f1358656
--- /dev/null
+++ b/src/Interpreters/SessionLog.cpp
@@ -0,0 +1,261 @@
+#include <Interpreters/SessionLog.h>
+
+#include <Access/ContextAccess.h>
+#include <Access/User.h>
+#include <Access/EnabledRolesInfo.h>
+#include <Core/Settings.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeDateTime64.h>
+#include <DataTypes/DataTypeDate.h>
+#include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeEnum.h>
+#include <DataTypes/DataTypeTuple.h>
+#include <DataTypes/DataTypeFactory.h>
+#include <DataTypes/DataTypeLowCardinality.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypeUUID.h>
+#include <Common/IPv6ToBinary.h>
+#include <Columns/ColumnArray.h>
+#include <Columns/ColumnString.h>
+#include <Columns/ColumnTuple.h>
+#include <Access/SettingsProfilesInfo.h>
+
+#include <cassert>
+
+namespace
+{
+using namespace DB;
+
+inline DateTime64 time_in_microseconds(std::chrono::time_point<std::chrono::system_clock> timepoint)
+{
+    return std::chrono::duration_cast<std::chrono::microseconds>(timepoint.time_since_epoch()).count();
+}
+
+inline time_t time_in_seconds(std::chrono::time_point<std::chrono::system_clock> timepoint)
+{
+    return std::chrono::duration_cast<std::chrono::seconds>(timepoint.time_since_epoch()).count();
+}
+
+auto eventTime()
+{
+    const auto finish_time = std::chrono::system_clock::now();
+
+    return std::make_pair(time_in_seconds(finish_time), time_in_microseconds(finish_time));
+}
+
+using AuthType = Authentication::Type;
+using Interface = ClientInfo::Interface;
+
+void fillColumnArray(const Strings & data, IColumn & column)
+{
+    auto & array = typeid_cast<ColumnArray &>(column);
+    size_t size = 0;
+    auto & data_col = array.getData();
+    for (const auto & name : data)
+    {
+        data_col.insertData(name.data(), name.size());
+        ++size;
+    }
+    auto & offsets = array.getOffsets();
+    offsets.push_back(offsets.back() + size);
+};
+
+}
+
+namespace DB
+{
+
+SessionLogElement::SessionLogElement(const UUID & session_id_, Type type_)
+    : session_id(session_id_),
+      type(type_)
+{
+    std::tie(event_time, event_time_microseconds) = eventTime();
+}
+
+NamesAndTypesList SessionLogElement::getNamesAndTypes()
+{
+    const auto event_type = std::make_shared<DataTypeEnum8>(
+        DataTypeEnum8::Values
+        {
+            {"LoginFailure",           static_cast<Int8>(SESSION_LOGIN_FAILURE)},
+            {"LoginSuccess",           static_cast<Int8>(SESSION_LOGIN_SUCCESS)},
+            {"Logout",                 static_cast<Int8>(SESSION_LOGOUT)}
+        });
+
+#define AUTH_TYPE_NAME_AND_VALUE(v) std::make_pair(Authentication::TypeInfo::get(v).raw_name, static_cast<Int8>(v))
+    const auto identified_with_column = std::make_shared<DataTypeEnum8>(
+        DataTypeEnum8::Values
+        {
+            AUTH_TYPE_NAME_AND_VALUE(AuthType::NO_PASSWORD),
+            AUTH_TYPE_NAME_AND_VALUE(AuthType::PLAINTEXT_PASSWORD),
+            AUTH_TYPE_NAME_AND_VALUE(AuthType::SHA256_PASSWORD),
+            AUTH_TYPE_NAME_AND_VALUE(AuthType::DOUBLE_SHA1_PASSWORD),
+            AUTH_TYPE_NAME_AND_VALUE(AuthType::LDAP),
+            AUTH_TYPE_NAME_AND_VALUE(AuthType::KERBEROS)
+        });
+#undef AUTH_TYPE_NAME_AND_VALUE
+
+    const auto interface_type_column = std::make_shared<DataTypeEnum8>(
+        DataTypeEnum8::Values
+        {
+            {"TCP",                    static_cast<Int8>(Interface::TCP)},
+            {"HTTP",                   static_cast<Int8>(Interface::HTTP)},
+            {"gRPC",                   static_cast<Int8>(Interface::GRPC)},
+            {"MySQL",                  static_cast<Int8>(Interface::MYSQL)},
+            {"PostgreSQL",             static_cast<Int8>(Interface::POSTGRESQL)}
+        });
+
+    const auto lc_string_datatype = std::make_shared<DataTypeLowCardinality>(std::make_shared<DataTypeString>());
+
+    const auto changed_settings_type_column = std::make_shared<DataTypeArray>(
+        std::make_shared<DataTypeTuple>(
+            DataTypes({
+                // setting name
+                lc_string_datatype,
+                // value
+                std::make_shared<DataTypeString>()
+            })));
+
+    return
+    {
+        {"type", std::move(event_type)},
+        {"session_id", std::make_shared<DataTypeUUID>()},
+        {"session_name", std::make_shared<DataTypeString>()},
+        {"event_date", std::make_shared<DataTypeDate>()},
+        {"event_time", std::make_shared<DataTypeDateTime>()},
+        {"event_time_microseconds", std::make_shared<DataTypeDateTime64>(6)},
+
+        {"user", std::make_shared<DataTypeString>()},
+        {"auth_type", std::move(identified_with_column)},
+
+        {"profiles", std::make_shared<DataTypeArray>(lc_string_datatype)},
+        {"roles", std::make_shared<DataTypeArray>(lc_string_datatype)},
+        {"changed_settings", std::move(changed_settings_type_column)},
+
+        {"client_address", DataTypeFactory::instance().get("IPv6")},
+        {"client_port", std::make_shared<DataTypeUInt16>()},
+        {"interface", std::move(interface_type_column)},
+
+        {"client_hostname", std::make_shared<DataTypeString>()},
+        {"client_name", std::make_shared<DataTypeString>()},
+        {"client_revision", std::make_shared<DataTypeUInt32>()},
+        {"client_version_major", std::make_shared<DataTypeUInt32>()},
+        {"client_version_minor", std::make_shared<DataTypeUInt32>()},
+        {"client_version_patch", std::make_shared<DataTypeUInt32>()},
+
+        {"failure_reason", std::make_shared<DataTypeString>()},
+    };
+}
+
+void SessionLogElement::appendToBlock(MutableColumns & columns) const
+{
+    assert(type >= SESSION_LOGIN_FAILURE && type <= SESSION_LOGOUT);
+    assert(user_identified_with >= Authentication::Type::NO_PASSWORD && user_identified_with <= Authentication::Type::MAX_TYPE);
+
+    size_t i = 0;
+
+    columns[i++]->insert(type);
+    columns[i++]->insert(session_id);
+    columns[i++]->insert(session_name);
+    columns[i++]->insert(static_cast<DayNum>(DateLUT::instance().toDayNum(event_time).toUnderType()));
+    columns[i++]->insert(event_time);
+    columns[i++]->insert(event_time_microseconds);
+
+    columns[i++]->insert(user);
+    columns[i++]->insert(user_identified_with);
+
+    fillColumnArray(profiles, *columns[i++]);
+    fillColumnArray(roles, *columns[i++]);
+
+    {
+        auto & changed_settings_array_col = assert_cast<ColumnArray &>(*columns[i++]);
+        auto & changed_settings_tuple_col = assert_cast<ColumnTuple &>(changed_settings_array_col.getData());
+        auto & names_col = *changed_settings_tuple_col.getColumnPtr(0)->assumeMutable();
+        auto & values_col = assert_cast<ColumnString &>(*changed_settings_tuple_col.getColumnPtr(1)->assumeMutable());
+
+        size_t items_added = 0;
+        for (const auto & kv : changed_settings)
+        {
+            names_col.insert(kv.first);
+            values_col.insert(kv.second);
+            ++items_added;
+        }
+
+        auto & offsets = changed_settings_array_col.getOffsets();
+        offsets.push_back(changed_settings_tuple_col.size());
+    }
+
+    columns[i++]->insertData(IPv6ToBinary(client_info.current_address.host()).data(), 16);
+    columns[i++]->insert(client_info.current_address.port());
+
+    columns[i++]->insert(client_info.interface);
+
+    columns[i++]->insertData(client_info.client_hostname.data(), client_info.client_hostname.length());
+    columns[i++]->insertData(client_info.client_name.data(), client_info.client_name.length());
+    columns[i++]->insert(client_info.client_tcp_protocol_version);
+    columns[i++]->insert(client_info.client_version_major);
+    columns[i++]->insert(client_info.client_version_minor);
+    columns[i++]->insert(client_info.client_version_patch);
+
+    columns[i++]->insertData(auth_failure_reason.data(), auth_failure_reason.length());
+}
+
+void SessionLog::addLoginSuccess(const UUID & session_id, std::optional<String> session_name, const Context & context)
+{
+    const auto access = context.getAccess();
+    const auto & settings = context.getSettingsRef();
+    const auto & client_info = context.getClientInfo();
+
+    DB::SessionLogElement log_entry(session_id, SESSION_LOGIN_SUCCESS);
+    log_entry.client_info = client_info;
+
+    {
+        const auto user = access->getUser();
+        log_entry.user = user->getName();
+        log_entry.user_identified_with = user->authentication.getType();
+        log_entry.external_auth_server = user->authentication.getLDAPServerName();
+    }
+
+    if (session_name)
+        log_entry.session_name = *session_name;
+
+    if (const auto roles_info = access->getRolesInfo())
+        log_entry.roles = roles_info->getCurrentRolesNames();
+
+    if (const auto profile_info = access->getDefaultProfileInfo())
+    log_entry.profiles = profile_info->getProfileNames();
+
+    for (const auto & s : settings.allChanged())
+        log_entry.changed_settings.emplace_back(s.getName(), s.getValueString());
+
+    add(log_entry);
+}
+
+void SessionLog::addLoginFailure(
+        const UUID & session_id,
+        const ClientInfo & info,
+        const String & user,
+        const Exception & reason)
+{
+    SessionLogElement log_entry(session_id, SESSION_LOGIN_FAILURE);
+
+    log_entry.user = user;
+    log_entry.auth_failure_reason = reason.message();
+    log_entry.client_info = info;
+    log_entry.user_identified_with = Authentication::Type::NO_PASSWORD;
+
+    add(log_entry);
+}
+
+void SessionLog::addLogOut(const UUID & session_id, const String & user, const ClientInfo & client_info)
+{
+    auto log_entry = SessionLogElement(session_id, SESSION_LOGOUT);
+    log_entry.user = user;
+    log_entry.client_info = client_info;
+
+    add(log_entry);
+}
+
+}
diff --git a/src/Interpreters/SessionLog.h b/src/Interpreters/SessionLog.h
new file mode 100644
index 00000000000..2530809f9f9
--- /dev/null
+++ b/src/Interpreters/SessionLog.h
@@ -0,0 +1,74 @@
+#pragma once
+
+#include <Interpreters/SystemLog.h>
+#include <Interpreters/ClientInfo.h>
+#include <Access/Authentication.h>
+
+namespace DB
+{
+
+enum SessionLogElementType : int8_t
+{
+    SESSION_LOGIN_FAILURE = 0,
+    SESSION_LOGIN_SUCCESS = 1,
+    SESSION_LOGOUT = 2,
+};
+
+class ContextAccess;
+
+/** A struct which will be inserted as row into session_log table.
+  *
+  *  Allows to log information about user sessions:
+  * - auth attempts, auth result, auth method, etc.
+  * - log out events
+  */
+struct SessionLogElement
+{
+    using Type = SessionLogElementType;
+
+    SessionLogElement() = default;
+    SessionLogElement(const UUID & session_id_, Type type_);
+    SessionLogElement(const SessionLogElement &) = default;
+    SessionLogElement & operator=(const SessionLogElement &) = default;
+    SessionLogElement(SessionLogElement &&) = default;
+    SessionLogElement & operator=(SessionLogElement &&) = default;
+
+    UUID session_id;
+
+    Type type = SESSION_LOGIN_FAILURE;
+
+    String session_name;
+    time_t event_time{};
+    Decimal64 event_time_microseconds{};
+
+    String user;
+    Authentication::Type user_identified_with = Authentication::Type::NO_PASSWORD;
+    String external_auth_server;
+    Strings roles;
+    Strings profiles;
+    std::vector<std::pair<String, String>> changed_settings;
+
+    ClientInfo client_info;
+    String auth_failure_reason;
+
+    static std::string name() { return "SessionLog"; }
+
+    static NamesAndTypesList getNamesAndTypes();
+    static NamesAndAliases getNamesAndAliases() { return {}; }
+
+    void appendToBlock(MutableColumns & columns) const;
+};
+
+
+/// Instead of typedef - to allow forward declaration.
+class SessionLog : public SystemLog<SessionLogElement>
+{
+    using SystemLog<SessionLogElement>::SystemLog;
+
+public:
+    void addLoginSuccess(const UUID & session_id, std::optional<String> session_name, const Context & context);
+    void addLoginFailure(const UUID & session_id, const ClientInfo & info, const String & user, const Exception & reason);
+    void addLogOut(const UUID & session_id, const String & user, const ClientInfo & client_info);
+};
+
+}
diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp
index 5c5a6e1439a..dfc16dae49c 100644
--- a/src/Interpreters/SystemLog.cpp
+++ b/src/Interpreters/SystemLog.cpp
@@ -9,6 +9,11 @@
 #include <Interpreters/SystemLog.h>
 #include <Interpreters/TextLog.h>
 #include <Interpreters/TraceLog.h>
+#include <Interpreters/CrashLog.h>
+#include <Interpreters/MetricLog.h>
+#include <Interpreters/SessionLog.h>
+#include <Interpreters/AsynchronousMetricLog.h>
+#include <Interpreters/OpenTelemetrySpanLog.h>
 #include <Interpreters/ZooKeeperLog.h>
 
 #include <Poco/Util/AbstractConfiguration.h>
@@ -39,7 +44,13 @@ std::shared_ptr<TSystemLog> createSystemLog(
     const String & config_prefix)
 {
     if (!config.has(config_prefix))
+    {
+        LOG_DEBUG(&Poco::Logger::get("SystemLog"),
+                "Not creating {}.{} since corresponding section '{}' is missing from config",
+                default_database_name, default_table_name, config_prefix);
+
         return {};
+    }
 
     String database = config.getString(config_prefix + ".database", default_database_name);
     String table = config.getString(config_prefix + ".table", default_table_name);
@@ -107,6 +118,7 @@ SystemLogs::SystemLogs(ContextPtr global_context, const Poco::Util::AbstractConf
         "opentelemetry_span_log");
     query_views_log = createSystemLog<QueryViewsLog>(global_context, "system", "query_views_log", config, "query_views_log");
     zookeeper_log = createSystemLog<ZooKeeperLog>(global_context, "system", "zookeeper_log", config, "zookeeper_log");
+    session_log = createSystemLog<SessionLog>(global_context, "system", "session_log", config, "session_log");
 
     if (query_log)
         logs.emplace_back(query_log.get());
@@ -130,6 +142,8 @@ SystemLogs::SystemLogs(ContextPtr global_context, const Poco::Util::AbstractConf
         logs.emplace_back(query_views_log.get());
     if (zookeeper_log)
         logs.emplace_back(zookeeper_log.get());
+    if (session_log)
+        logs.emplace_back(session_log.get());
 
     try
     {
diff --git a/src/Interpreters/SystemLog.h b/src/Interpreters/SystemLog.h
index a332245439b..f6f7706dc24 100644
--- a/src/Interpreters/SystemLog.h
+++ b/src/Interpreters/SystemLog.h
@@ -75,6 +75,7 @@ class AsynchronousMetricLog;
 class OpenTelemetrySpanLog;
 class QueryViewsLog;
 class ZooKeeperLog;
+class SessionLog;
 
 
 class ISystemLog
@@ -115,6 +116,8 @@ struct SystemLogs
     std::shared_ptr<QueryViewsLog> query_views_log;
     /// Used to log all actions of ZooKeeper client
     std::shared_ptr<ZooKeeperLog> zookeeper_log;
+    /// Login, LogOut and Login failure events
+    std::shared_ptr<SessionLog> session_log;
 
     std::vector<ISystemLog *> logs;
 };
diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make
index e0c0c475dad..9263435e003 100644
--- a/src/Interpreters/ya.make
+++ b/src/Interpreters/ya.make
@@ -148,6 +148,8 @@ SRCS(
     RowRefs.cpp
     SelectIntersectExceptQueryVisitor.cpp
     Session.cpp
+    Session.cpp
+    SessionLog.cpp
     Set.cpp
     SetVariants.cpp
     SortedBlocksWriter.cpp
diff --git a/src/Processors/Formats/Impl/MySQLOutputFormat.h b/src/Processors/Formats/Impl/MySQLOutputFormat.h
index a8e1ada3d6a..a285a6d75f3 100644
--- a/src/Processors/Formats/Impl/MySQLOutputFormat.h
+++ b/src/Processors/Formats/Impl/MySQLOutputFormat.h
@@ -14,6 +14,7 @@ class IColumn;
 class IDataType;
 class WriteBuffer;
 struct FormatSettings;
+class MySQLSession;
 
 /** A stream for outputting data in a binary line-by-line format.
   */
@@ -34,6 +35,7 @@ public:
 private:
     void initialize();
 
+private:
     bool initialized = false;
     uint32_t client_capabilities = 0;
     uint8_t * sequence_id = nullptr;
diff --git a/src/Server/HTTPHandler.h b/src/Server/HTTPHandler.h
index 98f573f8cef..f52958a191d 100644
--- a/src/Server/HTTPHandler.h
+++ b/src/Server/HTTPHandler.h
@@ -83,12 +83,13 @@ private:
 
     // The request_credential instance may outlive a single request/response loop.
     // This happens only when the authentication mechanism requires more than a single request/response exchange (e.g., SPNEGO).
+    std::shared_ptr<Session> request_session;
     std::unique_ptr<Credentials> request_credentials;
 
     // Returns true when the user successfully authenticated,
     //  the session instance will be configured accordingly, and the request_credentials instance will be dropped.
     // Returns false when the user is not authenticated yet, and the 'Negotiate' response is sent,
-    //  the session and request_credentials instances are preserved.
+    //  the request_session and request_credentials instances are preserved.
     // Throws an exception if authentication failed.
     bool authenticateUser(
         HTTPServerRequest & request,
diff --git a/src/Server/MySQLHandler.cpp b/src/Server/MySQLHandler.cpp
index 93f4bff46c2..3720362775c 100644
--- a/src/Server/MySQLHandler.cpp
+++ b/src/Server/MySQLHandler.cpp
@@ -246,7 +246,7 @@ void MySQLHandler::authenticate(const String & user_name, const String & auth_pl
     try
     {
         // For compatibility with JavaScript MySQL client, Native41 authentication plugin is used when possible (if password is specified using double SHA1). Otherwise SHA256 plugin is used.
-        if (session->getAuthenticationType(user_name) == DB::Authentication::SHA256_PASSWORD)
+        if (session->getAuthenticationTypeOrLogInFailure(user_name) == DB::Authentication::SHA256_PASSWORD)
         {
             authPluginSSL();
         }
diff --git a/src/TableFunctions/TableFunctionMySQL.cpp b/src/TableFunctions/TableFunctionMySQL.cpp
index 09f9cf8b1f5..92387b13d55 100644
--- a/src/TableFunctions/TableFunctionMySQL.cpp
+++ b/src/TableFunctions/TableFunctionMySQL.cpp
@@ -61,8 +61,9 @@ void TableFunctionMySQL::parseArguments(const ASTPtr & ast_function, ContextPtr
     user_name = args[3]->as<ASTLiteral &>().value.safeGet<String>();
     password = args[4]->as<ASTLiteral &>().value.safeGet<String>();
 
+    const auto & settings = context->getSettingsRef();
     /// Split into replicas if needed. 3306 is the default MySQL port number
-    size_t max_addresses = context->getSettingsRef().glob_expansion_max_elements;
+    const size_t max_addresses = settings.glob_expansion_max_elements;
     auto addresses = parseRemoteDescriptionForExternalDatabase(host_port, max_addresses, 3306);
     pool.emplace(remote_database_name, addresses, user_name, password);
 
diff --git a/tests/config/install.sh b/tests/config/install.sh
index 7dd2559e505..b5f2c65bea9 100755
--- a/tests/config/install.sh
+++ b/tests/config/install.sh
@@ -44,6 +44,7 @@ ln -sf $SRC_PATH/users.d/access_management.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/database_atomic_drop_detach_sync.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/opentelemetry.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/remote_queries.xml $DEST_SERVER_PATH/users.d/
+ln -sf $SRC_PATH/users.d/session_log_test.xml $DEST_SERVER_PATH/users.d/
 
 # FIXME DataPartsExchange may hang for http_send_timeout seconds
 # when nobody is going to read from the other side of socket (due to "Fetching of part was cancelled"),
diff --git a/tests/config/users.d/session_log_test.xml b/tests/config/users.d/session_log_test.xml
new file mode 100644
index 00000000000..5061963fe22
--- /dev/null
+++ b/tests/config/users.d/session_log_test.xml
@@ -0,0 +1,30 @@
+<?xml version="1.0"?>
+<!-- User and profile to be used in sesson_log tests, to make sure that list of user's profiles is logged correctly -->
+<yandex>
+    <profiles>
+        <session_log_test_xml_profile>
+            <!--
+            can't be readonly since we need to modify some
+            user settings in queries initiated by test.
+            <readonly>1</readonly>
+            -->
+            <send_logs_level>none</send_logs_level>
+        </session_log_test_xml_profile>
+    </profiles>
+    <roles>
+        <session_log_test_xml_role>
+        </session_log_test_xml_role>
+    </roles>
+
+    <users>
+        <session_log_test_xml_user>
+            <password></password>
+            <networks incl="networks" replace="replace">
+                <ip>::1</ip>
+                <ip>127.0.0.1</ip>
+            </networks>
+            <profile>session_log_test_xml_profile</profile>
+            <quota>default</quota>
+        </session_log_test_xml_user>
+    </users>
+</yandex>
diff --git a/tests/queries/0_stateless/01033_quota_dcl.reference b/tests/queries/0_stateless/01033_quota_dcl.reference
index 7478adac441..e732ea2fcd6 100644
--- a/tests/queries/0_stateless/01033_quota_dcl.reference
+++ b/tests/queries/0_stateless/01033_quota_dcl.reference
@@ -1 +1 @@
-CREATE QUOTA default KEYED BY user_name FOR INTERVAL 1 hour TRACKING ONLY TO default, readonly
+CREATE QUOTA default KEYED BY user_name FOR INTERVAL 1 hour TRACKING ONLY TO default, readonly, session_log_test_xml_user
diff --git a/tests/queries/0_stateless/01702_system_query_log.reference b/tests/queries/0_stateless/01702_system_query_log.reference
index 1f329feac22..3458c2e5ed4 100644
--- a/tests/queries/0_stateless/01702_system_query_log.reference
+++ b/tests/queries/0_stateless/01702_system_query_log.reference
@@ -8,6 +8,7 @@ GRANT queries
 REVOKE queries
 Misc queries
 ACTUAL LOG CONTENT:
+	-- fire all kinds of queries and then check if those are present in the system.query_log\nSET log_comment=\'system.query_log logging test\';
 Select	SELECT \'DROP queries and also a cleanup before the test\';
 Drop	DROP DATABASE IF EXISTS sqllt SYNC;
 	DROP USER IF EXISTS sqllt_user;
@@ -82,5 +83,4 @@ Rename	RENAME TABLE sqllt.table TO sqllt.table_new;
 Rename	RENAME TABLE sqllt.table_new TO sqllt.table;
 Drop	TRUNCATE TABLE sqllt.table;
 Drop	DROP TABLE sqllt.table SYNC;
-	SET log_comment=\'\';
 DROP queries and also a cleanup after the test
diff --git a/tests/queries/0_stateless/01747_system_session_log_long.reference b/tests/queries/0_stateless/01747_system_session_log_long.reference
new file mode 100644
index 00000000000..9ecf7e05421
--- /dev/null
+++ b/tests/queries/0_stateless/01747_system_session_log_long.reference
@@ -0,0 +1,218 @@
+
+#  no_password - User with profile from XML 
+TCP endpoint
+TCP 'wrong password' case is skipped for no_password.
+HTTP endpoint
+HTTP 'wrong password' case is skipped for no_password.
+MySQL endpoint
+MySQL 'wrong password' case is skipped for no_password.
+
+#  no_password - No profiles no roles 
+TCP endpoint
+TCP 'wrong password' case is skipped for no_password.
+HTTP endpoint
+HTTP 'wrong password' case is skipped for no_password.
+MySQL endpoint
+MySQL 'wrong password' case is skipped for no_password.
+
+#  no_password - Two profiles, no roles 
+TCP endpoint
+TCP 'wrong password' case is skipped for no_password.
+HTTP endpoint
+HTTP 'wrong password' case is skipped for no_password.
+MySQL endpoint
+MySQL 'wrong password' case is skipped for no_password.
+
+#  no_password - Two profiles and two simple roles 
+TCP endpoint
+TCP 'wrong password' case is skipped for no_password.
+HTTP endpoint
+HTTP 'wrong password' case is skipped for no_password.
+MySQL endpoint
+MySQL 'wrong password' case is skipped for no_password.
+
+#  plaintext_password - No profiles no roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+
+#  plaintext_password - Two profiles, no roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+
+#  plaintext_password - Two profiles and two simple roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+
+#  sha256_password - No profiles no roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+MySQL 'successful login' case is skipped for sha256_password.
+
+#  sha256_password - Two profiles, no roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+MySQL 'successful login' case is skipped for sha256_password.
+
+#  sha256_password - Two profiles and two simple roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+MySQL 'successful login' case is skipped for sha256_password.
+
+#  double_sha1_password - No profiles no roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+
+#  double_sha1_password - Two profiles, no roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+
+#  double_sha1_password - Two profiles and two simple roles 
+TCP endpoint
+HTTP endpoint
+MySQL endpoint
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	TCP	Logout	1
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	HTTP	Logout	1
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	MySQL	LoginFailure	many
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	MySQL	Logout	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	TCP	Logout	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	HTTP	Logout	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	MySQL	LoginFailure	many
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	MySQL	Logout	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	TCP	Logout	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	HTTP	Logout	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	MySQL	LoginFailure	many
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	MySQL	Logout	1
+${BASE_USERNAME}_no_password_no_profiles_no_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_no_password_no_profiles_no_roles	TCP	Logout	1
+${BASE_USERNAME}_no_password_no_profiles_no_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_no_password_no_profiles_no_roles	HTTP	Logout	1
+${BASE_USERNAME}_no_password_no_profiles_no_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_no_password_no_profiles_no_roles	MySQL	Logout	1
+${BASE_USERNAME}_no_password_two_profiles_no_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_no_password_two_profiles_no_roles	TCP	Logout	1
+${BASE_USERNAME}_no_password_two_profiles_no_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_no_password_two_profiles_no_roles	HTTP	Logout	1
+${BASE_USERNAME}_no_password_two_profiles_no_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_no_password_two_profiles_no_roles	MySQL	Logout	1
+${BASE_USERNAME}_no_password_two_profiles_two_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_no_password_two_profiles_two_roles	TCP	Logout	1
+${BASE_USERNAME}_no_password_two_profiles_two_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_no_password_two_profiles_two_roles	HTTP	Logout	1
+${BASE_USERNAME}_no_password_two_profiles_two_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_no_password_two_profiles_two_roles	MySQL	Logout	1
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	TCP	Logout	1
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	HTTP	Logout	1
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	MySQL	LoginFailure	many
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	MySQL	Logout	1
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	TCP	Logout	1
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	HTTP	Logout	1
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	MySQL	LoginFailure	many
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	MySQL	Logout	1
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	TCP	Logout	1
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	HTTP	Logout	1
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	MySQL	LoginFailure	many
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	MySQL	LoginSuccess	1
+${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	MySQL	Logout	1
+${BASE_USERNAME}_sha256_password_no_profiles_no_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_sha256_password_no_profiles_no_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_sha256_password_no_profiles_no_roles	TCP	Logout	1
+${BASE_USERNAME}_sha256_password_no_profiles_no_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_sha256_password_no_profiles_no_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_sha256_password_no_profiles_no_roles	HTTP	Logout	1
+${BASE_USERNAME}_sha256_password_no_profiles_no_roles	MySQL	LoginFailure	many
+${BASE_USERNAME}_sha256_password_two_profiles_no_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_sha256_password_two_profiles_no_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_sha256_password_two_profiles_no_roles	TCP	Logout	1
+${BASE_USERNAME}_sha256_password_two_profiles_no_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_sha256_password_two_profiles_no_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_sha256_password_two_profiles_no_roles	HTTP	Logout	1
+${BASE_USERNAME}_sha256_password_two_profiles_no_roles	MySQL	LoginFailure	many
+${BASE_USERNAME}_sha256_password_two_profiles_two_roles	TCP	LoginFailure	1
+${BASE_USERNAME}_sha256_password_two_profiles_two_roles	TCP	LoginSuccess	1
+${BASE_USERNAME}_sha256_password_two_profiles_two_roles	TCP	Logout	1
+${BASE_USERNAME}_sha256_password_two_profiles_two_roles	HTTP	LoginFailure	1
+${BASE_USERNAME}_sha256_password_two_profiles_two_roles	HTTP	LoginSuccess	1
+${BASE_USERNAME}_sha256_password_two_profiles_two_roles	HTTP	Logout	1
+${BASE_USERNAME}_sha256_password_two_profiles_two_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_double_sha1_password_no_profiles_no_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_double_sha1_password_two_profiles_no_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_double_sha1_password_two_profiles_two_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_no_password_no_profiles_no_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_no_password_no_profiles_no_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_no_password_no_profiles_no_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_no_password_two_profiles_no_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_no_password_two_profiles_no_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_no_password_two_profiles_no_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_no_password_two_profiles_two_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_no_password_two_profiles_two_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_no_password_two_profiles_two_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_plaintext_password_no_profiles_no_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_plaintext_password_two_profiles_no_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_plaintext_password_two_profiles_two_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_sha256_password_no_profiles_no_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_sha256_password_no_profiles_no_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_sha256_password_no_profiles_no_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_sha256_password_two_profiles_no_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_sha256_password_two_profiles_no_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_sha256_password_two_profiles_no_roles	MySQL	LoginFailure	many
+invalid_${BASE_USERNAME}_sha256_password_two_profiles_two_roles	TCP	LoginFailure	1
+invalid_${BASE_USERNAME}_sha256_password_two_profiles_two_roles	HTTP	LoginFailure	1
+invalid_${BASE_USERNAME}_sha256_password_two_profiles_two_roles	MySQL	LoginFailure	many
+invalid_session_log_test_xml_user	TCP	LoginFailure	1
+invalid_session_log_test_xml_user	HTTP	LoginFailure	1
+invalid_session_log_test_xml_user	MySQL	LoginFailure	many
+session_log_test_xml_user	TCP	LoginSuccess	1
+session_log_test_xml_user	TCP	Logout	1
+session_log_test_xml_user	HTTP	LoginSuccess	1
+session_log_test_xml_user	HTTP	Logout	1
+session_log_test_xml_user	MySQL	LoginSuccess	1
+session_log_test_xml_user	MySQL	Logout	1
diff --git a/tests/queries/0_stateless/01747_system_session_log_long.sh b/tests/queries/0_stateless/01747_system_session_log_long.sh
new file mode 100755
index 00000000000..16b32a08442
--- /dev/null
+++ b/tests/queries/0_stateless/01747_system_session_log_long.sh
@@ -0,0 +1,370 @@
+#!/usr/bin/env bash
+
+##################################################################################################
+# Verify that login, logout, and login failure events are properly stored in system.session_log
+# when different `IDENTIFIED BY` clauses are used on user.
+#
+# Make sure that system.session_log entries are non-empty and provide enough info on each event.
+#
+# Using multiple protocols
+# * native TCP protocol with CH client
+# * HTTP with CURL
+# * MySQL - CH server accesses itself via mysql table function, query typically fails (unrelated)
+#   but auth should be performed properly.
+# * PostgreSQL - CH server accesses itself via postgresql table function (currently out of order).
+# * gRPC - not done yet
+#
+# There is way to control how many time a query (e.g. via mysql table function) is retried
+# and hence variable number of records in session_log. To mitigate this and simplify final query,
+# each auth_type is tested for separate user. That way SELECT DISTINCT doesn't exclude log entries
+# from different cases.
+#
+# All created users added to the ALL_USERNAMES and later cleaned up.
+##################################################################################################
+
+# To minimize amount of error context sent on failed queries when talking to CH via MySQL protocol.
+export CLICKHOUSE_CLIENT_SERVER_LOGS_LEVEL=none
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+set -eu
+
+# Since there is no way to cleanup system.session_log table,
+# make sure that we can identify log entries from this test by a random user name.
+readonly BASE_USERNAME="session_log_test_user_$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 32)"
+readonly TMP_QUERY_FILE=$(mktemp /tmp/tmp_query.log.XXXXXX)
+declare -a ALL_USERNAMES
+ALL_USERNAMES+=("${BASE_USERNAME}")
+
+function reportError()
+{
+    if [ -s "${TMP_QUERY_FILE}" ] ;
+    then
+        echo "!!!!!! ERROR ${CLICKHOUSE_CLIENT} ${*} --queries-file ${TMP_QUERY_FILE}" >&2
+        echo "query:" >&2
+        cat "${TMP_QUERY_FILE}" >&2
+        rm -f "${TMP_QUERY_FILE}"
+    fi
+}
+
+function executeQuery()
+{
+    ## Execute query (provided via heredoc or herestring) and print query in case of error.
+    trap 'rm -f ${TMP_QUERY_FILE}; trap - ERR RETURN' RETURN
+    # Since we want to report with current values supplied to this function call
+    # shellcheck disable=SC2064
+    trap "reportError $*" ERR
+
+    cat - > "${TMP_QUERY_FILE}"
+    ${CLICKHOUSE_CLIENT} "${@}" --queries-file "${TMP_QUERY_FILE}"
+}
+
+function cleanup()
+{
+    local usernames_to_cleanup
+    usernames_to_cleanup="$(IFS=, ; echo "${ALL_USERNAMES[*]}")"
+    executeQuery <<EOF
+DROP USER IF EXISTS ${usernames_to_cleanup};
+DROP SETTINGS PROFILE IF EXISTS session_log_test_profile;
+DROP SETTINGS PROFILE IF EXISTS session_log_test_profile2;
+DROP ROLE IF EXISTS session_log_test_role;
+DROP ROLE IF EXISTS session_log_test_role2;
+EOF
+}
+
+cleanup
+trap "cleanup" EXIT
+
+function executeQueryExpectError()
+{
+    cat - > "${TMP_QUERY_FILE}"
+    ! ${CLICKHOUSE_CLIENT} "${@}" --multiquery --queries-file "${TMP_QUERY_FILE}" 2>&1 | tee -a ${TMP_QUERY_FILE}
+}
+
+function createUser()
+{
+    local auth_type="${1}"
+    local username="${2}"
+    local password="${3}"
+
+    if [[ "${auth_type}" == "no_password" ]]
+    then
+        password=""
+
+    elif [[ "${auth_type}" == "plaintext_password" ]]
+    then
+        password="${password}"
+
+    elif [[ "${auth_type}" == "sha256_password" ]]
+    then
+        password="$(executeQuery <<< "SELECT hex(SHA256('${password}'))")"
+
+    elif [[ "${auth_type}" == "double_sha1_password" ]]
+    then
+        password="$(executeQuery <<< "SELECT hex(SHA1(SHA1('${password}')))")"
+
+    else
+        echo "Invalid auth_type: ${auth_type}" >&2
+        exit 1
+    fi
+
+    export RESULTING_PASS="${password}"
+    if [ -n "${password}" ]
+    then
+        password="BY '${password}'"
+    fi
+
+    executeQuery <<EOF
+DROP USER IF EXISTS '${username}';
+CREATE USER '${username}' IDENTIFIED WITH ${auth_type} ${password};
+EOF
+    ALL_USERNAMES+=("${username}")
+}
+
+function testTCP()
+{
+    echo "TCP endpoint"
+
+    local auth_type="${1}"
+    local username="${2}"
+    local password="${3}"
+
+    # Loging\Logout
+    if [[ -n "${password}" ]]
+    then
+        executeQuery -u "${username}" --password "${password}" <<< "SELECT 1 FORMAT Null;"
+    else
+        executeQuery -u "${username}" <<< "SELECT 1 FORMAT Null;" 
+    fi
+
+    # Wrong username
+    executeQueryExpectError -u "invalid_${username}" \
+        <<< "SELECT 1 Format Null" \
+        | grep -Eq "Code: 516. .+ invalid_${username}: Authentication failed: password is incorrect or there is no user with such name"
+
+    # Wrong password
+    if [[ "${auth_type}" == "no_password" ]]
+    then
+        echo "TCP 'wrong password' case is skipped for ${auth_type}."
+    else
+        # user with `no_password` user is able to login with any password, so it makes sense to skip this testcase.
+        executeQueryExpectError -u "${username}" --password  "invalid_${password}" \
+            <<< "SELECT 1 Format Null" \
+            | grep -Eq "Code: 516. .+ ${username}: Authentication failed: password is incorrect or there is no user with such name"
+    fi
+}
+
+function testHTTPWithURL()
+{
+    local auth_type="${1}"
+    local username="${2}"
+    local password="${3}"
+    local clickhouse_url="${4}"
+
+    # Loging\Logout
+    ${CLICKHOUSE_CURL} -sS "${clickhouse_url}" \
+        -H "X-ClickHouse-User: ${username}" -H "X-ClickHouse-Key: ${password}" \
+        -d 'SELECT 1 Format Null'
+
+    # Wrong username
+    ${CLICKHOUSE_CURL} -sS "${clickhouse_url}" \
+        -H "X-ClickHouse-User: invalid_${username}" -H "X-ClickHouse-Key: ${password}" \
+        -d 'SELECT 1 Format Null' \
+        | grep -Eq "Code: 516. .+ invalid_${username}: Authentication failed: password is incorrect or there is no user with such name"
+
+    # Wrong password
+    if [[ "${auth_type}" == "no_password" ]]
+    then
+        echo "HTTP 'wrong password' case is skipped for ${auth_type}."
+    else
+        # user with `no_password` is able to login with any password, so it makes sense to skip this testcase.
+        ${CLICKHOUSE_CURL} -sS "${clickhouse_url}" \
+            -H "X-ClickHouse-User: ${username}" -H "X-ClickHouse-Key: invalid_${password}" \
+            -d 'SELECT 1 Format Null' \
+            | grep -Eq "Code: 516. .+ ${username}: Authentication failed: password is incorrect or there is no user with such name"
+    fi
+}
+
+function testHTTP()
+{
+    echo "HTTP endpoint"
+    testHTTPWithURL "${1}" "${2}" "${3}" "${CLICKHOUSE_URL}"
+}
+
+function testHTTPNamedSession()
+{
+    # echo "HTTP endpoint with named session"
+    local HTTP_SESSION_ID
+    HTTP_SESSION_ID="session_id_$(cat /dev/urandom | tr -cd 'a-f0-9' | head -c 32)"
+    if [ -v CLICKHOUSE_URL_PARAMS ]
+    then
+        CLICKHOUSE_URL_WITH_SESSION_ID="${CLICKHOUSE_URL}&session_id=${HTTP_SESSION_ID}"
+    else
+        CLICKHOUSE_URL_WITH_SESSION_ID="${CLICKHOUSE_URL}?session_id=${HTTP_SESSION_ID}"
+    fi
+
+    testHTTPWithURL "${1}" "${2}" "${3}" "${CLICKHOUSE_URL_WITH_SESSION_ID}"
+}
+
+function testMySQL()
+{
+    echo "MySQL endpoint"
+    local auth_type="${1}"
+    local username="${2}"
+    local password="${3}"
+
+    trap "reportError" ERR
+
+    # echo 'Loging\Logout'
+    # sha256 auth is done differenctly for MySQL, so skip it for now.
+    if [[ "${auth_type}" == "sha256_password" ]]
+    then
+        echo "MySQL 'successful login' case is skipped for ${auth_type}."
+    else
+        # CH is able to log into itself via MySQL protocol but query fails.
+        executeQueryExpectError \
+            <<< "SELECT 1 FROM mysql('127.0.0.1:9004', 'system', 'numbers', '${username}', '${password}') LIMIT 1 \
+            FORMAT NUll" \
+            | grep -Eq "Code: 1000\. DB::Exception: .*"
+    fi
+
+    # echo 'Wrong username'
+    executeQueryExpectError \
+        <<< "SELECT 1 FROM mysql('127.0.0.1:9004', 'system', 'numbers', 'invalid_${username}', '${password}') LIMIT 1 \
+        FORMAT NUll" \
+        | grep -Eq "Code: 1000\. DB::Exception: .* invalid_${username}"
+
+    # echo 'Wrong password'
+    if [[ "${auth_type}" == "no_password" ]]
+    then
+        echo "MySQL 'wrong password' case is skipped for ${auth_type}."
+    else
+        # user with `no_password` is able to login with any password, so it makes sense to skip this testcase.
+        executeQueryExpectError \
+            <<< "SELECT 1 FROM mysql('127.0.0.1:9004', 'system', 'numbers', '${username}', 'invalid_${password}') LIMIT 1 \
+            FORMAT NUll" \
+            | grep -Eq "Code: 1000\. DB::Exception: .* ${username}"
+    fi
+}
+
+# function testPostgreSQL()
+# {
+#     local auth_type="${1}"
+#
+#     # Right now it is impossible to log into CH via PostgreSQL protocol without a password.
+#     if [[ "${auth_type}" == "no_password" ]]
+#     then
+#         return 0
+#     fi
+#
+#     # Loging\Logout
+#     # CH is being able to log into itself via PostgreSQL protocol but query fails.
+#     executeQueryExpectError \
+#         <<< "SELECT 1 FROM postgresql('localhost:9005', 'system', 'numbers', '${username}', '${password}') LIMIT 1 FORMAT NUll" \
+#         | grep -Eq "Code: 1001. DB::Exception: .* pqxx::broken_connection: .*"
+#
+#     # Wrong username
+#     executeQueryExpectError \
+#         <<< "SELECT 1 FROM postgresql('localhost:9005', 'system', 'numbers', 'invalid_${username}', '${password}') LIMIT 1 FORMAT NUll" \
+#         | grep -Eq "Code: 1001. DB::Exception: .* pqxx::broken_connection: .*"
+#
+#     # Wrong password
+#     executeQueryExpectError \
+#         <<< "SELECT 1 FROM postgresql('localhost:9005', 'system', 'numbers', '${username}', 'invalid_${password}') LIMIT 1 FORMAT NUll" \
+#         | grep -Eq "Code: 1001. DB::Exception: .* pqxx::broken_connection: .*"
+# }
+
+function runEndpointTests()
+{
+    local case_name="${1}"
+    shift 1
+
+    local auth_type="${1}"
+    local username="${2}"
+    local password="${3}"
+    local setup_queries="${4:-}"
+
+    echo 
+    echo "#  ${auth_type} - ${case_name} "
+
+    ${CLICKHOUSE_CLIENT} -q "SET log_comment='${username} ${auth_type} - ${case_name}';"
+    if [[ -n "${setup_queries}" ]]
+    then
+        # echo "Executing setup queries: ${setup_queries}"
+        echo "${setup_queries}" | executeQuery --multiquery
+    fi
+
+    testTCP "${auth_type}" "${username}" "${password}"
+    testHTTP "${auth_type}" "${username}" "${password}"
+
+    # testHTTPNamedSession "${auth_type}" "${username}" "${password}"
+    testMySQL "${auth_type}" "${username}" "${password}"
+    # testPostgreSQL "${auth_type}" "${username}" "${password}"
+}
+
+function testAsUserIdentifiedBy()
+{
+    local auth_type="${1}"
+    local password="password"
+
+    cleanup
+
+    local username="${BASE_USERNAME}_${auth_type}_no_profiles_no_roles"
+    createUser "${auth_type}" "${username}" "${password}"
+    runEndpointTests "No profiles no roles" "${auth_type}" "${username}" "${RESULTING_PASS}"
+
+    username="${BASE_USERNAME}_${auth_type}_two_profiles_no_roles"
+    createUser "${auth_type}" "${username}" "${password}"
+    runEndpointTests "Two profiles, no roles" "${auth_type}" "${username}" "${RESULTING_PASS}" "\
+DROP SETTINGS PROFILE IF EXISTS session_log_test_profile;
+DROP SETTINGS PROFILE IF EXISTS session_log_test_profile2;
+CREATE PROFILE session_log_test_profile SETTINGS max_memory_usage=10000000 TO ${username};
+CREATE PROFILE session_log_test_profile2 SETTINGS max_rows_to_transfer=1000 TO ${username};
+"
+
+    username="${BASE_USERNAME}_${auth_type}_two_profiles_two_roles"
+    createUser "${auth_type}" "${username}" "${password}"
+    runEndpointTests "Two profiles and two simple roles" "${auth_type}" "${username}" "${RESULTING_PASS}" "\
+CREATE ROLE session_log_test_role;
+GRANT session_log_test_role TO ${username};
+CREATE ROLE session_log_test_role2 SETTINGS max_columns_to_read=100;
+GRANT session_log_test_role2 TO ${username};
+SET DEFAULT ROLE session_log_test_role, session_log_test_role2 TO ${username};
+"
+}
+
+# to cut off previous runs
+readonly start_time="$(executeQuery <<< 'SELECT now64(6);')"
+
+# Special case: user and profile are both defined in XML
+runEndpointTests "User with profile from XML" "no_password" "session_log_test_xml_user" ''
+
+testAsUserIdentifiedBy "no_password"
+testAsUserIdentifiedBy "plaintext_password"
+testAsUserIdentifiedBy "sha256_password"
+testAsUserIdentifiedBy "double_sha1_password"
+
+executeQuery --multiquery <<EOF
+SYSTEM FLUSH LOGS;
+
+WITH
+    now64(6) as n,
+    toDateTime64('${start_time}', 3) as test_start_time
+SELECT
+    replaceAll(user, '${BASE_USERNAME}', '\${BASE_USERNAME}') as user_name,
+    interface,
+    type,
+    if(count(*) > 1, 'many', toString(count(*))) -- do not rely on count value since MySQL does arbitrary number of retries
+FROM
+    system.session_log
+WHERE
+    (user LIKE '%session_log_test_xml_user%' OR user LIKE '%${BASE_USERNAME}%')
+    AND
+    event_time_microseconds >= test_start_time
+GROUP BY
+    user_name, interface, type
+ORDER BY
+    user_name, interface, type;
+EOF
\ No newline at end of file
diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json
index 0143cc78dbe..91fca7eb5d5 100644
--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -466,7 +466,7 @@
         "polygon_dicts", // they use an explicitly specified database
         "01658_read_file_to_stringcolumn",
         "01721_engine_file_truncate_on_insert", // It's ok to execute in parallel but not several instances of the same test.
-        "01702_system_query_log", // It's ok to execute in parallel with oter tests but not several instances of the same test.
+        "01702_system_query_log", // It's ok to execute in parallel but not several instances of the same test.
         "01748_dictionary_table_dot", // creates database
         "00950_dict_get",
         "01615_random_one_shard_insertion",
@@ -514,5 +514,7 @@
         "02001_add_default_database_to_system_users", ///create user
         "02002_row_level_filter_bug", ///create user
         "02015_system_views"
+        "02002_row_level_filter_bug", ///create user
+        "01747_system_session_log_long" // Reads from system.session_log and can't be run in parallel with any other test (since almost any other test writes to session_log)
     ]
 }

From 3ca0b0c8605f1933d237c870b0038d40401331d4 Mon Sep 17 00:00:00 2001
From: Vasily Nemkov <V.Nemkov@gmail.com>
Date: Mon, 30 Aug 2021 19:08:02 +0300
Subject: [PATCH 035/177] Fixed GCC-9 build

---
 src/Interpreters/SessionLog.cpp | 8 ++++----
 src/Interpreters/SessionLog.h   | 2 +-
 2 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/src/Interpreters/SessionLog.cpp b/src/Interpreters/SessionLog.cpp
index 2d2f1358656..4967cb867c8 100644
--- a/src/Interpreters/SessionLog.cpp
+++ b/src/Interpreters/SessionLog.cpp
@@ -202,11 +202,11 @@ void SessionLogElement::appendToBlock(MutableColumns & columns) const
     columns[i++]->insertData(auth_failure_reason.data(), auth_failure_reason.length());
 }
 
-void SessionLog::addLoginSuccess(const UUID & session_id, std::optional<String> session_name, const Context & context)
+void SessionLog::addLoginSuccess(const UUID & session_id, std::optional<String> session_name, const Context & login_context)
 {
-    const auto access = context.getAccess();
-    const auto & settings = context.getSettingsRef();
-    const auto & client_info = context.getClientInfo();
+    const auto access = login_context.getAccess();
+    const auto & settings = login_context.getSettingsRef();
+    const auto & client_info = login_context.getClientInfo();
 
     DB::SessionLogElement log_entry(session_id, SESSION_LOGIN_SUCCESS);
     log_entry.client_info = client_info;
diff --git a/src/Interpreters/SessionLog.h b/src/Interpreters/SessionLog.h
index 2530809f9f9..fddabf45e4e 100644
--- a/src/Interpreters/SessionLog.h
+++ b/src/Interpreters/SessionLog.h
@@ -66,7 +66,7 @@ class SessionLog : public SystemLog<SessionLogElement>
     using SystemLog<SessionLogElement>::SystemLog;
 
 public:
-    void addLoginSuccess(const UUID & session_id, std::optional<String> session_name, const Context & context);
+    void addLoginSuccess(const UUID & session_id, std::optional<String> session_name, const Context & login_context);
     void addLoginFailure(const UUID & session_id, const ClientInfo & info, const String & user, const Exception & reason);
     void addLogOut(const UUID & session_id, const String & user, const ClientInfo & client_info);
 };

From c4416906c8d70da4bebcb4086ca4ff49367f9a92 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 30 Aug 2021 19:37:03 +0000
Subject: [PATCH 036/177] done

---
 programs/local/LocalServer.cpp                |   2 +
 programs/server/Server.cpp                    |   2 +
 src/Interpreters/Context.cpp                  |  52 ++-
 src/Interpreters/Context.h                    |  23 +-
 .../MergeTree/BackgroundJobsExecutor.cpp      | 306 ++++++------------
 .../MergeTree/BackgroundJobsExecutor.h        | 137 ++------
 src/Storages/MergeTree/ExecutableTask.h       |  21 ++
 .../MergeTree/MergeMutateExecutor.cpp         |  66 ++++
 src/Storages/MergeTree/MergeMutateExecutor.h  | 204 ++++++++++++
 src/Storages/MergeTree/MergeTask.h            | 253 +++++++++++++++
 src/Storages/MergeTree/MergeTreeData.cpp      |  11 +-
 src/Storages/MergeTree/MergeTreeData.h        |   9 +-
 src/Storages/MergeTree/MergeTreeSink.cpp      |   2 +-
 .../MergeTree/ReplicatedMergeTreeQueue.cpp    |   6 +-
 src/Storages/StorageMergeTree.cpp             |  62 ++--
 src/Storages/StorageMergeTree.h               |  20 +-
 src/Storages/StorageReplicatedMergeTree.cpp   |  30 +-
 src/Storages/StorageReplicatedMergeTree.h     |  20 +-
 18 files changed, 841 insertions(+), 385 deletions(-)
 create mode 100644 src/Storages/MergeTree/ExecutableTask.h
 create mode 100644 src/Storages/MergeTree/MergeMutateExecutor.cpp
 create mode 100644 src/Storages/MergeTree/MergeMutateExecutor.h
 create mode 100644 src/Storages/MergeTree/MergeTask.h

diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 2b1b6185321..acbb84ea870 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -246,6 +246,8 @@ try
     /// Sets external authenticators config (LDAP, Kerberos).
     global_context->setExternalAuthenticatorsConfig(config());
 
+    global_context->initializeBackgroundExecutors();
+
     setupUsers();
 
     /// Limit on total number of concurrently executing queries.
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index ddbc4c4e433..19245eed36b 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -549,6 +549,8 @@ if (ThreadFuzzer::instance().isEffective())
     // ignore `max_thread_pool_size` in configs we fetch from ZK, but oh well.
     GlobalThreadPool::initialize(config().getUInt("max_thread_pool_size", 10000));
 
+    global_context->initializeBackgroundExecutors();
+
     ConnectionCollector::init(global_context, config().getUInt("max_threads_for_connection_collector", 10));
 
     bool has_zookeeper = config().has("zookeeper");
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index d984a350c80..0d323b141a2 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -100,6 +100,13 @@ namespace CurrentMetrics
     extern const Metric BackgroundBufferFlushSchedulePoolTask;
     extern const Metric BackgroundDistributedSchedulePoolTask;
     extern const Metric BackgroundMessageBrokerSchedulePoolTask;
+
+
+    extern const Metric DelayedInserts;
+    extern const Metric BackgroundPoolTask;
+    extern const Metric BackgroundMovePoolTask;
+    extern const Metric BackgroundFetchesPoolTask;
+
 }
 
 namespace DB
@@ -1503,9 +1510,9 @@ BackgroundSchedulePool & Context::getBufferFlushSchedulePool() const
     return *shared->buffer_flush_schedule_pool;
 }
 
-BackgroundTaskSchedulingSettings Context::getBackgroundProcessingTaskSchedulingSettings() const
+ExecutableTaskSchedulingSettings Context::getBackgroundProcessingTaskSchedulingSettings() const
 {
-    BackgroundTaskSchedulingSettings task_settings;
+    ExecutableTaskSchedulingSettings task_settings;
 
     const auto & config = getConfigRef();
     task_settings.thread_sleep_seconds = config.getDouble("background_processing_pool_thread_sleep_seconds", 10);
@@ -1518,9 +1525,9 @@ BackgroundTaskSchedulingSettings Context::getBackgroundProcessingTaskSchedulingS
     return task_settings;
 }
 
-BackgroundTaskSchedulingSettings Context::getBackgroundMoveTaskSchedulingSettings() const
+ExecutableTaskSchedulingSettings Context::getBackgroundMoveTaskSchedulingSettings() const
 {
-    BackgroundTaskSchedulingSettings task_settings;
+    ExecutableTaskSchedulingSettings task_settings;
 
     const auto & config = getConfigRef();
     task_settings.thread_sleep_seconds = config.getDouble("background_move_processing_pool_thread_sleep_seconds", 10);
@@ -2691,4 +2698,41 @@ PartUUIDsPtr Context::getIgnoredPartUUIDs() const
     return ignored_part_uuids;
 }
 
+
+void Context::initializeBackgroundExecutors()
+{
+    merge_mutate_executor = MergeTreeBackgroundExecutor::create();
+    moves_executor = MergeTreeBackgroundExecutor::create();
+    fetch_executor = MergeTreeBackgroundExecutor::create();
+
+    merge_mutate_executor->setThreadsCount([this] () { return getSettingsRef().background_pool_size; });
+    merge_mutate_executor->setTasksCount([this] () { return getSettingsRef().background_pool_size; });
+    merge_mutate_executor->setMetric(CurrentMetrics::BackgroundPoolTask);
+
+    moves_executor->setThreadsCount([this] () { return getSettingsRef().background_move_pool_size; });
+    moves_executor->setTasksCount([this] () { return getSettingsRef().background_move_pool_size; });
+    moves_executor->setMetric(CurrentMetrics::BackgroundMovePoolTask);
+
+    fetch_executor->setThreadsCount([this] () { return getSettingsRef().background_fetches_pool_size; });
+    fetch_executor->setTasksCount([this] () { return getSettingsRef().background_fetches_pool_size; });
+    fetch_executor->setMetric(CurrentMetrics::BackgroundFetchesPoolTask);
+}
+
+
+MergeTreeBackgroundExecutorPtr Context::getMergeMutateExecutor() const
+{
+    return merge_mutate_executor;
+}
+
+MergeTreeBackgroundExecutorPtr Context::getMovesExecutor() const
+{
+    return moves_executor;
+}
+
+MergeTreeBackgroundExecutorPtr Context::getFetchesExecutor() const
+{
+    return fetch_executor;
+}
+
+
 }
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 6af2c3c4d62..f7c90c9377b 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -100,6 +100,8 @@ using StoragePolicyPtr = std::shared_ptr<const IStoragePolicy>;
 using StoragePoliciesMap = std::map<String, StoragePolicyPtr>;
 class StoragePolicySelector;
 using StoragePolicySelectorPtr = std::shared_ptr<const StoragePolicySelector>;
+class MergeTreeBackgroundExecutor;
+using MergeTreeBackgroundExecutorPtr = std::shared_ptr<MergeTreeBackgroundExecutor>;
 struct PartUUIDs;
 using PartUUIDsPtr = std::shared_ptr<PartUUIDs>;
 class KeeperDispatcher;
@@ -110,7 +112,7 @@ using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
 class IVolume;
 using VolumePtr = std::shared_ptr<IVolume>;
 struct NamedSession;
-struct BackgroundTaskSchedulingSettings;
+struct ExecutableTaskSchedulingSettings;
 
 #if USE_NLP
     class SynonymsExtensions;
@@ -279,6 +281,12 @@ private:
     /// A flag, used to distinguish between user query and internal query to a database engine (MaterializePostgreSQL).
     bool is_internal_query = false;
 
+    /// Background executors for *MergeTree tables
+    /// Must be in global context
+    MergeTreeBackgroundExecutorPtr merge_mutate_executor;
+    MergeTreeBackgroundExecutorPtr moves_executor;
+    MergeTreeBackgroundExecutorPtr fetch_executor;
+
 public:
     // Top-level OpenTelemetry trace context for the query. Makes sense only for a query context.
     OpenTelemetryTraceContext query_trace_context;
@@ -689,8 +697,8 @@ public:
     void dropCaches() const;
 
     /// Settings for MergeTree background tasks stored in config.xml
-    BackgroundTaskSchedulingSettings getBackgroundProcessingTaskSchedulingSettings() const;
-    BackgroundTaskSchedulingSettings getBackgroundMoveTaskSchedulingSettings() const;
+    ExecutableTaskSchedulingSettings getBackgroundProcessingTaskSchedulingSettings() const;
+    ExecutableTaskSchedulingSettings getBackgroundMoveTaskSchedulingSettings() const;
 
     BackgroundSchedulePool & getBufferFlushSchedulePool() const;
     BackgroundSchedulePool & getSchedulePool() const;
@@ -825,6 +833,15 @@ public:
     ReadTaskCallback getReadTaskCallback() const;
     void setReadTaskCallback(ReadTaskCallback && callback);
 
+
+    /// Background executors related methods
+
+    void initializeBackgroundExecutors();
+
+    MergeTreeBackgroundExecutorPtr getMergeMutateExecutor() const;
+    MergeTreeBackgroundExecutorPtr getMovesExecutor() const;
+    MergeTreeBackgroundExecutorPtr getFetchesExecutor() const;
+
 private:
     std::unique_lock<std::recursive_mutex> getLock() const;
 
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
index f3d957117e8..11039edc057 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
@@ -5,258 +5,142 @@
 #include <pcg_random.hpp>
 #include <random>
 
-namespace CurrentMetrics
-{
-    extern const Metric BackgroundPoolTask;
-    extern const Metric BackgroundMovePoolTask;
-    extern const Metric BackgroundFetchesPoolTask;
-}
-
 namespace DB
 {
 
-IBackgroundJobExecutor::IBackgroundJobExecutor(
-        ContextPtr global_context_,
-        const BackgroundTaskSchedulingSettings & sleep_settings_,
-        const std::vector<PoolConfig> & pools_configs_)
+BackgroundJobAssignee::BackgroundJobAssignee(MergeTreeData & data_, BackgroundJobAssignee::Type type_, ContextPtr global_context_)
     : WithContext(global_context_)
-    , sleep_settings(sleep_settings_)
+    , data(data_)
+    , sleep_settings(global_context_->getBackgroundMoveTaskSchedulingSettings())
     , rng(randomSeed())
+    , storage_id(data.getStorageID())
+    , type(type_)
 {
-    for (const auto & pool_config : pools_configs_)
-    {
-        const auto max_pool_size = pool_config.get_max_pool_size();
-        pools.try_emplace(pool_config.pool_type, max_pool_size, 0, max_pool_size, false);
-        pools_configs.emplace(pool_config.pool_type, pool_config);
-    }
 }
 
-double IBackgroundJobExecutor::getSleepRandomAdd()
+void BackgroundJobAssignee::trigger()
 {
-    std::lock_guard random_lock(random_mutex);
-    return std::uniform_real_distribution<double>(0, sleep_settings.task_sleep_seconds_when_no_work_random_part)(rng);
-}
+    std::lock_guard lock(holder_mutex);
+
+    if (!holder)
+        return;
 
-void IBackgroundJobExecutor::runTaskWithoutDelay()
-{
     no_work_done_count = 0;
     /// We have background jobs, schedule task as soon as possible
-    scheduling_task->schedule();
+    holder->schedule();
 }
 
-void IBackgroundJobExecutor::scheduleTask(bool with_backoff)
+void BackgroundJobAssignee::postpone()
 {
-    size_t next_time_to_execute;
-    if (with_backoff)
-    {
-        auto no_work_done_times = no_work_done_count.fetch_add(1, std::memory_order_relaxed);
+    std::lock_guard lock(holder_mutex);
 
-        next_time_to_execute = 1000 * (std::min(
-                sleep_settings.task_sleep_seconds_when_no_work_max,
-                sleep_settings.thread_sleep_seconds_if_nothing_to_do * std::pow(sleep_settings.task_sleep_seconds_when_no_work_multiplier, no_work_done_times))
-            + getSleepRandomAdd());
-    }
+    if (!holder)
+        return;
+
+    auto no_work_done_times = no_work_done_count.fetch_add(1, std::memory_order_relaxed);
+    double random_addition = std::uniform_real_distribution<double>(0, sleep_settings.task_sleep_seconds_when_no_work_random_part)(rng);
+
+    size_t next_time_to_execute = 1000 * (std::min(
+            sleep_settings.task_sleep_seconds_when_no_work_max,
+            sleep_settings.thread_sleep_seconds_if_nothing_to_do * std::pow(sleep_settings.task_sleep_seconds_when_no_work_multiplier, no_work_done_times))
+        + random_addition);
+
+    holder->scheduleAfter(next_time_to_execute, false);
+}
+
+
+void BackgroundJobAssignee::scheduleMergeMutateTask(ExecutableTaskPtr merge_task)
+{
+    bool res = getContext()->getMergeMutateExecutor()->trySchedule(merge_task);
+    if (res)
+        trigger();
     else
-    {
-        no_work_done_count = 0;
-        next_time_to_execute = 1000 * sleep_settings.thread_sleep_seconds_if_nothing_to_do;
-    }
-
-    scheduling_task->scheduleAfter(next_time_to_execute, false);
+        postpone();
 }
 
-namespace
+
+void BackgroundJobAssignee::scheduleFetchTask(ExecutableTaskPtr fetch_task)
 {
+    bool res = getContext()->getFetchesExecutor()->trySchedule(fetch_task);
+    if (res)
+        trigger();
+    else
+        postpone();
+}
 
-/// Tricky function: we have separate thread pool with max_threads in each background executor for each table
-/// But we want total background threads to be less than max_threads value. So we use global atomic counter (BackgroundMetric)
-/// to limit total number of background threads.
-bool incrementMetricIfLessThanMax(std::atomic<Int64> & atomic_value, Int64 max_value)
+
+void BackgroundJobAssignee::scheduleMoveTask(ExecutableTaskPtr move_task)
 {
-    auto value = atomic_value.load(std::memory_order_relaxed);
-    while (value < max_value)
+    bool res = getContext()->getMovesExecutor()->trySchedule(move_task);
+    if (res)
+        trigger();
+    else
+        postpone();
+}
+
+
+String BackgroundJobAssignee::toString(Type type)
+{
+    switch (type)
     {
-        if (atomic_value.compare_exchange_weak(value, value + 1, std::memory_order_release, std::memory_order_relaxed))
-            return true;
+        case Type::DataProcessing:
+            return "DataProcessing";
+        case Type::Moving:
+            return "Moving";
     }
-    return false;
 }
 
+void BackgroundJobAssignee::start()
+{
+    std::lock_guard lock(holder_mutex);
+    if (!holder)
+        holder = getContext()->getSchedulePool().createTask("BackgroundJobAssignee:" + toString(type), [this]{ main(); });
+
+    holder->activateAndSchedule();
 }
 
-void IBackgroundJobExecutor::execute(JobAndPool job_and_pool)
+void BackgroundJobAssignee::finish()
+{
+    /// No lock here, because scheduled tasks could call trigger method
+    if (holder)
+    {
+        holder->deactivate();
+
+        auto context = getContext();
+
+        context->getMovesExecutor()->removeTasksCorrespondingToStorage(storage_id);
+        context->getFetchesExecutor()->removeTasksCorrespondingToStorage(storage_id);
+        context->getMergeMutateExecutor()->removeTasksCorrespondingToStorage(storage_id);
+    }
+}
+
+
+void BackgroundJobAssignee::main()
 try
 {
-    auto & pool_config = pools_configs[job_and_pool.pool_type];
-    const auto max_pool_size = pool_config.get_max_pool_size();
-
-    /// If corresponding pool is not full increment metric and assign new job
-    if (incrementMetricIfLessThanMax(CurrentMetrics::values[pool_config.tasks_metric], max_pool_size))
+    bool succeed = false;
+    switch (type)
     {
-        try /// this try required because we have to manually decrement metric
-        {
-            /// Synchronize pool size, because config could be reloaded
-            pools[job_and_pool.pool_type].setMaxThreads(max_pool_size);
-            pools[job_and_pool.pool_type].setQueueSize(max_pool_size);
-
-            pools[job_and_pool.pool_type].scheduleOrThrowOnError([this, pool_config, job{std::move(job_and_pool.job)}] ()
-            {
-                try /// We don't want exceptions in background pool
-                {
-                    bool job_success = job();
-                    /// Job done, decrement metric and reset no_work counter
-                    CurrentMetrics::values[pool_config.tasks_metric]--;
-
-                    if (job_success)
-                    {
-                        /// Job done, new empty space in pool, schedule background task
-                        runTaskWithoutDelay();
-                    }
-                    else
-                    {
-                        /// Job done, but failed, schedule with backoff
-                        scheduleTask(/* with_backoff = */ true);
-                    }
-
-                }
-                catch (...)
-                {
-                    CurrentMetrics::values[pool_config.tasks_metric]--;
-                    tryLogCurrentException(__PRETTY_FUNCTION__);
-                    scheduleTask(/* with_backoff = */ true);
-                }
-            });
-            /// We've scheduled task in the background pool and when it will finish we will be triggered again. But this task can be
-            /// extremely long and we may have a lot of other small tasks to do, so we schedule ourselves here.
-            runTaskWithoutDelay();
-        }
-        catch (...)
-        {
-            /// With our Pool settings scheduleOrThrowOnError shouldn't throw exceptions, but for safety catch added here
-            CurrentMetrics::values[pool_config.tasks_metric]--;
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-            scheduleTask(/* with_backoff = */ true);
-        }
-    }
-    else /// Pool is full and we have some work to do
-    {
-        scheduleTask(/* with_backoff = */ false);
-    }
-}
-catch (...) /// Exception while we looking for a task, reschedule
-{
-    tryLogCurrentException(__PRETTY_FUNCTION__);
-
-    /// Why do we scheduleTask again?
-    /// To retry on exception, since it may be some temporary exception.
-    scheduleTask(/* with_backoff = */ true);
-}
-
-void IBackgroundJobExecutor::start()
-{
-    std::lock_guard lock(scheduling_task_mutex);
-    if (!scheduling_task)
-    {
-        scheduling_task = getContext()->getSchedulePool().createTask(
-            getBackgroundTaskName(), [this]{ backgroundTaskFunction(); });
+        case Type::DataProcessing:
+            succeed = data.scheduleDataProcessingJob(*this);
+            break;
+        case Type::Moving:
+            succeed = data.scheduleDataMovingJob(*this);
+            break;
     }
 
-    scheduling_task->activateAndSchedule();
-}
-
-void IBackgroundJobExecutor::finish()
-{
-    std::lock_guard lock(scheduling_task_mutex);
-    if (scheduling_task)
-    {
-        scheduling_task->deactivate();
-        for (auto & [pool_type, pool] : pools)
-            pool.wait();
-    }
-}
-
-void IBackgroundJobExecutor::triggerTask()
-{
-    std::lock_guard lock(scheduling_task_mutex);
-    if (scheduling_task)
-        runTaskWithoutDelay();
-}
-
-void IBackgroundJobExecutor::backgroundTaskFunction()
-try
-{
-    if (!scheduleJob())
-        scheduleTask(/* with_backoff = */ true);
+    if (!succeed)
+        postpone();
 }
 catch (...) /// Catch any exception to avoid thread termination.
 {
     tryLogCurrentException(__PRETTY_FUNCTION__);
-    scheduleTask(/* with_backoff = */ true);
+    postpone();
 }
 
-IBackgroundJobExecutor::~IBackgroundJobExecutor()
+BackgroundJobAssignee::~BackgroundJobAssignee()
 {
     finish();
 }
 
-BackgroundJobsExecutor::BackgroundJobsExecutor(
-       MergeTreeData & data_,
-       ContextPtr global_context_)
-    : IBackgroundJobExecutor(
-        global_context_,
-        global_context_->getBackgroundProcessingTaskSchedulingSettings(),
-        {PoolConfig
-            {
-                .pool_type = PoolType::MERGE_MUTATE,
-                .get_max_pool_size = [global_context_] () { return global_context_->getSettingsRef().background_pool_size; },
-                .tasks_metric = CurrentMetrics::BackgroundPoolTask
-            },
-        PoolConfig
-            {
-                .pool_type = PoolType::FETCH,
-                .get_max_pool_size = [global_context_] () { return global_context_->getSettingsRef().background_fetches_pool_size; },
-                .tasks_metric = CurrentMetrics::BackgroundFetchesPoolTask
-            }
-        })
-    , data(data_)
-{
-}
-
-String BackgroundJobsExecutor::getBackgroundTaskName() const
-{
-    return data.getStorageID().getFullTableName() + " (dataProcessingTask)";
-}
-
-bool BackgroundJobsExecutor::scheduleJob()
-{
-    return data.scheduleDataProcessingJob(*this);
-}
-
-BackgroundMovesExecutor::BackgroundMovesExecutor(
-       MergeTreeData & data_,
-       ContextPtr global_context_)
-    : IBackgroundJobExecutor(
-        global_context_,
-        global_context_->getBackgroundMoveTaskSchedulingSettings(),
-        {PoolConfig
-            {
-                .pool_type = PoolType::MOVE,
-                .get_max_pool_size = [global_context_] () { return global_context_->getSettingsRef().background_move_pool_size; },
-                .tasks_metric = CurrentMetrics::BackgroundMovePoolTask
-            }
-        })
-    , data(data_)
-{
-}
-
-String BackgroundMovesExecutor::getBackgroundTaskName() const
-{
-    return data.getStorageID().getFullTableName() + " (dataMovingTask)";
-}
-
-bool BackgroundMovesExecutor::scheduleJob()
-{
-    return data.scheduleDataMovingJob(*this);
-}
-
 }
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.h b/src/Storages/MergeTree/BackgroundJobsExecutor.h
index 0d9856a2f40..d89d8721697 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.h
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include <Storages/MergeTree/MergeTreeData.h>
+#include <Storages/MergeTree/MergeMutateExecutor.h>
 #include <Common/ThreadPool.h>
 #include <Core/BackgroundSchedulePool.h>
 #include <pcg_random.hpp>
@@ -12,7 +13,7 @@ namespace DB
 /// Settings for background tasks scheduling. Each background executor has one
 /// BackgroundSchedulingPoolTask and depending on execution result may put this
 /// task to sleep according to settings. Look at scheduleTask function for details.
-struct BackgroundTaskSchedulingSettings
+struct ExecutableTaskSchedulingSettings
 {
     double thread_sleep_seconds_random_part = 1.0;
     double thread_sleep_seconds_if_nothing_to_do = 0.1;
@@ -27,48 +28,14 @@ struct BackgroundTaskSchedulingSettings
     double task_sleep_seconds_when_no_work_min = 10;
 };
 
-/// Pool type where we must execute new job. Each background executor can have several
-/// background pools. When it receives new job it will execute new task in corresponding pool.
-enum class PoolType
+
+class BackgroundJobAssignee : protected WithContext
 {
-    MERGE_MUTATE,
-    MOVE,
-    FETCH,
-};
-
-using BackgroundJobFunc = std::function<bool()>;
-
-/// Result from background job providers. Function which will be executed in pool and pool type.
-struct JobAndPool
-{
-    BackgroundJobFunc job;
-    PoolType pool_type;
-};
-
-/// Background jobs executor which execute heavy-weight background tasks for MergTree tables, like
-/// background merges, moves, mutations, fetches and so on.
-/// Consists of two important parts:
-/// 1) Task in background scheduling pool which receives new jobs from storages and put them into required pool.
-/// 2) One or more ThreadPool objects, which execute background jobs.
-class IBackgroundJobExecutor : protected WithContext
-{
-protected:
-    /// Configuration for single background ThreadPool
-    struct PoolConfig
-    {
-        /// This pool type
-        PoolType pool_type;
-        /// Max pool size in threads
-        const std::function<size_t()> get_max_pool_size;
-        /// Metric that we have to increment when we execute task in this pool
-        CurrentMetrics::Metric tasks_metric;
-    };
-
 private:
-    /// Name for task in background scheduling pool
-    String task_name;
+    MergeTreeData & data;
+
     /// Settings for execution control of background scheduling task
-    BackgroundTaskSchedulingSettings sleep_settings;
+    ExecutableTaskSchedulingSettings sleep_settings;
     /// Useful for random backoff timeouts generation
     pcg64 rng;
 
@@ -76,87 +43,45 @@ private:
     /// no new jobs.
     std::atomic<size_t> no_work_done_count{0};
 
-    /// Pools where we execute background jobs
-    std::unordered_map<PoolType, ThreadPool> pools;
-    /// Configs for background pools
-    std::unordered_map<PoolType, PoolConfig> pools_configs;
-
     /// Scheduling task which assign jobs in background pool
-    BackgroundSchedulePool::TaskHolder scheduling_task;
+    BackgroundSchedulePool::TaskHolder holder;
     /// Mutex for thread safety
-    std::mutex scheduling_task_mutex;
-    /// Mutex for pcg random generator thread safety
-    std::mutex random_mutex;
+    std::mutex holder_mutex;
+
+    /// Save storage id to prevent use-after-free in destructor
+    StorageID storage_id;
 
 public:
-    /// These three functions are thread safe
+    enum class Type
+    {
+        DataProcessing,
+        Moving
+    };
+    Type type{Type::DataProcessing};
 
-    /// Start background task and start to assign jobs
     void start();
-    /// Schedule background task as soon as possible, even if it sleep at this
-    /// moment for some reason.
-    void triggerTask();
-    /// Finish execution: deactivate background task and wait already scheduled jobs
+    void trigger();
+    void postpone();
     void finish();
 
-    /// Executes job in a nested pool
-    void execute(JobAndPool job_and_pool);
+    void scheduleMergeMutateTask(ExecutableTaskPtr merge_task);
+    void scheduleFetchTask(ExecutableTaskPtr fetch_task);
+    void scheduleMoveTask(ExecutableTaskPtr move_task);
 
     /// Just call finish
-    virtual ~IBackgroundJobExecutor();
+    virtual ~BackgroundJobAssignee();
 
-protected:
-    IBackgroundJobExecutor(
-        ContextPtr global_context_,
-        const BackgroundTaskSchedulingSettings & sleep_settings_,
-        const std::vector<PoolConfig> & pools_configs_);
-
-    /// Name for task in background schedule pool
-    virtual String getBackgroundTaskName() const = 0;
-
-    /// Schedules a job in a nested pool in this class.
-    virtual bool scheduleJob() = 0;
+    BackgroundJobAssignee(
+        MergeTreeData & data_,
+        Type type,
+        ContextPtr global_context_);
 
 private:
+    static String toString(Type type);
+
     /// Function that executes in background scheduling pool
-    void backgroundTaskFunction();
-    /// Recalculate timeouts when we have to check for a new job
-    void scheduleTask(bool with_backoff);
-    /// Run background task as fast as possible and reset errors counter
-    void runTaskWithoutDelay();
-    /// Return random add for sleep in case of error
-    double getSleepRandomAdd();
+    void main();
 };
 
-/// Main jobs executor: merges, mutations, fetches and so on
-class BackgroundJobsExecutor final : public IBackgroundJobExecutor
-{
-private:
-    MergeTreeData & data;
-public:
-    BackgroundJobsExecutor(
-        MergeTreeData & data_,
-        ContextPtr global_context_);
-
-protected:
-    String getBackgroundTaskName() const override;
-    bool scheduleJob() override;
-};
-
-/// Move jobs executor, move parts between disks in the background
-/// Does nothing in case of default configuration
-class BackgroundMovesExecutor final : public IBackgroundJobExecutor
-{
-private:
-    MergeTreeData & data;
-public:
-    BackgroundMovesExecutor(
-        MergeTreeData & data_,
-        ContextPtr global_context_);
-
-protected:
-    String getBackgroundTaskName() const override;
-    bool scheduleJob() override;
-};
 
 }
diff --git a/src/Storages/MergeTree/ExecutableTask.h b/src/Storages/MergeTree/ExecutableTask.h
new file mode 100644
index 00000000000..5834ec04bfa
--- /dev/null
+++ b/src/Storages/MergeTree/ExecutableTask.h
@@ -0,0 +1,21 @@
+#pragma once
+
+#include <memory>
+
+#include <Interpreters/StorageID.h>
+
+namespace DB
+{
+
+class ExecutableTask
+{
+public:
+    virtual bool execute() = 0;
+    virtual void onCompleted() = 0;
+    virtual StorageID getStorageID() = 0;
+    virtual ~ExecutableTask() = default;
+};
+
+using ExecutableTaskPtr = std::shared_ptr<ExecutableTask>;
+
+}
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
new file mode 100644
index 00000000000..cf7283d9c2b
--- /dev/null
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -0,0 +1,66 @@
+#include <Storages/MergeTree/MergeMutateExecutor.h>
+
+#include <Storages/MergeTree/BackgroundJobsExecutor.h>
+
+
+namespace DB
+{
+
+
+void MergeTreeBackgroundExecutor::schedulerThreadFunction()
+{
+    while (true)
+    {
+        ExecutableTaskPtr current;
+        {
+            std::unique_lock lock(mutex);
+            has_tasks.wait(lock, [this](){ return !tasks.empty() || shutdown_suspend; });
+
+            if (shutdown_suspend)
+                break;
+
+            current = std::move(tasks.front());
+            tasks.pop_front();
+
+            /// This is needed to increase / decrease the number of threads at runtime
+            updatePoolConfiguration();
+        }
+
+        bool res = pool.trySchedule([this, task = current] ()
+        {
+            try
+            {
+                if (task->execute())
+                {
+                    std::lock_guard guard(mutex);
+                    tasks.emplace_back(task);
+                    has_tasks.notify_one();
+                    return;
+                }
+
+                decrementTasksCount();
+                task->onCompleted();
+
+                std::lock_guard guard(mutex);
+                has_tasks.notify_one();
+            }
+            catch(...)
+            {
+                decrementTasksCount();
+                task->onCompleted();
+                std::lock_guard guard(mutex);
+                has_tasks.notify_one();
+                tryLogCurrentException(__PRETTY_FUNCTION__);
+            }
+        });
+
+        if (!res)
+        {
+            std::lock_guard guard(mutex);
+            tasks.emplace_back(current);
+        }
+    }
+}
+
+
+}
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
new file mode 100644
index 00000000000..ec0b71910ef
--- /dev/null
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -0,0 +1,204 @@
+#pragma once
+
+#include <deque>
+#include <functional>
+#include <atomic>
+#include <mutex>
+#include <condition_variable>
+
+#include <common/shared_ptr_helper.h>
+#include <Common/ThreadPool.h>
+#include <Storages/MergeTree/ExecutableTask.h>
+#include <Storages/MergeTree/MergeTreeData.h>
+
+
+namespace DB
+{
+
+class LambdaAdapter : public shared_ptr_helper<LambdaAdapter>, public ExecutableTask
+{
+public:
+
+    template <typename T>
+    explicit LambdaAdapter(T && inner_, MergeTreeData & data_) : inner(inner_), data(data_) {}
+
+    bool execute() override
+    {
+        res = inner();
+        return false;
+    }
+
+    void onCompleted() override
+    {
+        data.triggerBackgroundOperationTask(!res);
+    }
+
+    StorageID getStorageID() override
+    {
+        return data.getStorageID();
+    }
+
+private:
+    bool res = false;
+    std::function<bool()> inner;
+    MergeTreeData & data;
+};
+
+
+class MergeTreeBackgroundExecutor : public shared_ptr_helper<MergeTreeBackgroundExecutor>
+{
+public:
+
+    using CountGetter = std::function<size_t()>;
+    using Callback = std::function<void()>;
+
+
+    MergeTreeBackgroundExecutor()
+    {
+        scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
+    }
+
+    ~MergeTreeBackgroundExecutor()
+    {
+        wait();
+    }
+
+    void setThreadsCount(CountGetter && getter)
+    {
+        threads_count_getter = getter;
+    }
+
+    void setTasksCount(CountGetter && getter)
+    {
+        max_task_count_getter = getter;
+    }
+
+    void setMetric(CurrentMetrics::Metric metric_)
+    {
+        metric = metric_;
+    }
+
+    bool trySchedule(ExecutableTaskPtr task)
+    {
+        std::lock_guard lock(mutex);
+
+        if (shutdown_suspend)
+            return false;
+
+        auto & value = CurrentMetrics::values[metric];
+        if (value.load() >= static_cast<int64_t>(max_task_count_getter()))
+            return false;
+
+        CurrentMetrics::add(metric);
+
+        tasks.emplace_back(task);
+        ++scheduled_tasks_count;
+        has_tasks.notify_one();
+        return true;
+    }
+
+    void removeTasksCorrespondingToStorage(StorageID id)
+    {
+        /// Stop scheduler thread and pool
+        auto lock = getUniqueLock();
+        /// Get lock to the tasks
+        std::lock_guard second_lock(mutex);
+
+        size_t erased_count = std::erase_if(tasks, [id = std::move(id)] (auto task) -> bool { return task->getStorageID() == id; });
+        CurrentMetrics::sub(metric, erased_count);
+    }
+
+
+    void wait()
+    {
+        {
+            std::lock_guard lock(mutex);
+            shutdown_suspend = true;
+            has_tasks.notify_all();
+        }
+
+        if (scheduler.joinable())
+            scheduler.join();
+
+        pool.wait();
+    }
+
+private:
+
+    using ExecutorSuspender = std::unique_lock<MergeTreeBackgroundExecutor>;
+    friend class std::unique_lock<MergeTreeBackgroundExecutor>;
+
+    ExecutorSuspender getUniqueLock()
+    {
+        return ExecutorSuspender(*this);
+    }
+
+    /// This is needed to achive mutual exclusion
+    std::mutex lock_mutex;
+
+    void lock()
+    {
+        lock_mutex.lock();
+        suspend();
+    }
+
+    void unlock()
+    {
+        resume();
+        lock_mutex.unlock();
+    }
+
+    void suspend()
+    {
+        {
+            std::unique_lock lock(mutex);
+            shutdown_suspend = true;
+            has_tasks.notify_one();
+        }
+        scheduler.join();
+        pool.wait();
+    }
+
+
+    void resume()
+    {
+        shutdown_suspend = false;
+        scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
+    }
+
+
+    void updatePoolConfiguration()
+    {
+        const auto max_threads = threads_count_getter();
+        pool.setMaxFreeThreads(0);
+        pool.setMaxThreads(max_threads);
+        pool.setQueueSize(max_threads);
+    }
+
+    void decrementTasksCount()
+    {
+        --scheduled_tasks_count;
+        CurrentMetrics::sub(metric);
+    }
+
+    void schedulerThreadFunction();
+
+
+    CountGetter threads_count_getter;
+    CountGetter max_task_count_getter;
+    CurrentMetrics::Metric metric;
+
+    using TasksQueue = std::deque<ExecutableTaskPtr>;
+    TasksQueue tasks;
+
+    std::mutex mutex;
+    std::condition_variable has_tasks;
+
+    std::atomic_size_t scheduled_tasks_count{0};
+    std::atomic_bool shutdown_suspend{false};
+
+    ThreadPool pool;
+    ThreadFromGlobalPool scheduler;
+};
+
+}
diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h
new file mode 100644
index 00000000000..5f016c7ea08
--- /dev/null
+++ b/src/Storages/MergeTree/MergeTask.h
@@ -0,0 +1,253 @@
+#pragma once
+
+#include <Storages/MergeTree/ExecutableTask.h>
+#include <Storages/MergeTree/MergeProgress.h>
+#include <Storages/MergeTree/MergeTreeData.h>
+#include <Storages/MergeTree/IMergedBlockOutputStream.h>
+#include <Storages/MergeTree/MergedBlockOutputStream.h>
+#include <Storages/MergeTree/FutureMergedMutatedPart.h>
+#include <Storages/MergeTree/ColumnSizeEstimator.h>
+#include <Storages/MergeTree/MergedColumnOnlyOutputStream.h>
+#include <DataStreams/ColumnGathererStream.h>
+#include <Compression/CompressedReadBufferFromFile.h>
+
+#include <memory>
+#include <list>
+
+namespace DB
+{
+
+class MergeTask;
+using MergeTaskPtr = std::shared_ptr<MergeTask>;
+
+/**
+ * Overview of the merge algorithm
+ *
+ * Each merge is executed sequentially block by block.
+ * The main idea is to make a merge not a subroutine which is executed
+ * in a thread pool and may occupy a thread for a period of time,
+ * but to make a merge a coroutine which can suspend the execution
+ * in some points and then resume the execution from this point.
+ *
+ * A perfect point where to suspend the execution is after the work over a block is finished.
+ * The task itself will be executed via BackgroundJobAssignee.
+ *
+ * The interface of the task is simple.
+ * The main method is `execute()` which will return true, if the task wants to be executed again and false otherwise.
+ *
+ * With this kind of task we can give a merge a priority.
+ * A priority is simple - the lower the size of the merge, the higher priority.
+ * So, if ClickHouse wants to merge some really big parts into a bigger part,
+ * then it will be executed for a long time, because the result of the merge is not really needed immediately.
+ * It is better to merge small parts as soon as possible.
+*/
+class MergeTask
+{
+public:
+
+    MergeTask(
+        FutureMergedMutatedPartPtr future_part_,
+        StorageMetadataPtr metadata_snapshot_,
+        MergeList::Entry & merge_entry_,
+        TableLockHolder & holder_,
+        time_t time_of_merge_,
+        ContextPtr context_,
+        ReservationSharedPtr space_reservation_,
+        bool deduplicate_,
+        Names deduplicate_by_columns_,
+        MergeTreeData::MergingParams merging_params_,
+        MergeTreeDataPartPtr parent_part_,
+        String prefix_,
+        MergeTreeData & data_,
+        ActionBlocker & merges_blocker_,
+        ActionBlocker & ttl_merges_blocker_)
+        : future_part(future_part_)
+        , metadata_snapshot(metadata_snapshot_)
+        , merge_entry(merge_entry_)
+        , holder(holder_)
+        , time_of_merge(time_of_merge_)
+        , context(context_)
+        , space_reservation(std::move(space_reservation_))
+        , deduplicate(deduplicate_)
+        , deduplicate_by_columns(std::move(deduplicate_by_columns_))
+        , merging_params(merging_params_)
+        , parent_part(parent_part_)
+        , prefix(prefix_)
+        , data(data_)
+        , merges_blocker(merges_blocker_)
+        , ttl_merges_blocker(ttl_merges_blocker_)
+        {}
+
+    std::future<MergeTreeData::MutableDataPartPtr> getFuture()
+    {
+        return promise.get_future();
+    }
+
+    bool execute();
+
+    void prepare();
+
+private:
+    void createMergedStream();
+
+    MergeAlgorithm chooseMergeAlgorithm() const;
+
+    bool executeHorizontalForBlock();
+    void finalizeHorizontalPartOfTheMerge();
+    void prepareVertical();
+    bool executeVerticalMergeForAllColumns();
+
+    void prepareVerticalMergeForOneColumn();
+    bool executeVerticalMergeForOneColumn();
+    void finalizeVerticalMergeForOneColumn();
+
+    void finalizeVerticalMergeForAllColumns();
+
+    void mergeMinMaxIndex();
+
+    void prepareProjections();
+    bool executeProjections();
+    void finalizeProjections();
+
+    void finalize();
+
+
+    std::promise<MergeTreeData::MutableDataPartPtr> promise;
+
+    /**
+     * States of MergeTask state machine.
+     * Transitions are from up to down.
+     * But for vertical merge there are horizontal part of the merge and vertical part.
+     * For horizontal there is horizontal part only.
+     */
+    enum class MergeTaskState
+    {
+        NEED_PREPARE,
+        NEED_EXECUTE_HORIZONTAL,
+        NEED_FINALIZE_HORIZONTAL,
+        NEED_PREPARE_VERTICAL,
+        NEED_EXECUTE_VERTICAL,
+        NEED_FINISH_VERTICAL,
+        NEED_MERGE_MIN_MAX_INDEX,
+
+        NEED_PREPARE_PROJECTIONS,
+        NEED_EXECUTE_PROJECTIONS,
+        NEED_FINISH_PROJECTIONS,
+
+        NEED_FINISH
+    };
+    MergeTaskState state{MergeTaskState::NEED_PREPARE};
+
+    enum class VecticalMergeOneColumnState
+    {
+        NEED_PREPARE,
+        NEED_EXECUTE,
+        NEED_FINISH
+    };
+    VecticalMergeOneColumnState vertical_merge_one_column_state{VecticalMergeOneColumnState::NEED_PREPARE};
+
+    FutureMergedMutatedPartPtr future_part;
+    StorageMetadataPtr metadata_snapshot;
+    MergeList::Entry & merge_entry;
+    TableLockHolder & holder;
+    time_t time_of_merge;
+    ContextPtr context;
+    /// It is necessary, because of projections presense
+    ReservationSharedPtr space_reservation;
+    bool deduplicate;
+    Names deduplicate_by_columns;
+    MergeTreeData::MergingParams merging_params;
+    MergeTreeDataPartPtr parent_part;
+    String prefix;
+
+    /// From MergeTreeDataMergerMutator
+
+    MergeTreeData & data;
+    Poco::Logger * log{&Poco::Logger::get("MergeTask")};
+
+    ActionBlocker & merges_blocker;
+    ActionBlocker & ttl_merges_blocker;
+
+
+    /// Previously stack located variables
+
+    NamesAndTypesList gathering_columns;
+    NamesAndTypesList merging_columns;
+    Names gathering_column_names;
+    Names merging_column_names;
+
+    NamesAndTypesList storage_columns;
+    Names all_column_names;
+
+    String new_part_tmp_path;
+
+    size_t sum_input_rows_upper_bound{0};
+
+    bool need_remove_expired_values{false};
+    bool force_ttl{false};
+
+    DiskPtr tmp_disk{nullptr};
+    DiskPtr disk{nullptr};
+
+    std::unique_ptr<MergeStageProgress> horizontal_stage_progress{nullptr};
+    std::unique_ptr<MergeStageProgress> column_progress{nullptr};
+
+    std::unique_ptr<TemporaryFile> rows_sources_file;
+    std::unique_ptr<WriteBufferFromFileBase> rows_sources_uncompressed_write_buf{nullptr};
+    std::unique_ptr<WriteBuffer> rows_sources_write_buf{nullptr};
+    std::optional<ColumnSizeEstimator> column_sizes;
+
+    SyncGuardPtr sync_guard{nullptr};
+    MergeTreeData::MutableDataPartPtr new_data_part;
+    CompressionCodecPtr compression_codec;
+
+    MergeAlgorithm chosen_merge_algorithm{MergeAlgorithm::Undecided};
+
+    std::shared_ptr<MergedBlockOutputStream> to;
+    BlockInputStreamPtr merged_stream;
+
+    bool blocks_are_granules_size{false};
+
+    /// Variables that are needed for horizontal merge execution
+
+    size_t rows_written{0};
+    size_t initial_reservation{0};
+    UInt64 watch_prev_elapsed{0};
+
+    std::function<bool()> is_cancelled;
+
+    bool need_sync{false};
+    bool read_with_direct_io{false};
+
+
+    MergeTreeData::DataPart::Checksums checksums_gathered_columns;
+
+    std::list<DB::NameAndTypePair>::const_iterator it_name_and_type;
+    size_t column_num_for_vertical_merge{0};
+    size_t gathering_column_names_size{0};
+
+    /// This class has no default constructor, so we wrap it with unique_ptr
+    std::unique_ptr<CompressedReadBufferFromFile> rows_sources_read_buf{nullptr};
+    IMergedBlockOutputStream::WrittenOffsetColumns written_offset_columns;
+    std::unique_ptr<MergedColumnOnlyOutputStream> column_to;
+
+    BlockInputStreams column_part_streams;
+    std::unique_ptr<ColumnGathererStream> column_gathered_stream;
+
+    size_t column_elems_written = 0;
+    Float64 progress_before = 0;
+
+
+    using MergeTasks = std::deque<MergeTaskPtr>;
+    MergeTasks tasks_for_projections;
+    MergeTasks::iterator projections_iterator;
+};
+
+/// FIXME
+[[ maybe_unused]] static MergeTreeData::MutableDataPartPtr executeHere(MergeTaskPtr task)
+{
+    while (task->execute()) {}
+    return task->getFuture().get();
+}
+
+}
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 36ebcf825e5..1e7aef646ad 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -4889,7 +4889,7 @@ MergeTreeData::CurrentlyMovingPartsTagger::~CurrentlyMovingPartsTagger()
     }
 }
 
-bool MergeTreeData::scheduleDataMovingJob(IBackgroundJobExecutor & executor)
+bool MergeTreeData::scheduleDataMovingJob(BackgroundJobAssignee & executor)
 {
     if (parts_mover.moves_blocker.isCancelled())
         return false;
@@ -4898,10 +4898,11 @@ bool MergeTreeData::scheduleDataMovingJob(IBackgroundJobExecutor & executor)
     if (moving_tagger->parts_to_move.empty())
         return false;
 
-    executor.execute({[this, moving_tagger] () mutable
-    {
-        return moveParts(moving_tagger);
-    }, PoolType::MOVE});
+    executor.scheduleMoveTask(LambdaAdapter::create(
+        [this, moving_tagger] () mutable
+        {
+            return moveParts(moving_tagger);
+        }, *this));
     return true;
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index ef5f22ed096..1abfe3925e8 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -57,7 +57,7 @@ class ExpressionActions;
 using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
 using ManyExpressionActions = std::vector<ExpressionActionsPtr>;
 class MergeTreeDeduplicationLog;
-class IBackgroundJobExecutor;
+class BackgroundJobAssignee;
 
 namespace ErrorCodes
 {
@@ -824,9 +824,9 @@ public:
     PinnedPartUUIDsPtr getPinnedPartUUIDs() const;
 
     /// Schedules background job to like merge/mutate/fetch an executor
-    virtual bool scheduleDataProcessingJob(IBackgroundJobExecutor & executor) = 0;
+    virtual bool scheduleDataProcessingJob(BackgroundJobAssignee & executor) = 0;
     /// Schedules job to move parts between disks/volumes and so on.
-    bool scheduleDataMovingJob(IBackgroundJobExecutor & executor);
+    bool scheduleDataMovingJob(BackgroundJobAssignee & executor);
     bool areBackgroundMovesNeeded() const;
 
     /// Lock part in zookeeper for shared data in several nodes
@@ -849,6 +849,9 @@ public:
     /// Mutex for currently_submerging_parts and currently_emerging_parts
     mutable std::mutex currently_submerging_emerging_mutex;
 
+    /// Trigger merge scheduling task
+    virtual void triggerBackgroundOperationTask(bool delay) = 0;
+
 protected:
 
     friend class IMergeTreeDataPart;
diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp
index 73c753386a4..600c25b3774 100644
--- a/src/Storages/MergeTree/MergeTreeSink.cpp
+++ b/src/Storages/MergeTree/MergeTreeSink.cpp
@@ -37,7 +37,7 @@ void MergeTreeSink::consume(Chunk chunk)
             PartLog::addNewPart(storage.getContext(), part, watch.elapsed());
 
             /// Initiate async merge - it will be done if it's good time for merge and if there are space in 'background_pool'.
-            storage.background_executor.triggerTask();
+            storage.background_executor.trigger();
         }
     }
 }
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
index c71a79d2009..dee1aa47cb4 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@@ -624,7 +624,7 @@ int32_t ReplicatedMergeTreeQueue::pullLogsToQueue(zkutil::ZooKeeperPtr zookeeper
             }
         }
 
-        storage.background_executor.triggerTask();
+        storage.background_executor.trigger();
     }
 
     return stat.version;
@@ -713,7 +713,7 @@ void ReplicatedMergeTreeQueue::updateMutations(zkutil::ZooKeeperPtr zookeeper, C
     }
 
     if (some_active_mutations_were_killed)
-        storage.background_executor.triggerTask();
+        storage.background_executor.trigger();
 
     if (!entries_to_load.empty())
     {
@@ -847,7 +847,7 @@ ReplicatedMergeTreeMutationEntryPtr ReplicatedMergeTreeQueue::removeMutation(
     }
 
     if (mutation_was_active)
-        storage.background_executor.triggerTask();
+        storage.background_executor.trigger();
 
     return entry;
 }
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 898e990f7d3..f0ad289ae1c 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -82,8 +82,8 @@ StorageMergeTree::StorageMergeTree(
     , reader(*this)
     , writer(*this)
     , merger_mutator(*this, getContext()->getSettingsRef().background_pool_size)
-    , background_executor(*this, getContext())
-    , background_moves_executor(*this, getContext())
+    , background_executor(*this, BackgroundJobAssignee::Type::DataProcessing, getContext())
+    , background_moves_executor(*this, BackgroundJobAssignee::Type::Moving, getContext())
 
 {
     loadDataParts(has_force_restore_data_flag);
@@ -414,7 +414,7 @@ Int64 StorageMergeTree::startMutation(const MutationCommands & commands, String
 
         LOG_INFO(log, "Added mutation: {}", mutation_file_name);
     }
-    background_executor.triggerTask();
+    background_executor.trigger();
     return version;
 }
 
@@ -640,7 +640,7 @@ CancellationCode StorageMergeTree::killMutation(const String & mutation_id)
     }
 
     /// Maybe there is another mutation that was blocked by the killed one. Try to execute it immediately.
-    background_executor.triggerTask();
+    background_executor.trigger();
 
     return CancellationCode::CancelSent;
 }
@@ -1044,7 +1044,7 @@ bool StorageMergeTree::mutateSelectedPart(const StorageMetadataPtr & metadata_sn
     return true;
 }
 
-bool StorageMergeTree::scheduleDataProcessingJob(IBackgroundJobExecutor & executor) //-V657
+bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee & executor) //-V657
 {
     if (shutdown_called)
         return false;
@@ -1076,42 +1076,46 @@ bool StorageMergeTree::scheduleDataProcessingJob(IBackgroundJobExecutor & execut
 
     if (merge_entry)
     {
-        executor.execute({[this, metadata_snapshot, merge_entry, share_lock] () mutable
-        {
-            return mergeSelectedParts(metadata_snapshot, false, {}, *merge_entry, share_lock);
-        }, PoolType::MERGE_MUTATE});
+        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+            [this, metadata_snapshot, merge_entry, share_lock] () mutable
+            {
+                return mergeSelectedParts(metadata_snapshot, false, {}, *merge_entry, share_lock);
+            }, *this));
         return true;
     }
     if (mutate_entry)
     {
-        executor.execute({[this, metadata_snapshot, merge_entry, mutate_entry, share_lock] () mutable
-        {
+        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+            [this, metadata_snapshot, merge_entry, mutate_entry, share_lock] () mutable
+            {
             return mutateSelectedPart(metadata_snapshot, *mutate_entry, share_lock);
-        }, PoolType::MERGE_MUTATE});
+            }, *this));
         return true;
     }
     bool executed = false;
     if (time_after_previous_cleanup_temporary_directories.compareAndRestartDeferred(getContext()->getSettingsRef().merge_tree_clear_old_temporary_directories_interval_seconds))
     {
-        executor.execute({[this, share_lock] ()
-        {
-            clearOldTemporaryDirectories(getSettings()->temporary_directories_lifetime.totalSeconds());
-            return true;
-        }, PoolType::MERGE_MUTATE});
+        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+            [this, share_lock] ()
+            {
+                clearOldTemporaryDirectories(getSettings()->temporary_directories_lifetime.totalSeconds());
+                return true;
+            }, *this));
         executed = true;
     }
     if (time_after_previous_cleanup_parts.compareAndRestartDeferred(getContext()->getSettingsRef().merge_tree_clear_old_parts_interval_seconds))
     {
-        executor.execute({[this, share_lock] ()
-        {
-            /// All use relative_data_path which changes during rename
-            /// so execute under share lock.
-            clearOldPartsFromFilesystem();
-            clearOldWriteAheadLogs();
-            clearOldMutations();
-            clearEmptyParts();
-            return true;
-        }, PoolType::MERGE_MUTATE});
+        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+            [this, share_lock] ()
+            {
+                /// All use relative_data_path which changes during rename
+                /// so execute under share lock.
+                clearOldPartsFromFilesystem();
+                clearOldWriteAheadLogs();
+                clearOldMutations();
+                clearEmptyParts();
+                return true;
+            }, *this));
         executed = true;
      }
 
@@ -1566,9 +1570,9 @@ ActionLock StorageMergeTree::getActionLock(StorageActionBlockType action_type)
 void StorageMergeTree::onActionLockRemove(StorageActionBlockType action_type)
 {
     if (action_type == ActionLocks::PartsMerge ||  action_type == ActionLocks::PartsTTLMerge)
-        background_executor.triggerTask();
+        background_executor.trigger();
     else if (action_type == ActionLocks::PartsMove)
-        background_moves_executor.triggerTask();
+        background_moves_executor.trigger();
 }
 
 CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_context)
diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h
index e2dbc217135..a0d0583429f 100644
--- a/src/Storages/StorageMergeTree.h
+++ b/src/Storages/StorageMergeTree.h
@@ -96,9 +96,23 @@ public:
 
     RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) override;
 
-    bool scheduleDataProcessingJob(IBackgroundJobExecutor & executor) override;
+    bool scheduleDataProcessingJob(BackgroundJobAssignee & executor) override;
 
     MergeTreeDeduplicationLog * getDeduplicationLog() { return deduplication_log.get(); }
+
+    void triggerBackgroundOperationTask(bool delay) override
+    {
+        if (delay)
+            background_executor.postpone();
+        else
+            background_executor.trigger();
+
+        if (delay)
+            background_moves_executor.postpone();
+        else
+            background_moves_executor.trigger();
+    }
+
 private:
 
     /// Mutex and condvar for synchronous mutations wait
@@ -108,8 +122,8 @@ private:
     MergeTreeDataSelectExecutor reader;
     MergeTreeDataWriter writer;
     MergeTreeDataMergerMutator merger_mutator;
-    BackgroundJobsExecutor background_executor;
-    BackgroundMovesExecutor background_moves_executor;
+    BackgroundJobAssignee background_executor;
+    BackgroundJobAssignee background_moves_executor;
 
     std::unique_ptr<MergeTreeDeduplicationLog> deduplication_log;
 
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index d24ca0f3161..30c0f63bde5 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -284,8 +284,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
     , merge_strategy_picker(*this)
     , queue(*this, merge_strategy_picker)
     , fetcher(*this)
-    , background_executor(*this, getContext())
-    , background_moves_executor(*this, getContext())
+    , background_executor(*this, BackgroundJobAssignee::Type::DataProcessing, getContext())
+    , background_moves_executor(*this, BackgroundJobAssignee::Type::Moving, getContext())
     , cleanup_thread(*this)
     , part_check_thread(*this)
     , restarting_thread(*this)
@@ -3210,7 +3210,7 @@ bool StorageReplicatedMergeTree::processQueueEntry(ReplicatedMergeTreeQueue::Sel
     });
 }
 
-bool StorageReplicatedMergeTree::scheduleDataProcessingJob(IBackgroundJobExecutor & executor)
+bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee & executor)
 {
     /// If replication queue is stopped exit immediately as we successfully executed the task
     if (queue.actions_blocker.isCancelled())
@@ -3225,18 +3225,20 @@ bool StorageReplicatedMergeTree::scheduleDataProcessingJob(IBackgroundJobExecuto
     /// Depending on entry type execute in fetches (small) pool or big merge_mutate pool
     if (selected_entry->log_entry->type == LogEntry::GET_PART)
     {
-        executor.execute({[this, selected_entry] () mutable
-        {
-            return processQueueEntry(selected_entry);
-        }, PoolType::FETCH});
+        executor.scheduleFetchTask(LambdaAdapter::create(
+            [this, selected_entry] () mutable
+            {
+                return processQueueEntry(selected_entry);
+            }, *this));
         return true;
     }
     else
     {
-        executor.execute({[this, selected_entry] () mutable
-        {
-            return processQueueEntry(selected_entry);
-        }, PoolType::MERGE_MUTATE});
+        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+            [this, selected_entry] () mutable
+            {
+                return processQueueEntry(selected_entry);
+            }, *this));
         return true;
     }
 }
@@ -6982,9 +6984,9 @@ void StorageReplicatedMergeTree::onActionLockRemove(StorageActionBlockType actio
     if (action_type == ActionLocks::PartsMerge || action_type == ActionLocks::PartsTTLMerge
         || action_type == ActionLocks::PartsFetch || action_type == ActionLocks::PartsSend
         || action_type == ActionLocks::ReplicationQueue)
-        background_executor.triggerTask();
+        background_executor.trigger();
     else if (action_type == ActionLocks::PartsMove)
-        background_moves_executor.triggerTask();
+        background_moves_executor.trigger();
 }
 
 bool StorageReplicatedMergeTree::waitForShrinkingQueueSize(size_t queue_size, UInt64 max_wait_milliseconds)
@@ -6996,7 +6998,7 @@ bool StorageReplicatedMergeTree::waitForShrinkingQueueSize(size_t queue_size, UI
 
     /// This is significant, because the execution of this task could be delayed at BackgroundPool.
     /// And we force it to be executed.
-    background_executor.triggerTask();
+    background_executor.trigger();
 
     Poco::Event target_size_event;
     auto callback = [&target_size_event, queue_size] (size_t new_queue_size)
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 73c39bed3e3..164b82af47a 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -218,7 +218,7 @@ public:
                                               const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger);
 
     /// Schedules job to execute in background pool (merge, mutate, drop range and so on)
-    bool scheduleDataProcessingJob(IBackgroundJobExecutor & executor) override;
+    bool scheduleDataProcessingJob(BackgroundJobAssignee & executor) override;
 
     /// Checks that fetches are not disabled with action blocker and pool for fetches
     /// is not overloaded
@@ -262,6 +262,20 @@ public:
 
     bool createEmptyPartInsteadOfLost(zkutil::ZooKeeperPtr zookeeper, const String & lost_part_name);
 
+
+    void triggerBackgroundOperationTask(bool delay) override
+    {
+        if (delay)
+            background_executor.postpone();
+        else
+            background_executor.trigger();
+
+        if (delay)
+            background_moves_executor.postpone();
+        else
+            background_moves_executor.trigger();
+    }
+
 private:
     std::atomic_bool are_restoring_replica {false};
 
@@ -356,8 +370,8 @@ private:
     int metadata_version = 0;
     /// Threads.
 
-    BackgroundJobsExecutor background_executor;
-    BackgroundMovesExecutor background_moves_executor;
+    BackgroundJobAssignee background_executor;
+    BackgroundJobAssignee background_moves_executor;
 
     /// A task that keeps track of the updates in the logs of all replicas and loads them into the queue.
     bool queue_update_in_progress = false;

From 1adb9bfe23d5082ea5e9f9fe3b2607081e7dbd32 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 31 Aug 2021 11:02:39 +0000
Subject: [PATCH 037/177] better

---
 .../MergeTree/BackgroundJobsExecutor.cpp      |  2 +-
 .../MergeTree/BackgroundJobsExecutor.h        |  3 -
 .../MergeTree/MergeMutateExecutor.cpp         | 55 +++++++++++++++-
 src/Storages/MergeTree/MergeMutateExecutor.h  | 62 +++----------------
 src/Storages/StorageReplicatedMergeTree.cpp   |  1 +
 5 files changed, 62 insertions(+), 61 deletions(-)

diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
index 11039edc057..ed6f353c219 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
@@ -13,7 +13,6 @@ BackgroundJobAssignee::BackgroundJobAssignee(MergeTreeData & data_, BackgroundJo
     , data(data_)
     , sleep_settings(global_context_->getBackgroundMoveTaskSchedulingSettings())
     , rng(randomSeed())
-    , storage_id(data.getStorageID())
     , type(type_)
 {
 }
@@ -107,6 +106,7 @@ void BackgroundJobAssignee::finish()
         holder->deactivate();
 
         auto context = getContext();
+        auto storage_id = data.getStorageID();
 
         context->getMovesExecutor()->removeTasksCorrespondingToStorage(storage_id);
         context->getFetchesExecutor()->removeTasksCorrespondingToStorage(storage_id);
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.h b/src/Storages/MergeTree/BackgroundJobsExecutor.h
index d89d8721697..aba30aa06e7 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.h
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.h
@@ -48,9 +48,6 @@ private:
     /// Mutex for thread safety
     std::mutex holder_mutex;
 
-    /// Save storage id to prevent use-after-free in destructor
-    StorageID storage_id;
-
 public:
     enum class Type
     {
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index cf7283d9c2b..e28c5b7f867 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -6,12 +6,45 @@
 namespace DB
 {
 
+void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id)
+{
+    std::lock_guard remove_lock(remove_mutex);
+
+    /// First stop the scheduler thread
+    {
+        std::unique_lock lock(mutex);
+        shutdown_suspend = true;
+        has_tasks.notify_one();
+    }
+
+    scheduler.join();
+
+    /// Remove tasks
+    {
+        std::lock_guard lock(currently_executing_mutex);
+
+        for (auto & [task, future] : currently_executing)
+        {
+            if (task->getStorageID() == id)
+                future.wait();
+        }
+
+        /// Remove tasks from original queue
+        size_t erased_count = std::erase_if(tasks, [id = std::move(id)] (auto task) -> bool { return task->getStorageID() == id; });
+        CurrentMetrics::sub(metric, erased_count);
+    }
+
+    shutdown_suspend = false;
+    scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
+}
+
 
 void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 {
     while (true)
     {
         ExecutableTaskPtr current;
+        auto current_promise = std::make_shared<std::promise<void>>();
         {
             std::unique_lock lock(mutex);
             has_tasks.wait(lock, [this](){ return !tasks.empty() || shutdown_suspend; });
@@ -26,11 +59,29 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
             updatePoolConfiguration();
         }
 
-        bool res = pool.trySchedule([this, task = current] ()
         {
+            std::lock_guard lock(currently_executing_mutex);
+            currently_executing.emplace(current, current_promise->get_future());
+        }
+
+        bool res = pool.trySchedule([this, task = current, promise = current_promise] () mutable
+        {
+            auto on_exit = [&] ()
+            {
+                promise->set_value();
+                {
+                    std::lock_guard lock(currently_executing_mutex);
+                    currently_executing.erase(task);
+                }
+            };
+
+            SCOPE_EXIT({ on_exit(); });
+
             try
             {
-                if (task->execute())
+                bool result = task->execute();
+
+                if (result)
                 {
                     std::lock_guard guard(mutex);
                     tasks.emplace_back(task);
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index ec0b71910ef..9f1759ff307 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -97,17 +97,7 @@ public:
         return true;
     }
 
-    void removeTasksCorrespondingToStorage(StorageID id)
-    {
-        /// Stop scheduler thread and pool
-        auto lock = getUniqueLock();
-        /// Get lock to the tasks
-        std::lock_guard second_lock(mutex);
-
-        size_t erased_count = std::erase_if(tasks, [id = std::move(id)] (auto task) -> bool { return task->getStorageID() == id; });
-        CurrentMetrics::sub(metric, erased_count);
-    }
-
+    void removeTasksCorrespondingToStorage(StorageID id);
 
     void wait()
     {
@@ -125,48 +115,6 @@ public:
 
 private:
 
-    using ExecutorSuspender = std::unique_lock<MergeTreeBackgroundExecutor>;
-    friend class std::unique_lock<MergeTreeBackgroundExecutor>;
-
-    ExecutorSuspender getUniqueLock()
-    {
-        return ExecutorSuspender(*this);
-    }
-
-    /// This is needed to achive mutual exclusion
-    std::mutex lock_mutex;
-
-    void lock()
-    {
-        lock_mutex.lock();
-        suspend();
-    }
-
-    void unlock()
-    {
-        resume();
-        lock_mutex.unlock();
-    }
-
-    void suspend()
-    {
-        {
-            std::unique_lock lock(mutex);
-            shutdown_suspend = true;
-            has_tasks.notify_one();
-        }
-        scheduler.join();
-        pool.wait();
-    }
-
-
-    void resume()
-    {
-        shutdown_suspend = false;
-        scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
-    }
-
-
     void updatePoolConfiguration()
     {
         const auto max_threads = threads_count_getter();
@@ -188,8 +136,12 @@ private:
     CountGetter max_task_count_getter;
     CurrentMetrics::Metric metric;
 
-    using TasksQueue = std::deque<ExecutableTaskPtr>;
-    TasksQueue tasks;
+    std::deque<ExecutableTaskPtr> tasks;
+
+    std::mutex remove_mutex;
+
+    std::mutex currently_executing_mutex;
+    std::map<ExecutableTaskPtr, std::future<void>> currently_executing;
 
     std::mutex mutex;
     std::condition_variable has_tasks;
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 30c0f63bde5..4ba7b120fc2 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -17,6 +17,7 @@
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Storages/MergeTree/IMergeTreeDataPart.h>
 #include <Storages/MergeTree/MergeList.h>
+#include <Storages/MergeTree/MergeMutateExecutor.h>
 #include <Storages/MergeTree/MergedBlockOutputStream.h>
 #include <Storages/MergeTree/PinnedPartUUIDs.h>
 #include <Storages/MergeTree/PartitionPruner.h>

From 061a4c4a275c0782acd29ec9c0ed88a72e907ff1 Mon Sep 17 00:00:00 2001
From: romanzhukov <romanzhukov@yandex-team.ru>
Date: Tue, 31 Aug 2021 14:55:46 +0300
Subject: [PATCH 038/177] Fix PR comment.

---
 docs/ru/operations/server-configuration-parameters/settings.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/operations/server-configuration-parameters/settings.md b/docs/ru/operations/server-configuration-parameters/settings.md
index 0850b59cdb7..b6806ae6598 100644
--- a/docs/ru/operations/server-configuration-parameters/settings.md
+++ b/docs/ru/operations/server-configuration-parameters/settings.md
@@ -1200,7 +1200,7 @@ ClickHouse использует ZooKeeper для хранения метадан
 Секция конфигурационного файла,которая содержит настройки:
 -   Путь к конфигурационному файлу с предустановленными пользователями.
 -   Путь к файлу, в котором содержатся пользователи, созданные при помощи SQL команд.
--   Путь к узлу ZooKeeper, где хранятся и реплицируются пользователи, созданные с помощью команд SQL (экспериментально).
+-   Путь к узлу ZooKeeper, где хранятся и реплицируются пользователи, созданные с помощью команд SQL (экспериментальная функциональность).
 
 Если эта секция определена, путь из [users_config](../../operations/server-configuration-parameters/settings.md#users-config) и [access_control_path](../../operations/server-configuration-parameters/settings.md#access_control_path) не используется.
 

From 3027efe860531944a4c81a86a83db8e8715fa2b8 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 31 Aug 2021 12:09:35 +0000
Subject: [PATCH 039/177] after merge with master

---
 .../MergeTree/MergeMutateExecutor.cpp         | 33 +++++++++++++++++--
 src/Storages/MergeTree/MergeMutateExecutor.h  |  8 -----
 2 files changed, 31 insertions(+), 10 deletions(-)

diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index e28c5b7f867..df73bcef5b1 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -6,6 +6,32 @@
 namespace DB
 {
 
+
+/// This is a RAII class which only decrements metric.
+/// It is added because after all other fixes a bug non-executing merges was occurred again.
+/// Last hypothesis: task was successfully added to pool, however, was not executed because of internal exception in it.
+class ParanoidMetricDecrementor
+{
+public:
+    explicit ParanoidMetricDecrementor(CurrentMetrics::Metric metric_) : metric(metric_) {}
+    void alarm() { is_alarmed = true; }
+    void decrement()
+    {
+        if (is_alarmed.exchange(false))
+        {
+            CurrentMetrics::values[metric]--;
+        }
+    }
+
+    ~ParanoidMetricDecrementor() { decrement(); }
+
+private:
+
+    CurrentMetrics::Metric metric;
+    std::atomic_bool is_alarmed = false;
+};
+
+
 void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id)
 {
     std::lock_guard remove_lock(remove_mutex);
@@ -66,6 +92,9 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
         bool res = pool.trySchedule([this, task = current, promise = current_promise] () mutable
         {
+            auto metric_decrementor = std::make_shared<ParanoidMetricDecrementor>(metric);
+            metric_decrementor->alarm();
+
             auto on_exit = [&] ()
             {
                 promise->set_value();
@@ -89,7 +118,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
                     return;
                 }
 
-                decrementTasksCount();
+                metric_decrementor->decrement();
                 task->onCompleted();
 
                 std::lock_guard guard(mutex);
@@ -97,7 +126,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
             }
             catch(...)
             {
-                decrementTasksCount();
+                metric_decrementor->decrement();
                 task->onCompleted();
                 std::lock_guard guard(mutex);
                 has_tasks.notify_one();
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 9f1759ff307..22296e391ba 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -92,7 +92,6 @@ public:
         CurrentMetrics::add(metric);
 
         tasks.emplace_back(task);
-        ++scheduled_tasks_count;
         has_tasks.notify_one();
         return true;
     }
@@ -123,12 +122,6 @@ private:
         pool.setQueueSize(max_threads);
     }
 
-    void decrementTasksCount()
-    {
-        --scheduled_tasks_count;
-        CurrentMetrics::sub(metric);
-    }
-
     void schedulerThreadFunction();
 
 
@@ -146,7 +139,6 @@ private:
     std::mutex mutex;
     std::condition_variable has_tasks;
 
-    std::atomic_size_t scheduled_tasks_count{0};
     std::atomic_bool shutdown_suspend{false};
 
     ThreadPool pool;

From becd288cb9f168e7219e72275ae91ff38cfe3f3c Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Sat, 26 Jun 2021 14:27:58 +0300
Subject: [PATCH 040/177] Try to remove useless applyJoinKeyConvert

---
 src/Interpreters/ExpressionAnalyzer.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index dbb1addf202..c21103c1e7f 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -981,8 +981,6 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
         if (syntax->analyzed_join->dictionary_reader)
             joined_plan.reset();
     }
-    else
-        syntax->analyzed_join->applyJoinKeyConvert(left_sample_columns, {});
 
     return join;
 }

From c4654d6b4557e98857c83857ca60132b43e56b21 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Sat, 26 Jun 2021 16:59:07 +0300
Subject: [PATCH 041/177] Add option to getMostSubtype/getLeastSupertype to not
 throw exception

---
 src/DataTypes/getLeastSupertype.cpp | 98 ++++++++++++++++++-----------
 src/DataTypes/getLeastSupertype.h   |  5 +-
 src/DataTypes/getMostSubtype.cpp    | 44 +++++++------
 src/DataTypes/getMostSubtype.h      |  5 +-
 src/Interpreters/TableJoin.cpp      | 84 ++++++++++++++-----------
 src/Interpreters/TableJoin.h        |  2 +-
 6 files changed, 140 insertions(+), 98 deletions(-)

diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp
index a950d18b50a..2f06ff8b954 100644
--- a/src/DataTypes/getLeastSupertype.cpp
+++ b/src/DataTypes/getLeastSupertype.cpp
@@ -47,19 +47,47 @@ namespace
 
         return res.str();
     }
+
+    template <typename T>
+    DataTypePtr wrapWithType(DataTypePtr type)
+    {
+        if (type == nullptr)
+            return nullptr;
+        return std::make_shared<T>(type);
+    }
+
+    template <typename T>
+    DataTypePtr wrapWithType(DataTypePtr type1, DataTypePtr type2)
+    {
+        if (type1 == nullptr || type2 == nullptr)
+            return nullptr;
+        return std::make_shared<T>(type1, type2);
+    }
+
+    template <typename T>
+    DataTypePtr wrapWithType(const DataTypes & types)
+    {
+        for (const auto & type : types)
+        {
+            if (type == nullptr)
+                return nullptr;
+        }
+        return std::make_shared<T>(types);
+    }
 }
 
-
-DataTypePtr getLeastSupertype(const DataTypes & types)
+DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_type)
 {
-    /// Trivial cases
+    auto on_error = [throw_on_no_common_type](const String & msg) -> DataTypePtr
+    {
+        if (throw_on_no_common_type)
+            throw Exception(msg, ErrorCodes::NO_COMMON_TYPE);
+        return nullptr;
+    };
 
     if (types.empty())
         return std::make_shared<DataTypeNothing>();
 
-    if (types.size() == 1)
-        return types[0];
-
     /// All types are equal
     {
         bool all_equal = true;
@@ -88,7 +116,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
                 non_nothing_types.emplace_back(type);
 
         if (non_nothing_types.size() < types.size())
-            return getLeastSupertype(non_nothing_types);
+            return getLeastSupertype(non_nothing_types, throw_on_no_common_type);
     }
 
     /// For Arrays
@@ -113,9 +141,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
         if (have_array)
         {
             if (!all_arrays)
-                throw Exception(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE);
+                return on_error(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not");
 
-            return std::make_shared<DataTypeArray>(getLeastSupertype(nested_types));
+            return wrapWithType<DataTypeArray>(getLeastSupertype(nested_types, throw_on_no_common_type));
         }
     }
 
@@ -139,7 +167,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
                         nested_types[elem_idx].reserve(types.size());
                 }
                 else if (tuple_size != type_tuple->getElements().size())
-                    throw Exception(getExceptionMessagePrefix(types) + " because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE);
+                    return on_error(getExceptionMessagePrefix(types) + " because Tuples have different sizes");
 
                 have_tuple = true;
 
@@ -153,13 +181,13 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
         if (have_tuple)
         {
             if (!all_tuples)
-                throw Exception(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE);
+                return on_error(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not");
 
             DataTypes common_tuple_types(tuple_size);
             for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
-                common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx]);
+                common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx], throw_on_no_common_type);
 
-            return std::make_shared<DataTypeTuple>(common_tuple_types);
+            return wrapWithType<DataTypeTuple>(common_tuple_types);
         }
     }
 
@@ -187,9 +215,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
         if (have_maps)
         {
             if (!all_maps)
-                throw Exception(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE);
+                return on_error(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not");
 
-            return std::make_shared<DataTypeMap>(getLeastSupertype(key_types), getLeastSupertype(value_types));
+            return wrapWithType<DataTypeMap>(getLeastSupertype(key_types, throw_on_no_common_type), getLeastSupertype(value_types, throw_on_no_common_type));
         }
     }
 
@@ -220,9 +248,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
         if (have_low_cardinality)
         {
             if (have_not_low_cardinality)
-                return getLeastSupertype(nested_types);
+                return getLeastSupertype(nested_types, throw_on_no_common_type);
             else
-                return std::make_shared<DataTypeLowCardinality>(getLeastSupertype(nested_types));
+                return wrapWithType<DataTypeLowCardinality>(getLeastSupertype(nested_types, throw_on_no_common_type));
         }
     }
 
@@ -248,7 +276,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
 
         if (have_nullable)
         {
-            return std::make_shared<DataTypeNullable>(getLeastSupertype(nested_types));
+            return wrapWithType<DataTypeNullable>(getLeastSupertype(nested_types, throw_on_no_common_type));
         }
     }
 
@@ -268,7 +296,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
         {
             bool all_strings = type_ids.size() == (have_string + have_fixed_string);
             if (!all_strings)
-                throw Exception(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE);
+                return on_error(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not");
 
             return std::make_shared<DataTypeString>();
         }
@@ -284,8 +312,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
         {
             bool all_date_or_datetime = type_ids.size() == (have_date + have_datetime + have_datetime64);
             if (!all_date_or_datetime)
-                throw Exception(getExceptionMessagePrefix(types) + " because some of them are Date/DateTime/DateTime64 and some of them are not",
-                    ErrorCodes::NO_COMMON_TYPE);
+                return on_error(getExceptionMessagePrefix(types) + " because some of them are Date/DateTime/DateTime64 and some of them are not");
 
             if (have_datetime64 == 0)
             {
@@ -345,8 +372,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
             }
 
             if (num_supported != type_ids.size())
-                throw Exception(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal",
-                                ErrorCodes::NO_COMMON_TYPE);
+                return on_error(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal");
 
             UInt32 max_scale = 0;
             for (const auto & type : types)
@@ -368,9 +394,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
             }
 
             if (min_precision > DataTypeDecimal<Decimal128>::maxPrecision())
-                throw Exception(getExceptionMessagePrefix(types) + " because the least supertype is Decimal("
-                                + toString(min_precision) + ',' + toString(max_scale) + ')',
-                                ErrorCodes::NO_COMMON_TYPE);
+                return on_error(getExceptionMessagePrefix(types) + " because the least supertype is Decimal("
+                                + toString(min_precision) + ',' + toString(max_scale) + ')');
 
             if (have_decimal128 || min_precision > DataTypeDecimal<Decimal64>::maxPrecision())
                 return std::make_shared<DataTypeDecimal<Decimal128>>(DataTypeDecimal<Decimal128>::maxPrecision(), max_scale);
@@ -431,7 +456,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
         if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating)
         {
             if (!all_numbers)
-                throw Exception(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE);
+                return on_error(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not");
 
             /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit.
             /// Example, common of Int32, UInt32 = Int64.
@@ -446,11 +471,10 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
                 if (min_bit_width_of_integer != 64)
                     ++min_bit_width_of_integer;
                 else
-                    throw Exception(
+                    return on_error(
                         getExceptionMessagePrefix(types)
                             + " because some of them are signed integers and some are unsigned integers,"
-                              " but there is no signed integer type, that can exactly represent all required unsigned integer values",
-                        ErrorCodes::NO_COMMON_TYPE);
+                              " but there is no signed integer type, that can exactly represent all required unsigned integer values");
             }
 
             /// If the result must be floating.
@@ -462,9 +486,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
                 else if (min_mantissa_bits <= 53)
                     return std::make_shared<DataTypeFloat64>();
                 else
-                    throw Exception(getExceptionMessagePrefix(types)
+                    return on_error(getExceptionMessagePrefix(types)
                         + " because some of them are integers and some are floating point,"
-                        " but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE);
+                        " but there is no floating point type, that can exactly represent all required integers");
             }
 
             /// If the result must be signed integer.
@@ -483,9 +507,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
                 else if (min_bit_width_of_integer <= 256)
                     return std::make_shared<DataTypeInt256>();
                 else
-                    throw Exception(getExceptionMessagePrefix(types)
+                    return on_error(getExceptionMessagePrefix(types)
                         + " because some of them are signed integers and some are unsigned integers,"
-                        " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE);
+                        " but there is no signed integer type, that can exactly represent all required unsigned integer values");
             }
 
             /// All unsigned.
@@ -503,14 +527,14 @@ DataTypePtr getLeastSupertype(const DataTypes & types)
                 else if (min_bit_width_of_integer <= 256)
                     return std::make_shared<DataTypeUInt256>();
                 else
-                    throw Exception("Logical error: " + getExceptionMessagePrefix(types)
-                        + " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE);
+                    return on_error("Logical error: " + getExceptionMessagePrefix(types)
+                        + " but as all data types are unsigned integers, we must have found maximum unsigned integer type");
             }
         }
     }
 
     /// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases).
-    throw Exception(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE);
+    return on_error(getExceptionMessagePrefix(types));
 }
 
 }
diff --git a/src/DataTypes/getLeastSupertype.h b/src/DataTypes/getLeastSupertype.h
index 57e011a0529..4edecf49d17 100644
--- a/src/DataTypes/getLeastSupertype.h
+++ b/src/DataTypes/getLeastSupertype.h
@@ -2,16 +2,15 @@
 
 #include <DataTypes/IDataType.h>
 
-
 namespace DB
 {
 
 /** Get data type that covers all possible values of passed data types.
-  * If there is no such data type, throws an exception.
+  * If there is no such data type, throws an exception or return nullptr.
   *
   * Examples: least common supertype for UInt8, Int8 - Int16.
   * Examples: there is no least common supertype for Array(UInt8), Int8.
   */
-DataTypePtr getLeastSupertype(const DataTypes & types);
+DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_type = true);
 
 }
diff --git a/src/DataTypes/getMostSubtype.cpp b/src/DataTypes/getMostSubtype.cpp
index fcf966780eb..4b968415521 100644
--- a/src/DataTypes/getMostSubtype.cpp
+++ b/src/DataTypes/getMostSubtype.cpp
@@ -23,7 +23,8 @@ extern const int NO_COMMON_TYPE;
 
 namespace
 {
-String getExceptionMessagePrefix(const DataTypes & types)
+
+String getExceptionMessage(const DataTypes & types, const std::string & reason)
 {
     WriteBufferFromOwnString res;
     res << "There is no subtype for types ";
@@ -37,19 +38,18 @@ String getExceptionMessagePrefix(const DataTypes & types)
 
         res << type->getName();
     }
-
+    res << reason;
     return res.str();
 }
 
 }
 
-
-DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_nothing, bool force_support_conversion)
+DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type, bool force_support_conversion)
 {
-    auto get_nothing_or_throw = [throw_if_result_is_nothing, & types](const std::string & reason)
+    auto get_nothing_or_throw = [throw_on_no_common_type, & types](const std::string & reason) -> DataTypePtr
     {
-        if (throw_if_result_is_nothing)
-            throw Exception(getExceptionMessagePrefix(types) + reason, ErrorCodes::NO_COMMON_TYPE);
+        if (throw_on_no_common_type)
+            throw Exception(getExceptionMessage(types, reason), ErrorCodes::NO_COMMON_TYPE);
         return std::make_shared<DataTypeNothing>();
     };
 
@@ -57,14 +57,14 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_noth
 
     if (types.empty())
     {
-        if (throw_if_result_is_nothing)
+        if (throw_on_no_common_type)
             throw Exception("There is no common type for empty type list", ErrorCodes::NO_COMMON_TYPE);
         return std::make_shared<DataTypeNothing>();
     }
 
     if (types.size() == 1)
     {
-        if (throw_if_result_is_nothing && typeid_cast<const DataTypeNothing *>(types[0].get()))
+        if (throw_on_no_common_type && typeid_cast<const DataTypeNothing *>(types[0].get()))
             throw Exception("There is no common type for type Nothing", ErrorCodes::NO_COMMON_TYPE);
         return types[0];
     }
@@ -118,7 +118,7 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_noth
             if (!all_arrays)
                 return get_nothing_or_throw(" because some of them are Array and some of them are not");
 
-            return std::make_shared<DataTypeArray>(getMostSubtype(nested_types, false, force_support_conversion));
+            return std::make_shared<DataTypeArray>(getMostSubtype(nested_types, throw_on_no_common_type, force_support_conversion));
         }
     }
 
@@ -161,7 +161,7 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_noth
             DataTypes common_tuple_types(tuple_size);
             for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
                 common_tuple_types[elem_idx] =
-                        getMostSubtype(nested_types[elem_idx], throw_if_result_is_nothing, force_support_conversion);
+                        getMostSubtype(nested_types[elem_idx], throw_on_no_common_type, force_support_conversion);
 
             return std::make_shared<DataTypeTuple>(common_tuple_types);
         }
@@ -192,9 +192,9 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_noth
         if (have_nullable)
         {
             if (all_nullable || force_support_conversion)
-                return std::make_shared<DataTypeNullable>(getMostSubtype(nested_types, false, force_support_conversion));
+                return std::make_shared<DataTypeNullable>(getMostSubtype(nested_types, throw_on_no_common_type, force_support_conversion));
 
-            return getMostSubtype(nested_types, throw_if_result_is_nothing, force_support_conversion);
+            return getMostSubtype(nested_types, throw_on_no_common_type, force_support_conversion);
         }
     }
 
@@ -309,8 +309,10 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_noth
                 else if (min_mantissa_bits_of_floating <= 53)
                     return std::make_shared<DataTypeFloat64>();
                 else
-                    throw Exception("Logical error: " + getExceptionMessagePrefix(types)
-                                    + " but as all data types are floats, we must have found maximum float type", ErrorCodes::NO_COMMON_TYPE);
+                    throw Exception(
+                        ErrorCodes::NO_COMMON_TYPE,
+                        "Logical error: " +
+                            getExceptionMessage(types, " but as all data types are floats, we must have found maximum float type"));
             }
 
             /// If there are signed and unsigned types of same bit-width, the result must be unsigned number.
@@ -326,8 +328,10 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_noth
                 else if (min_bits_of_unsigned_integer <= 64)
                     return std::make_shared<DataTypeUInt64>();
                 else
-                    throw Exception("Logical error: " + getExceptionMessagePrefix(types)
-                                    + " but as all data types are integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE);
+                    throw Exception(
+                        ErrorCodes::NO_COMMON_TYPE,
+                        "Logical error: " +
+                            getExceptionMessage(types, " but as all data types are integers, we must have found maximum unsigned integer type"));
             }
 
             /// All signed.
@@ -341,8 +345,10 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_noth
                 else if (min_bits_of_signed_integer <= 64)
                     return std::make_shared<DataTypeInt64>();
                 else
-                    throw Exception("Logical error: " + getExceptionMessagePrefix(types)
-                                    + " but as all data types are integers, we must have found maximum signed integer type", ErrorCodes::NO_COMMON_TYPE);
+                    throw Exception(
+                        ErrorCodes::NO_COMMON_TYPE,
+                        "Logical error: " +
+                            getExceptionMessage(types, " but as all data types are integers, we must have found maximum signed integer type"));
             }
         }
     }
diff --git a/src/DataTypes/getMostSubtype.h b/src/DataTypes/getMostSubtype.h
index c46cf4e2054..c5765f7c370 100644
--- a/src/DataTypes/getMostSubtype.h
+++ b/src/DataTypes/getMostSubtype.h
@@ -12,8 +12,9 @@ namespace DB
   * Examples: most common subtype for Array(UInt8), Int8 is Nothing
   *
   * If force_support_conversion is true, returns type which may be used to convert each argument to.
-  * Example: most common subtype for Array(UInt8) and Array(Nullable(Int32)) is Array(Nullable(UInt8) if force_support_conversion is true.
+  * Example: most common subtype for Array(UInt8) and Array(Nullable(Int32)) is Array(Nullable(UInt8)) if force_support_conversion is true.
   */
-DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_nothing = false, bool force_support_conversion = false);
+DataTypePtr
+getMostSubtype(const DataTypes & types, bool throw_on_no_common_type = true, bool force_support_conversion = false);
 
 }
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index d15cbe42063..5ce0e7baca6 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -7,13 +7,14 @@
 #include <Core/Settings.h>
 
 #include <DataTypes/DataTypeNullable.h>
+#include <DataTypes/getMostSubtype.h>
+
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/queryToString.h>
 
 #include <common/logger_useful.h>
 
-
 namespace DB
 {
 
@@ -22,6 +23,23 @@ namespace ErrorCodes
     extern const int TYPE_MISMATCH;
 }
 
+namespace
+{
+
+std::string formatTypeMap(const TableJoin::NameToTypeMap & target, const TableJoin::NameToTypeMap & source)
+{
+    std::vector<std::string> text;
+    for (const auto & [k, v] : target)
+    {
+        auto src_type_it = source.find(k);
+        std::string src_type_name = src_type_it != source.end() ? src_type_it->second->getName() : "";
+        text.push_back(fmt::format("{} : {} -> {}", k, src_type_name, v->getName()));
+    }
+    return fmt::format("{}", fmt::join(text, ", "));
+}
+
+}
+
 TableJoin::TableJoin(const Settings & settings, VolumePtr tmp_volume_)
     : size_limits(SizeLimits{settings.max_rows_in_join, settings.max_bytes_in_join, settings.join_overflow_mode})
     , default_max_bytes(settings.default_max_bytes_in_join)
@@ -319,18 +337,22 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc
 
 bool TableJoin::applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
 {
-    bool need_convert = needConvert();
-    if (!need_convert && !hasUsing())
-    {
-        /// For `USING` we already inferred common type an syntax analyzer stage
-        NamesAndTypesList left_list;
-        NamesAndTypesList right_list;
-        for (const auto & col : left_sample_columns)
-            left_list.emplace_back(col.name, col.type);
-        for (const auto & col : right_sample_columns)
-            right_list.emplace_back(col.name, col.type);
 
-        need_convert = inferJoinKeyCommonType(left_list, right_list);
+    auto to_name_type_list = [](const ColumnsWithTypeAndName & columns)
+    {
+        NamesAndTypesList name_type_list;
+        for (const auto & col : columns)
+            name_type_list.emplace_back(col.name, col.type);
+        return name_type_list;
+    };
+
+    bool need_convert = needConvert();
+    if (!need_convert)
+    {
+        need_convert = inferJoinKeyCommonType(
+            to_name_type_list(left_sample_columns),
+            to_name_type_list(right_sample_columns),
+            true);
     }
 
     if (need_convert)
@@ -344,13 +366,13 @@ bool TableJoin::applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_c
 
 bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right)
 {
-    std::unordered_map<String, DataTypePtr> left_types;
+    NameToTypeMap left_types;
     for (const auto & col : left)
     {
         left_types[col.name] = col.type;
     }
 
-    std::unordered_map<String, DataTypePtr> right_types;
+    NameToTypeMap right_types;
     for (const auto & col : right)
     {
         if (auto it = renames.find(col.name); it != renames.end())
@@ -374,37 +396,27 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam
         if (JoinCommon::typesEqualUpToNullability(ltype->second, rtype->second))
             continue;
 
-        DataTypePtr supertype;
-        try
+        auto common_type = to_supertype ? DB::getLeastSupertype({ltype->second, rtype->second}, false)
+                                        : DB::getMostSubtype({ltype->second, rtype->second}, false);
+        if (common_type == nullptr || isNothing(common_type))
         {
-            supertype = DB::getLeastSupertype({ltype->second, rtype->second});
+            LOG_DEBUG(&Poco::Logger::get("TableJoin"),
+                      "Can't infer supertype for joined columns: {}: {} at left, {}: {} at right.",
+                      key_names_left[i], ltype->second->getName(),
+                      key_names_right[i], rtype->second->getName());
+            continue;
         }
-        catch (DB::Exception & ex)
-        {
-            throw Exception(
-                "Type mismatch of columns to JOIN by: " +
-                    key_names_left[i] + ": " + ltype->second->getName() + " at left, " +
-                    key_names_right[i] + ": " + rtype->second->getName() + " at right. " +
-                    "Can't get supertype: " + ex.message(),
-                ErrorCodes::TYPE_MISMATCH);
-        }
-        left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = supertype;
+
+        left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = common_type;
     }
 
     if (!left_type_map.empty() || !right_type_map.empty())
     {
-        auto format_type_map = [](NameToTypeMap mapping) -> std::string
-        {
-            std::vector<std::string> text;
-            for (const auto & [k, v] : mapping)
-                text.push_back(k + ": " + v->getName());
-            return fmt::format("{}", fmt::join(text, ", "));
-        };
         LOG_TRACE(
             &Poco::Logger::get("TableJoin"),
             "Infer supertype for joined columns. Left: [{}], Right: [{}]",
-            format_type_map(left_type_map),
-            format_type_map(right_type_map));
+            formatTypeMap(left_type_map, left_types),
+            formatTypeMap(right_type_map, right_types));
     }
 
     return !left_type_map.empty();
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 5d26a14113c..d766cad3cb8 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -193,7 +193,7 @@ public:
     void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability = true) const;
 
     /// Calculates common supertypes for corresponding join key columns.
-    bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right);
+    bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right, bool to_supertype);
 
     /// Calculate converting actions, rename key columns in required
     /// For `USING` join we will convert key columns inplace and affect into types in the result table

From 30e112d6a64560399247d8b31e04b840863395e5 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Sat, 26 Jun 2021 17:07:59 +0300
Subject: [PATCH 042/177] Do not convert join key columns for storage/dict join

---
 src/Interpreters/ExpressionAnalyzer.cpp | 102 ++++++++++++------------
 src/Interpreters/TableJoin.cpp          |   2 +-
 src/Interpreters/TreeRewriter.cpp       |   3 +-
 3 files changed, 53 insertions(+), 54 deletions(-)

diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index c21103c1e7f..b84546727b1 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -921,67 +921,65 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
     /// Use StorageJoin if any.
     JoinPtr join = tryGetStorageJoin(syntax->analyzed_join);
 
-    if (!join)
+    if (join)
+        return join;
+
+    /// Actions which need to be calculated on joined block.
+    auto joined_block_actions = createJoinedBlockActions(getContext(), analyzedJoin());
+    Names original_right_columns;
+
+    NamesWithAliases required_columns_with_aliases = analyzedJoin().getRequiredColumns(
+        Block(joined_block_actions->getResultColumns()), joined_block_actions->getRequiredColumns().getNames());
+    for (auto & pr : required_columns_with_aliases)
+        original_right_columns.push_back(pr.first);
+
+    /** For GLOBAL JOINs (in the case, for example, of the push method for executing GLOBAL subqueries), the following occurs
+        * - in the addExternalStorage function, the JOIN (SELECT ...) subquery is replaced with JOIN _data1,
+        *   in the subquery_for_set object this subquery is exposed as source and the temporary table _data1 as the `table`.
+        * - this function shows the expression JOIN _data1.
+        */
+    auto interpreter = interpretSubquery(
+        join_element.table_expression, getContext(), original_right_columns, query_options.copy().setWithAllColumns());
     {
-        /// Actions which need to be calculated on joined block.
-        auto joined_block_actions = createJoinedBlockActions(getContext(), analyzedJoin());
+        joined_plan = std::make_unique<QueryPlan>();
+        interpreter->buildQueryPlan(*joined_plan);
 
-        Names original_right_columns;
-
-        NamesWithAliases required_columns_with_aliases = analyzedJoin().getRequiredColumns(
-            Block(joined_block_actions->getResultColumns()), joined_block_actions->getRequiredColumns().getNames());
-        for (auto & pr : required_columns_with_aliases)
-            original_right_columns.push_back(pr.first);
-
-        /** For GLOBAL JOINs (in the case, for example, of the push method for executing GLOBAL subqueries), the following occurs
-            * - in the addExternalStorage function, the JOIN (SELECT ...) subquery is replaced with JOIN _data1,
-            *   in the subquery_for_set object this subquery is exposed as source and the temporary table _data1 as the `table`.
-            * - this function shows the expression JOIN _data1.
-            */
-        auto interpreter = interpretSubquery(
-            join_element.table_expression, getContext(), original_right_columns, query_options.copy().setWithAllColumns());
+        auto sample_block = interpreter->getSampleBlock();
+        auto rename_dag = std::make_unique<ActionsDAG>(sample_block.getColumnsWithTypeAndName());
+        for (const auto & name_with_alias : required_columns_with_aliases)
         {
-            joined_plan = std::make_unique<QueryPlan>();
-            interpreter->buildQueryPlan(*joined_plan);
-
-            auto sample_block = interpreter->getSampleBlock();
-
-            auto rename_dag = std::make_unique<ActionsDAG>(sample_block.getColumnsWithTypeAndName());
-            for (const auto & name_with_alias : required_columns_with_aliases)
+            if (sample_block.has(name_with_alias.first))
             {
-                if (sample_block.has(name_with_alias.first))
-                {
-                    auto pos = sample_block.getPositionByName(name_with_alias.first);
-                    const auto & alias = rename_dag->addAlias(*rename_dag->getInputs()[pos], name_with_alias.second);
-                    rename_dag->getIndex()[pos] = &alias;
-                }
+                auto pos = sample_block.getPositionByName(name_with_alias.first);
+                const auto & alias = rename_dag->addAlias(*rename_dag->getInputs()[pos], name_with_alias.second);
+                rename_dag->getIndex()[pos] = &alias;
             }
-
-            auto rename_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), std::move(rename_dag));
-            rename_step->setStepDescription("Rename joined columns");
-            joined_plan->addStep(std::move(rename_step));
         }
 
-        auto joined_actions_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), std::move(joined_block_actions));
-        joined_actions_step->setStepDescription("Joined actions");
-        joined_plan->addStep(std::move(joined_actions_step));
-
-        const ColumnsWithTypeAndName & right_sample_columns = joined_plan->getCurrentDataStream().header.getColumnsWithTypeAndName();
-        bool need_convert = syntax->analyzed_join->applyJoinKeyConvert(left_sample_columns, right_sample_columns);
-        if (need_convert)
-        {
-            auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), syntax->analyzed_join->rightConvertingActions());
-            converting_step->setStepDescription("Convert joined columns");
-            joined_plan->addStep(std::move(converting_step));
-        }
-
-        join = makeJoin(syntax->analyzed_join, joined_plan->getCurrentDataStream().header, getContext());
-
-        /// Do not make subquery for join over dictionary.
-        if (syntax->analyzed_join->dictionary_reader)
-            joined_plan.reset();
+        auto rename_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), std::move(rename_dag));
+        rename_step->setStepDescription("Rename joined columns");
+        joined_plan->addStep(std::move(rename_step));
     }
 
+    auto joined_actions_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), std::move(joined_block_actions));
+    joined_actions_step->setStepDescription("Joined actions");
+    joined_plan->addStep(std::move(joined_actions_step));
+
+    const ColumnsWithTypeAndName & right_sample_columns = joined_plan->getCurrentDataStream().header.getColumnsWithTypeAndName();
+    bool need_convert = syntax->analyzed_join->applyJoinKeyConvert(left_sample_columns, right_sample_columns);
+    if (need_convert)
+    {
+        auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), syntax->analyzed_join->rightConvertingActions());
+        converting_step->setStepDescription("Convert joined columns");
+        joined_plan->addStep(std::move(converting_step));
+    }
+
+    join = makeJoin(syntax->analyzed_join, joined_plan->getCurrentDataStream().header, getContext());
+
+    /// Do not make subquery for join over dictionary.
+    if (syntax->analyzed_join->dictionary_reader)
+        joined_plan.reset();
+
     return join;
 }
 
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index 5ce0e7baca6..047380a8436 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -347,7 +347,7 @@ bool TableJoin::applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_c
     };
 
     bool need_convert = needConvert();
-    if (!need_convert)
+    if (!need_convert && joined_storage == nullptr)
     {
         need_convert = inferJoinKeyCommonType(
             to_name_type_list(left_sample_columns),
diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp
index 5a9e2bfbd17..0128d4bdc2a 100644
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@@ -525,7 +525,8 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTTableJoin & table_
         /// We can infer common type on syntax stage for `USING` because join is performed only by columns (not expressions)
         /// We need to know  changed types in result tables because some analysis (e.g. analyzeAggregation) performed before join
         /// For `JOIN ON expr1 == expr2` we will infer common type later in ExpressionAnalyzer, when types of expression will be known
-        analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns);
+        if (analyzed_join.joined_storage == nullptr)
+            analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns, true);
     }
     else if (table_join.on_expression)
     {

From 3443b3b5c474a8ef8ad3a1b00ce3f47199f65197 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Sat, 26 Jun 2021 17:27:44 +0300
Subject: [PATCH 043/177] Separate functoin buildJoinedPlan, minor refactoring

---
 src/Interpreters/ExpressionAnalyzer.cpp       | 52 +++++++++------
 src/Interpreters/ExpressionAnalyzer.h         |  2 +-
 src/Interpreters/InterpreterSelectQuery.cpp   |  2 -
 src/Interpreters/TreeRewriter.cpp             | 66 +++++++++----------
 src/Interpreters/TreeRewriter.h               |  3 +-
 ..._transform_query_for_external_database.cpp |  2 +-
 6 files changed, 66 insertions(+), 61 deletions(-)

diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index b84546727b1..ce0eaacb339 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -43,7 +43,6 @@
 #include <Common/StringUtils/StringUtils.h>
 
 #include <DataTypes/DataTypeFactory.h>
-#include <Parsers/parseQuery.h>
 
 #include <Interpreters/ActionsVisitor.h>
 #include <Interpreters/GetAggregatesVisitor.h>
@@ -910,25 +909,18 @@ static std::shared_ptr<IJoin> makeJoin(std::shared_ptr<TableJoin> analyzed_join,
     return std::make_shared<JoinSwitcher>(analyzed_join, sample_block);
 }
 
-JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
-    const ASTTablesInSelectQueryElement & join_element, const ColumnsWithTypeAndName & left_sample_columns)
+std::unique_ptr<QueryPlan> buildJoinedPlan(
+    ContextPtr context,
+    const ASTTablesInSelectQueryElement & join_element,
+    const ColumnsWithTypeAndName & left_sample_columns,
+    TableJoin & analyzed_join,
+    SelectQueryOptions query_options)
 {
-    /// Two JOINs are not supported with the same subquery, but different USINGs.
-
-    if (joined_plan)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "Table join was already created for query");
-
-    /// Use StorageJoin if any.
-    JoinPtr join = tryGetStorageJoin(syntax->analyzed_join);
-
-    if (join)
-        return join;
-
     /// Actions which need to be calculated on joined block.
-    auto joined_block_actions = createJoinedBlockActions(getContext(), analyzedJoin());
+    auto joined_block_actions = createJoinedBlockActions(context, analyzed_join);
     Names original_right_columns;
 
-    NamesWithAliases required_columns_with_aliases = analyzedJoin().getRequiredColumns(
+    NamesWithAliases required_columns_with_aliases = analyzed_join.getRequiredColumns(
         Block(joined_block_actions->getResultColumns()), joined_block_actions->getRequiredColumns().getNames());
     for (auto & pr : required_columns_with_aliases)
         original_right_columns.push_back(pr.first);
@@ -940,10 +932,9 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
         */
     auto interpreter = interpretSubquery(
         join_element.table_expression, getContext(), original_right_columns, query_options.copy().setWithAllColumns());
+    auto joined_plan = std::make_unique<QueryPlan>();
+    interpreter->buildQueryPlan(*joined_plan);
     {
-        joined_plan = std::make_unique<QueryPlan>();
-        interpreter->buildQueryPlan(*joined_plan);
-
         auto sample_block = interpreter->getSampleBlock();
         auto rename_dag = std::make_unique<ActionsDAG>(sample_block.getColumnsWithTypeAndName());
         for (const auto & name_with_alias : required_columns_with_aliases)
@@ -966,14 +957,33 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
     joined_plan->addStep(std::move(joined_actions_step));
 
     const ColumnsWithTypeAndName & right_sample_columns = joined_plan->getCurrentDataStream().header.getColumnsWithTypeAndName();
-    bool need_convert = syntax->analyzed_join->applyJoinKeyConvert(left_sample_columns, right_sample_columns);
+    bool need_convert = analyzed_join.applyJoinKeyConvert(left_sample_columns, right_sample_columns);
     if (need_convert)
     {
-        auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), syntax->analyzed_join->rightConvertingActions());
+        auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), analyzed_join.rightConvertingActions());
         converting_step->setStepDescription("Convert joined columns");
         joined_plan->addStep(std::move(converting_step));
     }
 
+    return joined_plan;
+}
+
+JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
+    const ASTTablesInSelectQueryElement & join_element, const ColumnsWithTypeAndName & left_sample_columns)
+{
+    /// Two JOINs are not supported with the same subquery, but different USINGs.
+
+    if (joined_plan)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Table join was already created for query");
+
+    /// Use StorageJoin if any.
+    JoinPtr join = tryGetStorageJoin(syntax->analyzed_join);
+
+    if (join)
+        return join;
+
+    joined_plan = buildJoinedPlan(getContext(), join_element, left_sample_columns, *syntax->analyzed_join, query_options);
+
     join = makeJoin(syntax->analyzed_join, joined_plan->getCurrentDataStream().header, getContext());
 
     /// Do not make subquery for join over dictionary.
diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h
index 2d0041bd96b..fdcdcd51cda 100644
--- a/src/Interpreters/ExpressionAnalyzer.h
+++ b/src/Interpreters/ExpressionAnalyzer.h
@@ -92,7 +92,7 @@ private:
         const SizeLimits size_limits_for_set;
         const UInt64 distributed_group_by_no_merge;
 
-        ExtractedSettings(const Settings & settings_);
+        explicit ExtractedSettings(const Settings & settings_);
     };
 
 public:
diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index 6f64c83f8bf..d4d5affc798 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -1132,8 +1132,6 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
             }
 
             /// Optional step to convert key columns to common supertype.
-            /// Columns with changed types will be returned to user,
-            ///  so its only suitable for `USING` join.
             if (expressions.converting_join_columns)
             {
                 QueryPlanStepPtr convert_join_step = std::make_unique<ExpressionStep>(
diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp
index 0128d4bdc2a..0b3d8e82706 100644
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@@ -422,46 +422,44 @@ void executeScalarSubqueries(ASTPtr & query, ContextPtr context, size_t subquery
 void getArrayJoinedColumns(ASTPtr & query, TreeRewriterResult & result, const ASTSelectQuery * select_query,
                            const NamesAndTypesList & source_columns, const NameSet & source_columns_set)
 {
-    if (ASTPtr array_join_expression_list = select_query->arrayJoinExpressionList())
+    if (!select_query->arrayJoinExpressionList())
+        return;
+
+    ArrayJoinedColumnsVisitor::Data visitor_data{
+        result.aliases, result.array_join_name_to_alias, result.array_join_alias_to_name, result.array_join_result_to_source};
+    ArrayJoinedColumnsVisitor(visitor_data).visit(query);
+
+    /// If the result of ARRAY JOIN is not used, it is necessary to ARRAY-JOIN any column,
+    /// to get the correct number of rows.
+    if (result.array_join_result_to_source.empty())
     {
-        ArrayJoinedColumnsVisitor::Data visitor_data{result.aliases,
-                                                    result.array_join_name_to_alias,
-                                                    result.array_join_alias_to_name,
-                                                    result.array_join_result_to_source};
-        ArrayJoinedColumnsVisitor(visitor_data).visit(query);
+        if (select_query->arrayJoinExpressionList()->children.empty())
+            throw DB::Exception("ARRAY JOIN requires an argument", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 
-        /// If the result of ARRAY JOIN is not used, it is necessary to ARRAY-JOIN any column,
-        /// to get the correct number of rows.
-        if (result.array_join_result_to_source.empty())
+        ASTPtr expr = select_query->arrayJoinExpressionList()->children.at(0);
+        String source_name = expr->getColumnName();
+        String result_name = expr->getAliasOrColumnName();
+
+        /// This is an array.
+        if (!expr->as<ASTIdentifier>() || source_columns_set.count(source_name))
         {
-            if (select_query->arrayJoinExpressionList()->children.empty())
-                throw DB::Exception("ARRAY JOIN requires an argument", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
-
-            ASTPtr expr = select_query->arrayJoinExpressionList()->children.at(0);
-            String source_name = expr->getColumnName();
-            String result_name = expr->getAliasOrColumnName();
-
-            /// This is an array.
-            if (!expr->as<ASTIdentifier>() || source_columns_set.count(source_name))
+            result.array_join_result_to_source[result_name] = source_name;
+        }
+        else /// This is a nested table.
+        {
+            bool found = false;
+            for (const auto & column : source_columns)
             {
-                result.array_join_result_to_source[result_name] = source_name;
-            }
-            else /// This is a nested table.
-            {
-                bool found = false;
-                for (const auto & column : source_columns)
+                auto split = Nested::splitName(column.name);
+                if (split.first == source_name && !split.second.empty())
                 {
-                    auto split = Nested::splitName(column.name);
-                    if (split.first == source_name && !split.second.empty())
-                    {
-                        result.array_join_result_to_source[Nested::concatenateName(result_name, split.second)] = column.name;
-                        found = true;
-                        break;
-                    }
+                    result.array_join_result_to_source[Nested::concatenateName(result_name, split.second)] = column.name;
+                    found = true;
+                    break;
                 }
-                if (!found)
-                    throw Exception("No columns in nested table " + source_name, ErrorCodes::EMPTY_NESTED_TABLE);
             }
+            if (!found)
+                throw Exception("No columns in nested table " + source_name, ErrorCodes::EMPTY_NESTED_TABLE);
         }
     }
 }
@@ -523,7 +521,7 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTTableJoin & table_
         /// `USING` semantic allows to have columns with changed types in result table.
         /// `JOIN ON` should preserve types from original table
         /// We can infer common type on syntax stage for `USING` because join is performed only by columns (not expressions)
-        /// We need to know  changed types in result tables because some analysis (e.g. analyzeAggregation) performed before join
+        /// We need to know changed types in result tables because some analysis (e.g. analyzeAggregation) performed before join
         /// For `JOIN ON expr1 == expr2` we will infer common type later in ExpressionAnalyzer, when types of expression will be known
         if (analyzed_join.joined_storage == nullptr)
             analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns, true);
diff --git a/src/Interpreters/TreeRewriter.h b/src/Interpreters/TreeRewriter.h
index 0dca00c285e..98201f35216 100644
--- a/src/Interpreters/TreeRewriter.h
+++ b/src/Interpreters/TreeRewriter.h
@@ -73,7 +73,7 @@ struct TreeRewriterResult
     /// Results of scalar sub queries
     Scalars scalars;
 
-    TreeRewriterResult(
+    explicit TreeRewriterResult(
         const NamesAndTypesList & source_columns_,
         ConstStoragePtr storage_ = {},
         const StorageMetadataPtr & metadata_snapshot_ = {},
@@ -84,7 +84,6 @@ struct TreeRewriterResult
     Names requiredSourceColumns() const { return required_source_columns.getNames(); }
     const Names & requiredSourceColumnsForAccessCheck() const { return required_source_columns_before_expanding_alias_columns; }
     NameSet getArrayJoinSourceNameSet() const;
-    Names getExpandedAliases() const { return {expanded_aliases.begin(), expanded_aliases.end()}; }
     const Scalars & getScalars() const { return scalars; }
 };
 
diff --git a/src/Storages/tests/gtest_transform_query_for_external_database.cpp b/src/Storages/tests/gtest_transform_query_for_external_database.cpp
index 1d4cad576fa..b2621445147 100644
--- a/src/Storages/tests/gtest_transform_query_for_external_database.cpp
+++ b/src/Storages/tests/gtest_transform_query_for_external_database.cpp
@@ -104,7 +104,7 @@ static void check(
     SelectQueryInfo query_info;
     SelectQueryOptions select_options;
     query_info.syntax_analyzer_result
-        = TreeRewriter(state.context).analyzeSelect(ast, state.getColumns(), select_options, state.getTables(table_num));
+        = TreeRewriter(state.context).analyzeSelect(ast, DB::TreeRewriterResult(state.getColumns()), select_options, state.getTables(table_num));
     query_info.query = ast;
     std::string transformed_query = transformQueryForExternalDatabase(
         query_info, state.getColumns(), IdentifierQuotingStyle::DoubleQuotes, "test", "table", state.context);

From 34b9bd7d339b7da98bb3b87b967e895938dfb05a Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Sat, 26 Jun 2021 17:28:53 +0300
Subject: [PATCH 044/177] Fix build: missing argument in inferJoinKeyCommonType

---
 src/Interpreters/TableJoin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index 047380a8436..bbcfb56dbc6 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -364,7 +364,7 @@ bool TableJoin::applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_c
     return need_convert;
 }
 
-bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right)
+bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right, bool to_supertype)
 {
     NameToTypeMap left_types;
     for (const auto & col : left)

From e8e26463bf42327f4ce67041846ad9d98a1ea05a Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Mon, 28 Jun 2021 12:53:54 +0300
Subject: [PATCH 045/177] Change signature of
 ASTSelectQuery::arrayJoinExpressionList

---
 src/Interpreters/ArrayJoinedColumnsVisitor.h       |  2 +-
 src/Interpreters/ExpressionAnalyzer.cpp            |  3 +--
 src/Interpreters/InterpreterSelectQuery.cpp        |  8 +++-----
 src/Interpreters/PredicateExpressionsOptimizer.cpp |  2 +-
 src/Interpreters/TreeRewriter.cpp                  |  6 +++---
 src/Parsers/ASTSelectQuery.cpp                     | 14 +++-----------
 src/Parsers/ASTSelectQuery.h                       |  4 ++--
 src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp |  2 +-
 8 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/src/Interpreters/ArrayJoinedColumnsVisitor.h b/src/Interpreters/ArrayJoinedColumnsVisitor.h
index 94f6bdaf138..b64a170cb00 100644
--- a/src/Interpreters/ArrayJoinedColumnsVisitor.h
+++ b/src/Interpreters/ArrayJoinedColumnsVisitor.h
@@ -60,7 +60,7 @@ public:
 private:
     static void visit(const ASTSelectQuery & node, ASTPtr &, Data & data)
     {
-        ASTPtr array_join_expression_list = node.arrayJoinExpressionList();
+        auto [array_join_expression_list, _] = node.arrayJoinExpressionList();
         if (!array_join_expression_list)
             throw Exception("Logical error: no ARRAY JOIN", ErrorCodes::LOGICAL_ERROR);
 
diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index ce0eaacb339..028362e392b 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -803,8 +803,7 @@ ArrayJoinActionPtr SelectQueryExpressionAnalyzer::appendArrayJoin(ExpressionActi
 {
     const auto * select_query = getSelectQuery();
 
-    bool is_array_join_left;
-    ASTPtr array_join_expression_list = select_query->arrayJoinExpressionList(is_array_join_left);
+    auto [array_join_expression_list, is_array_join_left] = select_query->arrayJoinExpressionList();
     if (!array_join_expression_list)
         return nullptr;
 
diff --git a/src/Interpreters/InterpreterSelectQuery.cpp b/src/Interpreters/InterpreterSelectQuery.cpp
index d4d5affc798..5fe9948f857 100644
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@@ -854,7 +854,7 @@ static std::pair<UInt64, UInt64> getLimitLengthAndOffset(const ASTSelectQuery &
 static UInt64 getLimitForSorting(const ASTSelectQuery & query, ContextPtr context)
 {
     /// Partial sort can be done if there is LIMIT but no DISTINCT or LIMIT BY, neither ARRAY JOIN.
-    if (!query.distinct && !query.limitBy() && !query.limit_with_ties && !query.arrayJoinExpressionList() && query.limitLength())
+    if (!query.distinct && !query.limitBy() && !query.limit_with_ties && !query.arrayJoinExpressionList().first && query.limitLength())
     {
         auto [limit_length, limit_offset] = getLimitLengthAndOffset(query, context);
         if (limit_length > std::numeric_limits<UInt64>::max() - limit_offset)
@@ -1352,17 +1352,15 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
             bool apply_prelimit = apply_limit &&
                                   query.limitLength() && !query.limit_with_ties &&
                                   !hasWithTotalsInAnySubqueryInFromClause(query) &&
-                                  !query.arrayJoinExpressionList() &&
+                                  !query.arrayJoinExpressionList().first &&
                                   !query.distinct &&
                                   !expressions.hasLimitBy() &&
                                   !settings.extremes &&
                                   !has_withfill;
             bool apply_offset = options.to_stage != QueryProcessingStage::WithMergeableStateAfterAggregationAndLimit;
-            bool limit_applied = false;
             if (apply_prelimit)
             {
                 executePreLimit(query_plan, /* do_not_skip_offset= */!apply_offset);
-                limit_applied = true;
             }
 
             /** If there was more than one stream,
@@ -1384,7 +1382,6 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
             if (query.limit_with_ties && apply_offset)
             {
                 executeLimit(query_plan);
-                limit_applied = true;
             }
 
             /// Projection not be done on the shards, since then initiator will not find column in blocks.
@@ -1398,6 +1395,7 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
             /// Extremes are calculated before LIMIT, but after LIMIT BY. This is Ok.
             executeExtremes(query_plan);
 
+            bool limit_applied = apply_prelimit || (query.limit_with_ties && apply_offset);
             /// Limit is no longer needed if there is prelimit.
             ///
             /// NOTE: that LIMIT cannot be applied if OFFSET should not be applied,
diff --git a/src/Interpreters/PredicateExpressionsOptimizer.cpp b/src/Interpreters/PredicateExpressionsOptimizer.cpp
index f2e55441fb6..adc164e0596 100644
--- a/src/Interpreters/PredicateExpressionsOptimizer.cpp
+++ b/src/Interpreters/PredicateExpressionsOptimizer.cpp
@@ -39,7 +39,7 @@ bool PredicateExpressionsOptimizer::optimize(ASTSelectQuery & select_query)
     if (!select_query.tables() || select_query.tables()->children.empty())
         return false;
 
-    if ((!select_query.where() && !select_query.prewhere()) || select_query.arrayJoinExpressionList())
+    if ((!select_query.where() && !select_query.prewhere()) || select_query.arrayJoinExpressionList().first)
         return false;
 
     const auto & tables_predicates = extractTablesPredicates(select_query.where(), select_query.prewhere());
diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp
index 0b3d8e82706..073b38eae57 100644
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@@ -422,7 +422,7 @@ void executeScalarSubqueries(ASTPtr & query, ContextPtr context, size_t subquery
 void getArrayJoinedColumns(ASTPtr & query, TreeRewriterResult & result, const ASTSelectQuery * select_query,
                            const NamesAndTypesList & source_columns, const NameSet & source_columns_set)
 {
-    if (!select_query->arrayJoinExpressionList())
+    if (!select_query->arrayJoinExpressionList().first)
         return;
 
     ArrayJoinedColumnsVisitor::Data visitor_data{
@@ -433,10 +433,10 @@ void getArrayJoinedColumns(ASTPtr & query, TreeRewriterResult & result, const AS
     /// to get the correct number of rows.
     if (result.array_join_result_to_source.empty())
     {
-        if (select_query->arrayJoinExpressionList()->children.empty())
+        if (select_query->arrayJoinExpressionList().first->children.empty())
             throw DB::Exception("ARRAY JOIN requires an argument", ErrorCodes::NUMBER_OF_ARGUMENTS_DOESNT_MATCH);
 
-        ASTPtr expr = select_query->arrayJoinExpressionList()->children.at(0);
+        ASTPtr expr = select_query->arrayJoinExpressionList().first->children.at(0);
         String source_name = expr->getColumnName();
         String result_name = expr->getAliasOrColumnName();
 
diff --git a/src/Parsers/ASTSelectQuery.cpp b/src/Parsers/ASTSelectQuery.cpp
index 7699d380623..1c5a4310f1b 100644
--- a/src/Parsers/ASTSelectQuery.cpp
+++ b/src/Parsers/ASTSelectQuery.cpp
@@ -319,24 +319,16 @@ bool ASTSelectQuery::withFill() const
 }
 
 
-ASTPtr ASTSelectQuery::arrayJoinExpressionList(bool & is_left) const
+std::pair<ASTPtr, bool> ASTSelectQuery::arrayJoinExpressionList() const
 {
     const ASTArrayJoin * array_join = getFirstArrayJoin(*this);
     if (!array_join)
         return {};
 
-    is_left = (array_join->kind == ASTArrayJoin::Kind::Left);
-    return array_join->expression_list;
+    bool is_left = (array_join->kind == ASTArrayJoin::Kind::Left);
+    return {array_join->expression_list, is_left};
 }
 
-
-ASTPtr ASTSelectQuery::arrayJoinExpressionList() const
-{
-    bool is_left;
-    return arrayJoinExpressionList(is_left);
-}
-
-
 const ASTTablesInSelectQueryElement * ASTSelectQuery::join() const
 {
     return getFirstTableJoin(*this);
diff --git a/src/Parsers/ASTSelectQuery.h b/src/Parsers/ASTSelectQuery.h
index 2babc2f75c8..2b004e9e040 100644
--- a/src/Parsers/ASTSelectQuery.h
+++ b/src/Parsers/ASTSelectQuery.h
@@ -123,8 +123,8 @@ public:
     /// Compatibility with old parser of tables list. TODO remove
     ASTPtr sampleSize() const;
     ASTPtr sampleOffset() const;
-    ASTPtr arrayJoinExpressionList(bool & is_left) const;
-    ASTPtr arrayJoinExpressionList() const;
+    std::pair<ASTPtr, bool> arrayJoinExpressionList() const;
+
     const ASTTablesInSelectQueryElement * join() const;
     bool final() const;
     bool withFill() const;
diff --git a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
index 806c861cf00..1f33f3eb092 100644
--- a/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
+++ b/src/Storages/MergeTree/MergeTreeWhereOptimizer.cpp
@@ -373,7 +373,7 @@ bool MergeTreeWhereOptimizer::cannotBeMoved(const ASTPtr & ptr, bool is_final) c
 
 void MergeTreeWhereOptimizer::determineArrayJoinedNames(ASTSelectQuery & select)
 {
-    auto array_join_expression_list = select.arrayJoinExpressionList();
+    auto [array_join_expression_list, _] = select.arrayJoinExpressionList();
 
     /// much simplified code from ExpressionAnalyzer::getArrayJoinedColumns()
     if (!array_join_expression_list)

From e39f9aabdee2233b602cdc478290158338a268e7 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 31 Aug 2021 12:29:51 +0000
Subject: [PATCH 046/177] delete useless file

---
 src/Storages/MergeTree/MergeTask.h | 253 -----------------------------
 1 file changed, 253 deletions(-)
 delete mode 100644 src/Storages/MergeTree/MergeTask.h

diff --git a/src/Storages/MergeTree/MergeTask.h b/src/Storages/MergeTree/MergeTask.h
deleted file mode 100644
index 5f016c7ea08..00000000000
--- a/src/Storages/MergeTree/MergeTask.h
+++ /dev/null
@@ -1,253 +0,0 @@
-#pragma once
-
-#include <Storages/MergeTree/ExecutableTask.h>
-#include <Storages/MergeTree/MergeProgress.h>
-#include <Storages/MergeTree/MergeTreeData.h>
-#include <Storages/MergeTree/IMergedBlockOutputStream.h>
-#include <Storages/MergeTree/MergedBlockOutputStream.h>
-#include <Storages/MergeTree/FutureMergedMutatedPart.h>
-#include <Storages/MergeTree/ColumnSizeEstimator.h>
-#include <Storages/MergeTree/MergedColumnOnlyOutputStream.h>
-#include <DataStreams/ColumnGathererStream.h>
-#include <Compression/CompressedReadBufferFromFile.h>
-
-#include <memory>
-#include <list>
-
-namespace DB
-{
-
-class MergeTask;
-using MergeTaskPtr = std::shared_ptr<MergeTask>;
-
-/**
- * Overview of the merge algorithm
- *
- * Each merge is executed sequentially block by block.
- * The main idea is to make a merge not a subroutine which is executed
- * in a thread pool and may occupy a thread for a period of time,
- * but to make a merge a coroutine which can suspend the execution
- * in some points and then resume the execution from this point.
- *
- * A perfect point where to suspend the execution is after the work over a block is finished.
- * The task itself will be executed via BackgroundJobAssignee.
- *
- * The interface of the task is simple.
- * The main method is `execute()` which will return true, if the task wants to be executed again and false otherwise.
- *
- * With this kind of task we can give a merge a priority.
- * A priority is simple - the lower the size of the merge, the higher priority.
- * So, if ClickHouse wants to merge some really big parts into a bigger part,
- * then it will be executed for a long time, because the result of the merge is not really needed immediately.
- * It is better to merge small parts as soon as possible.
-*/
-class MergeTask
-{
-public:
-
-    MergeTask(
-        FutureMergedMutatedPartPtr future_part_,
-        StorageMetadataPtr metadata_snapshot_,
-        MergeList::Entry & merge_entry_,
-        TableLockHolder & holder_,
-        time_t time_of_merge_,
-        ContextPtr context_,
-        ReservationSharedPtr space_reservation_,
-        bool deduplicate_,
-        Names deduplicate_by_columns_,
-        MergeTreeData::MergingParams merging_params_,
-        MergeTreeDataPartPtr parent_part_,
-        String prefix_,
-        MergeTreeData & data_,
-        ActionBlocker & merges_blocker_,
-        ActionBlocker & ttl_merges_blocker_)
-        : future_part(future_part_)
-        , metadata_snapshot(metadata_snapshot_)
-        , merge_entry(merge_entry_)
-        , holder(holder_)
-        , time_of_merge(time_of_merge_)
-        , context(context_)
-        , space_reservation(std::move(space_reservation_))
-        , deduplicate(deduplicate_)
-        , deduplicate_by_columns(std::move(deduplicate_by_columns_))
-        , merging_params(merging_params_)
-        , parent_part(parent_part_)
-        , prefix(prefix_)
-        , data(data_)
-        , merges_blocker(merges_blocker_)
-        , ttl_merges_blocker(ttl_merges_blocker_)
-        {}
-
-    std::future<MergeTreeData::MutableDataPartPtr> getFuture()
-    {
-        return promise.get_future();
-    }
-
-    bool execute();
-
-    void prepare();
-
-private:
-    void createMergedStream();
-
-    MergeAlgorithm chooseMergeAlgorithm() const;
-
-    bool executeHorizontalForBlock();
-    void finalizeHorizontalPartOfTheMerge();
-    void prepareVertical();
-    bool executeVerticalMergeForAllColumns();
-
-    void prepareVerticalMergeForOneColumn();
-    bool executeVerticalMergeForOneColumn();
-    void finalizeVerticalMergeForOneColumn();
-
-    void finalizeVerticalMergeForAllColumns();
-
-    void mergeMinMaxIndex();
-
-    void prepareProjections();
-    bool executeProjections();
-    void finalizeProjections();
-
-    void finalize();
-
-
-    std::promise<MergeTreeData::MutableDataPartPtr> promise;
-
-    /**
-     * States of MergeTask state machine.
-     * Transitions are from up to down.
-     * But for vertical merge there are horizontal part of the merge and vertical part.
-     * For horizontal there is horizontal part only.
-     */
-    enum class MergeTaskState
-    {
-        NEED_PREPARE,
-        NEED_EXECUTE_HORIZONTAL,
-        NEED_FINALIZE_HORIZONTAL,
-        NEED_PREPARE_VERTICAL,
-        NEED_EXECUTE_VERTICAL,
-        NEED_FINISH_VERTICAL,
-        NEED_MERGE_MIN_MAX_INDEX,
-
-        NEED_PREPARE_PROJECTIONS,
-        NEED_EXECUTE_PROJECTIONS,
-        NEED_FINISH_PROJECTIONS,
-
-        NEED_FINISH
-    };
-    MergeTaskState state{MergeTaskState::NEED_PREPARE};
-
-    enum class VecticalMergeOneColumnState
-    {
-        NEED_PREPARE,
-        NEED_EXECUTE,
-        NEED_FINISH
-    };
-    VecticalMergeOneColumnState vertical_merge_one_column_state{VecticalMergeOneColumnState::NEED_PREPARE};
-
-    FutureMergedMutatedPartPtr future_part;
-    StorageMetadataPtr metadata_snapshot;
-    MergeList::Entry & merge_entry;
-    TableLockHolder & holder;
-    time_t time_of_merge;
-    ContextPtr context;
-    /// It is necessary, because of projections presense
-    ReservationSharedPtr space_reservation;
-    bool deduplicate;
-    Names deduplicate_by_columns;
-    MergeTreeData::MergingParams merging_params;
-    MergeTreeDataPartPtr parent_part;
-    String prefix;
-
-    /// From MergeTreeDataMergerMutator
-
-    MergeTreeData & data;
-    Poco::Logger * log{&Poco::Logger::get("MergeTask")};
-
-    ActionBlocker & merges_blocker;
-    ActionBlocker & ttl_merges_blocker;
-
-
-    /// Previously stack located variables
-
-    NamesAndTypesList gathering_columns;
-    NamesAndTypesList merging_columns;
-    Names gathering_column_names;
-    Names merging_column_names;
-
-    NamesAndTypesList storage_columns;
-    Names all_column_names;
-
-    String new_part_tmp_path;
-
-    size_t sum_input_rows_upper_bound{0};
-
-    bool need_remove_expired_values{false};
-    bool force_ttl{false};
-
-    DiskPtr tmp_disk{nullptr};
-    DiskPtr disk{nullptr};
-
-    std::unique_ptr<MergeStageProgress> horizontal_stage_progress{nullptr};
-    std::unique_ptr<MergeStageProgress> column_progress{nullptr};
-
-    std::unique_ptr<TemporaryFile> rows_sources_file;
-    std::unique_ptr<WriteBufferFromFileBase> rows_sources_uncompressed_write_buf{nullptr};
-    std::unique_ptr<WriteBuffer> rows_sources_write_buf{nullptr};
-    std::optional<ColumnSizeEstimator> column_sizes;
-
-    SyncGuardPtr sync_guard{nullptr};
-    MergeTreeData::MutableDataPartPtr new_data_part;
-    CompressionCodecPtr compression_codec;
-
-    MergeAlgorithm chosen_merge_algorithm{MergeAlgorithm::Undecided};
-
-    std::shared_ptr<MergedBlockOutputStream> to;
-    BlockInputStreamPtr merged_stream;
-
-    bool blocks_are_granules_size{false};
-
-    /// Variables that are needed for horizontal merge execution
-
-    size_t rows_written{0};
-    size_t initial_reservation{0};
-    UInt64 watch_prev_elapsed{0};
-
-    std::function<bool()> is_cancelled;
-
-    bool need_sync{false};
-    bool read_with_direct_io{false};
-
-
-    MergeTreeData::DataPart::Checksums checksums_gathered_columns;
-
-    std::list<DB::NameAndTypePair>::const_iterator it_name_and_type;
-    size_t column_num_for_vertical_merge{0};
-    size_t gathering_column_names_size{0};
-
-    /// This class has no default constructor, so we wrap it with unique_ptr
-    std::unique_ptr<CompressedReadBufferFromFile> rows_sources_read_buf{nullptr};
-    IMergedBlockOutputStream::WrittenOffsetColumns written_offset_columns;
-    std::unique_ptr<MergedColumnOnlyOutputStream> column_to;
-
-    BlockInputStreams column_part_streams;
-    std::unique_ptr<ColumnGathererStream> column_gathered_stream;
-
-    size_t column_elems_written = 0;
-    Float64 progress_before = 0;
-
-
-    using MergeTasks = std::deque<MergeTaskPtr>;
-    MergeTasks tasks_for_projections;
-    MergeTasks::iterator projections_iterator;
-};
-
-/// FIXME
-[[ maybe_unused]] static MergeTreeData::MutableDataPartPtr executeHere(MergeTaskPtr task)
-{
-    while (task->execute()) {}
-    return task->getFuture().get();
-}
-
-}

From 109d2f63d03ee974948151f4253886c2d66f8d0b Mon Sep 17 00:00:00 2001
From: Vasily Nemkov <V.Nemkov@gmail.com>
Date: Mon, 30 Aug 2021 20:37:07 +0300
Subject: [PATCH 047/177] Fixed tests and minor style issues

---
 programs/local/LocalServer.cpp    | 3 +--
 programs/server/Server.cpp        | 4 ----
 src/Core/MySQL/Authentication.cpp | 1 -
 src/Core/PostgreSQLProtocol.h     | 1 -
 src/Interpreters/Session.cpp      | 2 +-
 src/Interpreters/SystemLog.cpp    | 7 +------
 tests/queries/skip_list.json      | 3 +--
 7 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index 258743c7e16..278101e2c1d 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -9,7 +9,6 @@
 #include <Databases/DatabaseMemory.h>
 #include <Storages/System/attachSystemTables.h>
 #include <Interpreters/ProcessList.h>
-#include <Interpreters/Session.h>
 #include <Interpreters/executeQuery.h>
 #include <Interpreters/loadMetadata.h>
 #include <Interpreters/DatabaseCatalog.h>
@@ -394,7 +393,7 @@ void LocalServer::processQueries()
     auto context = session.makeQueryContext();
     context->makeSessionContext(); /// initial_create_query requires a session context to be set.
     context->setCurrentQueryId("");
- 
+
     applyCmdSettings(context);
 
     /// Use the same query_id (and thread group) for all queries
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 09b6add62d0..bf4e2f947dc 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -54,10 +54,6 @@
 #include <Interpreters/ProcessList.h>
 #include <Interpreters/Session.h>
 #include <Interpreters/loadMetadata.h>
-#include <Interpreters/DatabaseCatalog.h>
-#include <Interpreters/DNSCacheUpdater.h>
-#include <Interpreters/ExternalLoaderXMLConfigRepository.h>
-#include <Interpreters/InterserverCredentials.h>
 #include <Interpreters/JIT/CompiledExpressionCache.h>
 #include <Interpreters/UserDefinedObjectsLoader.h>
 #include <Access/AccessControlManager.h>
diff --git a/src/Core/MySQL/Authentication.cpp b/src/Core/MySQL/Authentication.cpp
index 76fb6bad833..0eb080892c1 100644
--- a/src/Core/MySQL/Authentication.cpp
+++ b/src/Core/MySQL/Authentication.cpp
@@ -2,7 +2,6 @@
 #include <Core/MySQL/PacketsConnection.h>
 #include <Poco/RandomStream.h>
 #include <Poco/SHA1Engine.h>
-#include <Interpreters/Session.h>
 #include <Access/User.h>
 #include <Access/AccessControlManager.h>
 #include <Interpreters/Session.h>
diff --git a/src/Core/PostgreSQLProtocol.h b/src/Core/PostgreSQLProtocol.h
index 6fc69d2d5b2..f0de4bbb843 100644
--- a/src/Core/PostgreSQLProtocol.h
+++ b/src/Core/PostgreSQLProtocol.h
@@ -7,7 +7,6 @@
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBuffer.h>
 #include <IO/WriteHelpers.h>
-#include <Interpreters/Session.h>
 #include <common/logger_useful.h>
 #include <Poco/Format.h>
 #include <Poco/RegularExpression.h>
diff --git a/src/Interpreters/Session.cpp b/src/Interpreters/Session.cpp
index b1f7f4349f4..d8480f3858e 100644
--- a/src/Interpreters/Session.cpp
+++ b/src/Interpreters/Session.cpp
@@ -309,7 +309,7 @@ void Session::authenticate(const Credentials & credentials_, const Poco::Net::So
     {
         user_id = global_context->getAccessControlManager().login(credentials_, address.host());
     }
-    catch(const Exception & e)
+    catch (const Exception & e)
     {
         if (auto session_log = getSessionLog())
             session_log->addLoginFailure(session_id, *prepared_client_info, credentials_.getUserName(), e);
diff --git a/src/Interpreters/SystemLog.cpp b/src/Interpreters/SystemLog.cpp
index dfc16dae49c..2ccb84e1ffa 100644
--- a/src/Interpreters/SystemLog.cpp
+++ b/src/Interpreters/SystemLog.cpp
@@ -6,14 +6,9 @@
 #include <Interpreters/QueryLog.h>
 #include <Interpreters/QueryThreadLog.h>
 #include <Interpreters/QueryViewsLog.h>
-#include <Interpreters/SystemLog.h>
+#include <Interpreters/SessionLog.h>
 #include <Interpreters/TextLog.h>
 #include <Interpreters/TraceLog.h>
-#include <Interpreters/CrashLog.h>
-#include <Interpreters/MetricLog.h>
-#include <Interpreters/SessionLog.h>
-#include <Interpreters/AsynchronousMetricLog.h>
-#include <Interpreters/OpenTelemetrySpanLog.h>
 #include <Interpreters/ZooKeeperLog.h>
 
 #include <Poco/Util/AbstractConfiguration.h>
diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json
index 91fca7eb5d5..6442ac5ed12 100644
--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -513,8 +513,7 @@
         "01530_drop_database_atomic_sync", /// creates database
         "02001_add_default_database_to_system_users", ///create user
         "02002_row_level_filter_bug", ///create user
-        "02015_system_views"
-        "02002_row_level_filter_bug", ///create user
+        "02015_system_views",
         "01747_system_session_log_long" // Reads from system.session_log and can't be run in parallel with any other test (since almost any other test writes to session_log)
     ]
 }

From dbc950caa4a3213edd394dca1b65f658b5a8722d Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 31 Aug 2021 14:54:24 +0000
Subject: [PATCH 048/177] added a test

---
 .../MergeTree/MergeMutateExecutor.cpp         |  3 +-
 src/Storages/MergeTree/MergeMutateExecutor.h  | 12 +++
 .../MergeTree/tests/gtest_executor.cpp        | 85 +++++++++++++++++++
 src/Storages/StorageMergeTree.h               |  6 +-
 src/Storages/StorageReplicatedMergeTree.cpp   |  4 +-
 src/Storages/StorageReplicatedMergeTree.h     |  7 +-
 6 files changed, 109 insertions(+), 8 deletions(-)
 create mode 100644 src/Storages/MergeTree/tests/gtest_executor.cpp

diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index df73bcef5b1..b8cbc5531d5 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -71,6 +71,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
     {
         ExecutableTaskPtr current;
         auto current_promise = std::make_shared<std::promise<void>>();
+
         {
             std::unique_lock lock(mutex);
             has_tasks.wait(lock, [this](){ return !tasks.empty() || shutdown_suspend; });
@@ -90,7 +91,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
             currently_executing.emplace(current, current_promise->get_future());
         }
 
-        bool res = pool.trySchedule([this, task = current, promise = current_promise] () mutable
+        bool res = pool.trySchedule([this, task = current, promise = current_promise] ()
         {
             auto metric_decrementor = std::make_shared<ParanoidMetricDecrementor>(metric);
             metric_decrementor->alarm();
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 22296e391ba..af6d4a824e2 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -25,6 +25,7 @@ public:
     bool execute() override
     {
         res = inner();
+        inner = {};
         return false;
     }
 
@@ -112,6 +113,17 @@ public:
         pool.wait();
     }
 
+    size_t active()
+    {
+        return pool.active();
+    }
+
+    size_t pending()
+    {
+        std::lock_guard lock(mutex);
+        return tasks.size();
+    }
+
 private:
 
     void updatePoolConfiguration()
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
new file mode 100644
index 00000000000..7e82716d474
--- /dev/null
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -0,0 +1,85 @@
+#include <gtest/gtest.h>
+
+#include <atomic>
+#include <memory>
+
+#include <Storages/MergeTree/ExecutableTask.h>
+#include <Storages/MergeTree/MergeMutateExecutor.h>
+
+using namespace DB;
+
+namespace CurrentMetrics
+{
+    extern const Metric BackgroundPoolTask;
+}
+
+class FakeExecutableTask : public ExecutableTask
+{
+public:
+    explicit FakeExecutableTask(String name_, std::function<void()> on_completed_) : name(name_), on_completed(on_completed_)
+    {
+    }
+
+    bool execute() override
+    {
+        std::this_thread::sleep_for(std::chrono::milliseconds(5));
+        return false;
+    }
+
+    StorageID getStorageID() override
+    {
+        return {"test", name};
+    }
+
+    void onCompleted() override
+    {
+        on_completed();
+    }
+
+private:
+
+    String name;
+    std::function<void()> on_completed;
+};
+
+
+TEST(Executor, Simple)
+{
+    auto executor = DB::MergeTreeBackgroundExecutor::create();
+
+    const size_t tasks_kinds = 25;
+    const size_t batch = 100;
+
+    executor->setThreadsCount([]() { return 25; });
+    executor->setTasksCount([] () { return tasks_kinds * batch; });
+    executor->setMetric(CurrentMetrics::BackgroundPoolTask);
+
+    for (size_t i = 0; i < 4; ++i)
+    {
+        for (size_t j = 0; j < tasks_kinds; ++j)
+        {
+            bool res = executor->trySchedule(std::make_shared<FakeExecutableTask>(std::to_string(j), [](){}));
+            ASSERT_TRUE(res);
+        }
+    }
+
+    std::vector<std::thread> threads(batch);
+
+    for (auto & thread : threads)
+        thread = std::thread([&] ()
+        {
+            for (size_t j = 0; j < tasks_kinds; ++j)
+                executor->removeTasksCorrespondingToStorage({"test", std::to_string(j)});
+
+        });
+
+    for (auto & thread : threads)
+        thread.join();
+
+    ASSERT_EQ(executor->active(), 0);
+    ASSERT_EQ(executor->pending(), 0);
+    ASSERT_EQ(CurrentMetrics::values[CurrentMetrics::BackgroundPoolTask], 0);
+
+    executor->wait();
+
+}
diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h
index a0d0583429f..4fd38c3618f 100644
--- a/src/Storages/StorageMergeTree.h
+++ b/src/Storages/StorageMergeTree.h
@@ -122,8 +122,6 @@ private:
     MergeTreeDataSelectExecutor reader;
     MergeTreeDataWriter writer;
     MergeTreeDataMergerMutator merger_mutator;
-    BackgroundJobAssignee background_executor;
-    BackgroundJobAssignee background_moves_executor;
 
     std::unique_ptr<MergeTreeDeduplicationLog> deduplication_log;
 
@@ -150,6 +148,10 @@ private:
 
     std::atomic<bool> shutdown_called {false};
 
+    /// Must be the last to be destroyed first
+    BackgroundJobAssignee background_executor;
+    BackgroundJobAssignee background_moves_executor;
+
     void loadMutations();
 
     /// Load and initialize deduplication logs. Even if deduplication setting
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index cfe71f63ede..351bc4646f9 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -285,8 +285,6 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
     , merge_strategy_picker(*this)
     , queue(*this, merge_strategy_picker)
     , fetcher(*this)
-    , background_executor(*this, BackgroundJobAssignee::Type::DataProcessing, getContext())
-    , background_moves_executor(*this, BackgroundJobAssignee::Type::Moving, getContext())
     , cleanup_thread(*this)
     , part_check_thread(*this)
     , restarting_thread(*this)
@@ -295,6 +293,8 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
     , replicated_fetches_pool_size(getContext()->getSettingsRef().background_fetches_pool_size)
     , replicated_fetches_throttler(std::make_shared<Throttler>(getSettings()->max_replicated_fetches_network_bandwidth, getContext()->getReplicatedFetchesThrottler()))
     , replicated_sends_throttler(std::make_shared<Throttler>(getSettings()->max_replicated_sends_network_bandwidth, getContext()->getReplicatedSendsThrottler()))
+    , background_executor(*this, BackgroundJobAssignee::Type::DataProcessing, getContext())
+    , background_moves_executor(*this, BackgroundJobAssignee::Type::Moving, getContext())
 {
     queue_updating_task = getContext()->getSchedulePool().createTask(
         getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::queueUpdatingTask)", [this]{ queueUpdatingTask(); });
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index 3b7267f0a23..ff0843ba064 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -370,9 +370,6 @@ private:
     int metadata_version = 0;
     /// Threads.
 
-    BackgroundJobAssignee background_executor;
-    BackgroundJobAssignee background_moves_executor;
-
     /// A task that keeps track of the updates in the logs of all replicas and loads them into the queue.
     bool queue_update_in_progress = false;
     BackgroundSchedulePool::TaskHolder queue_updating_task;
@@ -411,6 +408,10 @@ private:
     ThrottlerPtr replicated_fetches_throttler;
     ThrottlerPtr replicated_sends_throttler;
 
+    /// Must be the last to be destroyed first
+    BackgroundJobAssignee background_executor;
+    BackgroundJobAssignee background_moves_executor;
+
     template <class Func>
     void foreachCommittedParts(Func && func, bool select_sequential_consistency) const;
 

From 28eff0e369d1c99af7294113039b9881849b3b11 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 31 Aug 2021 18:07:24 +0000
Subject: [PATCH 049/177] save

---
 .../MergeTree/MergeMutateExecutor.cpp         | 54 ++++---------------
 src/Storages/MergeTree/MergeMutateExecutor.h  | 21 ++++++--
 .../MergeTree/tests/gtest_executor.cpp        |  2 +-
 3 files changed, 28 insertions(+), 49 deletions(-)

diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index b8cbc5531d5..01f1f7cbcc7 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -7,31 +7,6 @@ namespace DB
 {
 
 
-/// This is a RAII class which only decrements metric.
-/// It is added because after all other fixes a bug non-executing merges was occurred again.
-/// Last hypothesis: task was successfully added to pool, however, was not executed because of internal exception in it.
-class ParanoidMetricDecrementor
-{
-public:
-    explicit ParanoidMetricDecrementor(CurrentMetrics::Metric metric_) : metric(metric_) {}
-    void alarm() { is_alarmed = true; }
-    void decrement()
-    {
-        if (is_alarmed.exchange(false))
-        {
-            CurrentMetrics::values[metric]--;
-        }
-    }
-
-    ~ParanoidMetricDecrementor() { decrement(); }
-
-private:
-
-    CurrentMetrics::Metric metric;
-    std::atomic_bool is_alarmed = false;
-};
-
-
 void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id)
 {
     std::lock_guard remove_lock(remove_mutex);
@@ -69,9 +44,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 {
     while (true)
     {
-        ExecutableTaskPtr current;
-        auto current_promise = std::make_shared<std::promise<void>>();
-
+        ItemPtr item;
         {
             std::unique_lock lock(mutex);
             has_tasks.wait(lock, [this](){ return !tasks.empty() || shutdown_suspend; });
@@ -79,7 +52,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
             if (shutdown_suspend)
                 break;
 
-            current = std::move(tasks.front());
+            item = std::move(tasks.front());
             tasks.pop_front();
 
             /// This is needed to increase / decrease the number of threads at runtime
@@ -88,20 +61,17 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
         {
             std::lock_guard lock(currently_executing_mutex);
-            currently_executing.emplace(current, current_promise->get_future());
+            currently_executing.emplace(item);
         }
 
-        bool res = pool.trySchedule([this, task = current, promise = current_promise] ()
+        bool res = pool.trySchedule([this, item] ()
         {
-            auto metric_decrementor = std::make_shared<ParanoidMetricDecrementor>(metric);
-            metric_decrementor->alarm();
-
             auto on_exit = [&] ()
             {
-                promise->set_value();
+                item->promise.set_value();
                 {
                     std::lock_guard lock(currently_executing_mutex);
-                    currently_executing.erase(task);
+                    currently_executing.erase(item);
                 }
             };
 
@@ -109,26 +79,24 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
             try
             {
-                bool result = task->execute();
+                bool result = item->task->execute();
 
                 if (result)
                 {
                     std::lock_guard guard(mutex);
-                    tasks.emplace_back(task);
+                    tasks.emplace_back(item);
                     has_tasks.notify_one();
                     return;
                 }
 
-                metric_decrementor->decrement();
-                task->onCompleted();
+                item->task->onCompleted();
 
                 std::lock_guard guard(mutex);
                 has_tasks.notify_one();
             }
             catch(...)
             {
-                metric_decrementor->decrement();
-                task->onCompleted();
+                item->task->onCompleted();
                 std::lock_guard guard(mutex);
                 has_tasks.notify_one();
                 tryLogCurrentException(__PRETTY_FUNCTION__);
@@ -138,7 +106,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         if (!res)
         {
             std::lock_guard guard(mutex);
-            tasks.emplace_back(current);
+            tasks.emplace_back(item);
         }
     }
 }
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index af6d4a824e2..504b6e829f8 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -90,9 +90,7 @@ public:
         if (value.load() >= static_cast<int64_t>(max_task_count_getter()))
             return false;
 
-        CurrentMetrics::add(metric);
-
-        tasks.emplace_back(task);
+        tasks.emplace_back(std::make_shared<Item>(std::move(task), metric));
         has_tasks.notify_one();
         return true;
     }
@@ -141,12 +139,25 @@ private:
     CountGetter max_task_count_getter;
     CurrentMetrics::Metric metric;
 
-    std::deque<ExecutableTaskPtr> tasks;
+    struct Item
+    {
+        explicit Item(ExecutableTaskPtr && task_, CurrentMetrics::Metric && metric_)
+            : task(std::move(task_)), increment(std::move(metric_)) {}
+
+        ExecutableTaskPtr task;
+        CurrentMetrics::Increment increment;
+
+        std::promise<void> promise;
+    };
+
+    using ItemPtr = std::shared_ptr<Item>;
+
+    std::deque<ItemPtr> tasks;
 
     std::mutex remove_mutex;
 
     std::mutex currently_executing_mutex;
-    std::map<ExecutableTaskPtr, std::future<void>> currently_executing;
+    std::set<ItemPtr> currently_executing;
 
     std::mutex mutex;
     std::condition_variable has_tasks;
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index 7e82716d474..6811c77cb1d 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -54,7 +54,7 @@ TEST(Executor, Simple)
     executor->setTasksCount([] () { return tasks_kinds * batch; });
     executor->setMetric(CurrentMetrics::BackgroundPoolTask);
 
-    for (size_t i = 0; i < 4; ++i)
+    for (size_t i = 0; i < batch; ++i)
     {
         for (size_t j = 0; j < tasks_kinds; ++j)
         {

From ac5316ce98bce552065d62ffb5e348fdec56bf61 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 31 Aug 2021 23:20:23 +0000
Subject: [PATCH 050/177] Stress test + reworker executor

---
 .../MergeTree/MergeMutateExecutor.cpp         | 100 ++++++++--------
 src/Storages/MergeTree/MergeMutateExecutor.h  |  34 +++---
 .../MergeTree/tests/gtest_executor.cpp        | 107 ++++++++++++++----
 3 files changed, 158 insertions(+), 83 deletions(-)

diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index 01f1f7cbcc7..1cf84333dbe 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -11,32 +11,33 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 {
     std::lock_guard remove_lock(remove_mutex);
 
-    /// First stop the scheduler thread
+    std::vector<ItemPtr> tasks_to_wait;
     {
-        std::unique_lock lock(mutex);
-        shutdown_suspend = true;
-        has_tasks.notify_one();
+        std::lock_guard lock(mutex);
+
+        /// Mark this StorageID as deleting
+        currently_deleting.emplace(id);
+
+        std::erase_if(pending, [&] (auto item) -> bool { return item->task->getStorageID() == id; });
+
+        /// Find pending to wait
+        for (auto & item : active)
+            if (item->task->getStorageID() == id)
+                tasks_to_wait.emplace_back(item);
     }
 
-    scheduler.join();
 
-    /// Remove tasks
+    for (auto & item : tasks_to_wait)
     {
-        std::lock_guard lock(currently_executing_mutex);
-
-        for (auto & [task, future] : currently_executing)
-        {
-            if (task->getStorageID() == id)
-                future.wait();
-        }
-
-        /// Remove tasks from original queue
-        size_t erased_count = std::erase_if(tasks, [id = std::move(id)] (auto task) -> bool { return task->getStorageID() == id; });
-        CurrentMetrics::sub(metric, erased_count);
+        assert(item->future.valid());
+        item->future.wait();
     }
 
-    shutdown_suspend = false;
-    scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
+
+    {
+        std::lock_guard lock(mutex);
+        currently_deleting.erase(id);
+    }
 }
 
 
@@ -44,47 +45,54 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 {
     while (true)
     {
-        ItemPtr item;
-        {
-            std::unique_lock lock(mutex);
-            has_tasks.wait(lock, [this](){ return !tasks.empty() || shutdown_suspend; });
+        std::unique_lock lock(mutex);
 
-            if (shutdown_suspend)
-                break;
+        has_tasks.wait(lock, [this](){ return !pending.empty() || shutdown_suspend; });
 
-            item = std::move(tasks.front());
-            tasks.pop_front();
+        if (shutdown_suspend)
+            break;
 
-            /// This is needed to increase / decrease the number of threads at runtime
-            updatePoolConfiguration();
-        }
+        auto item = std::move(pending.front());
+        pending.pop_front();
 
-        {
-            std::lock_guard lock(currently_executing_mutex);
-            currently_executing.emplace(item);
-        }
+        active.emplace(item);
+
+        /// This is needed to increase / decrease the number of threads at runtime
+        updatePoolConfiguration();
 
         bool res = pool.trySchedule([this, item] ()
         {
-            auto on_exit = [&] ()
+            auto check_if_deleting = [&] () -> bool
             {
-                item->promise.set_value();
+                active.erase(item);
+
+                for (auto & id : currently_deleting)
                 {
-                    std::lock_guard lock(currently_executing_mutex);
-                    currently_executing.erase(item);
+                    if (item->task->getStorageID() == id)
+                    {
+                        item->promise.set_value();
+                        return true;
+                    }
                 }
+
+                return false;
             };
 
-            SCOPE_EXIT({ on_exit(); });
+            SCOPE_EXIT({
+                std::lock_guard guard(mutex);
+                check_if_deleting();
+            });
 
             try
             {
-                bool result = item->task->execute();
-
-                if (result)
+                if (item->task->execute())
                 {
                     std::lock_guard guard(mutex);
-                    tasks.emplace_back(item);
+
+                    if (check_if_deleting())
+                        return;
+
+                    pending.emplace_back(item);
                     has_tasks.notify_one();
                     return;
                 }
@@ -101,13 +109,15 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
                 has_tasks.notify_one();
                 tryLogCurrentException(__PRETTY_FUNCTION__);
             }
+
         });
 
         if (!res)
         {
-            std::lock_guard guard(mutex);
-            tasks.emplace_back(item);
+            active.erase(item);
+            pending.emplace_back(item);
         }
+
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 504b6e829f8..09de42ccbaf 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -90,7 +90,7 @@ public:
         if (value.load() >= static_cast<int64_t>(max_task_count_getter()))
             return false;
 
-        tasks.emplace_back(std::make_shared<Item>(std::move(task), metric));
+        pending.emplace_back(std::make_shared<Item>(std::move(task), metric));
         has_tasks.notify_one();
         return true;
     }
@@ -111,15 +111,16 @@ public:
         pool.wait();
     }
 
-    size_t active()
-    {
-        return pool.active();
-    }
-
-    size_t pending()
+    size_t activeCount()
     {
         std::lock_guard lock(mutex);
-        return tasks.size();
+        return active.size();
+    }
+
+    size_t pendingCount()
+    {
+        std::lock_guard lock(mutex);
+        return pending.size();
     }
 
 private:
@@ -141,24 +142,27 @@ private:
 
     struct Item
     {
-        explicit Item(ExecutableTaskPtr && task_, CurrentMetrics::Metric && metric_)
-            : task(std::move(task_)), increment(std::move(metric_)) {}
+        explicit Item(ExecutableTaskPtr && task_, CurrentMetrics::Metric metric_)
+            : task(std::move(task_))
+            , increment(std::move(metric_))
+            , future(promise.get_future())
+        {
+        }
 
         ExecutableTaskPtr task;
         CurrentMetrics::Increment increment;
 
         std::promise<void> promise;
+        std::future<void> future;
     };
 
     using ItemPtr = std::shared_ptr<Item>;
 
-    std::deque<ItemPtr> tasks;
+    std::deque<ItemPtr> pending;
+    std::set<ItemPtr> active;
+    std::set<StorageID> currently_deleting;
 
     std::mutex remove_mutex;
-
-    std::mutex currently_executing_mutex;
-    std::set<ItemPtr> currently_executing;
-
     std::mutex mutex;
     std::condition_variable has_tasks;
 
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index 6811c77cb1d..931d8ac8bef 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -1,7 +1,9 @@
 #include <gtest/gtest.h>
 
 #include <atomic>
+#include <barrier>
 #include <memory>
+#include <random>
 
 #include <Storages/MergeTree/ExecutableTask.h>
 #include <Storages/MergeTree/MergeMutateExecutor.h>
@@ -13,16 +15,24 @@ namespace CurrentMetrics
     extern const Metric BackgroundPoolTask;
 }
 
+std::random_device device;
+
 class FakeExecutableTask : public ExecutableTask
 {
 public:
-    explicit FakeExecutableTask(String name_, std::function<void()> on_completed_) : name(name_), on_completed(on_completed_)
+    explicit FakeExecutableTask(String name_) : generator(device()), distribution(0, 5), name(name_)
     {
     }
 
     bool execute() override
     {
-        std::this_thread::sleep_for(std::chrono::milliseconds(5));
+        auto sleep_time = distribution(generator);
+        std::this_thread::sleep_for(std::chrono::milliseconds(5 * sleep_time));
+
+        auto choice = distribution(generator);
+        if (choice == 0)
+            throw std::runtime_error("Unlucky...");
+
         return false;
     }
 
@@ -31,55 +41,106 @@ public:
         return {"test", name};
     }
 
-    void onCompleted() override
-    {
-        on_completed();
-    }
+    void onCompleted() override {}
 
 private:
+    std::mt19937 generator;
+    std::uniform_int_distribution<> distribution;
 
     String name;
     std::function<void()> on_completed;
 };
 
 
-TEST(Executor, Simple)
+TEST(Executor, RemoveTasks)
 {
     auto executor = DB::MergeTreeBackgroundExecutor::create();
 
     const size_t tasks_kinds = 25;
     const size_t batch = 100;
 
-    executor->setThreadsCount([]() { return 25; });
+    executor->setThreadsCount([]() { return tasks_kinds; });
     executor->setTasksCount([] () { return tasks_kinds * batch; });
     executor->setMetric(CurrentMetrics::BackgroundPoolTask);
 
     for (size_t i = 0; i < batch; ++i)
-    {
         for (size_t j = 0; j < tasks_kinds; ++j)
-        {
-            bool res = executor->trySchedule(std::make_shared<FakeExecutableTask>(std::to_string(j), [](){}));
-            ASSERT_TRUE(res);
-        }
-    }
+            ASSERT_TRUE(
+                executor->trySchedule(std::make_shared<FakeExecutableTask>(std::to_string(j)))
+            );
 
     std::vector<std::thread> threads(batch);
 
-    for (auto & thread : threads)
-        thread = std::thread([&] ()
-        {
-            for (size_t j = 0; j < tasks_kinds; ++j)
-                executor->removeTasksCorrespondingToStorage({"test", std::to_string(j)});
+    auto remover_routine = [&] ()
+    {
+        for (size_t j = 0; j < tasks_kinds; ++j)
+            executor->removeTasksCorrespondingToStorage({"test", std::to_string(j)});
+    };
 
-        });
+    for (auto & thread : threads)
+        thread = std::thread(remover_routine);
 
     for (auto & thread : threads)
         thread.join();
 
-    ASSERT_EQ(executor->active(), 0);
-    ASSERT_EQ(executor->pending(), 0);
+    ASSERT_EQ(executor->activeCount(), 0);
+    ASSERT_EQ(executor->pendingCount(), 0);
+    ASSERT_EQ(CurrentMetrics::values[CurrentMetrics::BackgroundPoolTask], 0);
+
+    executor->wait();
+}
+
+
+TEST(Executor, RemoveTasksStress)
+{
+    auto executor = DB::MergeTreeBackgroundExecutor::create();
+
+    const size_t tasks_kinds = 25;
+    const size_t batch = 100;
+    const size_t schedulers_count = 5;
+    const size_t removers_count = 5;
+
+    executor->setThreadsCount([]() { return tasks_kinds; });
+    executor->setTasksCount([] () { return tasks_kinds * batch * (schedulers_count + removers_count); });
+    executor->setMetric(CurrentMetrics::BackgroundPoolTask);
+
+    std::barrier barrier(schedulers_count + removers_count);
+
+    auto scheduler_routine = [&] ()
+    {
+        barrier.arrive_and_wait();
+        for (size_t i = 0; i < batch; ++i)
+            for (size_t j = 0; j < tasks_kinds; ++j)
+                executor->trySchedule(std::make_shared<FakeExecutableTask>(std::to_string(j)));
+    };
+
+    auto remover_routine = [&] ()
+    {
+        barrier.arrive_and_wait();
+        for (size_t j = 0; j < tasks_kinds; ++j)
+            executor->removeTasksCorrespondingToStorage({"test", std::to_string(j)});
+    };
+
+    std::vector<std::thread> schedulers(schedulers_count);
+    for (auto & scheduler : schedulers)
+        scheduler = std::thread(scheduler_routine);
+
+    std::vector<std::thread> removers(removers_count);
+    for (auto & remover : removers)
+        remover = std::thread(remover_routine);
+
+    for (auto & scheduler : schedulers)
+        scheduler.join();
+
+    for (auto & remover : removers)
+        remover.join();
+
+    for (size_t j = 0; j < tasks_kinds; ++j)
+        executor->removeTasksCorrespondingToStorage({"test", std::to_string(j)});
+
+    ASSERT_EQ(executor->activeCount(), 0);
+    ASSERT_EQ(executor->pendingCount(), 0);
     ASSERT_EQ(CurrentMetrics::values[CurrentMetrics::BackgroundPoolTask], 0);
 
     executor->wait();
-
 }

From 33ebdb283c7026244385e9349d2d23b22f6d43b0 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Mon, 28 Jun 2021 12:54:48 +0300
Subject: [PATCH 051/177] Split ExpressionAnalyzer::analyzeAggregation

---
 src/Interpreters/ExpressionAnalyzer.cpp | 236 ++++++++++++------------
 src/Interpreters/ExpressionAnalyzer.h   |   7 +-
 2 files changed, 124 insertions(+), 119 deletions(-)

diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index 028362e392b..4b06f356338 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -151,6 +151,9 @@ ExpressionAnalyzer::ExpressionAnalyzer(
     /// Replaces global subqueries with the generated names of temporary tables that will be sent to remote servers.
     initGlobalSubqueriesAndExternalTables(do_global);
 
+    auto temp_actions = std::make_shared<ActionsDAG>(sourceColumns());
+    columns_after_array_join = getColumnsAfterArrayJoin(temp_actions, sourceColumns());
+    columns_after_join = analyzeJoin(temp_actions, columns_after_array_join);
     /// has_aggregation, aggregation_keys, aggregate_descriptions, aggregated_columns.
     /// This analysis should be performed after processing global subqueries, because otherwise,
     /// if the aggregate function contains a global subquery, then `analyzeAggregation` method will save
@@ -158,7 +161,7 @@ ExpressionAnalyzer::ExpressionAnalyzer(
     /// global subquery. Then, when you call `initGlobalSubqueriesAndExternalTables` method, this
     /// the global subquery will be replaced with a temporary table, resulting in aggregate_descriptions
     /// will contain out-of-date information, which will lead to an error when the query is executed.
-    analyzeAggregation();
+    analyzeAggregation(temp_actions);
 }
 
 static ASTPtr checkPositionalArgument(ASTPtr argument, const ASTSelectQuery * select_query, ASTSelectQuery::Expression expression)
@@ -192,7 +195,64 @@ static ASTPtr checkPositionalArgument(ASTPtr argument, const ASTSelectQuery * se
     return nullptr;
 }
 
-void ExpressionAnalyzer::analyzeAggregation()
+NamesAndTypesList ExpressionAnalyzer::getColumnsAfterArrayJoin(ActionsDAGPtr & actions, const NamesAndTypesList & src_columns)
+{
+    const auto * select_query = query->as<ASTSelectQuery>();
+    if (!select_query)
+        return {};
+
+    auto [array_join_expression_list, is_array_join_left] = select_query->arrayJoinExpressionList();
+
+    if (!array_join_expression_list)
+        return src_columns;
+
+    getRootActionsNoMakeSet(array_join_expression_list, true, actions, false);
+
+    auto array_join = addMultipleArrayJoinAction(actions, is_array_join_left);
+    auto sample_columns = actions->getResultColumns();
+    array_join->prepare(sample_columns);
+    actions = std::make_shared<ActionsDAG>(sample_columns);
+
+    NamesAndTypesList new_columns_after_array_join;
+    NameSet added_columns;
+
+    for (auto & column : actions->getResultColumns())
+    {
+        if (syntax->array_join_result_to_source.count(column.name))
+        {
+            new_columns_after_array_join.emplace_back(column.name, column.type);
+            added_columns.emplace(column.name);
+        }
+    }
+
+    for (const auto & column : src_columns)
+        if (added_columns.count(column.name) == 0)
+            new_columns_after_array_join.emplace_back(column.name, column.type);
+
+    return new_columns_after_array_join;
+}
+
+NamesAndTypesList ExpressionAnalyzer::analyzeJoin(ActionsDAGPtr & actions, const NamesAndTypesList & src_columns)
+{
+    const auto * select_query = query->as<ASTSelectQuery>();
+    if (!select_query)
+        return {};
+
+    const ASTTablesInSelectQueryElement * join = select_query->join();
+    if (join)
+    {
+        getRootActionsNoMakeSet(analyzedJoin().leftKeysList(), true, actions, false);
+        auto sample_columns = actions->getNamesAndTypesList();
+        analyzedJoin().addJoinedColumnsAndCorrectTypes(sample_columns);
+        actions = std::make_shared<ActionsDAG>(sample_columns);
+    }
+
+    NamesAndTypesList result_columns = src_columns;
+    analyzedJoin().addJoinedColumnsAndCorrectTypes(result_columns, false);
+    return result_columns;
+}
+
+void ExpressionAnalyzer::analyzeAggregation(ActionsDAGPtr & temp_actions)
 {
     /** Find aggregation keys (aggregation_keys), information about aggregate functions (aggregate_descriptions),
      *  as well as a set of columns obtained after the aggregation, if any,
@@ -203,146 +263,90 @@ void ExpressionAnalyzer::analyzeAggregation()
 
     auto * select_query = query->as<ASTSelectQuery>();
 
-    auto temp_actions = std::make_shared<ActionsDAG>(sourceColumns());
+    makeAggregateDescriptions(temp_actions, aggregate_descriptions);
+    has_aggregation = !aggregate_descriptions.empty() || (select_query && (select_query->groupBy() || select_query->having()));
 
-    if (select_query)
+    if (!has_aggregation)
     {
-        NamesAndTypesList array_join_columns;
-        columns_after_array_join = sourceColumns();
-
-        bool is_array_join_left;
-        if (ASTPtr array_join_expression_list = select_query->arrayJoinExpressionList(is_array_join_left))
-        {
-            getRootActionsNoMakeSet(array_join_expression_list, true, temp_actions, false);
-
-            auto array_join = addMultipleArrayJoinAction(temp_actions, is_array_join_left);
-            auto sample_columns = temp_actions->getResultColumns();
-            array_join->prepare(sample_columns);
-            temp_actions = std::make_shared<ActionsDAG>(sample_columns);
-
-            NamesAndTypesList new_columns_after_array_join;
-            NameSet added_columns;
-
-            for (auto & column : temp_actions->getResultColumns())
-            {
-                if (syntax->array_join_result_to_source.count(column.name))
-                {
-                    new_columns_after_array_join.emplace_back(column.name, column.type);
-                    added_columns.emplace(column.name);
-                }
-            }
-
-            for (auto & column : columns_after_array_join)
-                if (added_columns.count(column.name) == 0)
-                    new_columns_after_array_join.emplace_back(column.name, column.type);
-
-            columns_after_array_join.swap(new_columns_after_array_join);
-        }
-
-        columns_after_array_join.insert(columns_after_array_join.end(), array_join_columns.begin(), array_join_columns.end());
-
-        const ASTTablesInSelectQueryElement * join = select_query->join();
-        if (join)
-        {
-            getRootActionsNoMakeSet(analyzedJoin().leftKeysList(), true, temp_actions, false);
-            auto sample_columns = temp_actions->getNamesAndTypesList();
-            analyzedJoin().addJoinedColumnsAndCorrectTypes(sample_columns);
-            temp_actions = std::make_shared<ActionsDAG>(sample_columns);
-        }
-
-        columns_after_join = columns_after_array_join;
-        analyzedJoin().addJoinedColumnsAndCorrectTypes(columns_after_join, false);
+        aggregated_columns = temp_actions->getNamesAndTypesList();
+        return;
     }
 
-    has_aggregation = makeAggregateDescriptions(temp_actions);
-    if (select_query && (select_query->groupBy() || select_query->having()))
-        has_aggregation = true;
-
-    if (has_aggregation)
+    /// Find out aggregation keys.
+    if (select_query)
     {
-        /// Find out aggregation keys.
-        if (select_query)
+        if (ASTPtr group_by_ast = select_query->groupBy())
         {
-            if (select_query->groupBy())
+            NameSet unique_keys;
+            ASTs & group_asts = group_by_ast->children;
+            for (ssize_t i = 0; i < ssize_t(group_asts.size()); ++i)
             {
-                NameSet unique_keys;
-                ASTs & group_asts = select_query->groupBy()->children;
+                ssize_t size = group_asts.size();
+                getRootActionsNoMakeSet(group_asts[i], true, temp_actions, false);
 
-                for (ssize_t i = 0; i < ssize_t(group_asts.size()); ++i)
+                if (getContext()->getSettingsRef().enable_positional_arguments)
                 {
-                    ssize_t size = group_asts.size();
-                    getRootActionsNoMakeSet(group_asts[i], true, temp_actions, false);
+                    auto new_argument = checkPositionalArgument(group_asts[i], select_query, ASTSelectQuery::Expression::GROUP_BY);
+                    if (new_argument)
+                        group_asts[i] = new_argument;
+                }
 
-                    if (getContext()->getSettingsRef().enable_positional_arguments)
+                const auto & column_name = group_asts[i]->getColumnName();
+                const auto * node = temp_actions->tryFindInIndex(column_name);
+                if (!node)
+                    throw Exception("Unknown identifier (in GROUP BY): " + column_name, ErrorCodes::UNKNOWN_IDENTIFIER);
+
+                /// Only removes constant keys if it's an initiator or distributed_group_by_no_merge is enabled.
+                if (getContext()->getClientInfo().distributed_depth == 0 || settings.distributed_group_by_no_merge > 0)
+                {
+                    /// Constant expressions have non-null column pointer at this stage.
+                    if (node->column && isColumnConst(*node->column))
                     {
-                        auto new_argument = checkPositionalArgument(group_asts[i], select_query, ASTSelectQuery::Expression::GROUP_BY);
-                        if (new_argument)
-                            group_asts[i] = new_argument;
-                    }
+                        select_query->group_by_with_constant_keys = true;
 
-                    const auto & column_name = group_asts[i]->getColumnName();
-                    const auto * node = temp_actions->tryFindInIndex(column_name);
-
-                    if (!node)
-                        throw Exception("Unknown identifier (in GROUP BY): " + column_name, ErrorCodes::UNKNOWN_IDENTIFIER);
-
-                    /// Only removes constant keys if it's an initiator or distributed_group_by_no_merge is enabled.
-                    if (getContext()->getClientInfo().distributed_depth == 0 || settings.distributed_group_by_no_merge > 0)
-                    {
-                        /// Constant expressions have non-null column pointer at this stage.
-                        if (node->column && isColumnConst(*node->column))
+                        /// But don't remove last key column if no aggregate functions, otherwise aggregation will not work.
+                        if (!aggregate_descriptions.empty() || size > 1)
                         {
-                            select_query->group_by_with_constant_keys = true;
+                            if (i + 1 < static_cast<ssize_t>(size))
+                                group_asts[i] = std::move(group_asts.back());
 
-                            /// But don't remove last key column if no aggregate functions, otherwise aggregation will not work.
-                            if (!aggregate_descriptions.empty() || size > 1)
-                            {
-                                if (i + 1 < static_cast<ssize_t>(size))
-                                    group_asts[i] = std::move(group_asts.back());
+                            group_asts.pop_back();
 
-                                group_asts.pop_back();
-
-                                --i;
-                                continue;
-                            }
+                            --i;
+                            continue;
                         }
                     }
-
-                    NameAndTypePair key{column_name, node->result_type};
-
-                    /// Aggregation keys are uniqued.
-                    if (!unique_keys.count(key.name))
-                    {
-                        unique_keys.insert(key.name);
-                        aggregation_keys.push_back(key);
-
-                        /// Key is no longer needed, therefore we can save a little by moving it.
-                        aggregated_columns.push_back(std::move(key));
-                    }
                 }
 
-                if (group_asts.empty())
+                NameAndTypePair key{column_name, node->result_type};
+
+                /// Aggregation keys are uniqued.
+                if (!unique_keys.count(key.name))
                 {
-                    select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, {});
-                    has_aggregation = select_query->having() || !aggregate_descriptions.empty();
+                    unique_keys.insert(key.name);
+                    aggregation_keys.push_back(key);
+
+                    /// Key is no longer needed, therefore we can save a little by moving it.
+                    aggregated_columns.push_back(std::move(key));
                 }
             }
+
+            if (group_asts.empty())
+            {
+                select_query->setExpression(ASTSelectQuery::Expression::GROUP_BY, {});
+                has_aggregation = select_query->having() || !aggregate_descriptions.empty();
+            }
         }
-        else
-            aggregated_columns = temp_actions->getNamesAndTypesList();
 
         /// Constant expressions are already removed during first 'analyze' run.
         /// So for second `analyze` information is taken from select_query.
-        if (select_query)
-            has_const_aggregation_keys = select_query->group_by_with_constant_keys;
-
-        for (const auto & desc : aggregate_descriptions)
-            aggregated_columns.emplace_back(desc.column_name, desc.function->getReturnType());
+        has_const_aggregation_keys = select_query->group_by_with_constant_keys;
     }
     else
-    {
         aggregated_columns = temp_actions->getNamesAndTypesList();
-    }
+
+    for (const auto & desc : aggregate_descriptions)
+        aggregated_columns.emplace_back(desc.column_name, desc.function->getReturnType());
 }
 
 
@@ -486,7 +490,7 @@ void ExpressionAnalyzer::getRootActionsForHaving(const ASTPtr & ast, bool no_sub
 }
 
 
-bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions)
+void ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions, AggregateDescriptions & descriptions)
 {
     for (const ASTFunction * node : aggregates())
     {
@@ -519,10 +523,8 @@ bool ExpressionAnalyzer::makeAggregateDescriptions(ActionsDAGPtr & actions)
         aggregate.parameters = (node->parameters) ? getAggregateFunctionParametersArray(node->parameters, "", getContext()) : Array();
         aggregate.function = AggregateFunctionFactory::instance().get(node->name, types, aggregate.parameters, properties);
 
-        aggregate_descriptions.push_back(aggregate);
+        descriptions.push_back(aggregate);
     }
-
-    return !aggregates().empty();
 }
 
 void makeWindowDescriptionFromAST(const Context & context,
diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h
index fdcdcd51cda..14f0e0023dd 100644
--- a/src/Interpreters/ExpressionAnalyzer.h
+++ b/src/Interpreters/ExpressionAnalyzer.h
@@ -188,12 +188,15 @@ protected:
       *  or after all the actions that are normally performed before aggregation.
       * Set has_aggregation = true if there is GROUP BY or at least one aggregate function.
       */
-    void analyzeAggregation();
-    bool makeAggregateDescriptions(ActionsDAGPtr & actions);
+    void analyzeAggregation(ActionsDAGPtr & temp_actions);
+    void makeAggregateDescriptions(ActionsDAGPtr & actions, AggregateDescriptions & descriptions);
 
     const ASTSelectQuery * getSelectQuery() const;
 
     bool isRemoteStorage() const { return syntax->is_remote_storage; }
+
+    NamesAndTypesList getColumnsAfterArrayJoin(ActionsDAGPtr & actions, const NamesAndTypesList & src_columns);
+    NamesAndTypesList analyzeJoin(ActionsDAGPtr & actions, const NamesAndTypesList & src_columns);
 };
 
 class SelectQueryExpressionAnalyzer;

From c4ffc2879f55d61032319c0c8a664defbbfb67b8 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Mon, 28 Jun 2021 16:44:19 +0300
Subject: [PATCH 052/177] Perform join type inference for USING at
 ExpressionAnalyzer instead of TreeRewriter

---
 src/Interpreters/ExpressionActions.cpp        |   2 +-
 src/Interpreters/ExpressionAnalyzer.cpp       |  17 ++-
 src/Interpreters/TableJoin.cpp                | 111 ++++++++++--------
 src/Interpreters/TableJoin.h                  |  19 +--
 src/Interpreters/TreeRewriter.cpp             |   8 --
 .../0_stateless/00561_storage_join.sql        |  19 +--
 .../00855_join_with_array_join.reference      |   9 ++
 .../00855_join_with_array_join.sql            |  16 +++
 8 files changed, 114 insertions(+), 87 deletions(-)

diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp
index f3c51487a05..96da6e54802 100644
--- a/src/Interpreters/ExpressionActions.cpp
+++ b/src/Interpreters/ExpressionActions.cpp
@@ -1042,7 +1042,7 @@ ExpressionActionsChain::JoinStep::JoinStep(
         required_columns.emplace_back(column.name, column.type);
 
     NamesAndTypesList result_names_and_types = required_columns;
-    analyzed_join->addJoinedColumnsAndCorrectTypes(result_names_and_types);
+    analyzed_join->addJoinedColumnsAndCorrectTypes(result_names_and_types, true);
     for (const auto & [name, type] : result_names_and_types)
         /// `column` is `nullptr` because we don't care on constness here, it may be changed in join
         result_columns.emplace_back(nullptr, type, name);
diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index 4b06f356338..2e7f785e665 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -243,12 +243,12 @@ NamesAndTypesList ExpressionAnalyzer::analyzeJoin(ActionsDAGPtr & actions, const
     {
         getRootActionsNoMakeSet(analyzedJoin().leftKeysList(), true, actions, false);
         auto sample_columns = actions->getNamesAndTypesList();
-        analyzedJoin().addJoinedColumnsAndCorrectTypes(sample_columns);
+        syntax->analyzed_join->addJoinedColumnsAndCorrectTypes(sample_columns, true);
         actions = std::make_shared<ActionsDAG>(sample_columns);
     }
 
     NamesAndTypesList result_columns = src_columns;
-    analyzedJoin().addJoinedColumnsAndCorrectTypes(result_columns, false);
+    syntax->analyzed_join->addJoinedColumnsAndCorrectTypes(result_columns,false);
     return result_columns;
 }
 
@@ -837,9 +837,9 @@ JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain
     const ColumnsWithTypeAndName & left_sample_columns = chain.getLastStep().getResultColumns();
     JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns);
 
-    if (syntax->analyzed_join->needConvert())
+    if (auto left_actions = syntax->analyzed_join->leftConvertingActions())
     {
-        chain.steps.push_back(std::make_unique<ExpressionActionsChain::ExpressionActionsStep>(syntax->analyzed_join->leftConvertingActions()));
+        chain.steps.push_back(std::make_unique<ExpressionActionsChain::ExpressionActionsStep>(left_actions));
         chain.addStep();
     }
 
@@ -958,8 +958,10 @@ std::unique_ptr<QueryPlan> buildJoinedPlan(
     joined_plan->addStep(std::move(joined_actions_step));
 
     const ColumnsWithTypeAndName & right_sample_columns = joined_plan->getCurrentDataStream().header.getColumnsWithTypeAndName();
-    bool need_convert = analyzed_join.applyJoinKeyConvert(left_sample_columns, right_sample_columns);
-    if (need_convert)
+
+    analyzed_join.createConvertingActions(left_sample_columns, right_sample_columns);
+
+    if (auto right_actions = analyzed_join.rightConvertingActions())
     {
         auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), analyzed_join.rightConvertingActions());
         converting_step->setStepDescription("Convert joined columns");
@@ -981,7 +983,10 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
     JoinPtr join = tryGetStorageJoin(syntax->analyzed_join);
 
     if (join)
+    {
+        syntax->analyzed_join->createConvertingActions(left_sample_columns, {});
         return join;
+    }
 
     joined_plan = buildJoinedPlan(getContext(), join_element, left_sample_columns, *syntax->analyzed_join, query_options);
 
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index bbcfb56dbc6..39bb552b4e7 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -233,36 +233,54 @@ bool TableJoin::rightBecomeNullable(const DataTypePtr & column_type) const
 
 void TableJoin::addJoinedColumn(const NameAndTypePair & joined_column)
 {
-    DataTypePtr type = joined_column.type;
-
-    if (hasUsing())
-    {
-        if (auto it = right_type_map.find(joined_column.name); it != right_type_map.end())
-            type = it->second;
-    }
-
-    if (rightBecomeNullable(type))
-        type = JoinCommon::convertTypeToNullable(type);
-
-    columns_added_by_join.emplace_back(joined_column.name, type);
+    columns_added_by_join.emplace_back(joined_column);
 }
 
-void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability) const
+NamesAndTypesList TableJoin::correctedColumnsAddedByJoin() const
 {
-    for (auto & col : names_and_types)
+    NamesAndTypesList result;
+    for (const auto & col : columns_added_by_join)
+    {
+        DataTypePtr type = col.type;
+        if (hasUsing())
+        {
+            if (auto it = right_type_map.find(col.name); it != right_type_map.end())
+                type = it->second;
+        }
+
+        if (rightBecomeNullable(type))
+            type = JoinCommon::convertTypeToNullable(type);
+        result.emplace_back(col.name, type);
+    }
+
+    return result;
+}
+
+void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & left_columns, bool correct_nullability)
+{
+    for (auto & col : left_columns)
     {
         if (hasUsing())
         {
+            /*
+             * Join with `USING` semantic allows to have columns with changed types in result table.
+             * But `JOIN ON` should preserve types from original table.
+             * So we need to know changed types in result tables before further analysis (e.g. analyzeAggregation)
+             * For `JOIN ON expr1 == expr2` we will infer common type later in makeTableJoin,
+             *   when part of plan built and types of expression will be known.
+             */
+            inferJoinKeyCommonType(left_columns, columns_from_joined_table, joined_storage != nullptr);
+
             if (auto it = left_type_map.find(col.name); it != left_type_map.end())
                 col.type = it->second;
         }
+
         if (correct_nullability && leftBecomeNullable(col.type))
             col.type = JoinCommon::convertTypeToNullable(col.type);
     }
 
-    /// Types in columns_added_by_join already converted and set nullable if needed
-    for (const auto & col : columns_added_by_join)
-        names_and_types.emplace_back(col.name, col.type);
+    for (const auto & col : correctedColumnsAddedByJoin())
+        left_columns.emplace_back(col.name, col.type);
 }
 
 bool TableJoin::sameStrictnessAndKind(ASTTableJoin::Strictness strictness_, ASTTableJoin::Kind kind_) const
@@ -335,51 +353,30 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc
     return true;
 }
 
-bool TableJoin::applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
+bool TableJoin::createConvertingActions(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
 {
+    bool need_convert = false;
+    need_convert = inferJoinKeyCommonType(left_sample_columns, right_sample_columns, joined_storage != nullptr);
 
-    auto to_name_type_list = [](const ColumnsWithTypeAndName & columns)
-    {
-        NamesAndTypesList name_type_list;
-        for (const auto & col : columns)
-            name_type_list.emplace_back(col.name, col.type);
-        return name_type_list;
-    };
-
-    bool need_convert = needConvert();
-    if (!need_convert && joined_storage == nullptr)
-    {
-        need_convert = inferJoinKeyCommonType(
-            to_name_type_list(left_sample_columns),
-            to_name_type_list(right_sample_columns),
-            true);
-    }
-
-    if (need_convert)
-    {
-        left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left);
-        right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right);
-    }
+    left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left);
+    right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right);
 
     return need_convert;
 }
 
-bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right, bool to_supertype)
+template <typename LeftNamesAndTypes, typename RightNamesAndTypes>
+bool TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const RightNamesAndTypes & right, bool allow_right)
 {
+    if (!left_type_map.empty() || !right_type_map.empty())
+        return true;
+
     NameToTypeMap left_types;
     for (const auto & col : left)
-    {
         left_types[col.name] = col.type;
-    }
 
     NameToTypeMap right_types;
     for (const auto & col : right)
-    {
-        if (auto it = renames.find(col.name); it != renames.end())
-            right_types[it->second] = col.type;
-        else
-            right_types[col.name] = col.type;
-    }
+        right_types[renamedRightColumnName(col.name)] = col.type;
 
     for (size_t i = 0; i < key_names_left.size(); ++i)
     {
@@ -396,8 +393,8 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam
         if (JoinCommon::typesEqualUpToNullability(ltype->second, rtype->second))
             continue;
 
-        auto common_type = to_supertype ? DB::getLeastSupertype({ltype->second, rtype->second}, false)
-                                        : DB::getMostSubtype({ltype->second, rtype->second}, false);
+        /// TODO(vdimir): use getMostSubtype if possible
+        auto common_type = DB::getLeastSupertype({ltype->second, rtype->second}, false);
         if (common_type == nullptr || isNothing(common_type))
         {
             LOG_DEBUG(&Poco::Logger::get("TableJoin"),
@@ -407,6 +404,13 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam
             continue;
         }
 
+        if (!allow_right && !common_type->equals(*rtype->second))
+        {
+            LOG_DEBUG(&Poco::Logger::get("TableJoin"),
+                      "Can't change type for right table: {}: {} -> {}.",
+                      key_names_right[i], rtype->second->getName(), common_type->getName());
+            continue;
+        }
         left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = common_type;
     }
 
@@ -425,15 +429,20 @@ bool TableJoin::inferJoinKeyCommonType(const NamesAndTypesList & left, const Nam
 ActionsDAGPtr TableJoin::applyKeyConvertToTable(
     const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) const
 {
+    bool has_some_to_do = false;
+
     ColumnsWithTypeAndName cols_dst = cols_src;
     for (auto & col : cols_dst)
     {
         if (auto it = type_mapping.find(col.name); it != type_mapping.end())
         {
+            has_some_to_do = true;
             col.type = it->second;
             col.column = nullptr;
         }
     }
+    if (!has_some_to_do)
+        return nullptr;
 
     NameToNameMap key_column_rename;
     /// Returns converting actions for tables that need to be performed before join
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index d766cad3cb8..081b9638ca0 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -86,7 +86,8 @@ private:
     /// All columns which can be read from joined table. Duplicating names are qualified.
     NamesAndTypesList columns_from_joined_table;
     /// Columns will be added to block by JOIN.
-    /// It's a subset of columns_from_joined_table with corrected Nullability and type (if inplace type conversion is required)
+    /// It's a subset of columns_from_joined_table
+    /// Note: without corrected Nullability or type, see correctedColumnsAddedByJoin
     NamesAndTypesList columns_added_by_join;
 
     /// Target type to convert key columns before join
@@ -109,6 +110,12 @@ private:
     ActionsDAGPtr applyKeyConvertToTable(
         const ColumnsWithTypeAndName & cols_src, const NameToTypeMap & type_mapping, Names & names_to_rename) const;
 
+    /// Calculates common supertypes for corresponding join key columns.
+    template <typename LeftNamesAndTypes, typename RightNamesAndTypes>
+    bool inferJoinKeyCommonType(const LeftNamesAndTypes & left, const RightNamesAndTypes & right, bool to_supertype);
+
+    NamesAndTypesList correctedColumnsAddedByJoin() const;
+
 public:
     TableJoin() = default;
     TableJoin(const Settings &, VolumePtr tmp_volume);
@@ -190,17 +197,12 @@ public:
     bool rightBecomeNullable(const DataTypePtr & column_type) const;
     void addJoinedColumn(const NameAndTypePair & joined_column);
 
-    void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & names_and_types, bool correct_nullability = true) const;
-
-    /// Calculates common supertypes for corresponding join key columns.
-    bool inferJoinKeyCommonType(const NamesAndTypesList & left, const NamesAndTypesList & right, bool to_supertype);
+    void addJoinedColumnsAndCorrectTypes(NamesAndTypesList & left_columns, bool correct_nullability);
 
     /// Calculate converting actions, rename key columns in required
     /// For `USING` join we will convert key columns inplace and affect into types in the result table
     /// For `JOIN ON` we will create new columns with converted keys to join by.
-    bool applyJoinKeyConvert(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns);
-
-    bool needConvert() const { return !left_type_map.empty(); }
+    bool createConvertingActions(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns);
 
     /// Key columns should be converted before join.
     ActionsDAGPtr leftConvertingActions() const { return left_converting_actions; }
@@ -215,6 +217,7 @@ public:
     const Names & keyNamesLeft() const { return key_names_left; }
     const Names & keyNamesRight() const { return key_names_right; }
     const NamesAndTypesList & columnsFromJoinedTable() const { return columns_from_joined_table; }
+
     Names columnsAddedByJoin() const
     {
         Names res;
diff --git a/src/Interpreters/TreeRewriter.cpp b/src/Interpreters/TreeRewriter.cpp
index 073b38eae57..51df6cd022b 100644
--- a/src/Interpreters/TreeRewriter.cpp
+++ b/src/Interpreters/TreeRewriter.cpp
@@ -517,14 +517,6 @@ void collectJoinedColumns(TableJoin & analyzed_join, const ASTTableJoin & table_
         const auto & keys = table_join.using_expression_list->as<ASTExpressionList &>();
         for (const auto & key : keys.children)
             analyzed_join.addUsingKey(key);
-
-        /// `USING` semantic allows to have columns with changed types in result table.
-        /// `JOIN ON` should preserve types from original table
-        /// We can infer common type on syntax stage for `USING` because join is performed only by columns (not expressions)
-        /// We need to know changed types in result tables because some analysis (e.g. analyzeAggregation) performed before join
-        /// For `JOIN ON expr1 == expr2` we will infer common type later in ExpressionAnalyzer, when types of expression will be known
-        if (analyzed_join.joined_storage == nullptr)
-            analyzed_join.inferJoinKeyCommonType(tables[0].columns, tables[1].columns, true);
     }
     else if (table_join.on_expression)
     {
diff --git a/tests/queries/0_stateless/00561_storage_join.sql b/tests/queries/0_stateless/00561_storage_join.sql
index 01e66d0c380..fa5b7a9493e 100644
--- a/tests/queries/0_stateless/00561_storage_join.sql
+++ b/tests/queries/0_stateless/00561_storage_join.sql
@@ -25,24 +25,17 @@ insert into joinbug_join (id, id2, val, val2, created)
 select id, id2, val, val2, created
 from joinbug;
 
-/* expected */
-select *
-from joinbug;
+select * from joinbug;
 
-/* wtf */
 select id, id2, val, val2, created
-from (
-   SELECT toUInt64(arrayJoin(range(50))) AS id2
-) js1
+from ( SELECT toUInt64(arrayJoin(range(50))) AS id2 ) js1
 SEMI LEFT JOIN joinbug_join using id2;
 
-/* type conversion */
-SELECT * FROM
-(
-    SELECT toUInt32(11) AS id2
-) AS js1
-SEMI LEFT JOIN joinbug_join USING (id2);
+-- type conversion
+SELECT * FROM ( SELECT toUInt32(11) AS id2 ) AS js1 SEMI LEFT JOIN joinbug_join USING (id2);
 
+-- can't convert right side in case on storage join
+SELECT * FROM ( SELECT toInt64(11) AS id2 ) AS js1 LEFT JOIN joinbug_join USING (id2); -- { serverError 53 }
 
 DROP TABLE joinbug;
 DROP TABLE joinbug_join;
diff --git a/tests/queries/0_stateless/00855_join_with_array_join.reference b/tests/queries/0_stateless/00855_join_with_array_join.reference
index 88f9253500c..acb0240fbd2 100644
--- a/tests/queries/0_stateless/00855_join_with_array_join.reference
+++ b/tests/queries/0_stateless/00855_join_with_array_join.reference
@@ -9,3 +9,12 @@
 2	2	a2
 1	0	
 2	2	a2
+0	0
+0	0
+0	0
+0	0
+0
+0
+0
+0
+0	Int64	Int64
diff --git a/tests/queries/0_stateless/00855_join_with_array_join.sql b/tests/queries/0_stateless/00855_join_with_array_join.sql
index 506d9479110..642f90d4c6c 100644
--- a/tests/queries/0_stateless/00855_join_with_array_join.sql
+++ b/tests/queries/0_stateless/00855_join_with_array_join.sql
@@ -30,6 +30,22 @@ SELECT did, id, name FROM f LEFT ARRAY JOIN d_ids as did LEFT JOIN d ON d.id = d
 -- name clash, doesn't work yet
 SELECT id, name FROM f LEFT ARRAY JOIN d_ids as id LEFT JOIN d ON d.id = id ORDER BY id; -- { serverError 403 }
 
+SELECT * FROM ( SELECT [dummy, dummy] AS dummy FROM system.one ) AS x ARRAY JOIN dummy
+JOIN system.one AS y ON x.dummy == y.dummy;
+
+SELECT * FROM ( SELECT [dummy, dummy] AS dummy FROM system.one ) AS x ARRAY JOIN dummy
+JOIN system.one AS y ON x.dummy + 1 == y.dummy + 1;
+
+SELECT * FROM ( SELECT [dummy, dummy] AS dummy FROM system.one ) AS x ARRAY JOIN dummy
+JOIN system.one AS y USING dummy;
+
+SELECT * FROM ( SELECT [toUInt32(dummy), toUInt32(dummy)] AS dummy FROM system.one ) AS x ARRAY JOIN dummy
+JOIN (select toInt32(dummy) as dummy from system.one ) AS y USING dummy;
+
+SELECT dummy > 0, toTypeName(any(dummy)), any(toTypeName(dummy)) 
+FROM ( SELECT [toUInt32(dummy), toUInt32(dummy)] AS dummy FROM system.one ) AS x ARRAY JOIN dummy
+JOIN ( SELECT toInt32(dummy) AS dummy FROM system.one ) AS y USING dummy GROUP BY (dummy > 0);
+
 DROP TABLE IF EXISTS f;
 DROP TABLE IF EXISTS d;
 

From 13bf141e1d18b6fb29191ed103e5342339468ba7 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Mon, 28 Jun 2021 17:12:15 +0300
Subject: [PATCH 053/177] Fix storage_join type inference

---
 src/Interpreters/TableJoin.cpp                   | 4 ++--
 tests/queries/0_stateless/00561_storage_join.sql | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index 39bb552b4e7..e8b9641d815 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -7,13 +7,13 @@
 #include <Core/Settings.h>
 
 #include <DataTypes/DataTypeNullable.h>
-#include <DataTypes/getMostSubtype.h>
 
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/queryToString.h>
 
 #include <common/logger_useful.h>
+#include <Storages/IStorage.h>
 
 namespace DB
 {
@@ -356,7 +356,7 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc
 bool TableJoin::createConvertingActions(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
 {
     bool need_convert = false;
-    need_convert = inferJoinKeyCommonType(left_sample_columns, right_sample_columns, joined_storage != nullptr);
+    need_convert = inferJoinKeyCommonType(left_sample_columns, right_sample_columns, joined_storage == nullptr);
 
     left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left);
     right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right);
diff --git a/tests/queries/0_stateless/00561_storage_join.sql b/tests/queries/0_stateless/00561_storage_join.sql
index fa5b7a9493e..ad4fab09c20 100644
--- a/tests/queries/0_stateless/00561_storage_join.sql
+++ b/tests/queries/0_stateless/00561_storage_join.sql
@@ -35,7 +35,7 @@ SEMI LEFT JOIN joinbug_join using id2;
 SELECT * FROM ( SELECT toUInt32(11) AS id2 ) AS js1 SEMI LEFT JOIN joinbug_join USING (id2);
 
 -- can't convert right side in case on storage join
-SELECT * FROM ( SELECT toInt64(11) AS id2 ) AS js1 LEFT JOIN joinbug_join USING (id2); -- { serverError 53 }
+SELECT * FROM ( SELECT toInt64(11) AS id2 ) AS js1 SEMI LEFT JOIN joinbug_join USING (id2); -- { serverError 53 }
 
 DROP TABLE joinbug;
 DROP TABLE joinbug_join;

From ed8c156190b7103acdc332702cf824e526122983 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Tue, 29 Jun 2021 12:22:53 +0300
Subject: [PATCH 054/177] Clean up TableJoin storage join

---
 src/Interpreters/ExpressionAnalyzer.cpp | 60 +++-----------------
 src/Interpreters/HashJoin.cpp           |  5 +-
 src/Interpreters/JoinedTables.cpp       | 13 +++--
 src/Interpreters/TableJoin.cpp          | 73 +++++++++++++++++++++++--
 src/Interpreters/TableJoin.h            | 21 +++++--
 5 files changed, 104 insertions(+), 68 deletions(-)

diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index 2e7f785e665..5d3b52372d6 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -850,14 +850,6 @@ JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain
     return table_join;
 }
 
-static JoinPtr tryGetStorageJoin(std::shared_ptr<TableJoin> analyzed_join)
-{
-    if (auto * table = analyzed_join->joined_storage.get())
-        if (auto * storage_join = dynamic_cast<StorageJoin *>(table))
-            return storage_join->getJoinLocked(analyzed_join);
-    return {};
-}
-
 static ActionsDAGPtr createJoinedBlockActions(ContextPtr context, const TableJoin & analyzed_join)
 {
     ASTPtr expression_list = analyzed_join.rightKeysList();
@@ -865,44 +857,13 @@ static ActionsDAGPtr createJoinedBlockActions(ContextPtr context, const TableJoi
     return ExpressionAnalyzer(expression_list, syntax_result, context).getActionsDAG(true, false);
 }
 
-static bool allowDictJoin(StoragePtr joined_storage, ContextPtr context, String & dict_name, String & key_name)
+static std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> analyzed_join, const Block & sample_block, ContextPtr context)
 {
-    if (!joined_storage->isDictionary())
-        return false;
-
-    StorageDictionary & storage_dictionary = static_cast<StorageDictionary &>(*joined_storage);
-    dict_name = storage_dictionary.getDictionaryName();
-    auto dictionary = context->getExternalDictionariesLoader().getDictionary(dict_name, context);
-    if (!dictionary)
-        return false;
-
-    const DictionaryStructure & structure = dictionary->getStructure();
-    if (structure.id)
-    {
-        key_name = structure.id->name;
-        return true;
-    }
-    return false;
-}
-
-static std::shared_ptr<IJoin> makeJoin(std::shared_ptr<TableJoin> analyzed_join, const Block & sample_block, ContextPtr context)
-{
-    bool allow_merge_join = analyzed_join->allowMergeJoin();
-
     /// HashJoin with Dictionary optimisation
-    String dict_name;
-    String key_name;
-    if (analyzed_join->joined_storage && allowDictJoin(analyzed_join->joined_storage, context, dict_name, key_name))
-    {
-        Names original_names;
-        NamesAndTypesList result_columns;
-        if (analyzed_join->allowDictJoin(key_name, sample_block, original_names, result_columns))
-        {
-            analyzed_join->dictionary_reader = std::make_shared<DictionaryReader>(dict_name, original_names, result_columns, context);
-            return std::make_shared<HashJoin>(analyzed_join, sample_block);
-        }
-    }
+    if (analyzed_join->tryInitDictJoin(sample_block, context))
+        return std::make_shared<HashJoin>(analyzed_join, sample_block);
 
+    bool allow_merge_join = analyzed_join->allowMergeJoin();
     if (analyzed_join->forceHashJoin() || (analyzed_join->preferMergeJoin() && !allow_merge_join))
         return std::make_shared<HashJoin>(analyzed_join, sample_block);
     else if (analyzed_join->forceMergeJoin() || (analyzed_join->preferMergeJoin() && allow_merge_join))
@@ -963,7 +924,7 @@ std::unique_ptr<QueryPlan> buildJoinedPlan(
 
     if (auto right_actions = analyzed_join.rightConvertingActions())
     {
-        auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), analyzed_join.rightConvertingActions());
+        auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), right_actions);
         converting_step->setStepDescription("Convert joined columns");
         joined_plan->addStep(std::move(converting_step));
     }
@@ -979,21 +940,18 @@ JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
     if (joined_plan)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Table join was already created for query");
 
-    /// Use StorageJoin if any.
-    JoinPtr join = tryGetStorageJoin(syntax->analyzed_join);
-
-    if (join)
+    if (auto storage = syntax->analyzed_join->getStorageJoin())
     {
         syntax->analyzed_join->createConvertingActions(left_sample_columns, {});
-        return join;
+        return storage->getJoinLocked(syntax->analyzed_join);
     }
 
     joined_plan = buildJoinedPlan(getContext(), join_element, left_sample_columns, *syntax->analyzed_join, query_options);
 
-    join = makeJoin(syntax->analyzed_join, joined_plan->getCurrentDataStream().header, getContext());
+    JoinPtr join = chooseJoinAlgorithm(syntax->analyzed_join, joined_plan->getCurrentDataStream().header, getContext());
 
     /// Do not make subquery for join over dictionary.
-    if (syntax->analyzed_join->dictionary_reader)
+    if (syntax->analyzed_join->getDictionaryReader())
         joined_plan.reset();
 
     return join;
diff --git a/src/Interpreters/HashJoin.cpp b/src/Interpreters/HashJoin.cpp
index 46f8c9aac79..1c450d531d6 100644
--- a/src/Interpreters/HashJoin.cpp
+++ b/src/Interpreters/HashJoin.cpp
@@ -211,7 +211,7 @@ HashJoin::HashJoin(std::shared_ptr<TableJoin> table_join_, const Block & right_s
     if (nullable_right_side)
         JoinCommon::convertColumnsToNullable(sample_block_with_columns_to_add);
 
-    if (table_join->dictionary_reader)
+    if (table_join->getDictionaryReader())
     {
         LOG_DEBUG(log, "Performing join over dict");
         data->type = Type::DICT;
@@ -331,7 +331,8 @@ public:
 
     KeyGetterForDict(const TableJoin & table_join, const ColumnRawPtrs & key_columns)
     {
-        table_join.dictionary_reader->readKeys(*key_columns[0], read_result, found, positions);
+        assert(table_join.getDictionaryReader());
+        table_join.getDictionaryReader()->readKeys(*key_columns[0], read_result, found, positions);
 
         for (ColumnWithTypeAndName & column : read_result)
             if (table_join.rightBecomeNullable(column.type))
diff --git a/src/Interpreters/JoinedTables.cpp b/src/Interpreters/JoinedTables.cpp
index 099fb5c2f44..271d7371425 100644
--- a/src/Interpreters/JoinedTables.cpp
+++ b/src/Interpreters/JoinedTables.cpp
@@ -299,16 +299,17 @@ std::shared_ptr<TableJoin> JoinedTables::makeTableJoin(const ASTSelectQuery & se
     if (table_to_join.database_and_table_name)
     {
         auto joined_table_id = context->resolveStorageID(table_to_join.database_and_table_name);
-        StoragePtr table = DatabaseCatalog::instance().tryGetTable(joined_table_id, context);
-        if (table)
+        StoragePtr storage = DatabaseCatalog::instance().tryGetTable(joined_table_id, context);
+        if (storage)
         {
-            if (dynamic_cast<StorageJoin *>(table.get()) ||
-                dynamic_cast<StorageDictionary *>(table.get()))
-                table_join->joined_storage = table;
+            if (auto storage_join = std::dynamic_pointer_cast<StorageJoin>(storage); storage_join)
+                table_join->setStorageJoin(storage_join);
+            else if (auto storage_dict = std::dynamic_pointer_cast<StorageDictionary>(storage); storage_dict)
+                table_join->setStorageJoin(storage_dict);
         }
     }
 
-    if (!table_join->joined_storage &&
+    if (!table_join->isSpecialStorage() &&
         settings.enable_optimize_predicate_expression)
         replaceJoinedTable(select_query);
 
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index e8b9641d815..a0f227faf78 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -1,5 +1,6 @@
 #include <Interpreters/TableJoin.h>
 
+
 #include <Common/StringUtils/StringUtils.h>
 
 #include <Core/Block.h>
@@ -8,12 +9,23 @@
 
 #include <DataTypes/DataTypeNullable.h>
 
+#include <Dictionaries/DictionaryStructure.h>
+
+#include <Interpreters/DictionaryReader.h>
+#include <Interpreters/ExternalDictionariesLoader.h>
+#include <Interpreters/TableJoin.h>
+
+#include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/queryToString.h>
 
-#include <common/logger_useful.h>
 #include <Storages/IStorage.h>
+#include <Storages/StorageDictionary.h>
+#include <Storages/StorageJoin.h>
+
+#include <common/logger_useful.h>
+
 
 namespace DB
 {
@@ -21,6 +33,7 @@ namespace DB
 namespace ErrorCodes
 {
     extern const int TYPE_MISMATCH;
+    extern const int LOGICAL_ERROR;
 }
 
 namespace
@@ -269,7 +282,7 @@ void TableJoin::addJoinedColumnsAndCorrectTypes(NamesAndTypesList & left_columns
              * For `JOIN ON expr1 == expr2` we will infer common type later in makeTableJoin,
              *   when part of plan built and types of expression will be known.
              */
-            inferJoinKeyCommonType(left_columns, columns_from_joined_table, joined_storage != nullptr);
+            inferJoinKeyCommonType(left_columns, columns_from_joined_table, !isSpecialStorage());
 
             if (auto it = left_type_map.find(col.name); it != left_type_map.end())
                 col.type = it->second;
@@ -318,7 +331,18 @@ bool TableJoin::needStreamWithNonJoinedRows() const
     return isRightOrFull(kind());
 }
 
-bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_block, Names & src_names, NamesAndTypesList & dst_columns) const
+static std::optional<String> getDictKeyName(const String & dict_name , ContextPtr context)
+{
+    auto dictionary = context->getExternalDictionariesLoader().getDictionary(dict_name, context);
+    if (!dictionary)
+        return {};
+
+    if (const auto & structure = dictionary->getStructure(); structure.id)
+        return structure.id->name;
+    return {};
+}
+
+bool TableJoin::tryInitDictJoin(const Block & sample_block, ContextPtr context)
 {
     /// Support ALL INNER, [ANY | ALL | SEMI | ANTI] LEFT
     if (!isLeft(kind()) && !(isInner(kind()) && strictness() == ASTTableJoin::Strictness::All))
@@ -333,9 +357,17 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc
     if (it_key == original_names.end())
         return false;
 
-    if (dict_key != it_key->second)
+    if (!right_storage_dictionary)
+        return false;
+
+    auto dict_name = right_storage_dictionary->getName();
+
+    auto dict_key = getDictKeyName(dict_name, context);
+    if (!dict_key.has_value() || *dict_key != it_key->second)
         return false; /// JOIN key != Dictionary key
 
+    Names src_names;
+    NamesAndTypesList dst_columns;
     for (const auto & col : sample_block)
     {
         if (col.name == right_keys[0])
@@ -349,6 +381,7 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc
             dst_columns.push_back({col.name, col.type});
         }
     }
+    dictionary_reader = std::make_shared<DictionaryReader>(dict_name, src_names, dst_columns, context);
 
     return true;
 }
@@ -356,7 +389,7 @@ bool TableJoin::allowDictJoin(const String & dict_key, const Block & sample_bloc
 bool TableJoin::createConvertingActions(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
 {
     bool need_convert = false;
-    need_convert = inferJoinKeyCommonType(left_sample_columns, right_sample_columns, joined_storage == nullptr);
+    need_convert = inferJoinKeyCommonType(left_sample_columns, right_sample_columns, !isSpecialStorage());
 
     left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left);
     right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right);
@@ -458,6 +491,26 @@ ActionsDAGPtr TableJoin::applyKeyConvertToTable(
     return dag;
 }
 
+
+void TableJoin::setStorageJoin(std::shared_ptr<StorageJoin> storage)
+{
+    if (right_storage_dictionary)
+        throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "StorageJoin and Dictionary join are mutually exclusive");
+    right_storage_join = storage;
+}
+
+void TableJoin::setStorageJoin(std::shared_ptr<StorageDictionary> storage)
+{
+    if (right_storage_join)
+        throw DB::Exception(ErrorCodes::LOGICAL_ERROR, "StorageJoin and Dictionary join are mutually exclusive");
+    right_storage_dictionary = storage;
+}
+
+std::shared_ptr<StorageJoin> TableJoin::getStorageJoin()
+{
+    return right_storage_join;
+}
+
 String TableJoin::renamedRightColumnName(const String & name) const
 {
     if (const auto it = renames.find(name); it != renames.end())
@@ -527,4 +580,14 @@ std::pair<String, String> TableJoin::joinConditionColumnNames() const
     return res;
 }
 
+bool TableJoin::isSpecialStorage() const
+{
+    return right_storage_dictionary || right_storage_join;
+}
+
+const DictionaryReader * TableJoin::getDictionaryReader() const
+{
+    return dictionary_reader.get();
+}
+
 }
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 081b9638ca0..5a562280f38 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -24,6 +24,8 @@ class ASTSelectQuery;
 struct DatabaseAndTableWithAlias;
 class Block;
 class DictionaryReader;
+class StorageJoin;
+class StorageDictionary;
 
 struct ColumnWithTypeAndName;
 using ColumnsWithTypeAndName = std::vector<ColumnWithTypeAndName>;
@@ -104,6 +106,11 @@ private:
 
     VolumePtr tmp_volume;
 
+    std::shared_ptr<StorageJoin> right_storage_join;
+
+    std::shared_ptr<StorageDictionary> right_storage_dictionary;
+    std::shared_ptr<DictionaryReader> dictionary_reader;
+
     Names requiredJoinedNames() const;
 
     /// Create converting actions and change key column names if required
@@ -133,16 +140,12 @@ public:
         table_join.strictness = strictness;
     }
 
-    StoragePtr joined_storage;
-    std::shared_ptr<DictionaryReader> dictionary_reader;
-
     ASTTableJoin::Kind kind() const { return table_join.kind; }
     ASTTableJoin::Strictness strictness() const { return table_join.strictness; }
     bool sameStrictnessAndKind(ASTTableJoin::Strictness, ASTTableJoin::Kind) const;
     const SizeLimits & sizeLimits() const { return size_limits; }
     VolumePtr getTemporaryVolume() { return tmp_volume; }
     bool allowMergeJoin() const;
-    bool allowDictJoin(const String & dict_key, const Block & sample_block, Names &, NamesAndTypesList &) const;
     bool preferMergeJoin() const { return join_algorithm == JoinAlgorithm::PREFER_PARTIAL_MERGE; }
     bool forceMergeJoin() const { return join_algorithm == JoinAlgorithm::PARTIAL_MERGE; }
     bool forceHashJoin() const
@@ -233,6 +236,16 @@ public:
 
     String renamedRightColumnName(const String & name) const;
     std::unordered_map<String, String> leftToRightKeyRemap() const;
+
+    void setStorageJoin(std::shared_ptr<StorageJoin> storage);
+    void setStorageJoin(std::shared_ptr<StorageDictionary> storage);
+
+    std::shared_ptr<StorageJoin> getStorageJoin();
+
+    bool tryInitDictJoin(const Block & sample_block, ContextPtr context);
+
+    bool isSpecialStorage() const;
+    const DictionaryReader * getDictionaryReader() const;
 };
 
 }

From df873866c9bf9e739f9cc75bd1bf3619023caca1 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Tue, 29 Jun 2021 12:52:19 +0300
Subject: [PATCH 055/177] Don't store converting actions in TableJoin

---
 src/Interpreters/ExpressionAnalyzer.cpp | 54 +++++++++++++------------
 src/Interpreters/ExpressionAnalyzer.h   |  6 ++-
 src/Interpreters/TableJoin.cpp          | 30 +++-----------
 src/Interpreters/TableJoin.h            | 16 +++-----
 4 files changed, 43 insertions(+), 63 deletions(-)

diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index 5d3b52372d6..33dc919d035 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -832,14 +832,14 @@ bool SelectQueryExpressionAnalyzer::appendJoinLeftKeys(ExpressionActionsChain &
     return true;
 }
 
-JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain)
+JoinPtr SelectQueryExpressionAnalyzer::appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & converting_join_columns)
 {
     const ColumnsWithTypeAndName & left_sample_columns = chain.getLastStep().getResultColumns();
-    JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns);
+    JoinPtr table_join = makeTableJoin(*syntax->ast_join, left_sample_columns, converting_join_columns);
 
-    if (auto left_actions = syntax->analyzed_join->leftConvertingActions())
+    if (converting_join_columns)
     {
-        chain.steps.push_back(std::make_unique<ExpressionActionsChain::ExpressionActionsStep>(left_actions));
+        chain.steps.push_back(std::make_unique<ExpressionActionsChain::ExpressionActionsStep>(converting_join_columns));
         chain.addStep();
     }
 
@@ -871,10 +871,9 @@ static std::shared_ptr<IJoin> chooseJoinAlgorithm(std::shared_ptr<TableJoin> ana
     return std::make_shared<JoinSwitcher>(analyzed_join, sample_block);
 }
 
-std::unique_ptr<QueryPlan> buildJoinedPlan(
+static std::unique_ptr<QueryPlan> buildJoinedPlan(
     ContextPtr context,
     const ASTTablesInSelectQueryElement & join_element,
-    const ColumnsWithTypeAndName & left_sample_columns,
     TableJoin & analyzed_join,
     SelectQueryOptions query_options)
 {
@@ -918,40 +917,44 @@ std::unique_ptr<QueryPlan> buildJoinedPlan(
     joined_actions_step->setStepDescription("Joined actions");
     joined_plan->addStep(std::move(joined_actions_step));
 
-    const ColumnsWithTypeAndName & right_sample_columns = joined_plan->getCurrentDataStream().header.getColumnsWithTypeAndName();
-
-    analyzed_join.createConvertingActions(left_sample_columns, right_sample_columns);
-
-    if (auto right_actions = analyzed_join.rightConvertingActions())
-    {
-        auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), right_actions);
-        converting_step->setStepDescription("Convert joined columns");
-        joined_plan->addStep(std::move(converting_step));
-    }
-
     return joined_plan;
 }
 
 JoinPtr SelectQueryExpressionAnalyzer::makeTableJoin(
-    const ASTTablesInSelectQueryElement & join_element, const ColumnsWithTypeAndName & left_sample_columns)
+    const ASTTablesInSelectQueryElement & join_element,
+    const ColumnsWithTypeAndName & left_columns,
+    ActionsDAGPtr & left_convert_actions)
 {
     /// Two JOINs are not supported with the same subquery, but different USINGs.
 
     if (joined_plan)
         throw Exception(ErrorCodes::LOGICAL_ERROR, "Table join was already created for query");
 
-    if (auto storage = syntax->analyzed_join->getStorageJoin())
+    ActionsDAGPtr right_convert_actions = nullptr;
+
+    const auto & analyzed_join = syntax->analyzed_join;
+
+    if (auto storage = analyzed_join->getStorageJoin())
     {
-        syntax->analyzed_join->createConvertingActions(left_sample_columns, {});
-        return storage->getJoinLocked(syntax->analyzed_join);
+        std::tie(left_convert_actions, right_convert_actions) = analyzed_join->createConvertingActions(left_columns, {});
+        return storage->getJoinLocked(analyzed_join);
     }
 
-    joined_plan = buildJoinedPlan(getContext(), join_element, left_sample_columns, *syntax->analyzed_join, query_options);
+    joined_plan = buildJoinedPlan(getContext(), join_element, *analyzed_join, query_options);
 
-    JoinPtr join = chooseJoinAlgorithm(syntax->analyzed_join, joined_plan->getCurrentDataStream().header, getContext());
+    const ColumnsWithTypeAndName & right_columns = joined_plan->getCurrentDataStream().header.getColumnsWithTypeAndName();
+    std::tie(left_convert_actions, right_convert_actions) = analyzed_join->createConvertingActions(left_columns, right_columns);
+    if (right_convert_actions)
+    {
+        auto converting_step = std::make_unique<ExpressionStep>(joined_plan->getCurrentDataStream(), right_convert_actions);
+        converting_step->setStepDescription("Convert joined columns");
+        joined_plan->addStep(std::move(converting_step));
+    }
+
+    JoinPtr join = chooseJoinAlgorithm(analyzed_join, joined_plan->getCurrentDataStream().header, getContext());
 
     /// Do not make subquery for join over dictionary.
-    if (syntax->analyzed_join->getDictionaryReader())
+    if (analyzed_join->getDictionaryReader())
         joined_plan.reset();
 
     return join;
@@ -1544,8 +1547,7 @@ ExpressionAnalysisResult::ExpressionAnalysisResult(
         {
             query_analyzer.appendJoinLeftKeys(chain, only_types || !first_stage);
             before_join = chain.getLastActions();
-            join = query_analyzer.appendJoin(chain);
-            converting_join_columns = query_analyzer.analyzedJoin().leftConvertingActions();
+            join = query_analyzer.appendJoin(chain, converting_join_columns);
             chain.addStep();
         }
 
diff --git a/src/Interpreters/ExpressionAnalyzer.h b/src/Interpreters/ExpressionAnalyzer.h
index 14f0e0023dd..5e3a7af8e9e 100644
--- a/src/Interpreters/ExpressionAnalyzer.h
+++ b/src/Interpreters/ExpressionAnalyzer.h
@@ -341,7 +341,8 @@ private:
 
     JoinPtr makeTableJoin(
         const ASTTablesInSelectQueryElement & join_element,
-        const ColumnsWithTypeAndName & left_sample_columns);
+        const ColumnsWithTypeAndName & left_columns,
+        ActionsDAGPtr & left_convert_actions);
 
     const ASTSelectQuery * getAggregatingQuery() const;
 
@@ -362,7 +363,8 @@ private:
     /// Before aggregation:
     ArrayJoinActionPtr appendArrayJoin(ExpressionActionsChain & chain, ActionsDAGPtr & before_array_join, bool only_types);
     bool appendJoinLeftKeys(ExpressionActionsChain & chain, bool only_types);
-    JoinPtr appendJoin(ExpressionActionsChain & chain);
+    JoinPtr appendJoin(ExpressionActionsChain & chain, ActionsDAGPtr & converting_join_columns);
+
     /// remove_filter is set in ExpressionActionsChain::finalize();
     /// Columns in `additional_required_columns` will not be removed (they can be used for e.g. sampling or FINAL modifier).
     ActionsDAGPtr appendPrewhere(ExpressionActionsChain & chain, bool only_types, const Names & additional_required_columns);
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index a0f227faf78..e61318788a2 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -80,8 +80,6 @@ void TableJoin::resetCollected()
     renames.clear();
     left_type_map.clear();
     right_type_map.clear();
-    left_converting_actions = nullptr;
-    right_converting_actions = nullptr;
 }
 
 void TableJoin::addUsingKey(const ASTPtr & ast)
@@ -386,15 +384,15 @@ bool TableJoin::tryInitDictJoin(const Block & sample_block, ContextPtr context)
     return true;
 }
 
-bool TableJoin::createConvertingActions(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
+std::pair<ActionsDAGPtr, ActionsDAGPtr>
+TableJoin::createConvertingActions(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns)
 {
-    bool need_convert = false;
-    need_convert = inferJoinKeyCommonType(left_sample_columns, right_sample_columns, !isSpecialStorage());
+    inferJoinKeyCommonType(left_sample_columns, right_sample_columns, !isSpecialStorage());
 
-    left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left);
-    right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right);
+    auto left_converting_actions = applyKeyConvertToTable(left_sample_columns, left_type_map, key_names_left);
+    auto right_converting_actions = applyKeyConvertToTable(right_sample_columns, right_type_map, key_names_right);
 
-    return need_convert;
+    return {left_converting_actions, right_converting_actions};
 }
 
 template <typename LeftNamesAndTypes, typename RightNamesAndTypes>
@@ -491,7 +489,6 @@ ActionsDAGPtr TableJoin::applyKeyConvertToTable(
     return dag;
 }
 
-
 void TableJoin::setStorageJoin(std::shared_ptr<StorageJoin> storage)
 {
     if (right_storage_dictionary)
@@ -506,11 +503,6 @@ void TableJoin::setStorageJoin(std::shared_ptr<StorageDictionary> storage)
     right_storage_dictionary = storage;
 }
 
-std::shared_ptr<StorageJoin> TableJoin::getStorageJoin()
-{
-    return right_storage_join;
-}
-
 String TableJoin::renamedRightColumnName(const String & name) const
 {
     if (const auto it = renames.find(name); it != renames.end())
@@ -580,14 +572,4 @@ std::pair<String, String> TableJoin::joinConditionColumnNames() const
     return res;
 }
 
-bool TableJoin::isSpecialStorage() const
-{
-    return right_storage_dictionary || right_storage_join;
-}
-
-const DictionaryReader * TableJoin::getDictionaryReader() const
-{
-    return dictionary_reader.get();
-}
-
 }
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 5a562280f38..8607cb53dc3 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -96,9 +96,6 @@ private:
     NameToTypeMap left_type_map;
     NameToTypeMap right_type_map;
 
-    ActionsDAGPtr left_converting_actions;
-    ActionsDAGPtr right_converting_actions;
-
     /// Name -> original name. Names are the same as in columns_from_joined_table list.
     std::unordered_map<String, String> original_names;
     /// Original name -> name. Only renamed columns.
@@ -205,11 +202,8 @@ public:
     /// Calculate converting actions, rename key columns in required
     /// For `USING` join we will convert key columns inplace and affect into types in the result table
     /// For `JOIN ON` we will create new columns with converted keys to join by.
-    bool createConvertingActions(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns);
-
-    /// Key columns should be converted before join.
-    ActionsDAGPtr leftConvertingActions() const { return left_converting_actions; }
-    ActionsDAGPtr rightConvertingActions() const { return right_converting_actions; }
+    std::pair<ActionsDAGPtr, ActionsDAGPtr>
+    createConvertingActions(const ColumnsWithTypeAndName & left_sample_columns, const ColumnsWithTypeAndName & right_sample_columns);
 
     void setAsofInequality(ASOF::Inequality inequality) { asof_inequality = inequality; }
     ASOF::Inequality getAsofInequality() { return asof_inequality; }
@@ -240,12 +234,12 @@ public:
     void setStorageJoin(std::shared_ptr<StorageJoin> storage);
     void setStorageJoin(std::shared_ptr<StorageDictionary> storage);
 
-    std::shared_ptr<StorageJoin> getStorageJoin();
+    std::shared_ptr<StorageJoin> getStorageJoin() { return right_storage_join; }
 
     bool tryInitDictJoin(const Block & sample_block, ContextPtr context);
 
-    bool isSpecialStorage() const;
-    const DictionaryReader * getDictionaryReader() const;
+    bool isSpecialStorage() const { return right_storage_dictionary || right_storage_join; }
+    const DictionaryReader * getDictionaryReader() const { return dictionary_reader.get(); }
 };
 
 }

From 450e49fd28e7cd574a2eb6ec4e8ed1295d45611c Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Tue, 29 Jun 2021 12:55:34 +0300
Subject: [PATCH 056/177] Throw type mismatch exception from
 inferJoinKeyCommonType

---
 src/Interpreters/TableJoin.cpp | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index e61318788a2..a39aa9e6731 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -428,19 +428,17 @@ bool TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const Rig
         auto common_type = DB::getLeastSupertype({ltype->second, rtype->second}, false);
         if (common_type == nullptr || isNothing(common_type))
         {
-            LOG_DEBUG(&Poco::Logger::get("TableJoin"),
-                      "Can't infer supertype for joined columns: {}: {} at left, {}: {} at right.",
-                      key_names_left[i], ltype->second->getName(),
-                      key_names_right[i], rtype->second->getName());
-            continue;
+            throw DB::Exception(ErrorCodes::TYPE_MISMATCH,
+                "Can't infer supertype for joined columns: {}: {} at left, {}: {} at right.",
+                key_names_left[i], ltype->second->getName(),
+                key_names_right[i], rtype->second->getName());
         }
 
         if (!allow_right && !common_type->equals(*rtype->second))
         {
-            LOG_DEBUG(&Poco::Logger::get("TableJoin"),
-                      "Can't change type for right table: {}: {} -> {}.",
-                      key_names_right[i], rtype->second->getName(), common_type->getName());
-            continue;
+            throw DB::Exception(ErrorCodes::TYPE_MISMATCH,
+                "Can't change type for right table: {}: {} -> {}.",
+                key_names_right[i], rtype->second->getName(), common_type->getName());
         }
         left_type_map[key_names_left[i]] = right_type_map[key_names_right[i]] = common_type;
     }

From 2a1c7d7336301fcb6f90f2690b033527e0d36573 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Tue, 29 Jun 2021 13:11:01 +0300
Subject: [PATCH 057/177] fix typo in tryInitDictJoin

---
 src/Interpreters/TableJoin.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index a39aa9e6731..a24e1a9cc91 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -358,7 +358,7 @@ bool TableJoin::tryInitDictJoin(const Block & sample_block, ContextPtr context)
     if (!right_storage_dictionary)
         return false;
 
-    auto dict_name = right_storage_dictionary->getName();
+    auto dict_name = right_storage_dictionary->getDictionaryName();
 
     auto dict_key = getDictKeyName(dict_name, context);
     if (!dict_key.has_value() || *dict_key != it_key->second)

From ac17c0c817edb67332fd0cd9a7a61bdd2b4163ae Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Wed, 30 Jun 2021 12:40:35 +0300
Subject: [PATCH 058/177] Minor style src/Interpreters/TableJoin.cpp

---
 src/Interpreters/TableJoin.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index a24e1a9cc91..a2761c84569 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -213,7 +213,7 @@ Block TableJoin::getRequiredRightKeys(const Block & right_table_keys, std::vecto
 {
     const Names & left_keys = keyNamesLeft();
     const Names & right_keys = keyNamesRight();
-    NameSet required_keys(requiredRightKeys().begin(), requiredRightKeys().end());
+    NameSet required_keys = requiredRightKeys();
     Block required_right_keys;
 
     for (size_t i = 0; i < right_keys.size(); ++i)
@@ -231,7 +231,6 @@ Block TableJoin::getRequiredRightKeys(const Block & right_table_keys, std::vecto
     return required_right_keys;
 }
 
-
 bool TableJoin::leftBecomeNullable(const DataTypePtr & column_type) const
 {
     return forceNullableLeft() && JoinCommon::canBecomeNullable(column_type);

From 6172cc167618b460246367c73d9b77d4d33f58e1 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Thu, 1 Jul 2021 10:51:32 +0300
Subject: [PATCH 059/177] Fix build, inferJoinKeyCommonType definition

---
 src/Interpreters/ExpressionAnalyzer.cpp | 2 +-
 src/Interpreters/TableJoin.h            | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Interpreters/ExpressionAnalyzer.cpp b/src/Interpreters/ExpressionAnalyzer.cpp
index 33dc919d035..d5739b51659 100644
--- a/src/Interpreters/ExpressionAnalyzer.cpp
+++ b/src/Interpreters/ExpressionAnalyzer.cpp
@@ -892,7 +892,7 @@ static std::unique_ptr<QueryPlan> buildJoinedPlan(
         * - this function shows the expression JOIN _data1.
         */
     auto interpreter = interpretSubquery(
-        join_element.table_expression, getContext(), original_right_columns, query_options.copy().setWithAllColumns());
+        join_element.table_expression, context, original_right_columns, query_options.copy().setWithAllColumns());
     auto joined_plan = std::make_unique<QueryPlan>();
     interpreter->buildQueryPlan(*joined_plan);
     {
diff --git a/src/Interpreters/TableJoin.h b/src/Interpreters/TableJoin.h
index 8607cb53dc3..462dd540a00 100644
--- a/src/Interpreters/TableJoin.h
+++ b/src/Interpreters/TableJoin.h
@@ -116,7 +116,7 @@ private:
 
     /// Calculates common supertypes for corresponding join key columns.
     template <typename LeftNamesAndTypes, typename RightNamesAndTypes>
-    bool inferJoinKeyCommonType(const LeftNamesAndTypes & left, const RightNamesAndTypes & right, bool to_supertype);
+    bool inferJoinKeyCommonType(const LeftNamesAndTypes & left, const RightNamesAndTypes & right, bool allow_right);
 
     NamesAndTypesList correctedColumnsAddedByJoin() const;
 

From bf9cb8482ba037d7f8b24ba75ffa4d65cc6c0a63 Mon Sep 17 00:00:00 2001
From: vdimir <vdimir@yandex-team.ru>
Date: Tue, 31 Aug 2021 15:53:51 +0300
Subject: [PATCH 060/177] Revert "Add option to
 getMostSubtype/getLeastSupertype to not throw exception"

This reverts commit c4654d6b4557e98857c83857ca60132b43e56b21.
---
 src/DataTypes/getLeastSupertype.cpp | 98 +++++++++++------------------
 src/DataTypes/getLeastSupertype.h   |  5 +-
 src/DataTypes/getMostSubtype.cpp    | 44 ++++++-------
 src/DataTypes/getMostSubtype.h      |  5 +-
 src/Interpreters/TableJoin.cpp      | 17 ++---
 5 files changed, 71 insertions(+), 98 deletions(-)

diff --git a/src/DataTypes/getLeastSupertype.cpp b/src/DataTypes/getLeastSupertype.cpp
index 2f06ff8b954..a950d18b50a 100644
--- a/src/DataTypes/getLeastSupertype.cpp
+++ b/src/DataTypes/getLeastSupertype.cpp
@@ -47,47 +47,19 @@ namespace
 
         return res.str();
     }
-
-    template <typename T>
-    DataTypePtr wrapWithType(DataTypePtr type)
-    {
-        if (type == nullptr)
-            return nullptr;
-        return std::make_shared<T>(type);
-    }
-
-    template <typename T>
-    DataTypePtr wrapWithType(DataTypePtr type1, DataTypePtr type2)
-    {
-        if (type1 == nullptr || type2 == nullptr)
-            return nullptr;
-        return std::make_shared<T>(type1, type2);
-    }
-
-    template <typename T>
-    DataTypePtr wrapWithType(const DataTypes & types)
-    {
-        for (const auto & type : types)
-        {
-            if (type == nullptr)
-                return nullptr;
-        }
-        return std::make_shared<T>(types);
-    }
 }
 
-DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_type)
+
+DataTypePtr getLeastSupertype(const DataTypes & types)
 {
-    auto on_error = [throw_on_no_common_type](const String & msg) -> DataTypePtr
-    {
-        if (throw_on_no_common_type)
-            throw Exception(msg, ErrorCodes::NO_COMMON_TYPE);
-        return nullptr;
-    };
+    /// Trivial cases
 
     if (types.empty())
         return std::make_shared<DataTypeNothing>();
 
+    if (types.size() == 1)
+        return types[0];
+
     /// All types are equal
     {
         bool all_equal = true;
@@ -116,7 +88,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
                 non_nothing_types.emplace_back(type);
 
         if (non_nothing_types.size() < types.size())
-            return getLeastSupertype(non_nothing_types, throw_on_no_common_type);
+            return getLeastSupertype(non_nothing_types);
     }
 
     /// For Arrays
@@ -141,9 +113,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
         if (have_array)
         {
             if (!all_arrays)
-                return on_error(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not");
+                throw Exception(getExceptionMessagePrefix(types) + " because some of them are Array and some of them are not", ErrorCodes::NO_COMMON_TYPE);
 
-            return wrapWithType<DataTypeArray>(getLeastSupertype(nested_types, throw_on_no_common_type));
+            return std::make_shared<DataTypeArray>(getLeastSupertype(nested_types));
         }
     }
 
@@ -167,7 +139,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
                         nested_types[elem_idx].reserve(types.size());
                 }
                 else if (tuple_size != type_tuple->getElements().size())
-                    return on_error(getExceptionMessagePrefix(types) + " because Tuples have different sizes");
+                    throw Exception(getExceptionMessagePrefix(types) + " because Tuples have different sizes", ErrorCodes::NO_COMMON_TYPE);
 
                 have_tuple = true;
 
@@ -181,13 +153,13 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
         if (have_tuple)
         {
             if (!all_tuples)
-                return on_error(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not");
+                throw Exception(getExceptionMessagePrefix(types) + " because some of them are Tuple and some of them are not", ErrorCodes::NO_COMMON_TYPE);
 
             DataTypes common_tuple_types(tuple_size);
             for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
-                common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx], throw_on_no_common_type);
+                common_tuple_types[elem_idx] = getLeastSupertype(nested_types[elem_idx]);
 
-            return wrapWithType<DataTypeTuple>(common_tuple_types);
+            return std::make_shared<DataTypeTuple>(common_tuple_types);
         }
     }
 
@@ -215,9 +187,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
         if (have_maps)
         {
             if (!all_maps)
-                return on_error(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not");
+                throw Exception(getExceptionMessagePrefix(types) + " because some of them are Maps and some of them are not", ErrorCodes::NO_COMMON_TYPE);
 
-            return wrapWithType<DataTypeMap>(getLeastSupertype(key_types, throw_on_no_common_type), getLeastSupertype(value_types, throw_on_no_common_type));
+            return std::make_shared<DataTypeMap>(getLeastSupertype(key_types), getLeastSupertype(value_types));
         }
     }
 
@@ -248,9 +220,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
         if (have_low_cardinality)
         {
             if (have_not_low_cardinality)
-                return getLeastSupertype(nested_types, throw_on_no_common_type);
+                return getLeastSupertype(nested_types);
             else
-                return wrapWithType<DataTypeLowCardinality>(getLeastSupertype(nested_types, throw_on_no_common_type));
+                return std::make_shared<DataTypeLowCardinality>(getLeastSupertype(nested_types));
         }
     }
 
@@ -276,7 +248,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
 
         if (have_nullable)
         {
-            return wrapWithType<DataTypeNullable>(getLeastSupertype(nested_types, throw_on_no_common_type));
+            return std::make_shared<DataTypeNullable>(getLeastSupertype(nested_types));
         }
     }
 
@@ -296,7 +268,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
         {
             bool all_strings = type_ids.size() == (have_string + have_fixed_string);
             if (!all_strings)
-                return on_error(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not");
+                throw Exception(getExceptionMessagePrefix(types) + " because some of them are String/FixedString and some of them are not", ErrorCodes::NO_COMMON_TYPE);
 
             return std::make_shared<DataTypeString>();
         }
@@ -312,7 +284,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
         {
             bool all_date_or_datetime = type_ids.size() == (have_date + have_datetime + have_datetime64);
             if (!all_date_or_datetime)
-                return on_error(getExceptionMessagePrefix(types) + " because some of them are Date/DateTime/DateTime64 and some of them are not");
+                throw Exception(getExceptionMessagePrefix(types) + " because some of them are Date/DateTime/DateTime64 and some of them are not",
+                    ErrorCodes::NO_COMMON_TYPE);
 
             if (have_datetime64 == 0)
             {
@@ -372,7 +345,8 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
             }
 
             if (num_supported != type_ids.size())
-                return on_error(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal");
+                throw Exception(getExceptionMessagePrefix(types) + " because some of them have no lossless conversion to Decimal",
+                                ErrorCodes::NO_COMMON_TYPE);
 
             UInt32 max_scale = 0;
             for (const auto & type : types)
@@ -394,8 +368,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
             }
 
             if (min_precision > DataTypeDecimal<Decimal128>::maxPrecision())
-                return on_error(getExceptionMessagePrefix(types) + " because the least supertype is Decimal("
-                                + toString(min_precision) + ',' + toString(max_scale) + ')');
+                throw Exception(getExceptionMessagePrefix(types) + " because the least supertype is Decimal("
+                                + toString(min_precision) + ',' + toString(max_scale) + ')',
+                                ErrorCodes::NO_COMMON_TYPE);
 
             if (have_decimal128 || min_precision > DataTypeDecimal<Decimal64>::maxPrecision())
                 return std::make_shared<DataTypeDecimal<Decimal128>>(DataTypeDecimal<Decimal128>::maxPrecision(), max_scale);
@@ -456,7 +431,7 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
         if (max_bits_of_signed_integer || max_bits_of_unsigned_integer || max_mantissa_bits_of_floating)
         {
             if (!all_numbers)
-                return on_error(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not");
+                throw Exception(getExceptionMessagePrefix(types) + " because some of them are numbers and some of them are not", ErrorCodes::NO_COMMON_TYPE);
 
             /// If there are signed and unsigned types of same bit-width, the result must be signed number with at least one more bit.
             /// Example, common of Int32, UInt32 = Int64.
@@ -471,10 +446,11 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
                 if (min_bit_width_of_integer != 64)
                     ++min_bit_width_of_integer;
                 else
-                    return on_error(
+                    throw Exception(
                         getExceptionMessagePrefix(types)
                             + " because some of them are signed integers and some are unsigned integers,"
-                              " but there is no signed integer type, that can exactly represent all required unsigned integer values");
+                              " but there is no signed integer type, that can exactly represent all required unsigned integer values",
+                        ErrorCodes::NO_COMMON_TYPE);
             }
 
             /// If the result must be floating.
@@ -486,9 +462,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
                 else if (min_mantissa_bits <= 53)
                     return std::make_shared<DataTypeFloat64>();
                 else
-                    return on_error(getExceptionMessagePrefix(types)
+                    throw Exception(getExceptionMessagePrefix(types)
                         + " because some of them are integers and some are floating point,"
-                        " but there is no floating point type, that can exactly represent all required integers");
+                        " but there is no floating point type, that can exactly represent all required integers", ErrorCodes::NO_COMMON_TYPE);
             }
 
             /// If the result must be signed integer.
@@ -507,9 +483,9 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
                 else if (min_bit_width_of_integer <= 256)
                     return std::make_shared<DataTypeInt256>();
                 else
-                    return on_error(getExceptionMessagePrefix(types)
+                    throw Exception(getExceptionMessagePrefix(types)
                         + " because some of them are signed integers and some are unsigned integers,"
-                        " but there is no signed integer type, that can exactly represent all required unsigned integer values");
+                        " but there is no signed integer type, that can exactly represent all required unsigned integer values", ErrorCodes::NO_COMMON_TYPE);
             }
 
             /// All unsigned.
@@ -527,14 +503,14 @@ DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_t
                 else if (min_bit_width_of_integer <= 256)
                     return std::make_shared<DataTypeUInt256>();
                 else
-                    return on_error("Logical error: " + getExceptionMessagePrefix(types)
-                        + " but as all data types are unsigned integers, we must have found maximum unsigned integer type");
+                    throw Exception("Logical error: " + getExceptionMessagePrefix(types)
+                        + " but as all data types are unsigned integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE);
             }
         }
     }
 
     /// All other data types (UUID, AggregateFunction, Enum...) are compatible only if they are the same (checked in trivial cases).
-    return on_error(getExceptionMessagePrefix(types));
+    throw Exception(getExceptionMessagePrefix(types), ErrorCodes::NO_COMMON_TYPE);
 }
 
 }
diff --git a/src/DataTypes/getLeastSupertype.h b/src/DataTypes/getLeastSupertype.h
index 4edecf49d17..57e011a0529 100644
--- a/src/DataTypes/getLeastSupertype.h
+++ b/src/DataTypes/getLeastSupertype.h
@@ -2,15 +2,16 @@
 
 #include <DataTypes/IDataType.h>
 
+
 namespace DB
 {
 
 /** Get data type that covers all possible values of passed data types.
-  * If there is no such data type, throws an exception or return nullptr.
+  * If there is no such data type, throws an exception.
   *
   * Examples: least common supertype for UInt8, Int8 - Int16.
   * Examples: there is no least common supertype for Array(UInt8), Int8.
   */
-DataTypePtr getLeastSupertype(const DataTypes & types, bool throw_on_no_common_type = true);
+DataTypePtr getLeastSupertype(const DataTypes & types);
 
 }
diff --git a/src/DataTypes/getMostSubtype.cpp b/src/DataTypes/getMostSubtype.cpp
index 4b968415521..fcf966780eb 100644
--- a/src/DataTypes/getMostSubtype.cpp
+++ b/src/DataTypes/getMostSubtype.cpp
@@ -23,8 +23,7 @@ extern const int NO_COMMON_TYPE;
 
 namespace
 {
-
-String getExceptionMessage(const DataTypes & types, const std::string & reason)
+String getExceptionMessagePrefix(const DataTypes & types)
 {
     WriteBufferFromOwnString res;
     res << "There is no subtype for types ";
@@ -38,18 +37,19 @@ String getExceptionMessage(const DataTypes & types, const std::string & reason)
 
         res << type->getName();
     }
-    res << reason;
+
     return res.str();
 }
 
 }
 
-DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type, bool force_support_conversion)
+
+DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_nothing, bool force_support_conversion)
 {
-    auto get_nothing_or_throw = [throw_on_no_common_type, & types](const std::string & reason) -> DataTypePtr
+    auto get_nothing_or_throw = [throw_if_result_is_nothing, & types](const std::string & reason)
     {
-        if (throw_on_no_common_type)
-            throw Exception(getExceptionMessage(types, reason), ErrorCodes::NO_COMMON_TYPE);
+        if (throw_if_result_is_nothing)
+            throw Exception(getExceptionMessagePrefix(types) + reason, ErrorCodes::NO_COMMON_TYPE);
         return std::make_shared<DataTypeNothing>();
     };
 
@@ -57,14 +57,14 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type
 
     if (types.empty())
     {
-        if (throw_on_no_common_type)
+        if (throw_if_result_is_nothing)
             throw Exception("There is no common type for empty type list", ErrorCodes::NO_COMMON_TYPE);
         return std::make_shared<DataTypeNothing>();
     }
 
     if (types.size() == 1)
     {
-        if (throw_on_no_common_type && typeid_cast<const DataTypeNothing *>(types[0].get()))
+        if (throw_if_result_is_nothing && typeid_cast<const DataTypeNothing *>(types[0].get()))
             throw Exception("There is no common type for type Nothing", ErrorCodes::NO_COMMON_TYPE);
         return types[0];
     }
@@ -118,7 +118,7 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type
             if (!all_arrays)
                 return get_nothing_or_throw(" because some of them are Array and some of them are not");
 
-            return std::make_shared<DataTypeArray>(getMostSubtype(nested_types, throw_on_no_common_type, force_support_conversion));
+            return std::make_shared<DataTypeArray>(getMostSubtype(nested_types, false, force_support_conversion));
         }
     }
 
@@ -161,7 +161,7 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type
             DataTypes common_tuple_types(tuple_size);
             for (size_t elem_idx = 0; elem_idx < tuple_size; ++elem_idx)
                 common_tuple_types[elem_idx] =
-                        getMostSubtype(nested_types[elem_idx], throw_on_no_common_type, force_support_conversion);
+                        getMostSubtype(nested_types[elem_idx], throw_if_result_is_nothing, force_support_conversion);
 
             return std::make_shared<DataTypeTuple>(common_tuple_types);
         }
@@ -192,9 +192,9 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type
         if (have_nullable)
         {
             if (all_nullable || force_support_conversion)
-                return std::make_shared<DataTypeNullable>(getMostSubtype(nested_types, throw_on_no_common_type, force_support_conversion));
+                return std::make_shared<DataTypeNullable>(getMostSubtype(nested_types, false, force_support_conversion));
 
-            return getMostSubtype(nested_types, throw_on_no_common_type, force_support_conversion);
+            return getMostSubtype(nested_types, throw_if_result_is_nothing, force_support_conversion);
         }
     }
 
@@ -309,10 +309,8 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type
                 else if (min_mantissa_bits_of_floating <= 53)
                     return std::make_shared<DataTypeFloat64>();
                 else
-                    throw Exception(
-                        ErrorCodes::NO_COMMON_TYPE,
-                        "Logical error: " +
-                            getExceptionMessage(types, " but as all data types are floats, we must have found maximum float type"));
+                    throw Exception("Logical error: " + getExceptionMessagePrefix(types)
+                                    + " but as all data types are floats, we must have found maximum float type", ErrorCodes::NO_COMMON_TYPE);
             }
 
             /// If there are signed and unsigned types of same bit-width, the result must be unsigned number.
@@ -328,10 +326,8 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type
                 else if (min_bits_of_unsigned_integer <= 64)
                     return std::make_shared<DataTypeUInt64>();
                 else
-                    throw Exception(
-                        ErrorCodes::NO_COMMON_TYPE,
-                        "Logical error: " +
-                            getExceptionMessage(types, " but as all data types are integers, we must have found maximum unsigned integer type"));
+                    throw Exception("Logical error: " + getExceptionMessagePrefix(types)
+                                    + " but as all data types are integers, we must have found maximum unsigned integer type", ErrorCodes::NO_COMMON_TYPE);
             }
 
             /// All signed.
@@ -345,10 +341,8 @@ DataTypePtr getMostSubtype(const DataTypes & types, bool throw_on_no_common_type
                 else if (min_bits_of_signed_integer <= 64)
                     return std::make_shared<DataTypeInt64>();
                 else
-                    throw Exception(
-                        ErrorCodes::NO_COMMON_TYPE,
-                        "Logical error: " +
-                            getExceptionMessage(types, " but as all data types are integers, we must have found maximum signed integer type"));
+                    throw Exception("Logical error: " + getExceptionMessagePrefix(types)
+                                    + " but as all data types are integers, we must have found maximum signed integer type", ErrorCodes::NO_COMMON_TYPE);
             }
         }
     }
diff --git a/src/DataTypes/getMostSubtype.h b/src/DataTypes/getMostSubtype.h
index c5765f7c370..c46cf4e2054 100644
--- a/src/DataTypes/getMostSubtype.h
+++ b/src/DataTypes/getMostSubtype.h
@@ -12,9 +12,8 @@ namespace DB
   * Examples: most common subtype for Array(UInt8), Int8 is Nothing
   *
   * If force_support_conversion is true, returns type which may be used to convert each argument to.
-  * Example: most common subtype for Array(UInt8) and Array(Nullable(Int32)) is Array(Nullable(UInt8)) if force_support_conversion is true.
+  * Example: most common subtype for Array(UInt8) and Array(Nullable(Int32)) is Array(Nullable(UInt8) if force_support_conversion is true.
   */
-DataTypePtr
-getMostSubtype(const DataTypes & types, bool throw_on_no_common_type = true, bool force_support_conversion = false);
+DataTypePtr getMostSubtype(const DataTypes & types, bool throw_if_result_is_nothing = false, bool force_support_conversion = false);
 
 }
diff --git a/src/Interpreters/TableJoin.cpp b/src/Interpreters/TableJoin.cpp
index a2761c84569..9194de3073a 100644
--- a/src/Interpreters/TableJoin.cpp
+++ b/src/Interpreters/TableJoin.cpp
@@ -13,9 +13,7 @@
 
 #include <Interpreters/DictionaryReader.h>
 #include <Interpreters/ExternalDictionariesLoader.h>
-#include <Interpreters/TableJoin.h>
 
-#include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTExpressionList.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/queryToString.h>
@@ -423,14 +421,19 @@ bool TableJoin::inferJoinKeyCommonType(const LeftNamesAndTypes & left, const Rig
         if (JoinCommon::typesEqualUpToNullability(ltype->second, rtype->second))
             continue;
 
-        /// TODO(vdimir): use getMostSubtype if possible
-        auto common_type = DB::getLeastSupertype({ltype->second, rtype->second}, false);
-        if (common_type == nullptr || isNothing(common_type))
+        DataTypePtr common_type;
+        try
+        {
+            /// TODO(vdimir): use getMostSubtype if possible
+            common_type = DB::getLeastSupertype({ltype->second, rtype->second});
+        }
+        catch (DB::Exception & ex)
         {
             throw DB::Exception(ErrorCodes::TYPE_MISMATCH,
-                "Can't infer supertype for joined columns: {}: {} at left, {}: {} at right.",
+                "Can't infer common type for joined columns: {}: {} at left, {}: {} at right. {}",
                 key_names_left[i], ltype->second->getName(),
-                key_names_right[i], rtype->second->getName());
+                key_names_right[i], rtype->second->getName(),
+                ex.message());
         }
 
         if (!allow_right && !common_type->equals(*rtype->second))

From ceab6feb2ab1a8b87bf4d866030f2d7bb6cbfce8 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 1 Sep 2021 12:35:37 +0000
Subject: [PATCH 061/177] Fix test

---
 src/Interpreters/Context.cpp | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index f8f9b71abd9..d32d1e97183 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -2372,6 +2372,13 @@ void Context::shutdown()
         }
     }
 
+    if (merge_mutate_executor)
+        merge_mutate_executor->wait();
+    if (fetch_executor)
+        fetch_executor->wait();
+    if (moves_executor)
+        moves_executor->wait();
+
     shared->shutdown();
 }
 

From 7f21cd7f3d8c5c3e2066584f4243f9a42c2b7ac7 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Thu, 2 Sep 2021 10:39:27 +0000
Subject: [PATCH 062/177] save

---
 src/Interpreters/Context.cpp                  | 49 ++++++++++---------
 src/Interpreters/Context.h                    |  6 ---
 .../MergeTree/BackgroundJobsExecutor.cpp      |  7 ++-
 .../MergeTree/MergeMutateExecutor.cpp         | 20 +++++++-
 src/Storages/MergeTree/MergeMutateExecutor.h  | 15 +++++-
 .../MergeTree/tests/gtest_executor.cpp        |  4 +-
 6 files changed, 63 insertions(+), 38 deletions(-)

diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index d32d1e97183..c1e92f1be0e 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -229,6 +229,11 @@ struct ContextSharedPart
     std::optional<StorageS3Settings> storage_s3_settings;   /// Settings of S3 storage
     std::vector<String> warnings;                           /// Store warning messages about server configuration.
 
+    /// Background executors for *MergeTree tables
+    MergeTreeBackgroundExecutorPtr merge_mutate_executor;
+    MergeTreeBackgroundExecutorPtr moves_executor;
+    MergeTreeBackgroundExecutorPtr fetch_executor;
+
     RemoteHostFilter remote_host_filter; /// Allowed URL from config.xml
 
     std::optional<TraceCollector> trace_collector;        /// Thread collecting traces from threads executing queries
@@ -298,6 +303,13 @@ struct ContextSharedPart
 
         DatabaseCatalog::shutdown();
 
+        if (merge_mutate_executor)
+            merge_mutate_executor->wait();
+        if (fetch_executor)
+            fetch_executor->wait();
+        if (moves_executor)
+            moves_executor->wait();
+
         std::unique_ptr<SystemLogs> delete_system_logs;
         {
             auto lock = std::lock_guard(mutex);
@@ -2372,13 +2384,6 @@ void Context::shutdown()
         }
     }
 
-    if (merge_mutate_executor)
-        merge_mutate_executor->wait();
-    if (fetch_executor)
-        fetch_executor->wait();
-    if (moves_executor)
-        moves_executor->wait();
-
     shared->shutdown();
 }
 
@@ -2724,37 +2729,37 @@ PartUUIDsPtr Context::getIgnoredPartUUIDs() const
 
 void Context::initializeBackgroundExecutors()
 {
-    merge_mutate_executor = MergeTreeBackgroundExecutor::create();
-    moves_executor = MergeTreeBackgroundExecutor::create();
-    fetch_executor = MergeTreeBackgroundExecutor::create();
+    shared->merge_mutate_executor = MergeTreeBackgroundExecutor::create(MergeTreeBackgroundExecutor::Type::MERGE_MUTATE);
+    shared->moves_executor = MergeTreeBackgroundExecutor::create(MergeTreeBackgroundExecutor::Type::MOVE);
+    shared->fetch_executor = MergeTreeBackgroundExecutor::create(MergeTreeBackgroundExecutor::Type::FETCH);
 
-    merge_mutate_executor->setThreadsCount([this] () { return getSettingsRef().background_pool_size; });
-    merge_mutate_executor->setTasksCount([this] () { return getSettingsRef().background_pool_size; });
-    merge_mutate_executor->setMetric(CurrentMetrics::BackgroundPoolTask);
+    shared->merge_mutate_executor->setThreadsCount([this] () { return getSettingsRef().background_pool_size; });
+    shared->merge_mutate_executor->setTasksCount([this] () { return getSettingsRef().background_pool_size; });
+    shared->merge_mutate_executor->setMetric(CurrentMetrics::BackgroundPoolTask);
 
-    moves_executor->setThreadsCount([this] () { return getSettingsRef().background_move_pool_size; });
-    moves_executor->setTasksCount([this] () { return getSettingsRef().background_move_pool_size; });
-    moves_executor->setMetric(CurrentMetrics::BackgroundMovePoolTask);
+    shared->moves_executor->setThreadsCount([this] () { return getSettingsRef().background_move_pool_size; });
+    shared->moves_executor->setTasksCount([this] () { return getSettingsRef().background_move_pool_size; });
+    shared->moves_executor->setMetric(CurrentMetrics::BackgroundMovePoolTask);
 
-    fetch_executor->setThreadsCount([this] () { return getSettingsRef().background_fetches_pool_size; });
-    fetch_executor->setTasksCount([this] () { return getSettingsRef().background_fetches_pool_size; });
-    fetch_executor->setMetric(CurrentMetrics::BackgroundFetchesPoolTask);
+    shared->fetch_executor->setThreadsCount([this] () { return getSettingsRef().background_fetches_pool_size; });
+    shared->fetch_executor->setTasksCount([this] () { return getSettingsRef().background_fetches_pool_size; });
+    shared->fetch_executor->setMetric(CurrentMetrics::BackgroundFetchesPoolTask);
 }
 
 
 MergeTreeBackgroundExecutorPtr Context::getMergeMutateExecutor() const
 {
-    return merge_mutate_executor;
+    return shared->merge_mutate_executor;
 }
 
 MergeTreeBackgroundExecutorPtr Context::getMovesExecutor() const
 {
-    return moves_executor;
+    return shared->moves_executor;
 }
 
 MergeTreeBackgroundExecutorPtr Context::getFetchesExecutor() const
 {
-    return fetch_executor;
+    return shared->fetch_executor;
 }
 
 
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index dd890f9bdf5..8179ebfd1c2 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -281,12 +281,6 @@ private:
     /// A flag, used to distinguish between user query and internal query to a database engine (MaterializePostgreSQL).
     bool is_internal_query = false;
 
-    /// Background executors for *MergeTree tables
-    /// Must be in global context
-    MergeTreeBackgroundExecutorPtr merge_mutate_executor;
-    MergeTreeBackgroundExecutorPtr moves_executor;
-    MergeTreeBackgroundExecutorPtr fetch_executor;
-
 public:
     // Top-level OpenTelemetry trace context for the query. Makes sense only for a query context.
     OpenTelemetryTraceContext query_trace_context;
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
index ed6f353c219..6d66b3e5a77 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
@@ -105,12 +105,11 @@ void BackgroundJobAssignee::finish()
     {
         holder->deactivate();
 
-        auto context = getContext();
         auto storage_id = data.getStorageID();
 
-        context->getMovesExecutor()->removeTasksCorrespondingToStorage(storage_id);
-        context->getFetchesExecutor()->removeTasksCorrespondingToStorage(storage_id);
-        context->getMergeMutateExecutor()->removeTasksCorrespondingToStorage(storage_id);
+        getContext()->getMovesExecutor()->removeTasksCorrespondingToStorage(storage_id);
+        getContext()->getFetchesExecutor()->removeTasksCorrespondingToStorage(storage_id);
+        getContext()->getMergeMutateExecutor()->removeTasksCorrespondingToStorage(storage_id);
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index 1cf84333dbe..c7b025a74f3 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -7,6 +7,20 @@ namespace DB
 {
 
 
+String MergeTreeBackgroundExecutor::toString(Type type)
+{
+    switch (type)
+    {
+        case Type::MERGE_MUTATE:
+            return "MergeMutate";
+        case Type::FETCH:
+            return "Fetch";
+        case Type::MOVE:
+            return "Move";
+    }
+}
+
+
 void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id)
 {
     std::lock_guard remove_lock(remove_mutex);
@@ -21,13 +35,13 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
         std::erase_if(pending, [&] (auto item) -> bool { return item->task->getStorageID() == id; });
 
         /// Find pending to wait
-        for (auto & item : active)
+        for (const auto & item : active)
             if (item->task->getStorageID() == id)
                 tasks_to_wait.emplace_back(item);
     }
 
 
-    for (auto & item : tasks_to_wait)
+    for (const auto & item : tasks_to_wait)
     {
         assert(item->future.valid());
         item->future.wait();
@@ -62,6 +76,8 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
         bool res = pool.trySchedule([this, item] ()
         {
+            setThreadName(name.c_str());
+
             auto check_if_deleting = [&] () -> bool
             {
                 active.erase(item);
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 09de42ccbaf..db1e89dd1bd 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -8,6 +8,7 @@
 
 #include <common/shared_ptr_helper.h>
 #include <Common/ThreadPool.h>
+#include <Common/ArenaAllocator.h>
 #include <Storages/MergeTree/ExecutableTask.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 
@@ -53,9 +54,16 @@ public:
     using CountGetter = std::function<size_t()>;
     using Callback = std::function<void()>;
 
-
-    MergeTreeBackgroundExecutor()
+    enum class Type
     {
+        MERGE_MUTATE,
+        FETCH,
+        MOVE
+    };
+
+    explicit MergeTreeBackgroundExecutor(Type type_) : type(type_)
+    {
+        name = toString(type);
         scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
     }
 
@@ -135,7 +143,10 @@ private:
 
     void schedulerThreadFunction();
 
+    static String toString(Type type);
 
+    Type type;
+    String name;
     CountGetter threads_count_getter;
     CountGetter max_task_count_getter;
     CurrentMetrics::Metric metric;
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index 931d8ac8bef..0a330a0af76 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -54,7 +54,7 @@ private:
 
 TEST(Executor, RemoveTasks)
 {
-    auto executor = DB::MergeTreeBackgroundExecutor::create();
+    auto executor = DB::MergeTreeBackgroundExecutor::create(DB::MergeTreeBackgroundExecutor::Type::MERGE_MUTATE);
 
     const size_t tasks_kinds = 25;
     const size_t batch = 100;
@@ -93,7 +93,7 @@ TEST(Executor, RemoveTasks)
 
 TEST(Executor, RemoveTasksStress)
 {
-    auto executor = DB::MergeTreeBackgroundExecutor::create();
+    auto executor = DB::MergeTreeBackgroundExecutor::create(DB::MergeTreeBackgroundExecutor::Type::MERGE_MUTATE);
 
     const size_t tasks_kinds = 25;
     const size_t batch = 100;

From abda2a636ebdd5ece5e16aefd86d3e5b3e9cfcd5 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Thu, 2 Sep 2021 14:53:20 +0300
Subject: [PATCH 063/177] Added ExecutablePool storage

---
 src/DataStreams/ShellCommandSource.h          | 181 ++++++++++++++++-
 .../ExecutableDictionarySource.cpp            |   4 +-
 src/Dictionaries/ExecutableDictionarySource.h |   7 +-
 .../ExecutablePoolDictionarySource.cpp        | 188 ++++--------------
 .../ExecutablePoolDictionarySource.h          |  13 +-
 src/Storages/ExecutablePoolSettings.cpp       |  42 ++++
 src/Storages/ExecutablePoolSettings.h         |  24 +++
 src/Storages/StorageExecutable.cpp            | 107 +++++++++-
 src/Storages/StorageExecutable.h              |  29 ++-
 .../test_executable_table_function/test.py    |  12 ++
 .../user_scripts/test_input_process_pool.sh   |   3 +
 .../test_input_process_pool_multiple_pipes.sh |  10 +
 12 files changed, 435 insertions(+), 185 deletions(-)
 create mode 100644 src/Storages/ExecutablePoolSettings.cpp
 create mode 100644 src/Storages/ExecutablePoolSettings.h
 create mode 100755 tests/integration/test_executable_table_function/user_scripts/test_input_process_pool.sh
 create mode 100755 tests/integration/test_executable_table_function/user_scripts/test_input_process_pool_multiple_pipes.sh

diff --git a/src/DataStreams/ShellCommandSource.h b/src/DataStreams/ShellCommandSource.h
index 4b8e3036f38..1559a96b456 100644
--- a/src/DataStreams/ShellCommandSource.h
+++ b/src/DataStreams/ShellCommandSource.h
@@ -3,8 +3,11 @@
 #include <memory>
 
 #include <common/logger_useful.h>
+#include <common/BorrowedObjectPool.h>
+
 #include <Common/ShellCommand.h>
 #include <Common/ThreadPool.h>
+
 #include <IO/ReadHelpers.h>
 #include <Formats/FormatFactory.h>
 #include <Processors/ISimpleTransform.h>
@@ -17,8 +20,9 @@
 namespace DB
 {
 
-/** A stream, that runs child process and sends data to its stdin in background thread,
-  * and receives data from its stdout.
+/** A stream, that get child process and sends data tasks.
+  * For each send data task background thread is created, send data tasks must send data to process input pipes.
+  * ShellCommandSource receives data from process stdout.
   */
 class ShellCommandSource final : public SourceWithProgress
 {
@@ -29,7 +33,7 @@ public:
         ContextPtr context,
         const std::string & format,
         const Block & sample_block,
-        std::unique_ptr<ShellCommand> command_,
+        std::unique_ptr<ShellCommand> && command_,
         Poco::Logger * log_,
         std::vector<SendDataTask> && send_data_tasks,
         size_t max_block_size = DEFAULT_BLOCK_SIZE)
@@ -48,7 +52,7 @@ public:
         ContextPtr context,
         const std::string & format,
         const Block & sample_block,
-        std::unique_ptr<ShellCommand> command_,
+        std::unique_ptr<ShellCommand> && command_,
         Poco::Logger * log_,
         size_t max_block_size = DEFAULT_BLOCK_SIZE)
         : SourceWithProgress(sample_block)
@@ -107,4 +111,173 @@ private:
     Poco::Logger * log;
 };
 
+/** A stream, that get child process and sends data tasks.
+  * For each send data task background thread is created, send data tasks must send data to process input pipes.
+  * ShellCommandPoolSource receives data from process stdout.
+  *
+  * Main difference with ShellCommandSource is that ShellCommandPoolSource initialized with process_pool and rows_to_read.
+  * Rows to read are necessary because processes in pool are not destroyed and work in read write loop.
+  * Source need to finish generating new chunks after rows_to_read rows are generated from process.
+  *
+  * If rows_to_read are not specified it is expected that script will output rows_to_read before other data.
+  *
+  * After source is destroyed process is returned to pool.
+  */
+
+using ProcessPool = BorrowedObjectPool<std::unique_ptr<ShellCommand>>;
+
+class ShellCommandPoolSource final : public SourceWithProgress
+{
+public:
+    using SendDataTask = std::function<void(void)>;
+
+    ShellCommandPoolSource(
+        ContextPtr context,
+        const std::string & format,
+        const Block & sample_block,
+        std::shared_ptr<ProcessPool> process_pool_,
+        std::unique_ptr<ShellCommand> && command_,
+        size_t rows_to_read_,
+        Poco::Logger * log_,
+        std::vector<SendDataTask> && send_data_tasks)
+        : SourceWithProgress(sample_block)
+        , process_pool(process_pool_)
+        , command(std::move(command_))
+        , rows_to_read(rows_to_read_)
+        , log(log_)
+    {
+        for (auto && send_data_task : send_data_tasks)
+        {
+            send_data_threads.emplace_back([task = std::move(send_data_task), this]()
+            {
+                try
+                {
+                    task();
+                }
+                catch (...)
+                {
+                    std::lock_guard<std::mutex> lock(send_data_lock);
+                    exception_during_send_data = std::current_exception();
+                }
+            });
+        }
+
+        pipeline.init(Pipe(FormatFactory::instance().getInput(format, command->out, sample_block, context, rows_to_read)));
+        executor = std::make_unique<PullingPipelineExecutor>(pipeline);
+    }
+
+    ShellCommandPoolSource(
+        ContextPtr context,
+        const std::string & format,
+        const Block & sample_block,
+        std::shared_ptr<ProcessPool> process_pool_,
+        std::unique_ptr<ShellCommand> && command_,
+        Poco::Logger * log_,
+        std::vector<SendDataTask> && send_data_tasks)
+        : SourceWithProgress(sample_block)
+        , process_pool(process_pool_)
+        , command(std::move(command_))
+        , log(log_)
+    {
+        for (auto && send_data_task : send_data_tasks)
+        {
+            send_data_threads.emplace_back([task = std::move(send_data_task), this]()
+            {
+                try
+                {
+                    task();
+                }
+                catch (...)
+                {
+                    std::lock_guard<std::mutex> lock(send_data_lock);
+                    exception_during_send_data = std::current_exception();
+                }
+            });
+        }
+
+        readText(rows_to_read, command->out);
+        pipeline.init(Pipe(FormatFactory::instance().getInput(format, command->out, sample_block, context, rows_to_read)));
+        executor = std::make_unique<PullingPipelineExecutor>(pipeline);
+    }
+
+
+    ~ShellCommandPoolSource() override
+    {
+        for (auto & thread : send_data_threads)
+            if (thread.joinable())
+                thread.join();
+
+        if (command)
+            process_pool->returnObject(std::move(command));
+    }
+
+protected:
+    Chunk generate() override
+    {
+        rethrowExceptionDuringReadIfNeeded();
+
+        if (current_read_rows == rows_to_read)
+            return {};
+
+        Chunk chunk;
+
+        try
+        {
+            if (!executor->pull(chunk))
+                return {};
+
+            current_read_rows += chunk.getNumRows();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log);
+            command = nullptr;
+            throw;
+        }
+
+        return chunk;
+    }
+
+public:
+    Status prepare() override
+    {
+        auto status = SourceWithProgress::prepare();
+
+        if (status == Status::Finished)
+        {
+            for (auto & thread : send_data_threads)
+                if (thread.joinable())
+                    thread.join();
+
+            rethrowExceptionDuringReadIfNeeded();
+        }
+
+        return status;
+    }
+
+    void rethrowExceptionDuringReadIfNeeded()
+    {
+        std::lock_guard<std::mutex> lock(send_data_lock);
+        if (exception_during_send_data)
+        {
+            command = nullptr;
+            std::rethrow_exception(exception_during_send_data);
+        }
+    }
+
+    String getName() const override { return "ShellCommandPoolSource"; }
+
+    std::shared_ptr<ProcessPool> process_pool;
+    std::unique_ptr<ShellCommand> command;
+    QueryPipeline pipeline;
+    std::unique_ptr<PullingPipelineExecutor> executor;
+    size_t rows_to_read = 0;
+    Poco::Logger * log;
+    std::vector<ThreadFromGlobalPool> send_data_threads;
+
+    size_t current_read_rows = 0;
+
+    std::mutex send_data_lock;
+    std::exception_ptr exception_during_send_data;
+};
 }
diff --git a/src/Dictionaries/ExecutableDictionarySource.cpp b/src/Dictionaries/ExecutableDictionarySource.cpp
index 0362ca9ea9d..a274e820e65 100644
--- a/src/Dictionaries/ExecutableDictionarySource.cpp
+++ b/src/Dictionaries/ExecutableDictionarySource.cpp
@@ -1,6 +1,5 @@
 #include "ExecutableDictionarySource.h"
 
-#include <functional>
 #include <common/logger_useful.h>
 #include <common/LocalDateTime.h>
 #include <Common/ShellCommand.h>
@@ -125,8 +124,7 @@ Pipe ExecutableDictionarySource::getStreamForBlock(const Block & block)
         formatBlock(output_stream, block);
         out.close();
     }};
-
-    std::vector<ShellCommandSource::SendDataTask> tasks = {task};
+    std::vector<ShellCommandSource::SendDataTask> tasks = {std::move(task)};
 
     Pipe pipe(std::make_unique<ShellCommandSource>(context, configuration.format, sample_block, std::move(process), log, std::move(tasks)));
 
diff --git a/src/Dictionaries/ExecutableDictionarySource.h b/src/Dictionaries/ExecutableDictionarySource.h
index f2d295b9b18..3133bc12b09 100644
--- a/src/Dictionaries/ExecutableDictionarySource.h
+++ b/src/Dictionaries/ExecutableDictionarySource.h
@@ -1,11 +1,12 @@
 #pragma once
 
-#include "DictionaryStructure.h"
-#include "IDictionarySource.h"
+#include <common/logger_useful.h>
+
 #include <Core/Block.h>
 #include <Interpreters/Context.h>
 
-namespace Poco { class Logger; }
+#include <Dictionaries/IDictionarySource.h>
+#include <Dictionaries/DictionaryStructure.h>
 
 
 namespace DB
diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.cpp b/src/Dictionaries/ExecutablePoolDictionarySource.cpp
index 24bd31acd85..e3cad41c856 100644
--- a/src/Dictionaries/ExecutablePoolDictionarySource.cpp
+++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp
@@ -1,24 +1,21 @@
 #include "ExecutablePoolDictionarySource.h"
 
-#include <functional>
-#include <common/scope_guard.h>
-#include <Processors/Sources/SourceWithProgress.h>
-#include <Processors/Executors/PullingPipelineExecutor.h>
-#include <Processors/QueryPipeline.h>
+#include <common/logger_useful.h>
+#include <common/LocalDateTime.h>
+#include <Common/ShellCommand.h>
+
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+
 #include <DataStreams/formatBlock.h>
+
 #include <Interpreters/Context.h>
 #include <IO/WriteHelpers.h>
 #include <IO/ReadHelpers.h>
-#include <Formats/FormatFactory.h>
-#include <Processors/Formats/IInputFormat.h>
-#include <Common/ShellCommand.h>
-#include <Common/ThreadPool.h>
-#include <common/logger_useful.h>
-#include <common/LocalDateTime.h>
-#include "DictionarySourceFactory.h"
-#include "DictionarySourceHelpers.h"
-#include "DictionaryStructure.h"
-#include "registerDictionaries.h"
+
+#include <Dictionaries/DictionarySourceFactory.h>
+#include <Dictionaries/DictionarySourceHelpers.h>
+#include <Dictionaries/DictionaryStructure.h>
 
 
 namespace DB
@@ -37,13 +34,13 @@ ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(
     const Configuration & configuration_,
     Block & sample_block_,
     ContextPtr context_)
-    : log(&Poco::Logger::get("ExecutablePoolDictionarySource"))
-    , dict_struct{dict_struct_}
-    , configuration{configuration_}
-    , sample_block{sample_block_}
-    , context{context_}
+    : dict_struct(dict_struct_)
+    , configuration(configuration_)
+    , sample_block(sample_block_)
+    , context(context_)
     /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type.
-    , process_pool{std::make_shared<ProcessPool>(configuration.pool_size == 0 ? std::numeric_limits<int>::max() : configuration.pool_size)}
+    , process_pool(std::make_shared<ProcessPool>(configuration.pool_size == 0 ? std::numeric_limits<int>::max() : configuration.pool_size))
+    , log(&Poco::Logger::get("ExecutablePoolDictionarySource"))
 {
     /// Remove keys from sample_block for implicit_key dictionary because
     /// these columns will not be returned from source
@@ -62,13 +59,12 @@ ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(
 }
 
 ExecutablePoolDictionarySource::ExecutablePoolDictionarySource(const ExecutablePoolDictionarySource & other)
-    : log(&Poco::Logger::get("ExecutablePoolDictionarySource"))
-    , update_time{other.update_time}
-    , dict_struct{other.dict_struct}
-    , configuration{other.configuration}
-    , sample_block{other.sample_block}
-    , context{Context::createCopy(other.context)}
-    , process_pool{std::make_shared<ProcessPool>(configuration.pool_size)}
+    : dict_struct(other.dict_struct)
+    , configuration(other.configuration)
+    , sample_block(other.sample_block)
+    , context(Context::createCopy(other.context))
+    , process_pool(std::make_shared<ProcessPool>(configuration.pool_size))
+    , log(&Poco::Logger::get("ExecutablePoolDictionarySource"))
 {
 }
 
@@ -82,123 +78,6 @@ Pipe ExecutablePoolDictionarySource::loadUpdatedAll()
     throw Exception(ErrorCodes::UNSUPPORTED_METHOD, "ExecutablePoolDictionarySource does not support loadUpdatedAll method");
 }
 
-namespace
-{
-    /** A stream, that runs child process and sends data to its stdin in background thread,
-      *  and receives data from its stdout.
-      */
-    class PoolSourceWithBackgroundThread final : public SourceWithProgress
-    {
-    public:
-        PoolSourceWithBackgroundThread(
-            std::shared_ptr<ProcessPool> process_pool_,
-            std::unique_ptr<ShellCommand> && command_,
-            Pipe pipe,
-            size_t read_rows_,
-            Poco::Logger * log_,
-            std::function<void(WriteBufferFromFile &)> && send_data_)
-            : SourceWithProgress(pipe.getHeader())
-            , process_pool(process_pool_)
-            , command(std::move(command_))
-            , rows_to_read(read_rows_)
-            , log(log_)
-            , send_data(std::move(send_data_))
-            , thread([this]
-            {
-                try
-                {
-                    send_data(command->in);
-                }
-                catch (...)
-                {
-                    std::lock_guard<std::mutex> lck(exception_during_read_lock);
-                    exception_during_read = std::current_exception();
-                }
-            })
-        {
-            pipeline.init(std::move(pipe));
-            executor = std::make_unique<PullingPipelineExecutor>(pipeline);
-        }
-
-        ~PoolSourceWithBackgroundThread() override
-        {
-            if (thread.joinable())
-                thread.join();
-
-            if (command)
-                process_pool->returnObject(std::move(command));
-        }
-
-    protected:
-        Chunk generate() override
-        {
-            rethrowExceptionDuringReadIfNeeded();
-
-            if (current_read_rows == rows_to_read)
-                return {};
-
-            Chunk chunk;
-
-            try
-            {
-                if (!executor->pull(chunk))
-                    return {};
-
-                current_read_rows += chunk.getNumRows();
-            }
-            catch (...)
-            {
-                tryLogCurrentException(log);
-                command = nullptr;
-                throw;
-            }
-
-            return chunk;
-        }
-
-    public:
-        Status prepare() override
-        {
-            auto status = SourceWithProgress::prepare();
-
-            if (status == Status::Finished)
-            {
-                if (thread.joinable())
-                    thread.join();
-
-                rethrowExceptionDuringReadIfNeeded();
-            }
-
-            return status;
-        }
-
-        void rethrowExceptionDuringReadIfNeeded()
-        {
-            std::lock_guard<std::mutex> lck(exception_during_read_lock);
-            if (exception_during_read)
-            {
-                command = nullptr;
-                std::rethrow_exception(exception_during_read);
-            }
-        }
-
-        String getName() const override { return "PoolWithBackgroundThread"; }
-
-        std::shared_ptr<ProcessPool> process_pool;
-        std::unique_ptr<ShellCommand> command;
-        QueryPipeline pipeline;
-        std::unique_ptr<PullingPipelineExecutor> executor;
-        size_t rows_to_read;
-        Poco::Logger * log;
-        std::function<void(WriteBufferFromFile &)> send_data;
-        ThreadFromGlobalPool thread;
-        size_t current_read_rows = 0;
-        std::mutex exception_during_read_lock;
-        std::exception_ptr exception_during_read;
-    };
-
-}
-
 Pipe ExecutablePoolDictionarySource::loadIds(const std::vector<UInt64> & ids)
 {
     LOG_TRACE(log, "loadIds {} size = {}", toString(), ids.size());
@@ -228,19 +107,20 @@ Pipe ExecutablePoolDictionarySource::getStreamForBlock(const Block & block)
 
     if (!result)
         throw Exception(ErrorCodes::TIMEOUT_EXCEEDED,
-            "Could not get process from pool, max command execution timeout exceeded ({}) seconds",
+            "Could not get process from pool, max command execution timeout exceeded {} seconds",
             configuration.max_command_execution_time);
 
     size_t rows_to_read = block.rows();
-    auto format = FormatFactory::instance().getInput(configuration.format, process->out, sample_block, context, rows_to_read);
+    auto * process_in = &process->in;
+    ShellCommandPoolSource::SendDataTask task = [process_in, block, this]() mutable
+    {
+        auto & out = *process_in;
+        auto output_stream = context->getOutputStream(configuration.format, out, block.cloneEmpty());
+        formatBlock(output_stream, block);
+    };
+    std::vector<ShellCommandPoolSource::SendDataTask> tasks = {std::move(task)};
 
-    Pipe pipe(std::make_unique<PoolSourceWithBackgroundThread>(
-        process_pool, std::move(process), Pipe(std::move(format)), rows_to_read, log,
-        [block, this](WriteBufferFromFile & out) mutable
-        {
-            auto output_stream = context->getOutputStream(configuration.format, out, block.cloneEmpty());
-            formatBlock(output_stream, block);
-        }));
+    Pipe pipe(std::make_unique<ShellCommandPoolSource>(context, configuration.format, sample_block, process_pool, std::move(process), rows_to_read, log, std::move(tasks)));
 
     if (configuration.implicit_key)
         pipe.addTransform(std::make_shared<TransformWithAdditionalColumns>(block, pipe.getHeader()));
diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.h b/src/Dictionaries/ExecutablePoolDictionarySource.h
index 22d7b713a24..b80122fb56f 100644
--- a/src/Dictionaries/ExecutablePoolDictionarySource.h
+++ b/src/Dictionaries/ExecutablePoolDictionarySource.h
@@ -1,20 +1,18 @@
 #pragma once
 
-#include <common/BorrowedObjectPool.h>
+#include <common/logger_useful.h>
 
 #include <Core/Block.h>
 #include <Interpreters/Context.h>
 
-#include "IDictionarySource.h"
-#include "DictionaryStructure.h"
-
-namespace Poco { class Logger; }
+#include <Dictionaries/IDictionarySource.h>
+#include <Dictionaries/DictionaryStructure.h>
+#include <DataStreams/ShellCommandSource.h>
 
 
 namespace DB
 {
 
-using ProcessPool = BorrowedObjectPool<std::unique_ptr<ShellCommand>>;
 
 /** ExecutablePoolDictionarySource allows loading data from pool of processes.
   * When client requests ids or keys source get process from ProcessPool
@@ -73,14 +71,13 @@ public:
     Pipe getStreamForBlock(const Block & block);
 
 private:
-    Poco::Logger * log;
-    time_t update_time = 0;
     const DictionaryStructure dict_struct;
     const Configuration configuration;
 
     Block sample_block;
     ContextPtr context;
     std::shared_ptr<ProcessPool> process_pool;
+    Poco::Logger * log;
 };
 
 }
diff --git a/src/Storages/ExecutablePoolSettings.cpp b/src/Storages/ExecutablePoolSettings.cpp
new file mode 100644
index 00000000000..8951c8edabf
--- /dev/null
+++ b/src/Storages/ExecutablePoolSettings.cpp
@@ -0,0 +1,42 @@
+#include "ExecutablePoolSettings.h"
+
+#include <Common/Exception.h>
+
+#include <Parsers/ASTCreateQuery.h>
+#include <Parsers/ASTSetQuery.h>
+#include <Parsers/ASTFunction.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNKNOWN_SETTING;
+}
+
+IMPLEMENT_SETTINGS_TRAITS(ExecutablePoolSettingsTraits, LIST_OF_EXECUTABLE_POOL_SETTINGS);
+
+void ExecutablePoolSettings::loadFromQuery(ASTStorage & storage_def)
+{
+    if (storage_def.settings)
+    {
+        try
+        {
+            applyChanges(storage_def.settings->changes);
+        }
+        catch (Exception & e)
+        {
+            if (e.code() == ErrorCodes::UNKNOWN_SETTING)
+                e.addMessage("for storage " + storage_def.engine->name);
+            throw;
+        }
+    }
+    else
+    {
+        auto settings_ast = std::make_shared<ASTSetQuery>();
+        settings_ast->is_standalone = false;
+        storage_def.set(storage_def.settings, settings_ast);
+    }
+}
+
+}
diff --git a/src/Storages/ExecutablePoolSettings.h b/src/Storages/ExecutablePoolSettings.h
new file mode 100644
index 00000000000..6de9b0f0e6c
--- /dev/null
+++ b/src/Storages/ExecutablePoolSettings.h
@@ -0,0 +1,24 @@
+#pragma once
+
+#include <Core/Defines.h>
+#include <Core/BaseSettings.h>
+
+namespace DB
+{
+
+class ASTStorage;
+
+#define LIST_OF_EXECUTABLE_POOL_SETTINGS(M) \
+    M(UInt64, pool_size, 16, "Processes pool size. If size == 0, then there is no size restrictions", 0) \
+    M(UInt64, max_command_execution_time, 10, "Max command execution time in seconds.", 0) \
+    M(UInt64, command_termination_timeout, 10, "Command termination timeout in seconds.", 0) \
+
+DECLARE_SETTINGS_TRAITS(ExecutablePoolSettingsTraits, LIST_OF_EXECUTABLE_POOL_SETTINGS)
+
+/// Settings for ExecutablePool engine.
+struct ExecutablePoolSettings : public BaseSettings<ExecutablePoolSettingsTraits>
+{
+    void loadFromQuery(ASTStorage & storage_def);
+};
+
+}
diff --git a/src/Storages/StorageExecutable.cpp b/src/Storages/StorageExecutable.cpp
index a71cf20119b..6d81a4eff40 100644
--- a/src/Storages/StorageExecutable.cpp
+++ b/src/Storages/StorageExecutable.cpp
@@ -4,16 +4,18 @@
 
 #include <Common/ShellCommand.h>
 #include <Core/Block.h>
+
 #include <IO/ReadHelpers.h>
 #include <Parsers/ASTLiteral.h>
 #include <Parsers/ASTSelectWithUnionQuery.h>
+#include <Parsers/ASTCreateQuery.h>
+
+#include <DataStreams/IBlockInputStream.h>
 #include <Processors/Pipe.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterSelectWithUnionQuery.h>
 #include <Interpreters/evaluateConstantExpression.h>
 #include <Storages/StorageFactory.h>
-#include <DataStreams/IBlockInputStream.h>
-#include <DataStreams/ShellCommandSource.h>
 
 
 namespace DB
@@ -24,6 +26,7 @@ namespace ErrorCodes
     extern const int UNSUPPORTED_METHOD;
     extern const int LOGICAL_ERROR;
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
+    extern const int TIMEOUT_EXCEEDED;
 }
 
 StorageExecutable::StorageExecutable(
@@ -47,6 +50,31 @@ StorageExecutable::StorageExecutable(
     setInMemoryMetadata(storage_metadata);
 }
 
+StorageExecutable::StorageExecutable(
+    const StorageID & table_id_,
+    const String & script_name_,
+    const std::vector<String> & arguments_,
+    const String & format_,
+    const std::vector<ASTPtr> & input_queries_,
+    const ExecutablePoolSettings & pool_settings_,
+    const ColumnsDescription & columns,
+    const ConstraintsDescription & constraints)
+    : IStorage(table_id_)
+    , script_name(script_name_)
+    , arguments(arguments_)
+    , format(format_)
+    , input_queries(input_queries_)
+    , pool_settings(pool_settings_)
+    /// If pool size == 0 then there is no size restrictions. Poco max size of semaphore is integer type.
+    , process_pool(std::make_shared<ProcessPool>(pool_settings.pool_size == 0 ? std::numeric_limits<int>::max() : pool_settings.pool_size))
+    , log(&Poco::Logger::get("StorageExecutablePool"))
+{
+    StorageInMemoryMetadata storage_metadata;
+    storage_metadata.setColumns(columns);
+    storage_metadata.setConstraints(constraints);
+    setInMemoryMetadata(storage_metadata);
+}
+
 Pipe StorageExecutable::read(
     const Names & /*column_names*/,
     const StorageMetadataPtr & metadata_snapshot,
@@ -56,6 +84,8 @@ Pipe StorageExecutable::read(
     size_t max_block_size,
     unsigned /*threads*/)
 {
+    std::cerr << getName() << "::read" << std::endl;
+
     auto user_scripts_path = context->getUserScriptsPath();
     auto script_path = user_scripts_path + '/' + script_name;
     if (!std::filesystem::exists(std::filesystem::path(script_path)))
@@ -79,7 +109,27 @@ Pipe StorageExecutable::read(
     for (size_t i = 1; i < inputs.size(); ++i)
         config.write_fds.emplace_back(i + 2);
 
-    auto process = ShellCommand::executeDirect(config);
+    std::unique_ptr<ShellCommand> process;
+
+    if (process_pool)
+    {
+        std::cerr << getName() <<"::read create process" << std::endl;
+        bool result = process_pool->tryBorrowObject(process, [&config, this]()
+        {
+            config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, pool_settings.command_termination_timeout };
+            auto shell_command = ShellCommand::execute(config);
+            return shell_command;
+        }, pool_settings.max_command_execution_time * 10000);
+
+        if (!result)
+            throw Exception(ErrorCodes::TIMEOUT_EXCEEDED,
+                "Could not get process from pool, max command execution timeout exceeded {} seconds",
+                pool_settings.max_command_execution_time);
+    }
+    else
+    {
+        process = ShellCommand::executeDirect(config);
+    }
 
     std::vector<ShellCommandSource::SendDataTask> tasks;
     tasks.reserve(inputs.size());
@@ -123,13 +173,22 @@ Pipe StorageExecutable::read(
     }
 
     auto sample_block = metadata_snapshot->getSampleBlock();
-    Pipe pipe(std::make_unique<ShellCommandSource>(context, format, sample_block, std::move(process), log, std::move(tasks), max_block_size));
-    return pipe;
+
+    if (process_pool)
+    {
+        Pipe pipe(std::make_unique<ShellCommandPoolSource>(context, format, std::move(sample_block), process_pool, std::move(process), log, std::move(tasks)));
+        return pipe;
+    }
+    else
+    {
+        Pipe pipe(std::make_unique<ShellCommandSource>(context, format, std::move(sample_block), std::move(process), log, std::move(tasks), max_block_size));
+        return pipe;
+    }
 }
 
 void registerStorageExecutable(StorageFactory & factory)
 {
-    factory.registerStorage("Executable", [](const StorageFactory::Arguments & args)
+    auto register_storage = [](const StorageFactory::Arguments & args, bool is_executable_pool) -> StoragePtr
     {
         auto local_context = args.getLocalContext();
 
@@ -143,7 +202,7 @@ void registerStorageExecutable(StorageFactory & factory)
         auto scipt_name_with_arguments_value = args.engine_args[0]->as<ASTLiteral &>().value.safeGet<String>();
 
         std::vector<String> script_name_with_arguments;
-        boost::split(script_name_with_arguments, scipt_name_with_arguments_value, [](char c){ return c == ' '; });
+        boost::split(script_name_with_arguments, scipt_name_with_arguments_value, [](char c) { return c == ' '; });
 
         auto script_name = script_name_with_arguments[0];
         script_name_with_arguments.erase(script_name_with_arguments.begin());
@@ -154,8 +213,8 @@ void registerStorageExecutable(StorageFactory & factory)
         {
             ASTPtr query = args.engine_args[i]->children.at(0);
             if (!query->as<ASTSelectWithUnionQuery>())
-                throw Exception(ErrorCodes::UNSUPPORTED_METHOD,
-                    "StorageExecutable argument is invalid input query {}",
+                throw Exception(
+                    ErrorCodes::UNSUPPORTED_METHOD, "StorageExecutable argument is invalid input query {}",
                     query->formatForErrorMessage());
 
             input_queries.emplace_back(std::move(query));
@@ -164,7 +223,35 @@ void registerStorageExecutable(StorageFactory & factory)
         const auto & columns = args.columns;
         const auto & constraints = args.constraints;
 
-        return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, columns, constraints);
+        if (is_executable_pool)
+        {
+            size_t max_command_execution_time = 10;
+
+            size_t max_execution_time_seconds = static_cast<size_t>(args.getContext()->getSettings().max_execution_time.totalSeconds());
+            if (max_execution_time_seconds != 0 && max_command_execution_time > max_execution_time_seconds)
+                max_command_execution_time = max_execution_time_seconds;
+
+            ExecutablePoolSettings pool_settings;
+            pool_settings.max_command_execution_time = max_command_execution_time;
+            if (args.storage_def->settings)
+                pool_settings.loadFromQuery(*args.storage_def);
+
+            return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, pool_settings, columns, constraints);
+        }
+        else
+        {
+            return StorageExecutable::create(args.table_id, script_name, script_name_with_arguments, format, input_queries, columns, constraints);
+        }
+    };
+
+    factory.registerStorage("Executable", [&](const StorageFactory::Arguments & args)
+    {
+        return register_storage(args, false /*is_executable_pool*/);
+    });
+
+    factory.registerStorage("ExecutablePool", [&](const StorageFactory::Arguments & args)
+    {
+        return register_storage(args, true /*is_executable_pool*/);
     });
 }
 
diff --git a/src/Storages/StorageExecutable.h b/src/Storages/StorageExecutable.h
index 033a9a35e7e..8578ec3fee7 100644
--- a/src/Storages/StorageExecutable.h
+++ b/src/Storages/StorageExecutable.h
@@ -3,19 +3,30 @@
 #include <common/logger_useful.h>
 #include <common/shared_ptr_helper.h>
 #include <Storages/IStorage.h>
-#include <IO/CompressionMethod.h>
+#include <DataStreams/ShellCommandSource.h>
+#include <Storages/ExecutablePoolSettings.h>
 
 
 namespace DB
 {
+
 /**
  * This class represents table engine for external executable files.
+ * Executable storage that will start process for read.
+ * ExecutablePool storage maintain pool of processes and take process from pool for read.
  */
 class StorageExecutable final : public shared_ptr_helper<StorageExecutable>, public IStorage
 {
     friend struct shared_ptr_helper<StorageExecutable>;
 public:
-    String getName() const override { return "Executable"; }
+
+    String getName() const override
+    {
+        if (process_pool)
+            return "ExecutablePool";
+        else
+            return "Executable";
+    }
 
     Pipe read(
         const Names & column_names,
@@ -36,12 +47,24 @@ protected:
         const ColumnsDescription & columns,
         const ConstraintsDescription & constraints);
 
+    StorageExecutable(
+        const StorageID & table_id,
+        const String & script_name_,
+        const std::vector<String> & arguments_,
+        const String & format_,
+        const std::vector<ASTPtr> & input_queries_,
+        const ExecutablePoolSettings & pool_settings_,
+        const ColumnsDescription & columns,
+        const ConstraintsDescription & constraints);
+
 private:
     String script_name;
     std::vector<String> arguments;
     String format;
     std::vector<ASTPtr> input_queries;
+    ExecutablePoolSettings pool_settings;
+    std::shared_ptr<ProcessPool> process_pool;
     Poco::Logger * log;
 };
-}
 
+}
diff --git a/tests/integration/test_executable_table_function/test.py b/tests/integration/test_executable_table_function/test.py
index 24bc01a300c..3106cde90c0 100644
--- a/tests/integration/test_executable_table_function/test.py
+++ b/tests/integration/test_executable_table_function/test.py
@@ -55,3 +55,15 @@ def test_executable_storage_argument(started_cluster):
     node.query("CREATE TABLE test_table (value String) ENGINE=Executable('test_argument.sh 1', 'TabSeparated')")
     assert node.query("SELECT * FROM test_table") == 'Key 1\n'
     node.query("DROP TABLE test_table")
+
+def test_executable_pool_storage(started_cluster):
+    node.query("DROP TABLE IF EXISTS test_table")
+    node.query("CREATE TABLE test_table (value String) ENGINE=ExecutablePool('test_input_process_pool.sh', 'TabSeparated', (SELECT 1))")
+    assert node.query("SELECT * FROM test_table") == 'Key 1\n'
+    node.query("DROP TABLE test_table")
+
+def test_executable_pool_storage_multiple_pipes(started_cluster):
+    node.query("DROP TABLE IF EXISTS test_table")
+    node.query("CREATE TABLE test_table (value String) ENGINE=ExecutablePool('test_input_process_pool_multiple_pipes.sh', 'TabSeparated', (SELECT 1), (SELECT 2), (SELECT 3))")
+    assert node.query("SELECT * FROM test_table") == 'Key from 4 fd 3\nKey from 3 fd 2\nKey from 0 fd 1\n'
+    node.query("DROP TABLE test_table")
diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool.sh b/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool.sh
new file mode 100755
index 00000000000..f569b2dbbaa
--- /dev/null
+++ b/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool.sh
@@ -0,0 +1,3 @@
+#!/bin/bash
+
+while read read_data; do printf '1'; printf "Key $read_data\n"; done
diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool_multiple_pipes.sh b/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool_multiple_pipes.sh
new file mode 100755
index 00000000000..6fcc412f86a
--- /dev/null
+++ b/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool_multiple_pipes.sh
@@ -0,0 +1,10 @@
+#!/bin/bash
+
+read -t 250 -u 4 read_data_from_4_fd;
+read -t 250 -u 3 read_data_from_3_fd;
+read -t 250 read_data_from_0_df;
+
+printf '3';
+printf "Key from 4 fd $read_data_from_4_fd\n";
+printf "Key from 3 fd $read_data_from_3_fd\n";
+printf "Key from 0 fd $read_data_from_0_df\n";

From f1bc3d77207fbbc48918b037fc7d8d2ea73b4564 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 2 Sep 2021 16:54:30 +0300
Subject: [PATCH 064/177] Add test for keeper 2 node configuration

---
 .../test_keeper_two_nodes_cluster/__init__.py |   1 +
 .../configs/enable_keeper1.xml                |  33 ++++
 .../configs/enable_keeper2.xml                |  33 ++++
 .../configs/use_keeper.xml                    |  12 ++
 .../test_keeper_two_nodes_cluster/test.py     | 169 ++++++++++++++++++
 5 files changed, 248 insertions(+)
 create mode 100644 tests/integration/test_keeper_two_nodes_cluster/__init__.py
 create mode 100644 tests/integration/test_keeper_two_nodes_cluster/configs/enable_keeper1.xml
 create mode 100644 tests/integration/test_keeper_two_nodes_cluster/configs/enable_keeper2.xml
 create mode 100644 tests/integration/test_keeper_two_nodes_cluster/configs/use_keeper.xml
 create mode 100644 tests/integration/test_keeper_two_nodes_cluster/test.py

diff --git a/tests/integration/test_keeper_two_nodes_cluster/__init__.py b/tests/integration/test_keeper_two_nodes_cluster/__init__.py
new file mode 100644
index 00000000000..e5a0d9b4834
--- /dev/null
+++ b/tests/integration/test_keeper_two_nodes_cluster/__init__.py
@@ -0,0 +1 @@
+#!/usr/bin/env python3
diff --git a/tests/integration/test_keeper_two_nodes_cluster/configs/enable_keeper1.xml b/tests/integration/test_keeper_two_nodes_cluster/configs/enable_keeper1.xml
new file mode 100644
index 00000000000..21601ff4cc0
--- /dev/null
+++ b/tests/integration/test_keeper_two_nodes_cluster/configs/enable_keeper1.xml
@@ -0,0 +1,33 @@
+<yandex>
+    <keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>1</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+        <snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <snapshot_distance>75</snapshot_distance>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+        </raft_configuration>
+    </keeper_server>
+</yandex>
diff --git a/tests/integration/test_keeper_two_nodes_cluster/configs/enable_keeper2.xml b/tests/integration/test_keeper_two_nodes_cluster/configs/enable_keeper2.xml
new file mode 100644
index 00000000000..baee6b578a0
--- /dev/null
+++ b/tests/integration/test_keeper_two_nodes_cluster/configs/enable_keeper2.xml
@@ -0,0 +1,33 @@
+<yandex>
+    <keeper_server>
+        <tcp_port>9181</tcp_port>
+        <server_id>2</server_id>
+        <log_storage_path>/var/lib/clickhouse/coordination/log</log_storage_path>
+        <snapshot_storage_path>/var/lib/clickhouse/coordination/snapshots</snapshot_storage_path>
+
+        <coordination_settings>
+            <operation_timeout_ms>5000</operation_timeout_ms>
+            <session_timeout_ms>10000</session_timeout_ms>
+            <snapshot_distance>75</snapshot_distance>
+            <raft_logs_level>trace</raft_logs_level>
+        </coordination_settings>
+
+        <raft_configuration>
+            <server>
+                <id>1</id>
+                <hostname>node1</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <priority>3</priority>
+            </server>
+            <server>
+                <id>2</id>
+                <hostname>node2</hostname>
+                <port>44444</port>
+                <can_become_leader>true</can_become_leader>
+                <start_as_follower>true</start_as_follower>
+                <priority>2</priority>
+            </server>
+        </raft_configuration>
+    </keeper_server>
+</yandex>
diff --git a/tests/integration/test_keeper_two_nodes_cluster/configs/use_keeper.xml b/tests/integration/test_keeper_two_nodes_cluster/configs/use_keeper.xml
new file mode 100644
index 00000000000..740b2afaab9
--- /dev/null
+++ b/tests/integration/test_keeper_two_nodes_cluster/configs/use_keeper.xml
@@ -0,0 +1,12 @@
+<yandex>
+    <zookeeper>
+        <node index="1">
+            <host>node1</host>
+            <port>9181</port>
+        </node>
+        <node index="2">
+            <host>node2</host>
+            <port>9181</port>
+        </node>
+    </zookeeper>
+</yandex>
diff --git a/tests/integration/test_keeper_two_nodes_cluster/test.py b/tests/integration/test_keeper_two_nodes_cluster/test.py
new file mode 100644
index 00000000000..5b71ddf7087
--- /dev/null
+++ b/tests/integration/test_keeper_two_nodes_cluster/test.py
@@ -0,0 +1,169 @@
+#!/usr/bin/env python3
+
+import pytest
+from helpers.cluster import ClickHouseCluster
+import random
+import string
+import os
+import time
+from multiprocessing.dummy import Pool
+from helpers.network import PartitionManager
+from helpers.test_tools import assert_eq_with_retry
+
+cluster = ClickHouseCluster(__file__)
+node1 = cluster.add_instance('node1', main_configs=['configs/enable_keeper1.xml', 'configs/use_keeper.xml'], stay_alive=True)
+node2 = cluster.add_instance('node2', main_configs=['configs/enable_keeper2.xml', 'configs/use_keeper.xml'], stay_alive=True)
+
+from kazoo.client import KazooClient, KazooState
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+
+        yield cluster
+
+    finally:
+        cluster.shutdown()
+
+def smaller_exception(ex):
+    return '\n'.join(str(ex).split('\n')[0:2])
+
+def wait_node(node):
+    for _ in range(100):
+        zk = None
+        try:
+            node.query("SELECT * FROM system.zookeeper WHERE path = '/'")
+            zk = get_fake_zk(node.name, timeout=30.0)
+            zk.create("/test", sequence=True)
+            print("node", node.name, "ready")
+            break
+        except Exception as ex:
+            time.sleep(0.2)
+            print("Waiting until", node.name, "will be ready, exception", ex)
+        finally:
+            if zk:
+                zk.stop()
+                zk.close()
+    else:
+        raise Exception("Can't wait node", node.name, "to become ready")
+
+def wait_nodes():
+    for node in [node1, node2]:
+        wait_node(node)
+
+
+def get_fake_zk(nodename, timeout=30.0):
+    _fake_zk_instance = KazooClient(hosts=cluster.get_instance_ip(nodename) + ":9181", timeout=timeout)
+    _fake_zk_instance.start()
+    return _fake_zk_instance
+
+def test_read_write_two_nodes(started_cluster):
+    try:
+        wait_nodes()
+        node1_zk = get_fake_zk("node1")
+        node2_zk = get_fake_zk("node2")
+
+        node1_zk.create("/test_read_write_multinode_node1", b"somedata1")
+        node2_zk.create("/test_read_write_multinode_node2", b"somedata2")
+
+        # stale reads are allowed
+        while node1_zk.exists("/test_read_write_multinode_node2") is None:
+            time.sleep(0.1)
+
+        # stale reads are allowed
+        while node2_zk.exists("/test_read_write_multinode_node1") is None:
+            time.sleep(0.1)
+
+        assert node2_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
+        assert node1_zk.get("/test_read_write_multinode_node1")[0] == b"somedata1"
+
+        assert node2_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
+        assert node1_zk.get("/test_read_write_multinode_node2")[0] == b"somedata2"
+
+    finally:
+        try:
+            for zk_conn in [node1_zk, node2_zk, node3_zk]:
+                zk_conn.stop()
+                zk_conn.close()
+        except:
+            pass
+
+def test_read_write_two_nodes_with_blocade(started_cluster):
+    try:
+        wait_nodes()
+        node1_zk = get_fake_zk("node1")
+        node2_zk = get_fake_zk("node2")
+
+        print("Blocking nodes")
+        with PartitionManager() as pm:
+            pm.partition_instances(node2, node1)
+
+            # We will respond conection loss but process this query
+            # after blocade will be removed
+            with pytest.raises(Exception):
+                node1_zk.create("/test_read_write_blocked_node1", b"somedata1")
+
+            # This node is not leader and will not process anything
+            with pytest.raises(Exception):
+                node2_zk.create("/test_read_write_blocked_node2", b"somedata2")
+
+
+        print("Nodes unblocked")
+        for i in range(10):
+            try:
+                node1_zk = get_fake_zk("node1")
+                node2_zk = get_fake_zk("node2")
+                break
+            except:
+                time.sleep(0.5)
+
+
+        for i in range(100):
+            try:
+                node1_zk.create("/test_after_block1", b"somedata12")
+                break
+            except:
+                time.sleep(0.1)
+        else:
+            raise Exception("node1 cannot recover after blockade")
+
+        print("Node1 created it's value")
+
+        for i in range(100):
+            try:
+                node2_zk.create("/test_after_block2", b"somedata12")
+                break
+            except:
+                time.sleep(0.1)
+        else:
+            raise Exception("node2 cannot recover after blockade")
+
+        print("Node2 created it's value")
+
+        assert node1_zk.exists("/test_read_write_blocked_node1") is not None
+        assert node2_zk.exists("/test_read_write_blocked_node1") is not None
+
+        assert node1_zk.exists("/test_read_write_blocked_node2") is None
+        assert node2_zk.exists("/test_read_write_blocked_node2") is None
+
+        # stale reads are allowed
+        while node1_zk.exists("/test_after_block2") is None:
+            time.sleep(0.1)
+
+        # stale reads are allowed
+        while node2_zk.exists("/test_after_block1") is None:
+            time.sleep(0.1)
+
+        assert node1_zk.exists("/test_after_block1") is not None
+        assert node1_zk.exists("/test_after_block2") is not None
+        assert node2_zk.exists("/test_after_block1") is not None
+        assert node2_zk.exists("/test_after_block2") is not None
+
+    finally:
+        try:
+            for zk_conn in [node1_zk, node2_zk, node3_zk]:
+                zk_conn.stop()
+                zk_conn.close()
+        except:
+            pass

From a6fe91ca47be99ab91dafc76de966619592dc0cf Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Thu, 2 Sep 2021 17:40:29 +0000
Subject: [PATCH 065/177] get rid of half of allocations

---
 src/Common/PlainMultiSet.h                    | 114 ++++++++++++++
 src/Common/RingBuffer.h                       | 122 +++++++++++++++
 src/Common/ThreadPool.h                       |   1 -
 src/Common/tests/gtest_plain_multiset.cpp     |  22 +++
 src/Common/tests/gtest_ringbuffer.cpp         | 142 ++++++++++++++++++
 src/Interpreters/Context.cpp                  |  33 ++--
 .../MergeTree/MergeMutateExecutor.cpp         |  35 +++--
 src/Storages/MergeTree/MergeMutateExecutor.h  |  74 +++++----
 .../MergeTree/tests/gtest_executor.cpp        |  24 +--
 9 files changed, 505 insertions(+), 62 deletions(-)
 create mode 100644 src/Common/PlainMultiSet.h
 create mode 100644 src/Common/RingBuffer.h
 create mode 100644 src/Common/tests/gtest_plain_multiset.cpp
 create mode 100644 src/Common/tests/gtest_ringbuffer.cpp

diff --git a/src/Common/PlainMultiSet.h b/src/Common/PlainMultiSet.h
new file mode 100644
index 00000000000..d5b646a53db
--- /dev/null
+++ b/src/Common/PlainMultiSet.h
@@ -0,0 +1,114 @@
+#pragma once
+
+#include <vector>
+
+namespace DB
+{
+
+
+/**
+ * Class with O(n) complexity for all methods
+ * Size has to be fixed.
+ * The main reason to use this is to get rid of any allocations.
+ * Used is some executors, where the number of elements is really small.
+ */
+template <class T>
+class PlainMultiSet
+{
+public:
+
+    explicit PlainMultiSet(size_t capacity_)
+    {
+        buffer.resize(capacity_);
+    }
+
+
+    bool tryPush(T element)
+    {
+        for (auto & item : buffer)
+        {
+            if (item.state == State::EMPTY)
+            {
+                item.state = State::FILLED;
+                item.value = std::move(element);
+                ++count;
+                return true;
+            }
+        }
+
+
+        return false;
+    }
+
+    bool has(T element)
+    {
+        for (auto & item : buffer)
+            if (item.state == State::FILLED && item.value == element)
+                return true;
+
+        return false;
+    }
+
+
+    template <class Predicate>
+    std::vector<T> getAll(Predicate && predicate)
+    {
+        std::vector<T> suitable;
+        for (auto & item : buffer)
+            if (item.state == State::FILLED && predicate(item.value))
+                suitable.emplace_back(item.value);
+
+        return suitable;
+    }
+
+
+    bool tryErase(const T & element)
+    {
+        for (auto & item : buffer)
+        {
+            if (item.state == State::FILLED && item.value == element)
+            {
+                item.state = State::EMPTY;
+                item.value = T{};
+                --count;
+                return true;
+            }
+        }
+
+        return false;
+    }
+
+    size_t size()
+    {
+        return count;
+    }
+
+    void reserve(size_t new_capacity)
+    {
+        if (buffer.size() >= new_capacity)
+            return;
+
+        std::vector<Item> new_buffer(std::move(buffer));
+        new_buffer.reserve(new_capacity);
+
+        std::swap(new_buffer, buffer);
+    }
+
+private:
+    enum class State
+    {
+        EMPTY,
+        FILLED
+    };
+
+    struct Item
+    {
+        T value;
+        State state{State::EMPTY};
+    };
+
+    size_t count{0};
+    std::vector<Item> buffer;
+};
+
+}
diff --git a/src/Common/RingBuffer.h b/src/Common/RingBuffer.h
new file mode 100644
index 00000000000..a619ae2dab6
--- /dev/null
+++ b/src/Common/RingBuffer.h
@@ -0,0 +1,122 @@
+#pragma once
+
+#include <vector>
+#include <iostream>
+
+namespace DB
+{
+
+/**
+ * A ring buffer of fixed size.
+ * With an ability to expand / narrow.
+ * When narrowing only first N elements remain.
+ */
+
+template <class T>
+class RingBuffer {
+public:
+    explicit RingBuffer(size_t capacity_) : capacity(capacity_)
+    {
+        buffer.assign(capacity, {});
+    }
+
+    size_t size() const
+    {
+        return count;
+    }
+
+    bool empty() const
+    {
+        return count == 0;
+    }
+
+    bool tryPush(T element)
+    {
+        if (count == capacity) {
+            return false;
+        }
+        buffer[(position + count) % capacity] = element;
+        ++count;
+        return true;
+    }
+
+    bool tryPop(T * element)
+    {
+        if (empty()) {
+            return false;
+        }
+        *element = std::move(buffer[position]);
+        --count;
+        position = (position + 1) % capacity;
+        return true;
+    }
+
+    template <typename Predicate>
+    void removeElements(Predicate && predicate)
+    {
+        /// Shift all elements to the beginning of the buffer
+        std::rotate(buffer.begin(), buffer.begin() + position, buffer.end());
+
+        /// Remove elements
+        auto end_removed = std::remove_if(buffer.begin(), buffer.begin() + count, predicate);
+
+        size_t new_count = std::distance(buffer.begin(), end_removed);
+
+        for (size_t i = new_count; i < count; ++i)
+            buffer[i] = T{};
+
+        count = new_count;
+        position = 0;
+    }
+
+
+    void resize(size_t new_capacity)
+    {
+        if (new_capacity > capacity)
+            expand(new_capacity);
+        else if (new_capacity < capacity)
+            narrow(new_capacity);
+    }
+
+private:
+
+    void expand(size_t new_capacity)
+    {
+        bool overflow = (position + count) > capacity;
+        buffer.resize(new_capacity);
+
+        if (overflow)
+        {
+            size_t count_before_end = capacity - position;
+            for (size_t i = 0; i < count_before_end; ++i)
+                buffer[new_capacity - i] = buffer[capacity - i];
+            position = new_capacity - count_before_end;
+        }
+
+        capacity = new_capacity;
+    }
+
+    void narrow(size_t new_capacity)
+    {
+        std::vector<T> new_buffer(new_capacity);
+
+        count = std::min(new_capacity, count);
+        for (size_t i = 0; i < count; ++i)
+            new_buffer[i] = buffer[(position + i) % capacity];
+
+        std::swap(buffer, new_buffer);
+
+        position = 0;
+        capacity = new_capacity;
+    }
+
+
+    std::vector<T> buffer;
+    size_t position{0};
+    size_t count{0};
+    size_t capacity{0};
+};
+
+
+
+}
diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h
index 1da5d25eef0..1cac87abb70 100644
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@@ -107,7 +107,6 @@ private:
     std::list<Thread> threads;
     std::exception_ptr first_exception;
 
-
     template <typename ReturnType>
     ReturnType scheduleImpl(Job job, int priority, std::optional<uint64_t> wait_microseconds);
 
diff --git a/src/Common/tests/gtest_plain_multiset.cpp b/src/Common/tests/gtest_plain_multiset.cpp
new file mode 100644
index 00000000000..2018053d6a7
--- /dev/null
+++ b/src/Common/tests/gtest_plain_multiset.cpp
@@ -0,0 +1,22 @@
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include <Common/PlainMultiSet.h>
+
+using namespace DB;
+
+
+TEST(PlainMultiSet, Simple)
+{
+    PlainMultiSet<int> set(10);
+
+    ASSERT_TRUE(set.tryPush(1));
+    ASSERT_TRUE(set.tryPush(1));
+    ASSERT_TRUE(set.tryPush(2));
+    ASSERT_TRUE(set.tryPush(3));
+
+    ASSERT_TRUE(set.has(1));
+    ASSERT_TRUE(set.has(2));
+    ASSERT_TRUE(set.has(3));
+}
diff --git a/src/Common/tests/gtest_ringbuffer.cpp b/src/Common/tests/gtest_ringbuffer.cpp
new file mode 100644
index 00000000000..e56739b21cc
--- /dev/null
+++ b/src/Common/tests/gtest_ringbuffer.cpp
@@ -0,0 +1,142 @@
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include <Common/RingBuffer.h>
+
+using namespace DB;
+
+TEST(RingBuffer, Empty)
+{
+    RingBuffer<int> buffer(1);
+
+    ASSERT_TRUE(buffer.size() == 0u); // NOLINT
+    ASSERT_TRUE(buffer.empty());
+}
+
+TEST(RingBuffer, PushAndPop)
+{
+    RingBuffer<int> buffer(2);
+
+    int i;
+    ASSERT_TRUE(true == buffer.tryPush(0));
+    ASSERT_TRUE(true == buffer.tryPush(1));
+    ASSERT_TRUE(false == buffer.tryPush(2));
+
+    ASSERT_TRUE(2u == buffer.size());
+    ASSERT_TRUE(false == buffer.empty());
+
+    ASSERT_TRUE(true == buffer.tryPop(&i));
+    ASSERT_TRUE(0 == i);
+    ASSERT_TRUE(true == buffer.tryPop(&i));
+    ASSERT_TRUE(1 == i);
+
+    ASSERT_TRUE(false == buffer.tryPop(&i));
+    ASSERT_TRUE(buffer.empty());
+    ASSERT_TRUE(true == buffer.empty());
+}
+
+TEST(RingBuffer, Random)
+{
+    std::random_device device;
+    std::mt19937 generator(device());
+
+    std::uniform_int_distribution<> distribution(0, 1);
+
+    RingBuffer<int> buffer(10);
+
+    int next_element = 0;
+    int next_received_element = 0;
+    for (int i = 0; i < 100000; ++i) {
+        if (distribution(generator) == 0)
+        {
+            if (buffer.tryPush(next_element))
+                next_element++;
+        }
+        else
+        {
+            int element;
+            if (buffer.tryPop(&element))
+            {
+                ASSERT_TRUE(next_received_element == element);
+                next_received_element++;
+            }
+        }
+    }
+}
+
+
+TEST(RingBuffer, Resize)
+{
+    RingBuffer<int> buffer(10);
+
+    for (size_t i = 0; i < 10; ++i)
+        ASSERT_TRUE(buffer.tryPush(i));
+
+    buffer.resize(0);
+
+    ASSERT_TRUE(buffer.empty());
+    ASSERT_EQ(buffer.size(), 0u);
+
+    ASSERT_FALSE(buffer.tryPush(42));
+
+    int value;
+    ASSERT_FALSE(buffer.tryPop(&value));
+
+    buffer.resize(1);
+
+    ASSERT_TRUE(buffer.tryPush(42));
+    ASSERT_TRUE(buffer.tryPop(&value));
+    ASSERT_EQ(value, 42);
+
+    buffer.resize(42);
+
+    for (size_t i = 0; i < 42; ++i)
+        ASSERT_TRUE(buffer.tryPush(i));
+
+    buffer.resize(56);
+
+    for (size_t i = 0; i < 42; ++i)
+    {
+        ASSERT_TRUE(buffer.tryPop(&value));
+        ASSERT_EQ(value, i);
+    }
+
+    for (size_t i = 0; i < 56; ++i)
+        ASSERT_TRUE(buffer.tryPush(i));
+
+    buffer.resize(13);
+
+    for (size_t i = 0; i < 13; ++i)
+    {
+        ASSERT_TRUE(buffer.tryPop(&value));
+        ASSERT_EQ(value, i);
+    }
+}
+
+
+TEST(RingBuffer, removeElements)
+{
+    RingBuffer<int> buffer(10);
+
+    for (size_t i = 0; i < 10; ++i)
+        ASSERT_TRUE(buffer.tryPush(i));
+
+    int value;
+    ASSERT_TRUE(buffer.tryPop(&value));
+    ASSERT_TRUE(buffer.tryPop(&value));
+    ASSERT_TRUE(buffer.tryPop(&value));
+
+    buffer.removeElements([](int current) { return current % 2 == 0; });
+
+    ASSERT_EQ(buffer.size(), 4);
+
+    ASSERT_TRUE(buffer.tryPop(&value));
+    ASSERT_EQ(value, 3);
+    ASSERT_TRUE(buffer.tryPop(&value));
+    ASSERT_EQ(value, 5);
+    ASSERT_TRUE(buffer.tryPop(&value));
+    ASSERT_EQ(value, 7);
+    ASSERT_TRUE(buffer.tryPop(&value));
+    ASSERT_EQ(value, 9);
+}
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index c1e92f1be0e..2c7bf9e6eca 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -2729,21 +2729,30 @@ PartUUIDsPtr Context::getIgnoredPartUUIDs() const
 
 void Context::initializeBackgroundExecutors()
 {
-    shared->merge_mutate_executor = MergeTreeBackgroundExecutor::create(MergeTreeBackgroundExecutor::Type::MERGE_MUTATE);
-    shared->moves_executor = MergeTreeBackgroundExecutor::create(MergeTreeBackgroundExecutor::Type::MOVE);
-    shared->fetch_executor = MergeTreeBackgroundExecutor::create(MergeTreeBackgroundExecutor::Type::FETCH);
+    shared->merge_mutate_executor = MergeTreeBackgroundExecutor::create
+    (
+        MergeTreeBackgroundExecutor::Type::MERGE_MUTATE,
+        [this] () { return getSettingsRef().background_pool_size; },
+        [this] () { return getSettingsRef().background_pool_size; },
+        CurrentMetrics::BackgroundPoolTask
+    );
 
-    shared->merge_mutate_executor->setThreadsCount([this] () { return getSettingsRef().background_pool_size; });
-    shared->merge_mutate_executor->setTasksCount([this] () { return getSettingsRef().background_pool_size; });
-    shared->merge_mutate_executor->setMetric(CurrentMetrics::BackgroundPoolTask);
+    shared->moves_executor = MergeTreeBackgroundExecutor::create
+    (
+        MergeTreeBackgroundExecutor::Type::MOVE,
+        [this] () { return getSettingsRef().background_move_pool_size; },
+        [this] () { return getSettingsRef().background_move_pool_size; },
+        CurrentMetrics::BackgroundMovePoolTask
+    );
 
-    shared->moves_executor->setThreadsCount([this] () { return getSettingsRef().background_move_pool_size; });
-    shared->moves_executor->setTasksCount([this] () { return getSettingsRef().background_move_pool_size; });
-    shared->moves_executor->setMetric(CurrentMetrics::BackgroundMovePoolTask);
 
-    shared->fetch_executor->setThreadsCount([this] () { return getSettingsRef().background_fetches_pool_size; });
-    shared->fetch_executor->setTasksCount([this] () { return getSettingsRef().background_fetches_pool_size; });
-    shared->fetch_executor->setMetric(CurrentMetrics::BackgroundFetchesPoolTask);
+    shared->fetch_executor = MergeTreeBackgroundExecutor::create
+    (
+        MergeTreeBackgroundExecutor::Type::FETCH,
+        [this] () { return getSettingsRef().background_fetches_pool_size; },
+        [this] () { return getSettingsRef().background_fetches_pool_size; },
+        CurrentMetrics::BackgroundFetchesPoolTask
+    );
 }
 
 
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index c7b025a74f3..5aecf85a7a4 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -32,12 +32,10 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
         /// Mark this StorageID as deleting
         currently_deleting.emplace(id);
 
-        std::erase_if(pending, [&] (auto item) -> bool { return item->task->getStorageID() == id; });
+        pending.removeElements([&] (auto item) -> bool { return item->task->getStorageID() == id; });
 
         /// Find pending to wait
-        for (const auto & item : active)
-            if (item->task->getStorageID() == id)
-                tasks_to_wait.emplace_back(item);
+        tasks_to_wait = active.getAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
     }
 
 
@@ -66,13 +64,24 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         if (shutdown_suspend)
             break;
 
-        auto item = std::move(pending.front());
-        pending.pop_front();
+        ItemPtr item;
+        if (!pending.tryPop(&item))
+            continue;
 
-        active.emplace(item);
+        active.tryPush(item);
+
+
+        try
+        {
+            /// This is needed to increase / decrease the number of threads at runtime
+            if (update_timer.compareAndRestartDeferred(1.))
+                updateConfiguration();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
 
-        /// This is needed to increase / decrease the number of threads at runtime
-        updatePoolConfiguration();
 
         bool res = pool.trySchedule([this, item] ()
         {
@@ -80,7 +89,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
             auto check_if_deleting = [&] () -> bool
             {
-                active.erase(item);
+                active.tryErase(item);
 
                 for (auto & id : currently_deleting)
                 {
@@ -108,7 +117,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
                     if (check_if_deleting())
                         return;
 
-                    pending.emplace_back(item);
+                    pending.tryPush(item);
                     has_tasks.notify_one();
                     return;
                 }
@@ -130,8 +139,8 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
         if (!res)
         {
-            active.erase(item);
-            pending.emplace_back(item);
+            active.tryErase(item);
+            pending.tryPush(item);
         }
 
     }
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index db1e89dd1bd..accb9bf2e0f 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -5,10 +5,13 @@
 #include <atomic>
 #include <mutex>
 #include <condition_variable>
+#include <unordered_set>
 
 #include <common/shared_ptr_helper.h>
 #include <Common/ThreadPool.h>
-#include <Common/ArenaAllocator.h>
+#include <Common/Stopwatch.h>
+#include <Common/RingBuffer.h>
+#include <Common/PlainMultiSet.h>
 #include <Storages/MergeTree/ExecutableTask.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 
@@ -61,9 +64,19 @@ public:
         MOVE
     };
 
-    explicit MergeTreeBackgroundExecutor(Type type_) : type(type_)
+    MergeTreeBackgroundExecutor(
+        Type type_,
+        CountGetter && threads_count_getter_,
+        CountGetter && max_task_count_getter_,
+        CurrentMetrics::Metric metric_)
+        : type(type_)
+        , threads_count_getter(threads_count_getter_)
+        , max_task_count_getter(max_task_count_getter_)
+        , metric(metric_)
     {
         name = toString(type);
+
+        updateConfiguration();
         scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
     }
 
@@ -72,21 +85,6 @@ public:
         wait();
     }
 
-    void setThreadsCount(CountGetter && getter)
-    {
-        threads_count_getter = getter;
-    }
-
-    void setTasksCount(CountGetter && getter)
-    {
-        max_task_count_getter = getter;
-    }
-
-    void setMetric(CurrentMetrics::Metric metric_)
-    {
-        metric = metric_;
-    }
-
     bool trySchedule(ExecutableTaskPtr task)
     {
         std::lock_guard lock(mutex);
@@ -95,10 +93,13 @@ public:
             return false;
 
         auto & value = CurrentMetrics::values[metric];
-        if (value.load() >= static_cast<int64_t>(max_task_count_getter()))
+        if (value.load() >= static_cast<int64_t>(max_tasks_count))
             return false;
 
-        pending.emplace_back(std::make_shared<Item>(std::move(task), metric));
+        if (!pending.tryPush(std::make_shared<Item>(std::move(task), metric)))
+            return false;
+
+
         has_tasks.notify_one();
         return true;
     }
@@ -133,12 +134,27 @@ public:
 
 private:
 
-    void updatePoolConfiguration()
+    void updateConfiguration()
     {
-        const auto max_threads = threads_count_getter();
-        pool.setMaxFreeThreads(0);
-        pool.setMaxThreads(max_threads);
-        pool.setQueueSize(max_threads);
+        auto new_threads_count = threads_count_getter();
+        auto new_max_tasks_count = max_task_count_getter();
+
+        try
+        {
+            pending.resize(new_max_tasks_count);
+            active.reserve(new_max_tasks_count);
+
+            pool.setMaxFreeThreads(0);
+            pool.setMaxThreads(new_threads_count);
+            pool.setQueueSize(new_max_tasks_count);
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+
+        threads_count = new_threads_count;
+        max_tasks_count = new_max_tasks_count;
     }
 
     void schedulerThreadFunction();
@@ -151,6 +167,11 @@ private:
     CountGetter max_task_count_getter;
     CurrentMetrics::Metric metric;
 
+    size_t threads_count{0};
+    size_t max_tasks_count{0};
+
+    AtomicStopwatch update_timer;
+
     struct Item
     {
         explicit Item(ExecutableTaskPtr && task_, CurrentMetrics::Metric metric_)
@@ -169,8 +190,9 @@ private:
 
     using ItemPtr = std::shared_ptr<Item>;
 
-    std::deque<ItemPtr> pending;
-    std::set<ItemPtr> active;
+    /// Initially it will be empty
+    RingBuffer<ItemPtr> pending{0};
+    PlainMultiSet<ItemPtr> active{0};
     std::set<StorageID> currently_deleting;
 
     std::mutex remove_mutex;
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index 0a330a0af76..664b09aa82f 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -54,14 +54,16 @@ private:
 
 TEST(Executor, RemoveTasks)
 {
-    auto executor = DB::MergeTreeBackgroundExecutor::create(DB::MergeTreeBackgroundExecutor::Type::MERGE_MUTATE);
-
     const size_t tasks_kinds = 25;
     const size_t batch = 100;
 
-    executor->setThreadsCount([]() { return tasks_kinds; });
-    executor->setTasksCount([] () { return tasks_kinds * batch; });
-    executor->setMetric(CurrentMetrics::BackgroundPoolTask);
+    auto executor = DB::MergeTreeBackgroundExecutor::create
+    (
+        DB::MergeTreeBackgroundExecutor::Type::MERGE_MUTATE,
+        [] () { return tasks_kinds; },
+        [] () { return tasks_kinds * batch; },
+        CurrentMetrics::BackgroundPoolTask
+    );
 
     for (size_t i = 0; i < batch; ++i)
         for (size_t j = 0; j < tasks_kinds; ++j)
@@ -93,16 +95,18 @@ TEST(Executor, RemoveTasks)
 
 TEST(Executor, RemoveTasksStress)
 {
-    auto executor = DB::MergeTreeBackgroundExecutor::create(DB::MergeTreeBackgroundExecutor::Type::MERGE_MUTATE);
-
     const size_t tasks_kinds = 25;
     const size_t batch = 100;
     const size_t schedulers_count = 5;
     const size_t removers_count = 5;
 
-    executor->setThreadsCount([]() { return tasks_kinds; });
-    executor->setTasksCount([] () { return tasks_kinds * batch * (schedulers_count + removers_count); });
-    executor->setMetric(CurrentMetrics::BackgroundPoolTask);
+    auto executor = DB::MergeTreeBackgroundExecutor::create
+    (
+        DB::MergeTreeBackgroundExecutor::Type::MERGE_MUTATE,
+        [] () { return tasks_kinds; },
+        [] () { return tasks_kinds * batch * (schedulers_count + removers_count); },
+        CurrentMetrics::BackgroundPoolTask
+    );
 
     std::barrier barrier(schedulers_count + removers_count);
 

From 6624fa12bae5b7a2c9914b164f25216c226f1e99 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Thu, 2 Sep 2021 18:49:37 +0000
Subject: [PATCH 066/177] deleted set

---
 src/Common/PlainMultiSet.h                    | 114 ------------------
 src/Common/RingBuffer.h                       |  42 ++++++-
 src/Common/tests/gtest_plain_multiset.cpp     |  22 ----
 src/Common/tests/gtest_ringbuffer.cpp         |   2 +-
 .../MergeTree/MergeMutateExecutor.cpp         |  11 +-
 src/Storages/MergeTree/MergeMutateExecutor.h  |   5 +-
 6 files changed, 43 insertions(+), 153 deletions(-)
 delete mode 100644 src/Common/PlainMultiSet.h
 delete mode 100644 src/Common/tests/gtest_plain_multiset.cpp

diff --git a/src/Common/PlainMultiSet.h b/src/Common/PlainMultiSet.h
deleted file mode 100644
index d5b646a53db..00000000000
--- a/src/Common/PlainMultiSet.h
+++ /dev/null
@@ -1,114 +0,0 @@
-#pragma once
-
-#include <vector>
-
-namespace DB
-{
-
-
-/**
- * Class with O(n) complexity for all methods
- * Size has to be fixed.
- * The main reason to use this is to get rid of any allocations.
- * Used is some executors, where the number of elements is really small.
- */
-template <class T>
-class PlainMultiSet
-{
-public:
-
-    explicit PlainMultiSet(size_t capacity_)
-    {
-        buffer.resize(capacity_);
-    }
-
-
-    bool tryPush(T element)
-    {
-        for (auto & item : buffer)
-        {
-            if (item.state == State::EMPTY)
-            {
-                item.state = State::FILLED;
-                item.value = std::move(element);
-                ++count;
-                return true;
-            }
-        }
-
-
-        return false;
-    }
-
-    bool has(T element)
-    {
-        for (auto & item : buffer)
-            if (item.state == State::FILLED && item.value == element)
-                return true;
-
-        return false;
-    }
-
-
-    template <class Predicate>
-    std::vector<T> getAll(Predicate && predicate)
-    {
-        std::vector<T> suitable;
-        for (auto & item : buffer)
-            if (item.state == State::FILLED && predicate(item.value))
-                suitable.emplace_back(item.value);
-
-        return suitable;
-    }
-
-
-    bool tryErase(const T & element)
-    {
-        for (auto & item : buffer)
-        {
-            if (item.state == State::FILLED && item.value == element)
-            {
-                item.state = State::EMPTY;
-                item.value = T{};
-                --count;
-                return true;
-            }
-        }
-
-        return false;
-    }
-
-    size_t size()
-    {
-        return count;
-    }
-
-    void reserve(size_t new_capacity)
-    {
-        if (buffer.size() >= new_capacity)
-            return;
-
-        std::vector<Item> new_buffer(std::move(buffer));
-        new_buffer.reserve(new_capacity);
-
-        std::swap(new_buffer, buffer);
-    }
-
-private:
-    enum class State
-    {
-        EMPTY,
-        FILLED
-    };
-
-    struct Item
-    {
-        T value;
-        State state{State::EMPTY};
-    };
-
-    size_t count{0};
-    std::vector<Item> buffer;
-};
-
-}
diff --git a/src/Common/RingBuffer.h b/src/Common/RingBuffer.h
index a619ae2dab6..fb208307c3b 100644
--- a/src/Common/RingBuffer.h
+++ b/src/Common/RingBuffer.h
@@ -35,7 +35,7 @@ public:
         if (count == capacity) {
             return false;
         }
-        buffer[(position + count) % capacity] = element;
+        buffer[advance(count)] = element;
         ++count;
         return true;
     }
@@ -47,21 +47,19 @@ public:
         }
         *element = std::move(buffer[position]);
         --count;
-        position = (position + 1) % capacity;
+        position = advance();
         return true;
     }
 
     template <typename Predicate>
-    void removeElements(Predicate && predicate)
+    void eraseAll(Predicate && predicate)
     {
         /// Shift all elements to the beginning of the buffer
         std::rotate(buffer.begin(), buffer.begin() + position, buffer.end());
-
         /// Remove elements
         auto end_removed = std::remove_if(buffer.begin(), buffer.begin() + count, predicate);
 
         size_t new_count = std::distance(buffer.begin(), end_removed);
-
         for (size_t i = new_count; i < count; ++i)
             buffer[i] = T{};
 
@@ -69,6 +67,31 @@ public:
         position = 0;
     }
 
+    template <class Predicate>
+    std::vector<T> getAll(Predicate && predicate)
+    {
+        std::vector<T> suitable;
+
+        for (size_t i = 0; i < count; ++i)
+        {
+            auto item = buffer[advance(i)];
+            if (predicate(item))
+                suitable.emplace_back(item);
+        }
+
+        return suitable;
+    }
+
+    template <typename Predicate>
+    bool has(Predicate && predicate)
+    {
+        for (size_t i = 0; i < count; ++i)
+            if (predicate(buffer[advance(i)]))
+                return true;
+
+        return false;
+    }
+
 
     void resize(size_t new_capacity)
     {
@@ -80,6 +103,13 @@ public:
 
 private:
 
+    size_t advance(size_t amount = 1)
+    {
+        if (position + amount >= capacity)
+            return position + amount - capacity;
+        return position + amount;
+    }
+
     void expand(size_t new_capacity)
     {
         bool overflow = (position + count) > capacity;
@@ -102,7 +132,7 @@ private:
 
         count = std::min(new_capacity, count);
         for (size_t i = 0; i < count; ++i)
-            new_buffer[i] = buffer[(position + i) % capacity];
+            new_buffer[i] = buffer[advance(i)];
 
         std::swap(buffer, new_buffer);
 
diff --git a/src/Common/tests/gtest_plain_multiset.cpp b/src/Common/tests/gtest_plain_multiset.cpp
deleted file mode 100644
index 2018053d6a7..00000000000
--- a/src/Common/tests/gtest_plain_multiset.cpp
+++ /dev/null
@@ -1,22 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <random>
-
-#include <Common/PlainMultiSet.h>
-
-using namespace DB;
-
-
-TEST(PlainMultiSet, Simple)
-{
-    PlainMultiSet<int> set(10);
-
-    ASSERT_TRUE(set.tryPush(1));
-    ASSERT_TRUE(set.tryPush(1));
-    ASSERT_TRUE(set.tryPush(2));
-    ASSERT_TRUE(set.tryPush(3));
-
-    ASSERT_TRUE(set.has(1));
-    ASSERT_TRUE(set.has(2));
-    ASSERT_TRUE(set.has(3));
-}
diff --git a/src/Common/tests/gtest_ringbuffer.cpp b/src/Common/tests/gtest_ringbuffer.cpp
index e56739b21cc..394922e4e20 100644
--- a/src/Common/tests/gtest_ringbuffer.cpp
+++ b/src/Common/tests/gtest_ringbuffer.cpp
@@ -127,7 +127,7 @@ TEST(RingBuffer, removeElements)
     ASSERT_TRUE(buffer.tryPop(&value));
     ASSERT_TRUE(buffer.tryPop(&value));
 
-    buffer.removeElements([](int current) { return current % 2 == 0; });
+    buffer.eraseAll([](int current) { return current % 2 == 0; });
 
     ASSERT_EQ(buffer.size(), 4);
 
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index 5aecf85a7a4..e430c488d30 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -32,9 +32,8 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
         /// Mark this StorageID as deleting
         currently_deleting.emplace(id);
 
-        pending.removeElements([&] (auto item) -> bool { return item->task->getStorageID() == id; });
-
-        /// Find pending to wait
+        /// Erase storage related tasks from pending and select active tasks to wait for
+        pending.eraseAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
         tasks_to_wait = active.getAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
     }
 
@@ -45,7 +44,6 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
         item->future.wait();
     }
 
-
     {
         std::lock_guard lock(mutex);
         currently_deleting.erase(id);
@@ -70,7 +68,6 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
         active.tryPush(item);
 
-
         try
         {
             /// This is needed to increase / decrease the number of threads at runtime
@@ -89,7 +86,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
             auto check_if_deleting = [&] () -> bool
             {
-                active.tryErase(item);
+                active.eraseAll([&] (auto x) { return x == item; });
 
                 for (auto & id : currently_deleting)
                 {
@@ -139,7 +136,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 
         if (!res)
         {
-            active.tryErase(item);
+            active.eraseAll([&] (auto x) { return x == item; });
             pending.tryPush(item);
         }
 
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index accb9bf2e0f..807730b8dc6 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -11,7 +11,6 @@
 #include <Common/ThreadPool.h>
 #include <Common/Stopwatch.h>
 #include <Common/RingBuffer.h>
-#include <Common/PlainMultiSet.h>
 #include <Storages/MergeTree/ExecutableTask.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 
@@ -142,7 +141,7 @@ private:
         try
         {
             pending.resize(new_max_tasks_count);
-            active.reserve(new_max_tasks_count);
+            active.resize(new_max_tasks_count);
 
             pool.setMaxFreeThreads(0);
             pool.setMaxThreads(new_threads_count);
@@ -192,7 +191,7 @@ private:
 
     /// Initially it will be empty
     RingBuffer<ItemPtr> pending{0};
-    PlainMultiSet<ItemPtr> active{0};
+    RingBuffer<ItemPtr> active{0};
     std::set<StorageID> currently_deleting;
 
     std::mutex remove_mutex;

From ed70ed6f71e60adfb9f1ee5d48cb158e4642c7cb Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 2 Sep 2021 00:52:38 +0300
Subject: [PATCH 067/177] Introduce connection_no_wait setting for MySQL
 engine.

This will allow to avoid superfluous sleep during query execution, since
this not only not desired behavoiur, but also may hang the server, since
if you will execute enough queries that will use MySQL database but will
not allow enough connections (or your MySQL server is too slow) then you
may run out of threads in the global thread pool.

Also note that right now it is possible to get deadlock when the mysql
pool is full, consider the following scenario:

- you have m1 and m2 mysql tables
- you have q1 and q2 queries, bot queries join m1 and m2
- q1 allocated connection for m1 but cannot allocate connection for m2
- q2 allocated connection for m2 but cannot allocate connection for m1
- but to resolve the lock one should give up on the locking while it is not possible right now...

And then you got no free threads and this:

    # grep -h ^202 /proc/$(pgrep clickhouse-serv)/task/*/syscall  | cut -d' ' -f2 | sort | uniq -c | sort -nr | head
       1554 0x7ffb60b92fe8 # mutex in mysqlxx::PoolWithFailover::get
       1375 0x7ffb9f1c4748 # mutex in ::PoolEntryHelper::~PoolEntryHelper from DB::MultiplexedConnections::invalidateReplica
       1160 0x7ffb612918b8 # mutex in mysqlxx::PoolWithFailover::get
         42 0x7ffb9f057984 # mutex in ThreadPoolImpl<std::__1::thread>::worker

    *NOTE: 202 is a `futex` with WAIT*

(Went with `syscall` because debugging 10k+ threads is not easy, and
eventually it may TRAP)
---
 base/mysqlxx/Pool.cpp                         | 29 +++++++++++++++++--
 base/mysqlxx/Pool.h                           |  2 +-
 base/mysqlxx/PoolWithFailover.cpp             | 12 +++++---
 base/mysqlxx/PoolWithFailover.h               |  6 +++-
 .../table-engines/integrations/mysql.md       |  1 +
 .../MySQL/MaterializedMySQLSyncThread.cpp     |  4 +--
 src/Storages/MySQL/MySQLSettings.h            |  1 +
 src/Storages/StorageMySQL.cpp                 | 10 +++++--
 8 files changed, 51 insertions(+), 14 deletions(-)

diff --git a/base/mysqlxx/Pool.cpp b/base/mysqlxx/Pool.cpp
index 2f47aa67356..cee386311d4 100644
--- a/base/mysqlxx/Pool.cpp
+++ b/base/mysqlxx/Pool.cpp
@@ -7,10 +7,22 @@
 #endif
 
 #include <mysqlxx/Pool.h>
-
 #include <common/sleep.h>
-
 #include <Poco/Util/LayeredConfiguration.h>
+#include <ctime>
+
+
+namespace
+{
+
+inline uint64_t clock_gettime_ns(clockid_t clock_type = CLOCK_MONOTONIC)
+{
+    struct timespec ts;
+    clock_gettime(clock_type, &ts);
+    return uint64_t(ts.tv_sec * 1000000000LL + ts.tv_nsec);
+}
+
+}
 
 
 namespace mysqlxx
@@ -124,10 +136,15 @@ Pool::~Pool()
 }
 
 
-Pool::Entry Pool::get()
+Pool::Entry Pool::get(uint64_t wait_timeout)
 {
     std::unique_lock<std::mutex> lock(mutex);
 
+    uint64_t deadline = 0;
+    /// UINT64_MAX -- wait indefinitely
+    if (wait_timeout && wait_timeout != UINT64_MAX)
+        deadline = clock_gettime_ns() + wait_timeout * 1'000'000'000;
+
     initialize();
     for (;;)
     {
@@ -153,6 +170,12 @@ Pool::Entry Pool::get()
             logger.trace("(%s): Unable to create a new connection: Max number of connections has been reached.", getDescription());
         }
 
+        if (!wait_timeout)
+            throw Poco::Exception("mysqlxx::Pool is full (wait is disabled, see connection_wait_timeout setting)");
+
+        if (deadline && clock_gettime_ns() >= deadline)
+            throw Poco::Exception("mysqlxx::Pool is full (connection_wait_timeout is exceeded)");
+
         lock.unlock();
         logger.trace("(%s): Sleeping for %d seconds.", getDescription(), MYSQLXX_POOL_SLEEP_ON_CONNECT_FAIL);
         sleepForSeconds(MYSQLXX_POOL_SLEEP_ON_CONNECT_FAIL);
diff --git a/base/mysqlxx/Pool.h b/base/mysqlxx/Pool.h
index 530e2c78cf2..08d8b85b4ac 100644
--- a/base/mysqlxx/Pool.h
+++ b/base/mysqlxx/Pool.h
@@ -189,7 +189,7 @@ public:
     ~Pool();
 
     /// Allocates connection.
-    Entry get();
+    Entry get(uint64_t wait_timeout);
 
     /// Allocates connection.
     /// If database is not accessible, returns empty Entry object.
diff --git a/base/mysqlxx/PoolWithFailover.cpp b/base/mysqlxx/PoolWithFailover.cpp
index e317ab7f228..14c0db9ecd5 100644
--- a/base/mysqlxx/PoolWithFailover.cpp
+++ b/base/mysqlxx/PoolWithFailover.cpp
@@ -21,8 +21,9 @@ PoolWithFailover::PoolWithFailover(
         const unsigned max_connections_,
         const size_t max_tries_)
     : max_tries(max_tries_)
+    , shareable(config_.getBool(config_name_ + ".share_connection", false))
+    , wait_timeout(UINT64_MAX)
 {
-    shareable = config_.getBool(config_name_ + ".share_connection", false);
     if (config_.has(config_name_ + ".replica"))
     {
         Poco::Util::AbstractConfiguration::Keys replica_keys;
@@ -80,9 +81,11 @@ PoolWithFailover::PoolWithFailover(
         const std::string & password,
         unsigned default_connections_,
         unsigned max_connections_,
-        size_t max_tries_)
+        size_t max_tries_,
+        uint64_t wait_timeout_)
     : max_tries(max_tries_)
     , shareable(false)
+    , wait_timeout(wait_timeout_)
 {
     /// Replicas have the same priority, but traversed replicas are moved to the end of the queue.
     for (const auto & [host, port] : addresses)
@@ -101,6 +104,7 @@ PoolWithFailover::PoolWithFailover(
 PoolWithFailover::PoolWithFailover(const PoolWithFailover & other)
     : max_tries{other.max_tries}
     , shareable{other.shareable}
+    , wait_timeout(other.wait_timeout)
 {
     if (shareable)
     {
@@ -140,7 +144,7 @@ PoolWithFailover::Entry PoolWithFailover::get()
 
                 try
                 {
-                    Entry entry = shareable ? pool->get() : pool->tryGet();
+                    Entry entry = shareable ? pool->get(wait_timeout) : pool->tryGet();
 
                     if (!entry.isNull())
                     {
@@ -172,7 +176,7 @@ PoolWithFailover::Entry PoolWithFailover::get()
     if (full_pool)
     {
         app.logger().error("All connections failed, trying to wait on a full pool " + (*full_pool)->getDescription());
-        return (*full_pool)->get();
+        return (*full_pool)->get(wait_timeout);
     }
 
     std::stringstream message;
diff --git a/base/mysqlxx/PoolWithFailover.h b/base/mysqlxx/PoolWithFailover.h
index 1c7a63e76c0..2bd5ec9f30a 100644
--- a/base/mysqlxx/PoolWithFailover.h
+++ b/base/mysqlxx/PoolWithFailover.h
@@ -80,6 +80,8 @@ namespace mysqlxx
         std::mutex mutex;
         /// Can the Pool be shared
         bool shareable;
+        /// Timeout for waiting free connection.
+        uint64_t wait_timeout = 0;
 
     public:
         using Entry = Pool::Entry;
@@ -96,6 +98,7 @@ namespace mysqlxx
          * default_connections   Number of connection in pool to each replica at start.
          * max_connections       Maximum number of connections in pool to each replica.
          * max_tries_            Max number of connection tries.
+         * wait_timeout_         Timeout for waiting free connection.
          */
         PoolWithFailover(
             const std::string & config_name_,
@@ -117,7 +120,8 @@ namespace mysqlxx
             const std::string & password,
             unsigned default_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS,
             unsigned max_connections_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_CONNECTIONS,
-            size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES);
+            size_t max_tries_ = MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES,
+            uint64_t wait_timeout_ = UINT64_MAX);
 
         PoolWithFailover(const PoolWithFailover & other);
 
diff --git a/docs/en/engines/table-engines/integrations/mysql.md b/docs/en/engines/table-engines/integrations/mysql.md
index a6402e00bc9..7eac159a645 100644
--- a/docs/en/engines/table-engines/integrations/mysql.md
+++ b/docs/en/engines/table-engines/integrations/mysql.md
@@ -19,6 +19,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]
 SETTINGS
     [connection_pool_size=16, ]
     [connection_max_tries=3, ]
+    [connection_wait_timeout=5, ] /* 0 -- do not wait */
     [connection_auto_close=true ]
 ;
 ```
diff --git a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp
index 53495aa3cb1..560d2d716c9 100644
--- a/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp
+++ b/src/Databases/MySQL/MaterializedMySQLSyncThread.cpp
@@ -247,7 +247,7 @@ void MaterializedMySQLSyncThread::assertMySQLAvailable()
 {
     try
     {
-        checkMySQLVariables(pool.get(), getContext()->getSettingsRef());
+        checkMySQLVariables(pool.get(/* wait_timeout= */ UINT64_MAX), getContext()->getSettingsRef());
     }
     catch (const mysqlxx::ConnectionFailed & e)
     {
@@ -729,7 +729,7 @@ void MaterializedMySQLSyncThread::onEvent(Buffers & buffers, const BinlogEventPt
         {
             /// Some behaviors(such as changing the value of "binlog_checksum") rotate the binlog file.
             /// To ensure that the synchronization continues, we need to handle these events
-            metadata.fetchMasterVariablesValue(pool.get());
+            metadata.fetchMasterVariablesValue(pool.get(/* wait_timeout= */ UINT64_MAX));
             client.setBinlogChecksum(metadata.binlog_checksum);
         }
         else if (receive_event->header.type != HEARTBEAT_EVENT)
diff --git a/src/Storages/MySQL/MySQLSettings.h b/src/Storages/MySQL/MySQLSettings.h
index da8723c2ea6..872b0607e20 100644
--- a/src/Storages/MySQL/MySQLSettings.h
+++ b/src/Storages/MySQL/MySQLSettings.h
@@ -17,6 +17,7 @@ class ASTStorage;
 #define LIST_OF_MYSQL_SETTINGS(M) \
     M(UInt64, connection_pool_size, 16, "Size of connection pool (if all connections are in use, the query will wait until some connection will be freed).", 0) \
     M(UInt64, connection_max_tries, 3, "Number of retries for pool with failover", 0) \
+    M(UInt64, connection_wait_timeout, 5, "Timeout (in seconds) for waiting for free connection (in case of there is already connection_pool_size active connections), 0 - do not wait.", 0) \
     M(Bool, connection_auto_close, true, "Auto-close connection after query execution, i.e. disable connection reuse.", 0) \
 
 DECLARE_SETTINGS_TRAITS(MySQLSettingsTraits, LIST_OF_MYSQL_SETTINGS)
diff --git a/src/Storages/StorageMySQL.cpp b/src/Storages/StorageMySQL.cpp
index 79bb1f59cc7..7f458ef82af 100644
--- a/src/Storages/StorageMySQL.cpp
+++ b/src/Storages/StorageMySQL.cpp
@@ -267,11 +267,15 @@ void registerStorageMySQL(StorageFactory & factory)
             throw Exception("connection_pool_size cannot be zero.", ErrorCodes::BAD_ARGUMENTS);
 
         auto addresses = parseRemoteDescriptionForExternalDatabase(host_port, max_addresses, 3306);
-        mysqlxx::PoolWithFailover pool(remote_database, addresses,
-            username, password,
+        mysqlxx::PoolWithFailover pool(
+            remote_database,
+            addresses,
+            username,
+            password,
             MYSQLXX_POOL_WITH_FAILOVER_DEFAULT_START_CONNECTIONS,
             mysql_settings.connection_pool_size,
-            mysql_settings.connection_max_tries);
+            mysql_settings.connection_max_tries,
+            mysql_settings.connection_wait_timeout);
 
         bool replace_query = false;
         std::string on_duplicate_clause;

From 6d5f01a56bb1715c47de8444bfc85b39228f3081 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 2 Sep 2021 22:32:32 +0300
Subject: [PATCH 068/177] Cover MySQL setting connection_wait_timeout

---
 tests/integration/test_storage_mysql/test.py | 48 ++++++++++++++++++++
 1 file changed, 48 insertions(+)

diff --git a/tests/integration/test_storage_mysql/test.py b/tests/integration/test_storage_mysql/test.py
index a044528cacf..c7ede8dede4 100644
--- a/tests/integration/test_storage_mysql/test.py
+++ b/tests/integration/test_storage_mysql/test.py
@@ -3,7 +3,10 @@ from contextlib import contextmanager
 ## sudo -H pip install PyMySQL
 import pymysql.cursors
 import pytest
+import time
+import threading
 from helpers.cluster import ClickHouseCluster
+from helpers.client import QueryRuntimeException
 
 cluster = ClickHouseCluster(__file__)
 
@@ -319,6 +322,51 @@ CREATE TABLE {}(id UInt32, name String, age UInt32, money UInt32) ENGINE = MySQL
     conn.close()
 
 
+# Check that limited connection_wait_timeout (via connection_pool_size=1) will throw.
+def test_settings_connection_wait_timeout(started_cluster):
+    table_name = 'test_settings_connection_wait_timeout'
+    node1.query(f'DROP TABLE IF EXISTS {table_name}')
+    wait_timeout = 2
+
+    conn = get_mysql_conn(started_cluster, cluster.mysql_ip)
+    drop_mysql_table(conn, table_name)
+    create_mysql_table(conn, table_name)
+
+    node1.query('''
+        CREATE TABLE {}
+        (
+            id UInt32,
+            name String,
+            age UInt32,
+            money UInt32
+        )
+        ENGINE = MySQL('mysql57:3306', 'clickhouse', '{}', 'root', 'clickhouse')
+        SETTINGS connection_wait_timeout={}, connection_pool_size=1
+        '''.format(table_name, table_name, wait_timeout)
+    )
+
+    node1.query("INSERT INTO {} (id, name) SELECT number, concat('name_', toString(number)) from numbers(10) ".format(table_name))
+
+    def worker():
+        node1.query("SELECT sleepEachRow(1) FROM {}".format(table_name))
+
+    worker_thread = threading.Thread(target=worker)
+    worker_thread.start()
+
+    # ensure that first query started in worker_thread
+    time.sleep(1)
+
+    started = time.time()
+    with pytest.raises(QueryRuntimeException, match=r"Exception: mysqlxx::Pool is full \(connection_wait_timeout is exceeded\)"):
+        node1.query("SELECT sleepEachRow(1) FROM {}".format(table_name))
+    ended = time.time()
+    assert (ended - started) >= wait_timeout
+
+    worker_thread.join()
+
+    drop_mysql_table(conn, table_name)
+    conn.close()
+
 if __name__ == '__main__':
     with contextmanager(started_cluster)() as cluster:
         for name, instance in list(cluster.instances.items()):

From cc7c221fadbb70f69067366f4bde52d371e21421 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Thu, 2 Sep 2021 21:31:32 +0000
Subject: [PATCH 069/177] Own PriorityQueue + prettifying the code

---
 src/Common/PriorityQueue.h                    | 53 +++++++++++++++++++
 src/Common/ThreadPool.cpp                     | 14 ++---
 src/Common/ThreadPool.h                       |  3 +-
 src/Common/tests/gtest_priority_queue.cpp     | 25 +++++++++
 src/Common/tests/gtest_ringbuffer.cpp         |  2 +-
 .../MergeTree/BackgroundJobsExecutor.cpp      |  5 +-
 .../MergeTree/BackgroundJobsExecutor.h        |  4 +-
 src/Storages/MergeTree/ExecutableTask.h       | 29 ++++++++++
 .../MergeTree/MergeMutateExecutor.cpp         | 11 +++-
 src/Storages/MergeTree/MergeMutateExecutor.h  | 35 +-----------
 src/Storages/MergeTree/MergeTreeData.cpp      | 20 ++++++-
 src/Storages/MergeTree/MergeTreeData.h        | 14 +++--
 src/Storages/StorageMergeTree.cpp             | 11 ++--
 src/Storages/StorageMergeTree.h               | 18 -------
 src/Storages/StorageReplicatedMergeTree.cpp   |  6 +--
 src/Storages/StorageReplicatedMergeTree.h     | 18 -------
 16 files changed, 167 insertions(+), 101 deletions(-)
 create mode 100644 src/Common/PriorityQueue.h
 create mode 100644 src/Common/tests/gtest_priority_queue.cpp

diff --git a/src/Common/PriorityQueue.h b/src/Common/PriorityQueue.h
new file mode 100644
index 00000000000..b845fc0caa8
--- /dev/null
+++ b/src/Common/PriorityQueue.h
@@ -0,0 +1,53 @@
+#pragma once
+
+#include <queue>
+
+namespace DB
+{
+
+
+template <class T, class Comparator = std::less<T>>
+class PriorityQueue
+{
+public:
+
+    T pop()
+    {
+        assert(!buffer.empty());
+        std::pop_heap(buffer.begin(), buffer.end(), comparator);
+        auto element = std::move(buffer.back());
+        buffer.pop_back();
+        return element;
+    }
+
+    void push(T element)
+    {
+        buffer.push_back(std::move(element));
+        std::push_heap(buffer.begin(), buffer.end(), comparator);
+    }
+
+    template< class... Args >
+    void emplace(Args &&... args)
+    {
+        buffer.emplace_back(std::forward<Args>(args)...);
+        std::push_heap(buffer.begin(), buffer.end(), comparator);
+    }
+
+    bool empty() { return buffer.empty(); }
+    size_t size() { return buffer.size(); }
+    void reserve(size_t count) { buffer.reserve(count); }
+    void resize(size_t count)
+    {
+        buffer.resize(count);
+        std::make_heap(buffer.begin(), buffer.end(), comparator);
+    }
+
+private:
+
+    Comparator comparator;
+    std::vector<T> buffer;
+
+
+};
+
+}
diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp
index 4cb1df3ff65..a50a5dce20a 100644
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@@ -74,6 +74,8 @@ void ThreadPoolImpl<Thread>::setQueueSize(size_t value)
 {
     std::lock_guard lock(mutex);
     queue_size = value;
+    /// Reserve memory to get rid of allocations
+    jobs.reserve(queue_size);
 }
 
 
@@ -246,17 +248,9 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
             need_shutdown = shutdown;
 
             if (!jobs.empty())
-            {
-                /// std::priority_queue does not provide interface for getting non-const reference to an element
-                /// to prevent us from modifying its priority. We have to use const_cast to force move semantics on JobWithPriority::job.
-                job = std::move(const_cast<Job &>(jobs.top().job));
-                jobs.pop();
-            }
+                job = std::move(jobs.pop().job);
             else
-            {
-                /// shutdown is true, simply finish the thread.
-                return;
-            }
+                return; /// shutdown is true, simply finish the thread.
         }
 
         if (!need_shutdown)
diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h
index 1cac87abb70..25d86a2fabc 100644
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@@ -11,6 +11,7 @@
 
 #include <Poco/Event.h>
 #include <Common/ThreadStatus.h>
+#include <Common/PriorityQueue.h>
 #include <common/scope_guard.h>
 
 /** Very simple thread pool similar to boost::threadpool.
@@ -103,7 +104,7 @@ private:
         }
     };
 
-    std::priority_queue<JobWithPriority> jobs;
+    DB::PriorityQueue<JobWithPriority> jobs;
     std::list<Thread> threads;
     std::exception_ptr first_exception;
 
diff --git a/src/Common/tests/gtest_priority_queue.cpp b/src/Common/tests/gtest_priority_queue.cpp
new file mode 100644
index 00000000000..c2c943ea4cc
--- /dev/null
+++ b/src/Common/tests/gtest_priority_queue.cpp
@@ -0,0 +1,25 @@
+#include <gtest/gtest.h>
+
+#include <random>
+
+#include <Common/PriorityQueue.h>
+
+using namespace DB;
+
+TEST(PriorityQueue, Simple)
+{
+    PriorityQueue<int> my;
+    std::priority_queue<int> original;
+
+    for (int i = 0; i < 1000; ++i)
+    {
+        my.push(i);
+        original.emplace(i);
+    }
+
+    for (int i = 0; i < 1000; ++i)
+    {
+        ASSERT_EQ(my.pop(), original.top());
+        original.pop();
+    }
+}
diff --git a/src/Common/tests/gtest_ringbuffer.cpp b/src/Common/tests/gtest_ringbuffer.cpp
index 394922e4e20..0b4e2c32375 100644
--- a/src/Common/tests/gtest_ringbuffer.cpp
+++ b/src/Common/tests/gtest_ringbuffer.cpp
@@ -41,7 +41,7 @@ TEST(RingBuffer, Random)
     std::random_device device;
     std::mt19937 generator(device());
 
-    std::uniform_int_distribution<> distribution(0, 1);
+    std::uniform_int_distribution<> distribution(0, 3);
 
     RingBuffer<int> buffer(10);
 
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
index 6d66b3e5a77..b2465add5b3 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
@@ -111,6 +111,8 @@ void BackgroundJobAssignee::finish()
         getContext()->getFetchesExecutor()->removeTasksCorrespondingToStorage(storage_id);
         getContext()->getMergeMutateExecutor()->removeTasksCorrespondingToStorage(storage_id);
     }
+
+    finished = true;
 }
 
 
@@ -139,7 +141,8 @@ catch (...) /// Catch any exception to avoid thread termination.
 
 BackgroundJobAssignee::~BackgroundJobAssignee()
 {
-    finish();
+    if (!finished)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "finish() method must be called before destructor")
 }
 
 }
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.h b/src/Storages/MergeTree/BackgroundJobsExecutor.h
index aba30aa06e7..3d776e56a35 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.h
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.h
@@ -1,6 +1,5 @@
 #pragma once
 
-#include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeMutateExecutor.h>
 #include <Common/ThreadPool.h>
 #include <Core/BackgroundSchedulePool.h>
@@ -28,6 +27,7 @@ struct ExecutableTaskSchedulingSettings
     double task_sleep_seconds_when_no_work_min = 10;
 };
 
+class MergeTreeData;
 
 class BackgroundJobAssignee : protected WithContext
 {
@@ -48,6 +48,8 @@ private:
     /// Mutex for thread safety
     std::mutex holder_mutex;
 
+    bool finished{false};
+
 public:
     enum class Type
     {
diff --git a/src/Storages/MergeTree/ExecutableTask.h b/src/Storages/MergeTree/ExecutableTask.h
index 5834ec04bfa..2b89a98cac1 100644
--- a/src/Storages/MergeTree/ExecutableTask.h
+++ b/src/Storages/MergeTree/ExecutableTask.h
@@ -1,7 +1,9 @@
 #pragma once
 
 #include <memory>
+#include <functional>
 
+#include <common/shared_ptr_helper.h>
 #include <Interpreters/StorageID.h>
 
 namespace DB
@@ -18,4 +20,31 @@ public:
 
 using ExecutableTaskPtr = std::shared_ptr<ExecutableTask>;
 
+
+class LambdaAdapter : public shared_ptr_helper<LambdaAdapter>, public ExecutableTask
+{
+public:
+
+    template <typename InnerJob, typename Callback>
+    explicit LambdaAdapter(InnerJob && inner_, Callback && callback_, StorageID id_)
+        : inner(inner_), callback(callback_), id(id_) {}
+
+    bool execute() override
+    {
+        res = inner();
+        return false;
+    }
+
+    void onCompleted() override { callback(!res); }
+
+    StorageID getStorageID() override { return id; }
+
+private:
+    bool res = false;
+    std::function<bool()> inner;
+    std::function<void(bool)> callback;
+    StorageID id;
+};
+
+
 }
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index e430c488d30..9178ba85452 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -1,5 +1,6 @@
 #include <Storages/MergeTree/MergeMutateExecutor.h>
 
+#include <Common/setThreadName.h>
 #include <Storages/MergeTree/BackgroundJobsExecutor.h>
 
 
@@ -53,6 +54,10 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 
 void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 {
+    DENY_ALLOCATIONS_IN_SCOPE;
+
+    bool status;
+
     while (true)
     {
         std::unique_lock lock(mutex);
@@ -66,7 +71,8 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         if (!pending.tryPop(&item))
             continue;
 
-        active.tryPush(item);
+        status = active.tryPush(item);
+        assert(status);
 
         try
         {
@@ -137,7 +143,8 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         if (!res)
         {
             active.eraseAll([&] (auto x) { return x == item; });
-            pending.tryPush(item);
+            status = pending.tryPush(item);
+            assert(status);
         }
 
     }
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 807730b8dc6..3470a352b8b 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -4,51 +4,20 @@
 #include <functional>
 #include <atomic>
 #include <mutex>
+#include <future>
 #include <condition_variable>
-#include <unordered_set>
+#include <set>
 
 #include <common/shared_ptr_helper.h>
 #include <Common/ThreadPool.h>
 #include <Common/Stopwatch.h>
 #include <Common/RingBuffer.h>
 #include <Storages/MergeTree/ExecutableTask.h>
-#include <Storages/MergeTree/MergeTreeData.h>
 
 
 namespace DB
 {
 
-class LambdaAdapter : public shared_ptr_helper<LambdaAdapter>, public ExecutableTask
-{
-public:
-
-    template <typename T>
-    explicit LambdaAdapter(T && inner_, MergeTreeData & data_) : inner(inner_), data(data_) {}
-
-    bool execute() override
-    {
-        res = inner();
-        inner = {};
-        return false;
-    }
-
-    void onCompleted() override
-    {
-        data.triggerBackgroundOperationTask(!res);
-    }
-
-    StorageID getStorageID() override
-    {
-        return data.getStorageID();
-    }
-
-private:
-    bool res = false;
-    std::function<bool()> inner;
-    MergeTreeData & data;
-};
-
-
 class MergeTreeBackgroundExecutor : public shared_ptr_helper<MergeTreeBackgroundExecutor>
 {
 public:
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index f57bba9682b..fea7e8b5bed 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -199,6 +199,8 @@ MergeTreeData::MergeTreeData(
     , data_parts_by_info(data_parts_indexes.get<TagByInfo>())
     , data_parts_by_state_and_info(data_parts_indexes.get<TagByStateAndInfo>())
     , parts_mover(this)
+    , background_executor(*this, BackgroundJobAssignee::Type::DataProcessing, getContext())
+    , background_moves_executor(*this, BackgroundJobAssignee::Type::Moving, getContext())
 {
     const auto settings = getSettings();
     allow_nullable_key = attach || settings->allow_nullable_key;
@@ -304,6 +306,22 @@ MergeTreeData::MergeTreeData(
     if (!canUsePolymorphicParts(*settings, &reason) && !reason.empty())
         LOG_WARNING(log, "{} Settings 'min_rows_for_wide_part', 'min_bytes_for_wide_part', "
             "'min_rows_for_compact_part' and 'min_bytes_for_compact_part' will be ignored.", reason);
+
+    common_assignee_trigger = [this] (bool delay) noexcept
+    {
+        if (delay)
+            background_executor.postpone();
+        else
+            background_executor.trigger();
+    };
+
+    moves_assignee_trigger = [this] (bool delay) noexcept
+    {
+        if (delay)
+            background_moves_executor.postpone();
+        else
+            background_moves_executor.trigger();
+    };
 }
 
 StoragePolicyPtr MergeTreeData::getStoragePolicy() const
@@ -4906,7 +4924,7 @@ bool MergeTreeData::scheduleDataMovingJob(BackgroundJobAssignee & executor)
         [this, moving_tagger] () mutable
         {
             return moveParts(moving_tagger);
-        }, *this));
+        }, moves_assignee_trigger, getStorageID()));
     return true;
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 8186c9f9937..7bc62f7959e 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -3,6 +3,7 @@
 #include <Common/SimpleIncrement.h>
 #include <Common/MultiVersion.h>
 #include <Storages/IStorage.h>
+#include <Storages/MergeTree/BackgroundJobsExecutor.h>
 #include <Storages/MergeTree/MergeTreeIndices.h>
 #include <Storages/MergeTree/MergeTreePartInfo.h>
 #include <Storages/MergeTree/MergeTreeSettings.h>
@@ -57,7 +58,6 @@ class ExpressionActions;
 using ExpressionActionsPtr = std::shared_ptr<ExpressionActions>;
 using ManyExpressionActions = std::vector<ExpressionActionsPtr>;
 class MergeTreeDeduplicationLog;
-class BackgroundJobAssignee;
 
 namespace ErrorCodes
 {
@@ -849,9 +849,6 @@ public:
     /// Mutex for currently_submerging_parts and currently_emerging_parts
     mutable std::mutex currently_submerging_emerging_mutex;
 
-    /// Trigger merge scheduling task
-    virtual void triggerBackgroundOperationTask(bool delay) = 0;
-
 protected:
 
     friend class IMergeTreeDataPart;
@@ -923,6 +920,15 @@ protected:
 
     MergeTreePartsMover parts_mover;
 
+    /// Executors are common for both ReplicatedMergeTree and plain MergeTree
+    /// but they are being started and finished in derived classes, so let them be protected.
+    BackgroundJobAssignee background_executor;
+    BackgroundJobAssignee background_moves_executor;
+
+    /// Every task that is finished will ask to assign a new one into an executor.
+    std::function<void(bool)> common_assignee_trigger;
+    std::function<void(bool)> moves_assignee_trigger;
+
     using DataPartIteratorByInfo = DataPartsIndexes::index<TagByInfo>::type::iterator;
     using DataPartIteratorByStateAndInfo = DataPartsIndexes::index<TagByStateAndInfo>::type::iterator;
 
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index f0ad289ae1c..92b8be63826 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -82,9 +82,6 @@ StorageMergeTree::StorageMergeTree(
     , reader(*this)
     , writer(*this)
     , merger_mutator(*this, getContext()->getSettingsRef().background_pool_size)
-    , background_executor(*this, BackgroundJobAssignee::Type::DataProcessing, getContext())
-    , background_moves_executor(*this, BackgroundJobAssignee::Type::Moving, getContext())
-
 {
     loadDataParts(has_force_restore_data_flag);
 
@@ -1080,7 +1077,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee & executo
             [this, metadata_snapshot, merge_entry, share_lock] () mutable
             {
                 return mergeSelectedParts(metadata_snapshot, false, {}, *merge_entry, share_lock);
-            }, *this));
+            }, common_assignee_trigger, getStorageID()));
         return true;
     }
     if (mutate_entry)
@@ -1089,7 +1086,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee & executo
             [this, metadata_snapshot, merge_entry, mutate_entry, share_lock] () mutable
             {
             return mutateSelectedPart(metadata_snapshot, *mutate_entry, share_lock);
-            }, *this));
+            }, common_assignee_trigger, getStorageID()));
         return true;
     }
     bool executed = false;
@@ -1100,7 +1097,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee & executo
             {
                 clearOldTemporaryDirectories(getSettings()->temporary_directories_lifetime.totalSeconds());
                 return true;
-            }, *this));
+            }, common_assignee_trigger, getStorageID()));
         executed = true;
     }
     if (time_after_previous_cleanup_parts.compareAndRestartDeferred(getContext()->getSettingsRef().merge_tree_clear_old_parts_interval_seconds))
@@ -1115,7 +1112,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee & executo
                 clearOldMutations();
                 clearEmptyParts();
                 return true;
-            }, *this));
+            }, common_assignee_trigger, getStorageID()));
         executed = true;
      }
 
diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h
index 4fd38c3618f..c05e4cf44ab 100644
--- a/src/Storages/StorageMergeTree.h
+++ b/src/Storages/StorageMergeTree.h
@@ -16,7 +16,6 @@
 
 #include <Disks/StoragePolicy.h>
 #include <Common/SimpleIncrement.h>
-#include <Storages/MergeTree/BackgroundJobsExecutor.h>
 
 
 namespace DB
@@ -100,19 +99,6 @@ public:
 
     MergeTreeDeduplicationLog * getDeduplicationLog() { return deduplication_log.get(); }
 
-    void triggerBackgroundOperationTask(bool delay) override
-    {
-        if (delay)
-            background_executor.postpone();
-        else
-            background_executor.trigger();
-
-        if (delay)
-            background_moves_executor.postpone();
-        else
-            background_moves_executor.trigger();
-    }
-
 private:
 
     /// Mutex and condvar for synchronous mutations wait
@@ -148,10 +134,6 @@ private:
 
     std::atomic<bool> shutdown_called {false};
 
-    /// Must be the last to be destroyed first
-    BackgroundJobAssignee background_executor;
-    BackgroundJobAssignee background_moves_executor;
-
     void loadMutations();
 
     /// Load and initialize deduplication logs. Even if deduplication setting
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 351bc4646f9..df648817641 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -293,8 +293,6 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
     , replicated_fetches_pool_size(getContext()->getSettingsRef().background_fetches_pool_size)
     , replicated_fetches_throttler(std::make_shared<Throttler>(getSettings()->max_replicated_fetches_network_bandwidth, getContext()->getReplicatedFetchesThrottler()))
     , replicated_sends_throttler(std::make_shared<Throttler>(getSettings()->max_replicated_sends_network_bandwidth, getContext()->getReplicatedSendsThrottler()))
-    , background_executor(*this, BackgroundJobAssignee::Type::DataProcessing, getContext())
-    , background_moves_executor(*this, BackgroundJobAssignee::Type::Moving, getContext())
 {
     queue_updating_task = getContext()->getSchedulePool().createTask(
         getStorageID().getFullTableName() + " (StorageReplicatedMergeTree::queueUpdatingTask)", [this]{ queueUpdatingTask(); });
@@ -3230,7 +3228,7 @@ bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee
             [this, selected_entry] () mutable
             {
                 return processQueueEntry(selected_entry);
-            }, *this));
+            }, common_assignee_trigger, getStorageID()));
         return true;
     }
     else
@@ -3239,7 +3237,7 @@ bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee
             [this, selected_entry] () mutable
             {
                 return processQueueEntry(selected_entry);
-            }, *this));
+            }, common_assignee_trigger, getStorageID()));
         return true;
     }
 }
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index ff0843ba064..8701b45002e 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -262,20 +262,6 @@ public:
 
     bool createEmptyPartInsteadOfLost(zkutil::ZooKeeperPtr zookeeper, const String & lost_part_name);
 
-
-    void triggerBackgroundOperationTask(bool delay) override
-    {
-        if (delay)
-            background_executor.postpone();
-        else
-            background_executor.trigger();
-
-        if (delay)
-            background_moves_executor.postpone();
-        else
-            background_moves_executor.trigger();
-    }
-
 private:
     std::atomic_bool are_restoring_replica {false};
 
@@ -408,10 +394,6 @@ private:
     ThrottlerPtr replicated_fetches_throttler;
     ThrottlerPtr replicated_sends_throttler;
 
-    /// Must be the last to be destroyed first
-    BackgroundJobAssignee background_executor;
-    BackgroundJobAssignee background_moves_executor;
-
     template <class Func>
     void foreachCommittedParts(Func && func, bool select_sequential_consistency) const;
 

From 7baad6adde239e1a65737355227073ea873c6033 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Thu, 2 Sep 2021 22:56:03 +0000
Subject: [PATCH 070/177] Fix build

---
 src/Storages/MergeTree/BackgroundJobsExecutor.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
index b2465add5b3..b67156f195f 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
@@ -141,8 +141,7 @@ catch (...) /// Catch any exception to avoid thread termination.
 
 BackgroundJobAssignee::~BackgroundJobAssignee()
 {
-    if (!finished)
-        throw Exception(ErrorCodes::LOGICAL_ERROR, "finish() method must be called before destructor")
+    assert(finished);
 }
 
 }

From 9257188512ad886f91c32de98e005cd9fb15fee5 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 3 Sep 2021 00:28:29 +0000
Subject: [PATCH 071/177] Comments

---
 src/Common/RingBuffer.h                       |  4 ++-
 .../MergeTree/MergeMutateExecutor.cpp         | 29 +++++++++++++++----
 2 files changed, 26 insertions(+), 7 deletions(-)

diff --git a/src/Common/RingBuffer.h b/src/Common/RingBuffer.h
index fb208307c3b..d8e0ae74156 100644
--- a/src/Common/RingBuffer.h
+++ b/src/Common/RingBuffer.h
@@ -13,7 +13,8 @@ namespace DB
  */
 
 template <class T>
-class RingBuffer {
+class RingBuffer
+{
 public:
     explicit RingBuffer(size_t capacity_) : capacity(capacity_)
     {
@@ -51,6 +52,7 @@ public:
         return true;
     }
 
+    /// In case of T = std::shared_ptr<Something> it won't cause any allocations
     template <typename Predicate>
     void eraseAll(Predicate && predicate)
     {
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index 9178ba85452..e2c65a0916b 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -77,7 +77,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         try
         {
             /// This is needed to increase / decrease the number of threads at runtime
-            if (update_timer.compareAndRestartDeferred(1.))
+            if (update_timer.compareAndRestartDeferred(10.))
                 updateConfiguration();
         }
         catch (...)
@@ -90,11 +90,16 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         {
             setThreadName(name.c_str());
 
-            auto check_if_deleting = [&] () -> bool
+            /// Storage may want to destroy and it needs to finish all task related to it.
+            /// But checking causes some interaction with storage methods, for example it calls getStorageID.
+            /// So, we must execute this checking once, signal another thread that we are finished and be destroyed.
+            /// Not doing any complex stuff, especially interaction with Storage...
+            /// Calling this check twice may cause segfault.
+            auto check_if_currently_deleting = [&] () -> bool
             {
                 active.eraseAll([&] (auto x) { return x == item; });
 
-                for (auto & id : currently_deleting)
+                for (const auto & id : currently_deleting)
                 {
                     if (item->task->getStorageID() == id)
                     {
@@ -106,9 +111,14 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
                 return false;
             };
 
+            bool checked{false};
+
             SCOPE_EXIT({
+                if (checked)
+                    return;
+                checked = true;
                 std::lock_guard guard(mutex);
-                check_if_deleting();
+                check_if_currently_deleting();
             });
 
             try
@@ -117,7 +127,7 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
                 {
                     std::lock_guard guard(mutex);
 
-                    if (check_if_deleting())
+                    if (check_if_currently_deleting())
                         return;
 
                     pending.tryPush(item);
@@ -125,6 +135,9 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
                     return;
                 }
 
+                /// In a situation of a lack of memory this method can throw an exception,
+                /// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
+                /// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
                 item->task->onCompleted();
 
                 std::lock_guard guard(mutex);
@@ -132,9 +145,13 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
             }
             catch(...)
             {
-                item->task->onCompleted();
                 std::lock_guard guard(mutex);
                 has_tasks.notify_one();
+                try
+                {
+                    item->task->onCompleted();
+                }
+                catch (...) {}
                 tryLogCurrentException(__PRETTY_FUNCTION__);
             }
 

From 60765b8c6ed796a23fa7bd1048f50dec386dd878 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 3 Sep 2021 00:45:19 +0000
Subject: [PATCH 072/177] Better

---
 src/Common/RingBuffer.h                      | 2 +-
 src/Storages/MergeTree/MergeMutateExecutor.h | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Common/RingBuffer.h b/src/Common/RingBuffer.h
index d8e0ae74156..b002a353fcc 100644
--- a/src/Common/RingBuffer.h
+++ b/src/Common/RingBuffer.h
@@ -36,7 +36,7 @@ public:
         if (count == capacity) {
             return false;
         }
-        buffer[advance(count)] = element;
+        buffer[advance(count)] = std::move(element);
         ++count;
         return true;
     }
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 3470a352b8b..3da17310918 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -112,7 +112,7 @@ private:
             pending.resize(new_max_tasks_count);
             active.resize(new_max_tasks_count);
 
-            pool.setMaxFreeThreads(0);
+            pool.setMaxFreeThreads(new_threads_count);
             pool.setMaxThreads(new_threads_count);
             pool.setQueueSize(new_max_tasks_count);
         }

From d777c6cd70f7c73d34c8247b69c3281b25071181 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 3 Sep 2021 00:51:12 +0000
Subject: [PATCH 073/177] Better [2]

---
 src/Storages/MergeTree/MergeMutateExecutor.cpp | 11 -----------
 src/Storages/MergeTree/MergeMutateExecutor.h   | 11 +++++++++++
 2 files changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index e2c65a0916b..439af59afec 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -74,17 +74,6 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         status = active.tryPush(item);
         assert(status);
 
-        try
-        {
-            /// This is needed to increase / decrease the number of threads at runtime
-            if (update_timer.compareAndRestartDeferred(10.))
-                updateConfiguration();
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-        }
-
 
         bool res = pool.trySchedule([this, item] ()
         {
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 3da17310918..aaca2b8dc1c 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -60,6 +60,17 @@ public:
         if (shutdown_suspend)
             return false;
 
+        try
+        {
+            /// This is needed to increase / decrease the number of threads at runtime
+            if (update_timer.compareAndRestartDeferred(10.))
+                updateConfiguration();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
+        }
+
         auto & value = CurrentMetrics::values[metric];
         if (value.load() >= static_cast<int64_t>(max_tasks_count))
             return false;

From a182e621f06b9b6c777bad1bce68e5ada02fdaa3 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 3 Sep 2021 11:53:26 +0300
Subject: [PATCH 074/177] Maybe it help

---
 tests/integration/test_keeper_two_nodes_cluster/test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/integration/test_keeper_two_nodes_cluster/test.py b/tests/integration/test_keeper_two_nodes_cluster/test.py
index 5b71ddf7087..9c7fa25ab31 100644
--- a/tests/integration/test_keeper_two_nodes_cluster/test.py
+++ b/tests/integration/test_keeper_two_nodes_cluster/test.py
@@ -92,8 +92,8 @@ def test_read_write_two_nodes(started_cluster):
 def test_read_write_two_nodes_with_blocade(started_cluster):
     try:
         wait_nodes()
-        node1_zk = get_fake_zk("node1")
-        node2_zk = get_fake_zk("node2")
+        node1_zk = get_fake_zk("node1", timeout=5.0)
+        node2_zk = get_fake_zk("node2", timeout=5.0)
 
         print("Blocking nodes")
         with PartitionManager() as pm:

From af7220b6aa3d5802479572afba2e39fae2d3a71e Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Fri, 3 Sep 2021 13:00:40 +0300
Subject: [PATCH 075/177] Updated ShellCommandSource

---
 src/DataStreams/ShellCommandSource.h          | 253 ++++++------------
 .../ExecutablePoolDictionarySource.cpp        |  14 +-
 src/Storages/StorageExecutable.cpp            |  29 +-
 src/Storages/StorageExecutable.h              |   2 +
 .../user_scripts/test_input_process_pool.sh   |   2 +-
 .../test_input_process_pool_multiple_pipes.sh |   2 +-
 6 files changed, 105 insertions(+), 197 deletions(-)

diff --git a/src/DataStreams/ShellCommandSource.h b/src/DataStreams/ShellCommandSource.h
index 1559a96b456..befdbc796ba 100644
--- a/src/DataStreams/ShellCommandSource.h
+++ b/src/DataStreams/ShellCommandSource.h
@@ -20,14 +20,32 @@
 namespace DB
 {
 
-/** A stream, that get child process and sends data tasks.
-  * For each send data task background thread is created, send data tasks must send data to process input pipes.
-  * ShellCommandSource receives data from process stdout.
+/** A stream, that get child process and sends data using tasks in background threads.
+  * For each send data task background thread is created. Send data task must send data to process input pipes.
+  * ShellCommandPoolSource receives data from process stdout.
+  *
+  * If process_pool is passed in constructor then after source is destroyed process is returned to pool.
   */
+
+using ProcessPool = BorrowedObjectPool<std::unique_ptr<ShellCommand>>;
+
+struct ShellCommandSourceConfiguration
+{
+    /// Read fixed number of rows from command output
+    bool read_fixed_number_of_rows = false;
+    /// Valid only if read_fixed_number_of_rows = true
+    bool read_number_of_rows_from_process_output = false;
+    /// Valid only if read_fixed_number_of_rows = true
+    size_t number_of_rows_to_read = 0;
+    /// Max block size
+    size_t max_block_size = DBMS_DEFAULT_BUFFER_SIZE;
+};
+
 class ShellCommandSource final : public SourceWithProgress
 {
 public:
-    using SendDataTask = std::function<void (void)>;
+
+    using SendDataTask = std::function<void(void)>;
 
     ShellCommandSource(
         ContextPtr context,
@@ -35,30 +53,52 @@ public:
         const Block & sample_block,
         std::unique_ptr<ShellCommand> && command_,
         Poco::Logger * log_,
-        std::vector<SendDataTask> && send_data_tasks,
-        size_t max_block_size = DEFAULT_BLOCK_SIZE)
+        std::vector<SendDataTask> && send_data_tasks = {},
+        const ShellCommandSourceConfiguration & configuration_ = {},
+        std::shared_ptr<ProcessPool> process_pool_ = nullptr)
         : SourceWithProgress(sample_block)
         , command(std::move(command_))
+        , configuration(configuration_)
         , log(log_)
+        , process_pool(process_pool_)
     {
         for (auto && send_data_task : send_data_tasks)
-            send_data_threads.emplace_back([task = std::move(send_data_task)]() { task(); });
+        {
+            send_data_threads.emplace_back([task = std::move(send_data_task), this]()
+            {
+                try
+                {
+                    task();
+                }
+                catch (...)
+                {
+                    std::lock_guard<std::mutex> lock(send_data_lock);
+                    exception_during_send_data = std::current_exception();
+                }
+            });
+        }
 
-        pipeline.init(Pipe(FormatFactory::instance().getInput(format, command->out, sample_block, context, max_block_size)));
-        executor = std::make_unique<PullingPipelineExecutor>(pipeline);
-    }
+        size_t max_block_size = configuration.max_block_size;
+
+        if (configuration.read_fixed_number_of_rows)
+        {
+            /** Currently parallel parsing input format cannot read exactly max_block_size rows from input,
+              * so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof.
+              */
+            auto context_for_reading = Context::createCopy(context);
+            context_for_reading->setSetting("input_format_parallel_parsing", false);
+            context = context_for_reading;
+
+            if (configuration.read_number_of_rows_from_process_output)
+            {
+                readText(configuration.number_of_rows_to_read, command->out);
+                char dummy;
+                readChar(dummy, command->out);
+            }
+
+            max_block_size = configuration.number_of_rows_to_read;
+        }
 
-    ShellCommandSource(
-        ContextPtr context,
-        const std::string & format,
-        const Block & sample_block,
-        std::unique_ptr<ShellCommand> && command_,
-        Poco::Logger * log_,
-        size_t max_block_size = DEFAULT_BLOCK_SIZE)
-        : SourceWithProgress(sample_block)
-        , command(std::move(command_))
-        , log(log_)
-    {
         pipeline.init(Pipe(FormatFactory::instance().getInput(format, command->out, sample_block, context, max_block_size)));
         executor = std::make_unique<PullingPipelineExecutor>(pipeline);
     }
@@ -68,155 +108,18 @@ public:
         for (auto & thread : send_data_threads)
             if (thread.joinable())
                 thread.join();
-    }
 
-protected:
-    Chunk generate() override
-    {
-        Chunk chunk;
-        executor->pull(chunk);
-        return chunk;
-    }
-
-public:
-    Status prepare() override
-    {
-        auto status = SourceWithProgress::prepare();
-
-        if (status == Status::Finished)
-        {
-            std::string err;
-            readStringUntilEOF(err, command->err);
-            if (!err.empty())
-                LOG_ERROR(log, "Having stderr: {}", err);
-
-            for (auto & thread : send_data_threads)
-                if (thread.joinable())
-                    thread.join();
-
-            command->wait();
-        }
-
-        return status;
-    }
-
-    String getName() const override { return "ShellCommandSource"; }
-
-private:
-
-    QueryPipeline pipeline;
-    std::unique_ptr<PullingPipelineExecutor> executor;
-    std::unique_ptr<ShellCommand> command;
-    std::vector<ThreadFromGlobalPool> send_data_threads;
-    Poco::Logger * log;
-};
-
-/** A stream, that get child process and sends data tasks.
-  * For each send data task background thread is created, send data tasks must send data to process input pipes.
-  * ShellCommandPoolSource receives data from process stdout.
-  *
-  * Main difference with ShellCommandSource is that ShellCommandPoolSource initialized with process_pool and rows_to_read.
-  * Rows to read are necessary because processes in pool are not destroyed and work in read write loop.
-  * Source need to finish generating new chunks after rows_to_read rows are generated from process.
-  *
-  * If rows_to_read are not specified it is expected that script will output rows_to_read before other data.
-  *
-  * After source is destroyed process is returned to pool.
-  */
-
-using ProcessPool = BorrowedObjectPool<std::unique_ptr<ShellCommand>>;
-
-class ShellCommandPoolSource final : public SourceWithProgress
-{
-public:
-    using SendDataTask = std::function<void(void)>;
-
-    ShellCommandPoolSource(
-        ContextPtr context,
-        const std::string & format,
-        const Block & sample_block,
-        std::shared_ptr<ProcessPool> process_pool_,
-        std::unique_ptr<ShellCommand> && command_,
-        size_t rows_to_read_,
-        Poco::Logger * log_,
-        std::vector<SendDataTask> && send_data_tasks)
-        : SourceWithProgress(sample_block)
-        , process_pool(process_pool_)
-        , command(std::move(command_))
-        , rows_to_read(rows_to_read_)
-        , log(log_)
-    {
-        for (auto && send_data_task : send_data_tasks)
-        {
-            send_data_threads.emplace_back([task = std::move(send_data_task), this]()
-            {
-                try
-                {
-                    task();
-                }
-                catch (...)
-                {
-                    std::lock_guard<std::mutex> lock(send_data_lock);
-                    exception_during_send_data = std::current_exception();
-                }
-            });
-        }
-
-        pipeline.init(Pipe(FormatFactory::instance().getInput(format, command->out, sample_block, context, rows_to_read)));
-        executor = std::make_unique<PullingPipelineExecutor>(pipeline);
-    }
-
-    ShellCommandPoolSource(
-        ContextPtr context,
-        const std::string & format,
-        const Block & sample_block,
-        std::shared_ptr<ProcessPool> process_pool_,
-        std::unique_ptr<ShellCommand> && command_,
-        Poco::Logger * log_,
-        std::vector<SendDataTask> && send_data_tasks)
-        : SourceWithProgress(sample_block)
-        , process_pool(process_pool_)
-        , command(std::move(command_))
-        , log(log_)
-    {
-        for (auto && send_data_task : send_data_tasks)
-        {
-            send_data_threads.emplace_back([task = std::move(send_data_task), this]()
-            {
-                try
-                {
-                    task();
-                }
-                catch (...)
-                {
-                    std::lock_guard<std::mutex> lock(send_data_lock);
-                    exception_during_send_data = std::current_exception();
-                }
-            });
-        }
-
-        readText(rows_to_read, command->out);
-        pipeline.init(Pipe(FormatFactory::instance().getInput(format, command->out, sample_block, context, rows_to_read)));
-        executor = std::make_unique<PullingPipelineExecutor>(pipeline);
-    }
-
-
-    ~ShellCommandPoolSource() override
-    {
-        for (auto & thread : send_data_threads)
-            if (thread.joinable())
-                thread.join();
-
-        if (command)
+        if (command && process_pool)
             process_pool->returnObject(std::move(command));
     }
 
 protected:
+
     Chunk generate() override
     {
-        rethrowExceptionDuringReadIfNeeded();
+        rethrowExceptionDuringSendDataIfNeeded();
 
-        if (current_read_rows == rows_to_read)
+        if (configuration.read_fixed_number_of_rows && configuration.number_of_rows_to_read == current_read_rows)
             return {};
 
         Chunk chunk;
@@ -238,7 +141,6 @@ protected:
         return chunk;
     }
 
-public:
     Status prepare() override
     {
         auto status = SourceWithProgress::prepare();
@@ -249,13 +151,17 @@ public:
                 if (thread.joinable())
                     thread.join();
 
-            rethrowExceptionDuringReadIfNeeded();
+            rethrowExceptionDuringSendDataIfNeeded();
         }
 
         return status;
     }
 
-    void rethrowExceptionDuringReadIfNeeded()
+    String getName() const override { return "ShellCommandSource"; }
+
+private:
+
+    void rethrowExceptionDuringSendDataIfNeeded()
     {
         std::lock_guard<std::mutex> lock(send_data_lock);
         if (exception_during_send_data)
@@ -265,18 +171,19 @@ public:
         }
     }
 
-    String getName() const override { return "ShellCommandPoolSource"; }
-
-    std::shared_ptr<ProcessPool> process_pool;
     std::unique_ptr<ShellCommand> command;
-    QueryPipeline pipeline;
-    std::unique_ptr<PullingPipelineExecutor> executor;
-    size_t rows_to_read = 0;
-    Poco::Logger * log;
-    std::vector<ThreadFromGlobalPool> send_data_threads;
+    ShellCommandSourceConfiguration configuration;
 
     size_t current_read_rows = 0;
 
+    Poco::Logger * log;
+
+    std::shared_ptr<ProcessPool> process_pool;
+
+    QueryPipeline pipeline;
+    std::unique_ptr<PullingPipelineExecutor> executor;
+
+    std::vector<ThreadFromGlobalPool> send_data_threads;
     std::mutex send_data_lock;
     std::exception_ptr exception_during_send_data;
 };
diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.cpp b/src/Dictionaries/ExecutablePoolDictionarySource.cpp
index e3cad41c856..10164fe7310 100644
--- a/src/Dictionaries/ExecutablePoolDictionarySource.cpp
+++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp
@@ -112,15 +112,18 @@ Pipe ExecutablePoolDictionarySource::getStreamForBlock(const Block & block)
 
     size_t rows_to_read = block.rows();
     auto * process_in = &process->in;
-    ShellCommandPoolSource::SendDataTask task = [process_in, block, this]() mutable
+    ShellCommandSource::SendDataTask task = [process_in, block, this]() mutable
     {
         auto & out = *process_in;
         auto output_stream = context->getOutputStream(configuration.format, out, block.cloneEmpty());
         formatBlock(output_stream, block);
     };
-    std::vector<ShellCommandPoolSource::SendDataTask> tasks = {std::move(task)};
+    std::vector<ShellCommandSource::SendDataTask> tasks = {std::move(task)};
 
-    Pipe pipe(std::make_unique<ShellCommandPoolSource>(context, configuration.format, sample_block, process_pool, std::move(process), rows_to_read, log, std::move(tasks)));
+    ShellCommandSourceConfiguration command_configuration;
+    command_configuration.read_fixed_number_of_rows = true;
+    command_configuration.number_of_rows_to_read = rows_to_read;
+    Pipe pipe(std::make_unique<ShellCommandSource>(context, configuration.format, sample_block, std::move(process), log, std::move(tasks), command_configuration, process_pool));
 
     if (configuration.implicit_key)
         pipe.addTransform(std::make_shared<TransformWithAdditionalColumns>(block, pipe.getHeader()));
@@ -174,11 +177,6 @@ void registerDictionarySourceExecutablePool(DictionarySourceFactory & factory)
 
         ContextMutablePtr context = copyContextAndApplySettingsFromDictionaryConfig(global_context, config, config_prefix);
 
-        /** Currently parallel parsing input format cannot read exactly max_block_size rows from input,
-         *  so it will be blocked on ReadBufferFromFileDescriptor because this file descriptor represent pipe that does not have eof.
-         */
-        context->setSetting("input_format_parallel_parsing", false);
-
         String settings_config_prefix = config_prefix + ".executable_pool";
 
         size_t max_command_execution_time = config.getUInt64(settings_config_prefix + ".max_command_execution_time", 10);
diff --git a/src/Storages/StorageExecutable.cpp b/src/Storages/StorageExecutable.cpp
index 6d81a4eff40..4b0aaf6caea 100644
--- a/src/Storages/StorageExecutable.cpp
+++ b/src/Storages/StorageExecutable.cpp
@@ -84,8 +84,6 @@ Pipe StorageExecutable::read(
     size_t max_block_size,
     unsigned /*threads*/)
 {
-    std::cerr << getName() << "::read" << std::endl;
-
     auto user_scripts_path = context->getUserScriptsPath();
     auto script_path = user_scripts_path + '/' + script_name;
     if (!std::filesystem::exists(std::filesystem::path(script_path)))
@@ -111,9 +109,9 @@ Pipe StorageExecutable::read(
 
     std::unique_ptr<ShellCommand> process;
 
-    if (process_pool)
+    bool is_executable_pool = (process_pool != nullptr);
+    if (is_executable_pool)
     {
-        std::cerr << getName() <<"::read create process" << std::endl;
         bool result = process_pool->tryBorrowObject(process, [&config, this]()
         {
             config.terminate_in_destructor_strategy = ShellCommand::DestructorStrategy{ true /*terminate_in_destructor*/, pool_settings.command_termination_timeout };
@@ -153,7 +151,7 @@ Pipe StorageExecutable::read(
             write_buffer = &it->second;
         }
 
-        ShellCommandSource::SendDataTask task = [input_stream, write_buffer, context, this]()
+        ShellCommandSource::SendDataTask task = [input_stream, write_buffer, context, is_executable_pool, this]()
         {
             auto output_stream = context->getOutputStream(format, *write_buffer, input_stream->getHeader().cloneEmpty());
             input_stream->readPrefix();
@@ -166,7 +164,9 @@ Pipe StorageExecutable::read(
             output_stream->writeSuffix();
 
             output_stream->flush();
-            write_buffer->close();
+
+            if (!is_executable_pool)
+                write_buffer->close();
         };
 
         tasks.emplace_back(std::move(task));
@@ -174,16 +174,17 @@ Pipe StorageExecutable::read(
 
     auto sample_block = metadata_snapshot->getSampleBlock();
 
-    if (process_pool)
+    ShellCommandSourceConfiguration configuration;
+    configuration.max_block_size = max_block_size;
+
+    if (is_executable_pool)
     {
-        Pipe pipe(std::make_unique<ShellCommandPoolSource>(context, format, std::move(sample_block), process_pool, std::move(process), log, std::move(tasks)));
-        return pipe;
-    }
-    else
-    {
-        Pipe pipe(std::make_unique<ShellCommandSource>(context, format, std::move(sample_block), std::move(process), log, std::move(tasks), max_block_size));
-        return pipe;
+        configuration.read_fixed_number_of_rows = true;
+        configuration.read_number_of_rows_from_process_output = true;
     }
+
+    Pipe pipe(std::make_unique<ShellCommandSource>(context, format, std::move(sample_block), std::move(process), log, std::move(tasks), configuration, process_pool));
+    return pipe;
 }
 
 void registerStorageExecutable(StorageFactory & factory)
diff --git a/src/Storages/StorageExecutable.h b/src/Storages/StorageExecutable.h
index 8578ec3fee7..dd986ee3956 100644
--- a/src/Storages/StorageExecutable.h
+++ b/src/Storages/StorageExecutable.h
@@ -18,6 +18,7 @@ namespace DB
 class StorageExecutable final : public shared_ptr_helper<StorageExecutable>, public IStorage
 {
     friend struct shared_ptr_helper<StorageExecutable>;
+
 public:
 
     String getName() const override
@@ -38,6 +39,7 @@ public:
         unsigned threads) override;
 
 protected:
+
     StorageExecutable(
         const StorageID & table_id,
         const String & script_name_,
diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool.sh b/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool.sh
index f569b2dbbaa..ed40a0d5291 100755
--- a/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool.sh
+++ b/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool.sh
@@ -1,3 +1,3 @@
 #!/bin/bash
 
-while read read_data; do printf '1'; printf "Key $read_data\n"; done
+while read read_data; do printf "1\n"; printf "Key $read_data\n"; done
diff --git a/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool_multiple_pipes.sh b/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool_multiple_pipes.sh
index 6fcc412f86a..4408ccae756 100755
--- a/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool_multiple_pipes.sh
+++ b/tests/integration/test_executable_table_function/user_scripts/test_input_process_pool_multiple_pipes.sh
@@ -4,7 +4,7 @@ read -t 250 -u 4 read_data_from_4_fd;
 read -t 250 -u 3 read_data_from_3_fd;
 read -t 250 read_data_from_0_df;
 
-printf '3';
+printf "3\n";
 printf "Key from 4 fd $read_data_from_4_fd\n";
 printf "Key from 3 fd $read_data_from_3_fd\n";
 printf "Key from 0 fd $read_data_from_0_df\n";

From 497c2252037f01af170078d513276dbf96d8c459 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 3 Sep 2021 13:07:40 +0300
Subject: [PATCH 076/177] Test log level for CI

---
 base/common/logger_useful.h             |  1 +
 contrib/poco                            |  2 +-
 programs/server/config.xml              |  1 +
 src/Common/tests/gtest_log.cpp          | 30 +++++++++++++++++++++++++
 src/Coordination/KeeperStateMachine.cpp |  4 ++++
 src/Core/SettingsEnums.cpp              |  4 ++--
 src/Core/SettingsEnums.h                |  1 +
 tests/config/config.d/logger.xml        | 11 +++++++++
 tests/config/install.sh                 |  1 +
 9 files changed, 52 insertions(+), 3 deletions(-)
 create mode 100644 tests/config/config.d/logger.xml

diff --git a/base/common/logger_useful.h b/base/common/logger_useful.h
index d3b4d38d546..e2290a727b4 100644
--- a/base/common/logger_useful.h
+++ b/base/common/logger_useful.h
@@ -42,6 +42,7 @@ namespace
 } while (false)
 
 
+#define LOG_TEST(logger, ...)    LOG_IMPL(logger, DB::LogsLevel::test, Poco::Message::PRIO_TEST, __VA_ARGS__)
 #define LOG_TRACE(logger, ...)   LOG_IMPL(logger, DB::LogsLevel::trace, Poco::Message::PRIO_TRACE, __VA_ARGS__)
 #define LOG_DEBUG(logger, ...)   LOG_IMPL(logger, DB::LogsLevel::debug, Poco::Message::PRIO_DEBUG, __VA_ARGS__)
 #define LOG_INFO(logger, ...)    LOG_IMPL(logger, DB::LogsLevel::information, Poco::Message::PRIO_INFORMATION, __VA_ARGS__)
diff --git a/contrib/poco b/contrib/poco
index 7351c4691b5..15883876a75 160000
--- a/contrib/poco
+++ b/contrib/poco
@@ -1 +1 @@
-Subproject commit 7351c4691b5d401f59e3959adfc5b4fa263b32da
+Subproject commit 15883876a758bf6f407b22ea4ad0ad2f9465bee6
diff --git a/programs/server/config.xml b/programs/server/config.xml
index 510a5e230f8..18bb23c6227 100644
--- a/programs/server/config.xml
+++ b/programs/server/config.xml
@@ -18,6 +18,7 @@
           - information
           - debug
           - trace
+          - test (not for production usage)
 
             [1]: https://github.com/pocoproject/poco/blob/poco-1.9.4-release/Foundation/include/Poco/Logger.h#L105-L114
         -->
diff --git a/src/Common/tests/gtest_log.cpp b/src/Common/tests/gtest_log.cpp
index 9f4ef41f642..a7c49497c35 100644
--- a/src/Common/tests/gtest_log.cpp
+++ b/src/Common/tests/gtest_log.cpp
@@ -6,6 +6,8 @@
 #include <Poco/Logger.h>
 #include <Poco/AutoPtr.h>
 #include <Poco/NullChannel.h>
+#include <Poco/StreamChannel.h>
+#include <sstream>
 
 
 TEST(Logger, Log)
@@ -17,3 +19,31 @@ TEST(Logger, Log)
     /// This test checks that we don't pass this string to fmtlib, because it is the only argument.
     EXPECT_NO_THROW(LOG_INFO(log, "Hello {} World"));
 }
+
+TEST(Logger, TestLog)
+{
+    {   /// Test logs visible for test level
+        Poco::Logger::root().setLevel("test");
+        std::ostringstream oss;
+        Poco::Logger::root().setChannel(Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss)));
+        Poco::Logger * log = &Poco::Logger::get("Log");
+        LOG_TEST(log, "Hello World");
+
+        EXPECT_EQ(oss.str(), "Hello World\n");
+    }
+
+    {   /// Test logs invisible for other levels
+        for (const auto & level : {"trace", "debug", "information", "warning", "fatal"})
+        {
+            Poco::Logger::root().setLevel(level);
+            std::ostringstream oss;
+            Poco::Logger::root().setChannel(Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss)));
+            Poco::Logger * log = &Poco::Logger::get("Log");
+
+            LOG_TEST(log, "Hello World");
+
+            EXPECT_EQ(oss.str(), "");
+        }
+    }
+
+}
diff --git a/src/Coordination/KeeperStateMachine.cpp b/src/Coordination/KeeperStateMachine.cpp
index ffbac0656b9..ed808c15708 100644
--- a/src/Coordination/KeeperStateMachine.cpp
+++ b/src/Coordination/KeeperStateMachine.cpp
@@ -122,6 +122,10 @@ nuraft::ptr<nuraft::buffer> KeeperStateMachine::commit(const uint64_t log_idx, n
     }
     else
     {
+        LOG_TEST(log, "Commit request for session {} with type {}, log id {}{}",
+                 request_for_session.session_id, toString(request_for_session.request->getOpNum()), log_idx,
+                 request_for_session.request->getPath().empty() ? "" : ", path " + request_for_session.request->getPath());
+
         std::lock_guard lock(storage_and_responses_lock);
         KeeperStorage::ResponsesForSessions responses_for_sessions = storage->processRequest(request_for_session.request, request_for_session.session_id, log_idx);
         for (auto & response_for_session : responses_for_sessions)
diff --git a/src/Core/SettingsEnums.cpp b/src/Core/SettingsEnums.cpp
index 213d365ad96..8e588b62326 100644
--- a/src/Core/SettingsEnums.cpp
+++ b/src/Core/SettingsEnums.cpp
@@ -79,8 +79,8 @@ IMPLEMENT_SETTING_ENUM(LogsLevel, ErrorCodes::BAD_ARGUMENTS,
      {"warning",     LogsLevel::warning},
      {"information", LogsLevel::information},
      {"debug",       LogsLevel::debug},
-     {"trace",       LogsLevel::trace}})
-
+     {"trace",       LogsLevel::trace},
+     {"test",        LogsLevel::test}})
 
 IMPLEMENT_SETTING_ENUM_WITH_RENAME(LogQueriesType, ErrorCodes::BAD_ARGUMENTS,
     {{"QUERY_START",                QUERY_START},
diff --git a/src/Core/SettingsEnums.h b/src/Core/SettingsEnums.h
index d1dc71f621f..a699da3062c 100644
--- a/src/Core/SettingsEnums.h
+++ b/src/Core/SettingsEnums.h
@@ -94,6 +94,7 @@ enum class LogsLevel
     information,
     debug,
     trace,
+    test,
 };
 
 DECLARE_SETTING_ENUM(LogsLevel)
diff --git a/tests/config/config.d/logger.xml b/tests/config/config.d/logger.xml
new file mode 100644
index 00000000000..5b6f976b5f8
--- /dev/null
+++ b/tests/config/config.d/logger.xml
@@ -0,0 +1,11 @@
+<yandex>
+    <logger>
+        <level>test</level>
+        <log>/var/log/clickhouse-server/clickhouse-server.log</log>
+        <errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
diff --git a/tests/config/install.sh b/tests/config/install.sh
index 7dd2559e505..67cc4ef99d6 100755
--- a/tests/config/install.sh
+++ b/tests/config/install.sh
@@ -38,6 +38,7 @@ ln -sf $SRC_PATH/config.d/top_level_domains_lists.xml $DEST_SERVER_PATH/config.d
 ln -sf $SRC_PATH/config.d/top_level_domains_path.xml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/config.d/encryption.xml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/config.d/zookeeper_log.xml $DEST_SERVER_PATH/config.d/
+ln -sf $SRC_PATH/config.d/logger.xml $DEST_SERVER_PATH/config.d/
 ln -sf $SRC_PATH/users.d/log_queries.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/readonly.xml $DEST_SERVER_PATH/users.d/
 ln -sf $SRC_PATH/users.d/access_management.xml $DEST_SERVER_PATH/users.d/

From bb87d01579ec965478c2d05f0b98b7bbec4b5713 Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Fri, 3 Sep 2021 18:49:01 +0800
Subject: [PATCH 077/177] Fix wrong header of minmax_count projection

---
 src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp         | 2 +-
 .../0_stateless/01710_minmax_count_projection.reference        | 1 +
 tests/queries/0_stateless/01710_minmax_count_projection.sql    | 3 +++
 3 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 734a67da758..92529b00faa 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -174,7 +174,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read(
     if (query_info.projection->desc->is_minmax_count_projection)
     {
         Pipe pipe(std::make_shared<SourceFromSingleChunk>(
-            query_info.minmax_count_projection_block,
+            query_info.minmax_count_projection_block.cloneEmpty(),
             Chunk(query_info.minmax_count_projection_block.getColumns(), query_info.minmax_count_projection_block.rows())));
         auto read_from_pipe = std::make_unique<ReadFromPreparedSource>(std::move(pipe));
         projection_plan->addStep(std::move(read_from_pipe));
diff --git a/tests/queries/0_stateless/01710_minmax_count_projection.reference b/tests/queries/0_stateless/01710_minmax_count_projection.reference
index 882d808069e..ad9b87b998d 100644
--- a/tests/queries/0_stateless/01710_minmax_count_projection.reference
+++ b/tests/queries/0_stateless/01710_minmax_count_projection.reference
@@ -2,3 +2,4 @@
 0	9998	5000
 1	9999	5000
 0	9998	5000
+1
diff --git a/tests/queries/0_stateless/01710_minmax_count_projection.sql b/tests/queries/0_stateless/01710_minmax_count_projection.sql
index 3ee19fe8c2e..58af11f01f7 100644
--- a/tests/queries/0_stateless/01710_minmax_count_projection.sql
+++ b/tests/queries/0_stateless/01710_minmax_count_projection.sql
@@ -11,4 +11,7 @@ select min(i), max(i), count() from d group by _partition_id order by _partition
 select min(i), max(i), count() from d where _partition_value.1 = 0 group by _partition_id order by _partition_id;
 select min(i), max(i), count() from d where _partition_value.1 = 10 group by _partition_id order by _partition_id;
 
+-- fuzz crash
+select min(i) from d where 1 = _partition_value.1;
+
 drop table d;

From 9c480c0de6502322a65bd35b8b0a29ee5b07a13b Mon Sep 17 00:00:00 2001
From: Artur <613623@mail.ru>
Date: Fri, 3 Sep 2021 10:56:18 +0000
Subject: [PATCH 078/177] remove recursion in ZstdInflatingReadBuffer

---
 src/IO/ZstdInflatingReadBuffer.cpp | 59 ++++++++++++++++--------------
 1 file changed, 32 insertions(+), 27 deletions(-)

diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp
index 6c03ea420a9..5694b333912 100644
--- a/src/IO/ZstdInflatingReadBuffer.cpp
+++ b/src/IO/ZstdInflatingReadBuffer.cpp
@@ -28,41 +28,46 @@ ZstdInflatingReadBuffer::~ZstdInflatingReadBuffer()
 
 bool ZstdInflatingReadBuffer::nextImpl()
 {
-    if (eof)
-        return false;
-
-    if (input.pos >= input.size)
+    do
     {
-        in->nextIfAtEnd();
-        input.src = reinterpret_cast<unsigned char *>(in->position());
-        input.pos = 0;
-        input.size = in->buffer().end() - in->position();
-    }
+        // If it is known that end of file was reached, return false
+        if (eof)
+            return false;
 
-    output.dst = reinterpret_cast<unsigned char *>(internal_buffer.begin());
-    output.size = internal_buffer.size();
-    output.pos = 0;
+        /// If end was reached, get next part
+        if (input.pos >= input.size)
+        {
+            in->nextIfAtEnd();
+            input.src = reinterpret_cast<unsigned char *>(in->position());
+            input.pos = 0;
+            input.size = in->buffer().end() - in->position();
+        }
 
-    size_t ret = ZSTD_decompressStream(dctx, &output, &input);
-    if (ZSTD_isError(ret))
-        throw Exception(
-            ErrorCodes::ZSTD_DECODER_FAILED, "Zstd stream decoding failed: error code: {}; zstd version: {}", ret, ZSTD_VERSION_STRING);
+        /// fill output
+        output.dst = reinterpret_cast<unsigned char *>(internal_buffer.begin());
+        output.size = internal_buffer.size();
+        output.pos = 0;
 
-    in->position() = in->buffer().begin() + input.pos;
-    working_buffer.resize(output.pos);
+        /// Decompress data and check errors.
+        size_t ret = ZSTD_decompressStream(dctx, &output, &input);
+        if (ZSTD_isError(ret))
+            throw Exception(
+                ErrorCodes::ZSTD_DECODER_FAILED, "Zstd stream decoding failed: error code: {}; zstd version: {}", ret, ZSTD_VERSION_STRING);
 
-    if (in->eof())
-    {
-        eof = true;
-        return !working_buffer.empty();
-    }
-    else if (output.pos == 0)
-    {
+        /// move position to the end of read data
+        in->position() = in->buffer().begin() + input.pos;
+        working_buffer.resize(output.pos);
+
+        /// If end of file is reached, fill eof variable and return true if there is some data in buffer, otherwise return false
+        if (in->eof())
+        {
+            eof = true;
+            return !working_buffer.empty();
+        }
         /// It is possible, that input buffer is not at eof yet, but nothing was decompressed in current iteration.
         /// But there are cases, when such behaviour is not allowed - i.e. if input buffer is not eof, then
         /// it has to be guaranteed that working_buffer is not empty. So if it is empty, continue.
-        return nextImpl();
-    }
+    } while (output.pos == 0);
 
     return true;
 }

From 69e7e1dff7dd2a14bea037fb8facfad2ffbe8084 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 3 Sep 2021 14:00:57 +0300
Subject: [PATCH 079/177] Update Dockerfile

---
 docker/packager/deb/Dockerfile | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docker/packager/deb/Dockerfile b/docker/packager/deb/Dockerfile
index b6f23c55aa6..22bba94f250 100644
--- a/docker/packager/deb/Dockerfile
+++ b/docker/packager/deb/Dockerfile
@@ -37,9 +37,7 @@ RUN curl -O https://clickhouse-datasets.s3.yandex.net/utils/1/dpkg-deb \
 RUN apt-get update \
     && apt-get install \
         alien \
-        clang-11 \
         clang-12 \
-        clang-tidy-11 \
         clang-tidy-12 \
         cmake \
         debhelper \
@@ -47,10 +45,7 @@ RUN apt-get update \
         gdb  \
         git \
         gperf \
-        lld-11 \
         lld-12 \
-        llvm-11 \
-        llvm-11-dev \
         llvm-12 \
         llvm-12-dev \
         moreutils \

From 4fe3909d74425b0ddf168369c176889a2931caad Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 3 Sep 2021 14:01:47 +0300
Subject: [PATCH 080/177] Update Dockerfile

---
 docker/packager/binary/Dockerfile | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/docker/packager/binary/Dockerfile b/docker/packager/binary/Dockerfile
index 05834fec493..f5d496ce97f 100644
--- a/docker/packager/binary/Dockerfile
+++ b/docker/packager/binary/Dockerfile
@@ -39,8 +39,6 @@ RUN apt-get update \
         bash \
         build-essential \
         ccache \
-        clang-11 \
-        clang-tidy-11 \
         cmake \
         curl \
         g++-10 \
@@ -50,9 +48,6 @@ RUN apt-get update \
         gperf \
         libicu-dev \
         libreadline-dev \
-        lld-11 \
-        llvm-11 \
-        llvm-11-dev \
         clang-12 \
         clang-tidy-12 \
         lld-12 \

From 04e5e52dbdd4074b396b898902ad71000425ab61 Mon Sep 17 00:00:00 2001
From: Artur <613623@mail.ru>
Date: Fri, 3 Sep 2021 11:18:19 +0000
Subject: [PATCH 081/177] prevent endless loop

---
 src/IO/ZstdInflatingReadBuffer.cpp | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/src/IO/ZstdInflatingReadBuffer.cpp b/src/IO/ZstdInflatingReadBuffer.cpp
index 5694b333912..da6768f7c4a 100644
--- a/src/IO/ZstdInflatingReadBuffer.cpp
+++ b/src/IO/ZstdInflatingReadBuffer.cpp
@@ -54,6 +54,9 @@ bool ZstdInflatingReadBuffer::nextImpl()
             throw Exception(
                 ErrorCodes::ZSTD_DECODER_FAILED, "Zstd stream decoding failed: error code: {}; zstd version: {}", ret, ZSTD_VERSION_STRING);
 
+        /// Check that something has changed after decompress (input or output position)
+        assert(output.pos > 0 || in->position() < in->buffer().begin() + input.pos);
+
         /// move position to the end of read data
         in->position() = in->buffer().begin() + input.pos;
         working_buffer.resize(output.pos);

From 8b8b0043c1493bf2b506f0d0d91356190780cf56 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 3 Sep 2021 14:20:09 +0300
Subject: [PATCH 082/177] Fix logger  conf

---
 tests/config/config.d/logger.xml | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/config/config.d/logger.xml b/tests/config/config.d/logger.xml
index 5b6f976b5f8..6a52c8d68a2 100644
--- a/tests/config/config.d/logger.xml
+++ b/tests/config/config.d/logger.xml
@@ -1,11 +1,5 @@
 <yandex>
     <logger>
         <level>test</level>
-        <log>/var/log/clickhouse-server/clickhouse-server.log</log>
-        <errorlog>/var/log/clickhouse-server/clickhouse-server.err.log</errorlog>
-        <size>1000M</size>
-        <count>10</count>
-        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
-        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
     </logger>
 </yandex>

From 9345b3c45ed40be2e45c381f096fd7c32267649f Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 3 Sep 2021 14:59:21 +0300
Subject: [PATCH 083/177] Fixup flaky

---
 tests/integration/test_keeper_two_nodes_cluster/test.py | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/tests/integration/test_keeper_two_nodes_cluster/test.py b/tests/integration/test_keeper_two_nodes_cluster/test.py
index 9c7fa25ab31..e6e3eb37af2 100644
--- a/tests/integration/test_keeper_two_nodes_cluster/test.py
+++ b/tests/integration/test_keeper_two_nodes_cluster/test.py
@@ -141,12 +141,6 @@ def test_read_write_two_nodes_with_blocade(started_cluster):
 
         print("Node2 created it's value")
 
-        assert node1_zk.exists("/test_read_write_blocked_node1") is not None
-        assert node2_zk.exists("/test_read_write_blocked_node1") is not None
-
-        assert node1_zk.exists("/test_read_write_blocked_node2") is None
-        assert node2_zk.exists("/test_read_write_blocked_node2") is None
-
         # stale reads are allowed
         while node1_zk.exists("/test_after_block2") is None:
             time.sleep(0.1)

From ca7adb76b979544bd35166b2086d7a48be910d55 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 3 Sep 2021 12:27:49 +0000
Subject: [PATCH 084/177] Owning future

---
 src/Common/RingBuffer.h                       |   9 +-
 .../MergeTree/MergeMutateExecutor.cpp         | 160 +++++++++---------
 src/Storages/MergeTree/MergeMutateExecutor.h  |  18 +-
 3 files changed, 98 insertions(+), 89 deletions(-)

diff --git a/src/Common/RingBuffer.h b/src/Common/RingBuffer.h
index b002a353fcc..9c65ac3dff1 100644
--- a/src/Common/RingBuffer.h
+++ b/src/Common/RingBuffer.h
@@ -54,19 +54,24 @@ public:
 
     /// In case of T = std::shared_ptr<Something> it won't cause any allocations
     template <typename Predicate>
-    void eraseAll(Predicate && predicate)
+    bool eraseAll(Predicate && predicate)
     {
         /// Shift all elements to the beginning of the buffer
         std::rotate(buffer.begin(), buffer.begin() + position, buffer.end());
+        position = 0;
+
         /// Remove elements
         auto end_removed = std::remove_if(buffer.begin(), buffer.begin() + count, predicate);
 
+        if (end_removed == buffer.begin() + count)
+            return false;
+
         size_t new_count = std::distance(buffer.begin(), end_removed);
         for (size_t i = new_count; i < count; ++i)
             buffer[i] = T{};
 
         count = new_count;
-        position = 0;
+        return true;
     }
 
     template <class Predicate>
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index 439af59afec..85ae35e8f32 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -26,7 +26,7 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 {
     std::lock_guard remove_lock(remove_mutex);
 
-    std::vector<ItemPtr> tasks_to_wait;
+    std::vector<ActiveMeta> tasks_to_wait;
     {
         std::lock_guard lock(mutex);
 
@@ -35,23 +35,91 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 
         /// Erase storage related tasks from pending and select active tasks to wait for
         pending.eraseAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
-        tasks_to_wait = active.getAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
+        tasks_to_wait = active.getAll([&] (auto elem) -> bool { return elem.item->task->getStorageID() == id; });
     }
 
 
-    for (const auto & item : tasks_to_wait)
+    for (auto & [item, future] : tasks_to_wait)
     {
-        assert(item->future.valid());
-        item->future.wait();
+        assert(future.valid());
+        try
+        {
+            future.wait();
+        }
+        catch (...) {}
     }
 
     {
         std::lock_guard lock(mutex);
+
+        for (auto & [item, future] : tasks_to_wait)
+        {
+            assert(item.use_count() == 1);
+            item.reset();
+        }
+
+
         currently_deleting.erase(id);
     }
 }
 
 
+void MergeTreeBackgroundExecutor::routine(ItemPtr item)
+{
+    setThreadName(name.c_str());
+
+    bool checked{false};
+
+    auto check_if_currently_deleting = [&] ()
+    {
+        checked = true;
+        return active.eraseAll([&] (auto & x) { return x.item == item; });
+    };
+
+
+    SCOPE_EXIT({
+        if (checked)
+            return;
+        std::lock_guard guard(mutex);
+        check_if_currently_deleting();
+    });
+
+    try
+    {
+        if (item->task->execute())
+        {
+            std::lock_guard guard(mutex);
+
+            if (check_if_currently_deleting())
+                return;
+
+            pending.tryPush(item);
+            has_tasks.notify_one();
+            return;
+        }
+
+        /// In a situation of a lack of memory this method can throw an exception,
+        /// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
+        /// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
+        item->task->onCompleted();
+
+        std::lock_guard guard(mutex);
+        has_tasks.notify_one();
+    }
+    catch(...)
+    {
+        std::lock_guard guard(mutex);
+        has_tasks.notify_one();
+        try
+        {
+            item->task->onCompleted();
+        }
+        catch (...) {}
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
+
 void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 {
     DENY_ALLOCATIONS_IN_SCOPE;
@@ -71,88 +139,20 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         if (!pending.tryPop(&item))
             continue;
 
-        status = active.tryPush(item);
-        assert(status);
-
-
-        bool res = pool.trySchedule([this, item] ()
-        {
-            setThreadName(name.c_str());
-
-            /// Storage may want to destroy and it needs to finish all task related to it.
-            /// But checking causes some interaction with storage methods, for example it calls getStorageID.
-            /// So, we must execute this checking once, signal another thread that we are finished and be destroyed.
-            /// Not doing any complex stuff, especially interaction with Storage...
-            /// Calling this check twice may cause segfault.
-            auto check_if_currently_deleting = [&] () -> bool
-            {
-                active.eraseAll([&] (auto x) { return x == item; });
-
-                for (const auto & id : currently_deleting)
-                {
-                    if (item->task->getStorageID() == id)
-                    {
-                        item->promise.set_value();
-                        return true;
-                    }
-                }
-
-                return false;
-            };
-
-            bool checked{false};
-
-            SCOPE_EXIT({
-                if (checked)
-                    return;
-                checked = true;
-                std::lock_guard guard(mutex);
-                check_if_currently_deleting();
-            });
-
-            try
-            {
-                if (item->task->execute())
-                {
-                    std::lock_guard guard(mutex);
-
-                    if (check_if_currently_deleting())
-                        return;
-
-                    pending.tryPush(item);
-                    has_tasks.notify_one();
-                    return;
-                }
-
-                /// In a situation of a lack of memory this method can throw an exception,
-                /// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
-                /// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
-                item->task->onCompleted();
-
-                std::lock_guard guard(mutex);
-                has_tasks.notify_one();
-            }
-            catch(...)
-            {
-                std::lock_guard guard(mutex);
-                has_tasks.notify_one();
-                try
-                {
-                    item->task->onCompleted();
-                }
-                catch (...) {}
-                tryLogCurrentException(__PRETTY_FUNCTION__);
-            }
-
-        });
+        auto thread_pool_job = std::make_shared<std::packaged_task<void()>>([this, item] { routine(item); });
+        auto future = thread_pool_job->get_future();
+        bool res = pool.trySchedule([thread_pool_job] { (*thread_pool_job)(); });
 
         if (!res)
         {
-            active.eraseAll([&] (auto x) { return x == item; });
+            active.eraseAll([&] (auto x) { return x.item == item; });
             status = pending.tryPush(item);
             assert(status);
+            continue;
         }
 
+        status = active.tryPush({std::move(item), std::move(future)});
+        assert(status);
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index aaca2b8dc1c..976a17b785d 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -136,8 +136,6 @@ private:
         max_tasks_count = new_max_tasks_count;
     }
 
-    void schedulerThreadFunction();
-
     static String toString(Type type);
 
     Type type;
@@ -156,22 +154,28 @@ private:
         explicit Item(ExecutableTaskPtr && task_, CurrentMetrics::Metric metric_)
             : task(std::move(task_))
             , increment(std::move(metric_))
-            , future(promise.get_future())
         {
         }
 
         ExecutableTaskPtr task;
         CurrentMetrics::Increment increment;
-
-        std::promise<void> promise;
-        std::future<void> future;
     };
 
     using ItemPtr = std::shared_ptr<Item>;
 
+    void routine(ItemPtr item);
+    void schedulerThreadFunction();
+
     /// Initially it will be empty
     RingBuffer<ItemPtr> pending{0};
-    RingBuffer<ItemPtr> active{0};
+
+    struct ActiveMeta
+    {
+        ItemPtr item;
+        std::shared_future<void> future;
+    };
+
+    RingBuffer<ActiveMeta> active{0};
     std::set<StorageID> currently_deleting;
 
     std::mutex remove_mutex;

From b867eabe73aaea971cd87a023e422c32d30ade24 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 3 Sep 2021 13:02:49 +0000
Subject: [PATCH 085/177] Use Poco::Event

---
 .../MergeTree/MergeMutateExecutor.cpp         | 33 ++++++++-----------
 src/Storages/MergeTree/MergeMutateExecutor.h  | 21 +++++++-----
 .../MergeTree/tests/gtest_executor.cpp        |  7 +++-
 3 files changed, 33 insertions(+), 28 deletions(-)

diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index 85ae35e8f32..2c3590dc134 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -26,7 +26,7 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 {
     std::lock_guard remove_lock(remove_mutex);
 
-    std::vector<ActiveMeta> tasks_to_wait;
+    std::vector<ItemPtr> tasks_to_wait;
     {
         std::lock_guard lock(mutex);
 
@@ -35,30 +35,22 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 
         /// Erase storage related tasks from pending and select active tasks to wait for
         pending.eraseAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
-        tasks_to_wait = active.getAll([&] (auto elem) -> bool { return elem.item->task->getStorageID() == id; });
+        tasks_to_wait = active.getAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
     }
 
 
-    for (auto & [item, future] : tasks_to_wait)
-    {
-        assert(future.valid());
-        try
-        {
-            future.wait();
-        }
-        catch (...) {}
-    }
+    for (auto & item : tasks_to_wait)
+        item->is_done.wait();
 
     {
         std::lock_guard lock(mutex);
 
-        for (auto & [item, future] : tasks_to_wait)
+        for (auto & item : tasks_to_wait)
         {
             assert(item.use_count() == 1);
             item.reset();
         }
 
-
         currently_deleting.erase(id);
     }
 }
@@ -73,7 +65,7 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
     auto check_if_currently_deleting = [&] ()
     {
         checked = true;
-        return active.eraseAll([&] (auto & x) { return x.item == item; });
+        return active.eraseAll([&] (auto & x) { return x == item; });
     };
 
 
@@ -94,6 +86,7 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
                 return;
 
             pending.tryPush(item);
+            item->is_done.reset();
             has_tasks.notify_one();
             return;
         }
@@ -139,19 +132,21 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         if (!pending.tryPop(&item))
             continue;
 
-        auto thread_pool_job = std::make_shared<std::packaged_task<void()>>([this, item] { routine(item); });
-        auto future = thread_pool_job->get_future();
-        bool res = pool.trySchedule([thread_pool_job] { (*thread_pool_job)(); });
+        bool res = pool.trySchedule([this, item]
+        {
+            routine(item);
+            item->is_done.set();
+        });
 
         if (!res)
         {
-            active.eraseAll([&] (auto x) { return x.item == item; });
+            active.eraseAll([&] (auto x) { return x == item; });
             status = pending.tryPush(item);
             assert(status);
             continue;
         }
 
-        status = active.tryPush({std::move(item), std::move(future)});
+        status = active.tryPush(std::move(item));
         assert(status);
     }
 }
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 976a17b785d..82da33893e6 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -9,6 +9,7 @@
 #include <set>
 
 #include <common/shared_ptr_helper.h>
+#include <common/logger_useful.h>
 #include <Common/ThreadPool.h>
 #include <Common/Stopwatch.h>
 #include <Common/RingBuffer.h>
@@ -75,6 +76,16 @@ public:
         if (value.load() >= static_cast<int64_t>(max_tasks_count))
             return false;
 
+        if (!scheduler.joinable())
+        {
+            LOG_ERROR(&Poco::Logger::get("MergeTreeBackgroundExecutor"), "Scheduler thread is dead. Trying to alive..");
+            scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
+
+            if (!scheduler.joinable())
+                LOG_FATAL(&Poco::Logger::get("MergeTreeBackgroundExecutor"), "Scheduler thread is dead permanently. Restart is needed");
+        }
+
+
         if (!pending.tryPush(std::make_shared<Item>(std::move(task), metric)))
             return false;
 
@@ -159,6 +170,7 @@ private:
 
         ExecutableTaskPtr task;
         CurrentMetrics::Increment increment;
+        Poco::Event is_done;
     };
 
     using ItemPtr = std::shared_ptr<Item>;
@@ -168,14 +180,7 @@ private:
 
     /// Initially it will be empty
     RingBuffer<ItemPtr> pending{0};
-
-    struct ActiveMeta
-    {
-        ItemPtr item;
-        std::shared_future<void> future;
-    };
-
-    RingBuffer<ActiveMeta> active{0};
+    RingBuffer<ItemPtr> active{0};
     std::set<StorageID> currently_deleting;
 
     std::mutex remove_mutex;
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index 664b09aa82f..ece859fbb0f 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -41,7 +41,12 @@ public:
         return {"test", name};
     }
 
-    void onCompleted() override {}
+    void onCompleted() override
+    {
+        auto choice = distribution(generator);
+        if (choice == 0)
+            throw std::runtime_error("Unlucky...");
+    }
 
 private:
     std::mt19937 generator;

From 26ab6ebc0ad6803f12661fbfbc37b943cddb4fa6 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 3 Sep 2021 14:20:34 +0000
Subject: [PATCH 086/177] finalize in destructor

---
 src/Common/RingBuffer.h                           | 10 ++--------
 src/Storages/MergeTree/BackgroundJobsExecutor.cpp | 11 ++++++++---
 src/Storages/MergeTree/BackgroundJobsExecutor.h   |  2 --
 3 files changed, 10 insertions(+), 13 deletions(-)

diff --git a/src/Common/RingBuffer.h b/src/Common/RingBuffer.h
index 9c65ac3dff1..f87bdbea3ac 100644
--- a/src/Common/RingBuffer.h
+++ b/src/Common/RingBuffer.h
@@ -21,15 +21,9 @@ public:
         buffer.assign(capacity, {});
     }
 
-    size_t size() const
-    {
-        return count;
-    }
+    size_t size() const { return count; }
 
-    bool empty() const
-    {
-        return count == 0;
-    }
+    bool empty() const { return count == 0; }
 
     bool tryPush(T element)
     {
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
index b67156f195f..f863ce91e72 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
@@ -111,8 +111,6 @@ void BackgroundJobAssignee::finish()
         getContext()->getFetchesExecutor()->removeTasksCorrespondingToStorage(storage_id);
         getContext()->getMergeMutateExecutor()->removeTasksCorrespondingToStorage(storage_id);
     }
-
-    finished = true;
 }
 
 
@@ -141,7 +139,14 @@ catch (...) /// Catch any exception to avoid thread termination.
 
 BackgroundJobAssignee::~BackgroundJobAssignee()
 {
-    assert(finished);
+    try
+    {
+        finish();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
 }
 
 }
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.h b/src/Storages/MergeTree/BackgroundJobsExecutor.h
index 3d776e56a35..82bd6882b96 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.h
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.h
@@ -48,8 +48,6 @@ private:
     /// Mutex for thread safety
     std::mutex holder_mutex;
 
-    bool finished{false};
-
 public:
     enum class Type
     {

From 65ff5fd052a0a58a674efec944aa4e6fb64bc99b Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Fri, 3 Sep 2021 17:57:29 +0300
Subject: [PATCH 087/177] Rewrite test

---
 src/Common/tests/gtest_log.cpp | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/src/Common/tests/gtest_log.cpp b/src/Common/tests/gtest_log.cpp
index a7c49497c35..e60d87a30c0 100644
--- a/src/Common/tests/gtest_log.cpp
+++ b/src/Common/tests/gtest_log.cpp
@@ -23,26 +23,29 @@ TEST(Logger, Log)
 TEST(Logger, TestLog)
 {
     {   /// Test logs visible for test level
-        Poco::Logger::root().setLevel("test");
-        std::ostringstream oss;
-        Poco::Logger::root().setChannel(Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss)));
-        Poco::Logger * log = &Poco::Logger::get("Log");
+
+        std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+        auto my_channel = Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss));
+        auto log = &Poco::Logger::create("TestLogger", my_channel.get());
+        log->setLevel("test");
         LOG_TEST(log, "Hello World");
 
         EXPECT_EQ(oss.str(), "Hello World\n");
+        Poco::Logger::destroy("TestLogger");
     }
 
     {   /// Test logs invisible for other levels
-        for (const auto & level : {"trace", "debug", "information", "warning", "fatal"})
+        for (const auto & level : {"trace", "debug", "information", "warning", "error", "fatal"})
         {
-            Poco::Logger::root().setLevel(level);
-            std::ostringstream oss;
-            Poco::Logger::root().setChannel(Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss)));
-            Poco::Logger * log = &Poco::Logger::get("Log");
-
+            std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+            auto my_channel = Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss));
+            auto log = &Poco::Logger::create(std::string{level} + "_Logger", my_channel.get());
+            log->setLevel(level);
             LOG_TEST(log, "Hello World");
 
             EXPECT_EQ(oss.str(), "");
+
+            Poco::Logger::destroy(std::string{level} + "_Logger");
         }
     }
 

From 3eec8a3c2f5d97a632e6b67dde152e3e78e2e206 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Fri, 3 Sep 2021 22:15:20 +0000
Subject: [PATCH 088/177] Use boost::circular_buffer

---
 contrib/boost                                 |   2 +-
 contrib/boost-cmake/CMakeLists.txt            |  13 +-
 src/CMakeLists.txt                            |   2 +
 src/Common/RingBuffer.h                       | 153 ------------------
 src/Common/tests/gtest_ringbuffer.cpp         | 142 ----------------
 .../MergeTree/BackgroundJobsExecutor.cpp      |  15 +-
 .../MergeTree/MergeMutateExecutor.cpp         | 151 ++++++++++++-----
 src/Storages/MergeTree/MergeMutateExecutor.h  | 104 +++---------
 8 files changed, 157 insertions(+), 425 deletions(-)
 delete mode 100644 src/Common/RingBuffer.h
 delete mode 100644 src/Common/tests/gtest_ringbuffer.cpp

diff --git a/contrib/boost b/contrib/boost
index 9cf09dbfd55..4b98e2befd3 160000
--- a/contrib/boost
+++ b/contrib/boost
@@ -1 +1 @@
-Subproject commit 9cf09dbfd55a5c6202dedbdf40781a51b02c2675
+Subproject commit 4b98e2befd3f3265b0db0acb5d20c4812ef8d88e
diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt
index 675931d319f..9a2234ac9f6 100644
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@@ -14,11 +14,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         context
         coroutine
         graph
+        circular_buffer
     )
 
-    if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND Boost_FILESYSTEM_LIBRARY AND
+    if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND
         Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
-        Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY)
+        Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY AND Boost_CIRCULAR_BUFFER_LIBRARY)
 
         set(EXTERNAL_BOOST_FOUND 1)
 
@@ -34,6 +35,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         add_library (_boost_context INTERFACE)
         add_library (_boost_coroutine INTERFACE)
         add_library (_boost_graph INTERFACE)
+        add_library (_boost_circular_buffer INTERFACE)
 
         target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
         target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
@@ -43,6 +45,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
         target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
         target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY})
+        target_link_libraries (_boost_circular_buffer INTERFACE ${Boost_CIRCULAR_BUFFER_LIBRARY})
 
         add_library (boost::filesystem ALIAS _boost_filesystem)
         add_library (boost::iostreams ALIAS _boost_iostreams)
@@ -52,6 +55,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         add_library (boost::context ALIAS _boost_context)
         add_library (boost::coroutine ALIAS _boost_coroutine)
         add_library (boost::graph ALIAS _boost_graph)
+        add_library (boost::circular_buffer ALIAS _boost_circular_buffer)
     else()
         set(EXTERNAL_BOOST_FOUND 0)
         message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
@@ -238,4 +242,9 @@ if (NOT EXTERNAL_BOOST_FOUND)
     target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR})
     target_link_libraries(_boost_graph PRIVATE _boost_regex)
 
+
+    add_library(_boost_circular_buffer INTERFACE)
+    add_library(boost::circular_buffer ALIAS _boost_circular_buffer)
+    target_include_directories(_boost_circular_buffer SYSTEM BEFORE INTERFACE ${LIBRARY_DIR})
+
 endif ()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 40b8526e103..54e73389d3f 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -297,6 +297,7 @@ target_link_libraries(clickhouse_common_io
         PUBLIC
             boost::program_options
             boost::system
+            boost::circular_buffer
             ${CITYHASH_LIBRARIES}
             ${ZLIB_LIBRARIES}
             pcg_random
@@ -335,6 +336,7 @@ dbms_target_link_libraries (
     PRIVATE
         boost::filesystem
         boost::program_options
+        boost::circular_buffer
         clickhouse_common_config
         clickhouse_common_zookeeper
         clickhouse_dictionaries_embedded
diff --git a/src/Common/RingBuffer.h b/src/Common/RingBuffer.h
deleted file mode 100644
index f87bdbea3ac..00000000000
--- a/src/Common/RingBuffer.h
+++ /dev/null
@@ -1,153 +0,0 @@
-#pragma once
-
-#include <vector>
-#include <iostream>
-
-namespace DB
-{
-
-/**
- * A ring buffer of fixed size.
- * With an ability to expand / narrow.
- * When narrowing only first N elements remain.
- */
-
-template <class T>
-class RingBuffer
-{
-public:
-    explicit RingBuffer(size_t capacity_) : capacity(capacity_)
-    {
-        buffer.assign(capacity, {});
-    }
-
-    size_t size() const { return count; }
-
-    bool empty() const { return count == 0; }
-
-    bool tryPush(T element)
-    {
-        if (count == capacity) {
-            return false;
-        }
-        buffer[advance(count)] = std::move(element);
-        ++count;
-        return true;
-    }
-
-    bool tryPop(T * element)
-    {
-        if (empty()) {
-            return false;
-        }
-        *element = std::move(buffer[position]);
-        --count;
-        position = advance();
-        return true;
-    }
-
-    /// In case of T = std::shared_ptr<Something> it won't cause any allocations
-    template <typename Predicate>
-    bool eraseAll(Predicate && predicate)
-    {
-        /// Shift all elements to the beginning of the buffer
-        std::rotate(buffer.begin(), buffer.begin() + position, buffer.end());
-        position = 0;
-
-        /// Remove elements
-        auto end_removed = std::remove_if(buffer.begin(), buffer.begin() + count, predicate);
-
-        if (end_removed == buffer.begin() + count)
-            return false;
-
-        size_t new_count = std::distance(buffer.begin(), end_removed);
-        for (size_t i = new_count; i < count; ++i)
-            buffer[i] = T{};
-
-        count = new_count;
-        return true;
-    }
-
-    template <class Predicate>
-    std::vector<T> getAll(Predicate && predicate)
-    {
-        std::vector<T> suitable;
-
-        for (size_t i = 0; i < count; ++i)
-        {
-            auto item = buffer[advance(i)];
-            if (predicate(item))
-                suitable.emplace_back(item);
-        }
-
-        return suitable;
-    }
-
-    template <typename Predicate>
-    bool has(Predicate && predicate)
-    {
-        for (size_t i = 0; i < count; ++i)
-            if (predicate(buffer[advance(i)]))
-                return true;
-
-        return false;
-    }
-
-
-    void resize(size_t new_capacity)
-    {
-        if (new_capacity > capacity)
-            expand(new_capacity);
-        else if (new_capacity < capacity)
-            narrow(new_capacity);
-    }
-
-private:
-
-    size_t advance(size_t amount = 1)
-    {
-        if (position + amount >= capacity)
-            return position + amount - capacity;
-        return position + amount;
-    }
-
-    void expand(size_t new_capacity)
-    {
-        bool overflow = (position + count) > capacity;
-        buffer.resize(new_capacity);
-
-        if (overflow)
-        {
-            size_t count_before_end = capacity - position;
-            for (size_t i = 0; i < count_before_end; ++i)
-                buffer[new_capacity - i] = buffer[capacity - i];
-            position = new_capacity - count_before_end;
-        }
-
-        capacity = new_capacity;
-    }
-
-    void narrow(size_t new_capacity)
-    {
-        std::vector<T> new_buffer(new_capacity);
-
-        count = std::min(new_capacity, count);
-        for (size_t i = 0; i < count; ++i)
-            new_buffer[i] = buffer[advance(i)];
-
-        std::swap(buffer, new_buffer);
-
-        position = 0;
-        capacity = new_capacity;
-    }
-
-
-    std::vector<T> buffer;
-    size_t position{0};
-    size_t count{0};
-    size_t capacity{0};
-};
-
-
-
-}
diff --git a/src/Common/tests/gtest_ringbuffer.cpp b/src/Common/tests/gtest_ringbuffer.cpp
deleted file mode 100644
index 0b4e2c32375..00000000000
--- a/src/Common/tests/gtest_ringbuffer.cpp
+++ /dev/null
@@ -1,142 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <random>
-
-#include <Common/RingBuffer.h>
-
-using namespace DB;
-
-TEST(RingBuffer, Empty)
-{
-    RingBuffer<int> buffer(1);
-
-    ASSERT_TRUE(buffer.size() == 0u); // NOLINT
-    ASSERT_TRUE(buffer.empty());
-}
-
-TEST(RingBuffer, PushAndPop)
-{
-    RingBuffer<int> buffer(2);
-
-    int i;
-    ASSERT_TRUE(true == buffer.tryPush(0));
-    ASSERT_TRUE(true == buffer.tryPush(1));
-    ASSERT_TRUE(false == buffer.tryPush(2));
-
-    ASSERT_TRUE(2u == buffer.size());
-    ASSERT_TRUE(false == buffer.empty());
-
-    ASSERT_TRUE(true == buffer.tryPop(&i));
-    ASSERT_TRUE(0 == i);
-    ASSERT_TRUE(true == buffer.tryPop(&i));
-    ASSERT_TRUE(1 == i);
-
-    ASSERT_TRUE(false == buffer.tryPop(&i));
-    ASSERT_TRUE(buffer.empty());
-    ASSERT_TRUE(true == buffer.empty());
-}
-
-TEST(RingBuffer, Random)
-{
-    std::random_device device;
-    std::mt19937 generator(device());
-
-    std::uniform_int_distribution<> distribution(0, 3);
-
-    RingBuffer<int> buffer(10);
-
-    int next_element = 0;
-    int next_received_element = 0;
-    for (int i = 0; i < 100000; ++i) {
-        if (distribution(generator) == 0)
-        {
-            if (buffer.tryPush(next_element))
-                next_element++;
-        }
-        else
-        {
-            int element;
-            if (buffer.tryPop(&element))
-            {
-                ASSERT_TRUE(next_received_element == element);
-                next_received_element++;
-            }
-        }
-    }
-}
-
-
-TEST(RingBuffer, Resize)
-{
-    RingBuffer<int> buffer(10);
-
-    for (size_t i = 0; i < 10; ++i)
-        ASSERT_TRUE(buffer.tryPush(i));
-
-    buffer.resize(0);
-
-    ASSERT_TRUE(buffer.empty());
-    ASSERT_EQ(buffer.size(), 0u);
-
-    ASSERT_FALSE(buffer.tryPush(42));
-
-    int value;
-    ASSERT_FALSE(buffer.tryPop(&value));
-
-    buffer.resize(1);
-
-    ASSERT_TRUE(buffer.tryPush(42));
-    ASSERT_TRUE(buffer.tryPop(&value));
-    ASSERT_EQ(value, 42);
-
-    buffer.resize(42);
-
-    for (size_t i = 0; i < 42; ++i)
-        ASSERT_TRUE(buffer.tryPush(i));
-
-    buffer.resize(56);
-
-    for (size_t i = 0; i < 42; ++i)
-    {
-        ASSERT_TRUE(buffer.tryPop(&value));
-        ASSERT_EQ(value, i);
-    }
-
-    for (size_t i = 0; i < 56; ++i)
-        ASSERT_TRUE(buffer.tryPush(i));
-
-    buffer.resize(13);
-
-    for (size_t i = 0; i < 13; ++i)
-    {
-        ASSERT_TRUE(buffer.tryPop(&value));
-        ASSERT_EQ(value, i);
-    }
-}
-
-
-TEST(RingBuffer, removeElements)
-{
-    RingBuffer<int> buffer(10);
-
-    for (size_t i = 0; i < 10; ++i)
-        ASSERT_TRUE(buffer.tryPush(i));
-
-    int value;
-    ASSERT_TRUE(buffer.tryPop(&value));
-    ASSERT_TRUE(buffer.tryPop(&value));
-    ASSERT_TRUE(buffer.tryPop(&value));
-
-    buffer.eraseAll([](int current) { return current % 2 == 0; });
-
-    ASSERT_EQ(buffer.size(), 4);
-
-    ASSERT_TRUE(buffer.tryPop(&value));
-    ASSERT_EQ(value, 3);
-    ASSERT_TRUE(buffer.tryPop(&value));
-    ASSERT_EQ(value, 5);
-    ASSERT_TRUE(buffer.tryPop(&value));
-    ASSERT_EQ(value, 7);
-    ASSERT_TRUE(buffer.tryPop(&value));
-    ASSERT_EQ(value, 9);
-}
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
index f863ce91e72..a8d2f52926c 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
@@ -51,30 +51,21 @@ void BackgroundJobAssignee::postpone()
 void BackgroundJobAssignee::scheduleMergeMutateTask(ExecutableTaskPtr merge_task)
 {
     bool res = getContext()->getMergeMutateExecutor()->trySchedule(merge_task);
-    if (res)
-        trigger();
-    else
-        postpone();
+    res ? trigger() : postpone();
 }
 
 
 void BackgroundJobAssignee::scheduleFetchTask(ExecutableTaskPtr fetch_task)
 {
     bool res = getContext()->getFetchesExecutor()->trySchedule(fetch_task);
-    if (res)
-        trigger();
-    else
-        postpone();
+    res ? trigger() : postpone();
 }
 
 
 void BackgroundJobAssignee::scheduleMoveTask(ExecutableTaskPtr move_task)
 {
     bool res = getContext()->getMovesExecutor()->trySchedule(move_task);
-    if (res)
-        trigger();
-    else
-        postpone();
+    res ? trigger() : postpone();
 }
 
 
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index 2c3590dc134..15ee6f7f2fc 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -22,8 +22,88 @@ String MergeTreeBackgroundExecutor::toString(Type type)
 }
 
 
+void MergeTreeBackgroundExecutor::updateConfiguration()
+{
+    auto new_threads_count = threads_count_getter();
+    auto new_max_tasks_count = max_task_count_getter();
+
+    try
+    {
+        pending.set_capacity(new_max_tasks_count);
+        active.set_capacity(new_max_tasks_count);
+
+        pool.setMaxFreeThreads(new_threads_count);
+        pool.setMaxThreads(new_threads_count);
+        pool.setQueueSize(new_max_tasks_count);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+
+    threads_count = new_threads_count;
+    max_tasks_count = new_max_tasks_count;
+}
+
+
+void MergeTreeBackgroundExecutor::wait()
+{
+    {
+        std::lock_guard lock(mutex);
+        shutdown = true;
+        has_tasks.notify_all();
+    }
+
+    if (scheduler.joinable())
+        scheduler.join();
+
+    pool.wait();
+}
+
+
+bool MergeTreeBackgroundExecutor::trySchedule(ExecutableTaskPtr task)
+{
+    std::lock_guard lock(mutex);
+
+    if (shutdown)
+        return false;
+
+    try
+    {
+        /// This is needed to increase / decrease the number of threads at runtime
+        if (update_timer.compareAndRestartDeferred(10.))
+            updateConfiguration();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+
+    auto & value = CurrentMetrics::values[metric];
+    if (value.load() >= static_cast<int64_t>(max_tasks_count))
+        return false;
+
+    /// Just check if the main scheduler thread in excellent condition
+    if (!scheduler.joinable())
+    {
+        LOG_ERROR(&Poco::Logger::get("MergeTreeBackgroundExecutor"), "Scheduler thread is dead. Trying to alive..");
+        scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
+
+        if (!scheduler.joinable())
+            LOG_FATAL(&Poco::Logger::get("MergeTreeBackgroundExecutor"), "Scheduler thread is dead permanently. Restart is needed");
+    }
+
+
+    pending.push_back(std::make_shared<Item>(std::move(task), metric));
+
+    has_tasks.notify_one();
+    return true;
+}
+
+
 void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id)
 {
+    /// Executor is global, so protect from any concurrent storage shutdowns
     std::lock_guard remove_lock(remove_mutex);
 
     std::vector<ItemPtr> tasks_to_wait;
@@ -34,8 +114,13 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
         currently_deleting.emplace(id);
 
         /// Erase storage related tasks from pending and select active tasks to wait for
-        pending.eraseAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
-        tasks_to_wait = active.getAll([&] (auto item) -> bool { return item->task->getStorageID() == id; });
+        auto it = std::remove_if(pending.begin(), pending.end(),
+            [&] (auto item) -> bool { return item->task->getStorageID() == id; } );
+        pending.erase(it, pending.end());
+
+        /// Copy items to wait for their completion
+        std::copy_if(active.begin(), active.end(), std::back_inserter(tasks_to_wait),
+            [&] (auto item) -> bool { return item->task->getStorageID() == id; });
     }
 
 
@@ -60,55 +145,48 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
 {
     setThreadName(name.c_str());
 
-    bool checked{false};
-
-    auto check_if_currently_deleting = [&] ()
+    auto erase_from_active = [&]
     {
-        checked = true;
-        return active.eraseAll([&] (auto & x) { return x == item; });
+        DENY_ALLOCATIONS_IN_SCOPE;
+        active.erase(std::remove(active.begin(), active.end(), item), active.end());
     };
 
-
-    SCOPE_EXIT({
-        if (checked)
-            return;
-        std::lock_guard guard(mutex);
-        check_if_currently_deleting();
-    });
-
     try
     {
         if (item->task->execute())
         {
+            DENY_ALLOCATIONS_IN_SCOPE;
             std::lock_guard guard(mutex);
 
-            if (check_if_currently_deleting())
+            if (currently_deleting.contains(item->task->getStorageID()))
+            {
+                erase_from_active();
                 return;
+            }
 
-            pending.tryPush(item);
+            pending.push_back(item);
+            erase_from_active();
             item->is_done.reset();
             has_tasks.notify_one();
             return;
         }
 
+        std::lock_guard guard(mutex);
+        erase_from_active();
+        has_tasks.notify_one();
         /// In a situation of a lack of memory this method can throw an exception,
         /// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
         /// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
         item->task->onCompleted();
-
-        std::lock_guard guard(mutex);
-        has_tasks.notify_one();
     }
     catch(...)
     {
         std::lock_guard guard(mutex);
+        erase_from_active();
         has_tasks.notify_one();
-        try
-        {
-            item->task->onCompleted();
-        }
-        catch (...) {}
         tryLogCurrentException(__PRETTY_FUNCTION__);
+        /// Do not want any exceptions
+        try { item->task->onCompleted(); } catch (...) {}
     }
 }
 
@@ -117,37 +195,36 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 {
     DENY_ALLOCATIONS_IN_SCOPE;
 
-    bool status;
-
     while (true)
     {
         std::unique_lock lock(mutex);
 
-        has_tasks.wait(lock, [this](){ return !pending.empty() || shutdown_suspend; });
+        has_tasks.wait(lock, [this](){ return !pending.empty() || shutdown; });
 
-        if (shutdown_suspend)
+        if (shutdown)
             break;
 
-        ItemPtr item;
-        if (!pending.tryPop(&item))
-            continue;
+        ItemPtr item = std::move(pending.front());
+        pending.pop_front();
 
+        /// Execute a piece of task
         bool res = pool.trySchedule([this, item]
         {
             routine(item);
+            /// When storage shutdowns it will wait until all related background tasks
+            /// are finished, because they may want to interact with its fields
+            /// and this will cause segfault.
             item->is_done.set();
         });
 
         if (!res)
         {
-            active.eraseAll([&] (auto x) { return x == item; });
-            status = pending.tryPush(item);
-            assert(status);
+            active.erase(std::remove(active.begin(), active.end(), item), active.end());
+            pending.push_back(item);
             continue;
         }
 
-        status = active.tryPush(std::move(item));
-        assert(status);
+        active.push_back(std::move(item));
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeMutateExecutor.h
index 82da33893e6..67edf282862 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeMutateExecutor.h
@@ -8,17 +8,36 @@
 #include <condition_variable>
 #include <set>
 
+#include <boost/circular_buffer.hpp>
+
 #include <common/shared_ptr_helper.h>
 #include <common/logger_useful.h>
 #include <Common/ThreadPool.h>
 #include <Common/Stopwatch.h>
-#include <Common/RingBuffer.h>
 #include <Storages/MergeTree/ExecutableTask.h>
 
 
 namespace DB
 {
 
+/**
+ *  Executor for a background MergeTree related operations such as merges, mutations, fetches an so on.
+ *  It can execute only successors of ExecutableTask interface.
+ *  Which is a self-written coroutine. It suspends, when returns true from execute() method.
+ *
+ *  Executor consists of ThreadPool to execute pieces of a task (basically calls 'execute' on a task)
+ *  and a scheduler thread, which manages the tasks. Due to bad experience of working with high memory under
+ *  high memory pressure scheduler thread mustn't do any allocations,
+ *  because it will be a fatal error if this thread will die from a random exception.
+ *
+ *  There are two queues of a tasks: pending (main queue for all the tasks) and active (currently executing).
+ *  There is an invariant, that task may occur only in one of these queue. It can occur in both queues only in critical sections.
+ *
+ *  Due to all caveats I described above we use boost::circular_buffer as a container for queues.
+ *
+ *  Another nuisance that we faces with is than backgroud operations always interacts with an associated Storage.
+ *  So, when a Storage want to shutdown, it must wait until all its background operaions are finished.
+ */
 class MergeTreeBackgroundExecutor : public shared_ptr_helper<MergeTreeBackgroundExecutor>
 {
 public:
@@ -54,61 +73,11 @@ public:
         wait();
     }
 
-    bool trySchedule(ExecutableTaskPtr task)
-    {
-        std::lock_guard lock(mutex);
-
-        if (shutdown_suspend)
-            return false;
-
-        try
-        {
-            /// This is needed to increase / decrease the number of threads at runtime
-            if (update_timer.compareAndRestartDeferred(10.))
-                updateConfiguration();
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-        }
-
-        auto & value = CurrentMetrics::values[metric];
-        if (value.load() >= static_cast<int64_t>(max_tasks_count))
-            return false;
-
-        if (!scheduler.joinable())
-        {
-            LOG_ERROR(&Poco::Logger::get("MergeTreeBackgroundExecutor"), "Scheduler thread is dead. Trying to alive..");
-            scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
-
-            if (!scheduler.joinable())
-                LOG_FATAL(&Poco::Logger::get("MergeTreeBackgroundExecutor"), "Scheduler thread is dead permanently. Restart is needed");
-        }
-
-
-        if (!pending.tryPush(std::make_shared<Item>(std::move(task), metric)))
-            return false;
-
-
-        has_tasks.notify_one();
-        return true;
-    }
+    bool trySchedule(ExecutableTaskPtr task);
 
     void removeTasksCorrespondingToStorage(StorageID id);
 
-    void wait()
-    {
-        {
-            std::lock_guard lock(mutex);
-            shutdown_suspend = true;
-            has_tasks.notify_all();
-        }
-
-        if (scheduler.joinable())
-            scheduler.join();
-
-        pool.wait();
-    }
+    void wait();
 
     size_t activeCount()
     {
@@ -124,28 +93,7 @@ public:
 
 private:
 
-    void updateConfiguration()
-    {
-        auto new_threads_count = threads_count_getter();
-        auto new_max_tasks_count = max_task_count_getter();
-
-        try
-        {
-            pending.resize(new_max_tasks_count);
-            active.resize(new_max_tasks_count);
-
-            pool.setMaxFreeThreads(new_threads_count);
-            pool.setMaxThreads(new_threads_count);
-            pool.setQueueSize(new_max_tasks_count);
-        }
-        catch (...)
-        {
-            tryLogCurrentException(__PRETTY_FUNCTION__);
-        }
-
-        threads_count = new_threads_count;
-        max_tasks_count = new_max_tasks_count;
-    }
+    void updateConfiguration();
 
     static String toString(Type type);
 
@@ -179,15 +127,15 @@ private:
     void schedulerThreadFunction();
 
     /// Initially it will be empty
-    RingBuffer<ItemPtr> pending{0};
-    RingBuffer<ItemPtr> active{0};
+    boost::circular_buffer<ItemPtr> pending{0};
+    boost::circular_buffer<ItemPtr> active{0};
     std::set<StorageID> currently_deleting;
 
     std::mutex remove_mutex;
     std::mutex mutex;
     std::condition_variable has_tasks;
 
-    std::atomic_bool shutdown_suspend{false};
+    std::atomic_bool shutdown{false};
 
     ThreadPool pool;
     ThreadFromGlobalPool scheduler;

From bf6ba796f8b5df9e4a384562bb0370118c2222cc Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 2 Sep 2021 21:05:13 +0300
Subject: [PATCH 089/177] Fix UUID overlap in DROP TABLE for internal DDL from
 MaterializeMySQL

This will fix race with DatabaseCatalog::loadMarkedAsDroppedTables(),
since MaterializeMySQL, and MaterializedMySQLSyncThread in background,
will be started earlier then
DatabaseCatalog::loadMarkedAsDroppedTables() and will move those tables
to metadata_dropped, and after loadMarkedAsDroppedTables() will start
and try to load partially dropped tables and will hit UUID overlap:

    12:02:51.536783 [ 3026034 ] {} <Information> Application: starting up
    12:02:53.019282 [ 3026034 ] {} <Information> DatabaseMaterializeMySQL<Atomic> (mysql): Total 9 tables and 0 dictionaries.
    12:02:53.041699 [ 3026200 ] {} <Debug> mysql.data (7143b65f-6982-4600-b143-b65f6982e600): Loading data parts
    12:02:53.041740 [ 3026200 ] {} <Debug> mysql.data (7143b65f-6982-4600-b143-b65f6982e600): There are no data parts
    12:02:53.620382 [ 3026034 ] {} <Information> DatabaseMaterializeMySQL<Atomic> (mysql): Starting up tables.
    12:03:00.669730 [ 3026183 ] {} <Debug> executeQuery: (internal) /*Materialize MySQL step 1: execute MySQL DDL for dump data*/ DROP TABLE mysql.data
    12:03:00.741894 [ 3026269 ] {} <Information> DatabaseCatalog: Trying load partially dropped table mysql.data (7143b65f-6982-4600-b143-b65f6982e600) from /var/lib/clickhouse/metadata_dropped/mysql.data.7143b65f-6982-4600-b143-b65f6982e600.sql
    12:03:00.742582 [ 3026269 ] {} <Debug> mysql.data (7143b65f-6982-4600-b143-b65f6982e600): Loading data parts
    12:03:00.742650 [ 3026269 ] {} <Debug> mysql.data (7143b65f-6982-4600-b143-b65f6982e600): There are no data parts
    12:03:00.773137 [ 3026034 ] {} <Error> Application: Caught exception while loading metadata: Code: 57, e.displayText() = DB::Exception: Mapping for table with UUID=7143b65f-6982-4600-b143-b65f6982e600 already exists. It happened due to UUID collision, most likely because some not random UUIDs were manually specified in CREATE queries., Stack trace (when copying this message, always include the lines below):
    12:03:01.224557 [ 3026034 ] {} <Error> Application: DB::Exception: Mapping for table with UUID=7143b65f-6982-4600-b143-b65f6982e600 already exists. It happened due to UUID collision, most likely because some not random UUIDs were manually specified in CREATE queries.

Cc: @zhang2014
---
 programs/server/Server.cpp           | 4 ++++
 src/Interpreters/DatabaseCatalog.cpp | 1 -
 src/Interpreters/DatabaseCatalog.h   | 2 +-
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index bf0d33d9c5c..c09b4f774fa 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -1131,6 +1131,10 @@ if (ThreadFuzzer::instance().isEffective())
         global_context->setSystemZooKeeperLogAfterInitializationIfNeeded();
         /// After the system database is created, attach virtual system tables (in addition to query_log and part_log)
         attachSystemTablesServer(*database_catalog.getSystemDatabase(), has_zookeeper);
+        /// Firstly remove partially dropped databases, to avoid race with MaterializedMySQLSyncThread,
+        /// that may execute DROP before loadMarkedAsDroppedTables() in background,
+        /// and so loadMarkedAsDroppedTables() will find it and try to add, and UUID will overlap.
+        database_catalog.loadMarkedAsDroppedTables();
         /// Then, load remaining databases
         loadMetadata(global_context, default_database);
         database_catalog.loadDatabases();
diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp
index 6e0ca97df1d..af60eeeaba3 100644
--- a/src/Interpreters/DatabaseCatalog.cpp
+++ b/src/Interpreters/DatabaseCatalog.cpp
@@ -146,7 +146,6 @@ void DatabaseCatalog::initializeAndLoadTemporaryDatabase()
 
 void DatabaseCatalog::loadDatabases()
 {
-    loadMarkedAsDroppedTables();
     auto task_holder = getContext()->getSchedulePool().createTask("DatabaseCatalog", [this](){ this->dropTableDataTask(); });
     drop_task = std::make_unique<BackgroundSchedulePoolTaskHolder>(std::move(task_holder));
     (*drop_task)->activate();
diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h
index 787fd9dc512..72dd28d335b 100644
--- a/src/Interpreters/DatabaseCatalog.h
+++ b/src/Interpreters/DatabaseCatalog.h
@@ -130,6 +130,7 @@ public:
 
     void initializeAndLoadTemporaryDatabase();
     void loadDatabases();
+    void loadMarkedAsDroppedTables();
 
     /// Get an object that protects the table from concurrently executing multiple DDL operations.
     DDLGuardPtr getDDLGuard(const String & database, const String & table);
@@ -241,7 +242,6 @@ private:
     };
     using TablesMarkedAsDropped = std::list<TableMarkedAsDropped>;
 
-    void loadMarkedAsDroppedTables();
     void dropTableDataTask();
     void dropTableFinally(const TableMarkedAsDropped & table);
 

From 69fa28f332049294f37f5b833333b4d91a3989c4 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Sat, 4 Sep 2021 14:29:40 +0300
Subject: [PATCH 090/177] Fixed tests

---
 .../ExecutablePoolDictionarySource.cpp           |  3 ---
 tests/config/executable_dictionary.xml           | 16 ++++++++--------
 2 files changed, 8 insertions(+), 11 deletions(-)

diff --git a/src/Dictionaries/ExecutablePoolDictionarySource.cpp b/src/Dictionaries/ExecutablePoolDictionarySource.cpp
index 10164fe7310..c5d081bb3e6 100644
--- a/src/Dictionaries/ExecutablePoolDictionarySource.cpp
+++ b/src/Dictionaries/ExecutablePoolDictionarySource.cpp
@@ -4,9 +4,6 @@
 #include <common/LocalDateTime.h>
 #include <Common/ShellCommand.h>
 
-#include <IO/WriteHelpers.h>
-#include <IO/ReadHelpers.h>
-
 #include <DataStreams/formatBlock.h>
 
 #include <Interpreters/Context.h>
diff --git a/tests/config/executable_dictionary.xml b/tests/config/executable_dictionary.xml
index 6089f57a3d7..e2b0bd0663e 100644
--- a/tests/config/executable_dictionary.xml
+++ b/tests/config/executable_dictionary.xml
@@ -123,8 +123,8 @@
 
     <source>
         <executable>
-            <command>printf "1\tValue\n"</command>
-            <format>TabSeparated</format>
+            <format>JSONEachRow</format>
+            <command>cd /; clickhouse-local --input-format JSONEachRow --output-format JSONEachRow --structure 'id UInt64' --query "SELECT id, 'Value' AS value FROM table"</command>
             <implicit_key>false</implicit_key>
         </executable>
     </source>
@@ -156,8 +156,8 @@
 
     <source>
         <executable>
-            <command>echo "Value"</command>
-            <format>TabSeparated</format>
+            <format>JSONEachRow</format>
+            <command>cd /; clickhouse-local --input-format JSONEachRow --output-format JSONEachRow --structure 'id UInt64' --query "SELECT 'Value' AS value FROM table"</command>
             <implicit_key>true</implicit_key>
         </executable>
     </source>
@@ -197,8 +197,8 @@
 
     <source>
         <executable>
-            <command>printf "1\tFirstKey\tValue\n"</command>
-            <format>TabSeparated</format>
+            <format>JSONEachRow</format>
+            <command>cd /; clickhouse-local --input-format JSONEachRow --output-format JSONEachRow --structure 'id UInt64, id_key String' --query "SELECT id, id_key, 'Value' AS value FROM table"</command>
             <implicit_key>false</implicit_key>
         </executable>
     </source>
@@ -238,8 +238,8 @@
 
     <source>
         <executable>
-            <command>echo "Value"</command>
-            <format>TabSeparated</format>
+            <format>JSONEachRow</format>
+            <command>cd /; clickhouse-local --input-format JSONEachRow --output-format JSONEachRow --structure 'id UInt64, id_key String' --query "SELECT 'Value' AS value FROM table"</command>
             <implicit_key>true</implicit_key>
         </executable>
     </source>

From ac2d9a73a8d62d72114b1ab27a656c922b7f05be Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Sat, 4 Sep 2021 13:07:59 +0300
Subject: [PATCH 091/177] User managed slots

---
 .../materialized-postgresql.md                | 14 ++++-
 docs/en/operations/settings/settings.md       | 10 +++-
 .../DatabaseMaterializedPostgreSQL.cpp        |  6 +-
 .../MaterializedPostgreSQLSettings.h          |  2 +
 .../PostgreSQLReplicationHandler.cpp          | 57 ++++++++++++++-----
 .../PostgreSQL/PostgreSQLReplicationHandler.h | 10 ++--
 .../StorageMaterializedPostgreSQL.cpp         |  6 +-
 .../test.py                                   | 41 ++++++++++++-
 8 files changed, 119 insertions(+), 27 deletions(-)

diff --git a/docs/en/engines/database-engines/materialized-postgresql.md b/docs/en/engines/database-engines/materialized-postgresql.md
index 89c7c803bb3..77a5f2af0e0 100644
--- a/docs/en/engines/database-engines/materialized-postgresql.md
+++ b/docs/en/engines/database-engines/materialized-postgresql.md
@@ -31,6 +31,10 @@ ENGINE = MaterializedPostgreSQL('host:port', ['database' | database], 'user', 'p
 
 -   [materialized_postgresql_allow_automatic_update](../../operations/settings/settings.md#materialized-postgresql-allow-automatic-update)
 
+-   [materialized_postgresql_replication_slot](../../operations/settings/settings.md#materialized-postgresql-replication-slot)
+
+-   [materialized_postgresql_snapshot](../../operations/settings/settings.md#materialized-postgresql-snapshot)
+
 ``` sql
 CREATE DATABASE database1
 ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres_user', 'postgres_password')
@@ -73,7 +77,7 @@ WHERE oid = 'postgres_table'::regclass;
 
 !!! warning "Warning"
     Replication of [**TOAST**](https://www.postgresql.org/docs/9.5/storage-toast.html) values is not supported. The default value for the data type will be used.
-	
+
 ## Example of Use {#example-of-use}
 
 ``` sql
@@ -82,3 +86,11 @@ ENGINE = MaterializedPostgreSQL('postgres1:5432', 'postgres_database', 'postgres
 
 SELECT * FROM postgresql_db.postgres_table;
 ```
+
+## Notes {#notes}
+
+- Failover of the logical replication slot.
+
+Logical Replication Slots which exist on the primary are not available on standby replicas.
+So if there is a failover, new primary (the old physical standby) won’t be aware of any slots which were existing with old primary. This will lead to a broken replication from PostgreSQL.
+A solution to this is to manage replication slots yourself and define a permanent replication slot (some information can be found [here](https://patroni.readthedocs.io/en/latest/SETTINGS.html)). You'll need to pass slot name via `materialized_postgresql_replication_slot` setting, and it has to be exported with `EXPORT SNAPSHOT` option. The snapshot identifier needs to be passed via `materialized_postgresql_snapshot` setting.
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index a1c7d1aab32..5635321b598 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -3436,6 +3436,14 @@ Possible values:
 
 Default value: `0`.
 
+## materialized_postgresql_replication_slot {#materialized-postgresql-replication-slot}
+
+Allows to have user-managed replication slots. Must be used together with `materialized_postgresql_snapshot`.
+
+## materialized_postgresql_replication_slot {#materialized-postgresql-replication-slot}
+
+A text string identifying a snapshot, from which initial dump of tables will be performed. Must be used together with `materialized_postgresql_replication_slot`.
+
 ## allow_experimental_projection_optimization {#allow-experimental-projection-optimization}
 
 Enables or disables [projection](../../engines/table-engines/mergetree-family/mergetree.md#projections) optimization when processing `SELECT` queries.
@@ -3449,7 +3457,7 @@ Default value: `0`.
 
 ## force_optimize_projection {#force-optimize-projection}
 
-Enables or disables the obligatory use of [projections](../../engines/table-engines/mergetree-family/mergetree.md#projections) in `SELECT` queries, when projection optimization is enabled (see [allow_experimental_projection_optimization](#allow-experimental-projection-optimization) setting). 
+Enables or disables the obligatory use of [projections](../../engines/table-engines/mergetree-family/mergetree.md#projections) in `SELECT` queries, when projection optimization is enabled (see [allow_experimental_projection_optimization](#allow-experimental-projection-optimization) setting).
 
 Possible values:
 
diff --git a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp
index c9ea8d12ef2..218dda94d31 100644
--- a/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp
+++ b/src/Databases/PostgreSQL/DatabaseMaterializedPostgreSQL.cpp
@@ -61,10 +61,8 @@ void DatabaseMaterializedPostgreSQL::startSynchronization()
             connection_info,
             getContext(),
             is_attach,
-            settings->materialized_postgresql_max_block_size.value,
-            settings->materialized_postgresql_allow_automatic_update,
-            /* is_materialized_postgresql_database = */ true,
-            settings->materialized_postgresql_tables_list.value);
+            *settings,
+            /* is_materialized_postgresql_database = */ true);
 
     postgres::Connection connection(connection_info);
     NameSet tables_to_replicate;
diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h b/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h
index 1d986b223e9..cc147a01d32 100644
--- a/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h
+++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLSettings.h
@@ -17,6 +17,8 @@ namespace DB
     M(UInt64, materialized_postgresql_max_block_size, 65536, "Number of row collected before flushing data into table.", 0) \
     M(String, materialized_postgresql_tables_list, "", "List of tables for MaterializedPostgreSQL database engine", 0) \
     M(Bool, materialized_postgresql_allow_automatic_update, false, "Allow to reload table in the background, when schema changes are detected", 0) \
+    M(String, materialized_postgresql_replication_slot, "", "A user-created replication slot", 0) \
+    M(String, materialized_postgresql_snapshot, "", "User provided snapshot in case he manages replication slots himself", 0) \
 
 DECLARE_SETTINGS_TRAITS(MaterializedPostgreSQLSettingsTraits, LIST_OF_MATERIALIZED_POSTGRESQL_SETTINGS)
 
diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp
index c8c74d2ddaa..1bda6d13e11 100644
--- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp
+++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.cpp
@@ -32,24 +32,28 @@ PostgreSQLReplicationHandler::PostgreSQLReplicationHandler(
     const postgres::ConnectionInfo & connection_info_,
     ContextPtr context_,
     bool is_attach_,
-    const size_t max_block_size_,
-    bool allow_automatic_update_,
-    bool is_materialized_postgresql_database_,
-    const String tables_list_)
+    const MaterializedPostgreSQLSettings & replication_settings,
+    bool is_materialized_postgresql_database_)
     : log(&Poco::Logger::get("PostgreSQLReplicationHandler"))
     , context(context_)
     , is_attach(is_attach_)
     , remote_database_name(remote_database_name_)
     , current_database_name(current_database_name_)
     , connection_info(connection_info_)
-    , max_block_size(max_block_size_)
-    , allow_automatic_update(allow_automatic_update_)
+    , max_block_size(replication_settings.materialized_postgresql_max_block_size)
+    , allow_automatic_update(replication_settings.materialized_postgresql_allow_automatic_update)
     , is_materialized_postgresql_database(is_materialized_postgresql_database_)
-    , tables_list(tables_list_)
+    , tables_list(replication_settings.materialized_postgresql_tables_list)
+    , user_provided_snapshot(replication_settings.materialized_postgresql_snapshot)
     , connection(std::make_shared<postgres::Connection>(connection_info_))
     , milliseconds_to_wait(RESCHEDULE_MS)
 {
-    replication_slot = fmt::format("{}_ch_replication_slot", replication_identifier);
+    replication_slot = replication_settings.materialized_postgresql_replication_slot;
+    if (replication_slot.empty())
+    {
+        user_managed_slot = false;
+        replication_slot = fmt::format("{}_ch_replication_slot", replication_identifier);
+    }
     publication_name = fmt::format("{}_ch_publication", replication_identifier);
 
     startup_task = context->getSchedulePool().createTask("PostgreSQLReplicaStartup", [this]{ waitConnectionAndStart(); });
@@ -121,7 +125,20 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error)
 
     auto initial_sync = [&]()
     {
-        createReplicationSlot(tx, start_lsn, snapshot_name);
+        LOG_TRACE(log, "Starting tables sync load");
+
+        if (user_managed_slot)
+        {
+            if (user_provided_snapshot.empty())
+                throw Exception(ErrorCodes::BAD_ARGUMENTS,
+                                "Using a user-defined replication slot must be provided with a snapshot from EXPORT SNAPSHOT when the slot is created."
+                                "Pass it to `materialized_postgresql_snapshot` setting");
+            snapshot_name = user_provided_snapshot;
+        }
+        else
+        {
+            createReplicationSlot(tx, start_lsn, snapshot_name);
+        }
 
         for (const auto & [table_name, storage] : materialized_storages)
         {
@@ -147,12 +164,17 @@ void PostgreSQLReplicationHandler::startSynchronization(bool throw_on_error)
     /// Recreation of a replication slot imposes reloading of all tables.
     if (!isReplicationSlotExist(tx, start_lsn, /* temporary */false))
     {
+        if (user_managed_slot)
+            throw Exception(ErrorCodes::BAD_ARGUMENTS, "Having replication slot `{}` from settings, but it does not exist", replication_slot);
+
         initial_sync();
     }
     /// Always drop replication slot if it is CREATE query and not ATTACH.
     else if (!is_attach || new_publication)
     {
-        dropReplicationSlot(tx);
+        if (!user_managed_slot)
+            dropReplicationSlot(tx);
+
         initial_sync();
     }
     /// Synchronization and initial load already took place - do not create any new tables, just fetch StoragePtr's
@@ -376,6 +398,8 @@ bool PostgreSQLReplicationHandler::isReplicationSlotExist(pqxx::nontransaction &
 void PostgreSQLReplicationHandler::createReplicationSlot(
         pqxx::nontransaction & tx, String & start_lsn, String & snapshot_name, bool temporary)
 {
+    assert(temporary || !user_managed_slot);
+
     String query_str, slot_name;
     if (temporary)
         slot_name = replication_slot + "_tmp";
@@ -401,6 +425,8 @@ void PostgreSQLReplicationHandler::createReplicationSlot(
 
 void PostgreSQLReplicationHandler::dropReplicationSlot(pqxx::nontransaction & tx, bool temporary)
 {
+    assert(temporary || !user_managed_slot);
+
     std::string slot_name;
     if (temporary)
         slot_name = replication_slot + "_tmp";
@@ -433,14 +459,17 @@ void PostgreSQLReplicationHandler::shutdownFinal()
 
         connection->execWithRetry([&](pqxx::nontransaction & tx)
         {
-            if (isReplicationSlotExist(tx, last_committed_lsn, /* temporary */false))
-                dropReplicationSlot(tx, /* temporary */false);
+            if (isReplicationSlotExist(tx, last_committed_lsn, /* temporary */true))
+                dropReplicationSlot(tx, /* temporary */true);
         });
 
+        if (user_managed_slot)
+            return;
+
         connection->execWithRetry([&](pqxx::nontransaction & tx)
         {
-            if (isReplicationSlotExist(tx, last_committed_lsn, /* temporary */true))
-                dropReplicationSlot(tx, /* temporary */true);
+            if (isReplicationSlotExist(tx, last_committed_lsn, /* temporary */false))
+                dropReplicationSlot(tx, /* temporary */false);
         });
     }
     catch (Exception & e)
diff --git a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h
index 3a0bedc0852..eacf6b69b3b 100644
--- a/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h
+++ b/src/Storages/PostgreSQL/PostgreSQLReplicationHandler.h
@@ -1,6 +1,7 @@
 #pragma once
 
 #include "MaterializedPostgreSQLConsumer.h"
+#include "MaterializedPostgreSQLSettings.h"
 #include <Databases/PostgreSQL/fetchPostgreSQLTableStructure.h>
 #include <Core/PostgreSQL/Utils.h>
 
@@ -25,10 +26,8 @@ public:
             const postgres::ConnectionInfo & connection_info_,
             ContextPtr context_,
             bool is_attach_,
-            const size_t max_block_size_,
-            bool allow_automatic_update_,
-            bool is_materialized_postgresql_database_,
-            const String tables_list = "");
+            const MaterializedPostgreSQLSettings & replication_settings,
+            bool is_materialized_postgresql_database_);
 
     /// Activate task to be run from a separate thread: wait until connection is available and call startReplication().
     void startup();
@@ -108,6 +107,9 @@ private:
     /// A coma-separated list of tables, which are going to be replicated for database engine. By default, a whole database is replicated.
     String tables_list;
 
+    bool user_managed_slot = true;
+    String user_provided_snapshot;
+
     String replication_slot, publication_name;
 
     /// Shared between replication_consumer and replication_handler, but never accessed concurrently.
diff --git a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp
index aa27a54cdac..73a685af9b4 100644
--- a/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp
+++ b/src/Storages/PostgreSQL/StorageMaterializedPostgreSQL.cpp
@@ -64,6 +64,8 @@ StorageMaterializedPostgreSQL::StorageMaterializedPostgreSQL(
     setInMemoryMetadata(storage_metadata);
 
     String replication_identifier = remote_database_name + "_" + remote_table_name_;
+    replication_settings->materialized_postgresql_tables_list = remote_table_name_;
+
     replication_handler = std::make_unique<PostgreSQLReplicationHandler>(
             replication_identifier,
             remote_database_name,
@@ -71,8 +73,8 @@ StorageMaterializedPostgreSQL::StorageMaterializedPostgreSQL(
             connection_info,
             getContext(),
             is_attach,
-            replication_settings->materialized_postgresql_max_block_size.value,
-            /* allow_automatic_update */ false, /* is_materialized_postgresql_database */false);
+            *replication_settings,
+            /* is_materialized_postgresql_database */false);
 
     if (!is_attach)
     {
diff --git a/tests/integration/test_postgresql_replica_database_engine/test.py b/tests/integration/test_postgresql_replica_database_engine/test.py
index 68b42d91fb6..1e7188458a9 100644
--- a/tests/integration/test_postgresql_replica_database_engine/test.py
+++ b/tests/integration/test_postgresql_replica_database_engine/test.py
@@ -31,18 +31,30 @@ postgres_table_template_3 = """
     key1 Integer NOT NULL, value1 Integer, key2 Integer NOT NULL, value2 Integer NOT NULL)
     """
 
-def get_postgres_conn(ip, port, database=False, auto_commit=True, database_name='postgres_database'):
+def get_postgres_conn(ip, port, database=False, auto_commit=True, database_name='postgres_database', replication=False):
     if database == True:
         conn_string = "host={} port={} dbname='{}' user='postgres' password='mysecretpassword'".format(ip, port, database_name)
     else:
         conn_string = "host={} port={} user='postgres' password='mysecretpassword'".format(ip, port)
 
+    if replication:
+        conn_string += " replication='database'"
+
     conn = psycopg2.connect(conn_string)
     if auto_commit:
         conn.set_isolation_level(ISOLATION_LEVEL_AUTOCOMMIT)
         conn.autocommit = True
     return conn
 
+def create_replication_slot(conn, slot_name='user_slot'):
+    cursor = conn.cursor()
+    cursor.execute('CREATE_REPLICATION_SLOT {} LOGICAL pgoutput EXPORT_SNAPSHOT'.format(slot_name))
+    result = cursor.fetchall()
+    print(result[0][0]) # slot name
+    print(result[0][1]) # start lsn
+    print(result[0][2]) # snapshot
+    return result[0][2]
+
 
 def create_postgres_db(cursor, name='postgres_database'):
     cursor.execute("CREATE DATABASE {}".format(name))
@@ -941,6 +953,33 @@ def test_quoting(started_cluster):
     drop_materialized_db()
 
 
+def test_user_managed_slots(started_cluster):
+    conn = get_postgres_conn(ip=started_cluster.postgres_ip,
+                             port=started_cluster.postgres_port,
+                             database=True)
+    cursor = conn.cursor()
+    table_name = 'test_table'
+    create_postgres_table(cursor, table_name);
+    instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(10000)".format(table_name))
+
+    slot_name = 'user_slot'
+    replication_connection = get_postgres_conn(ip=started_cluster.postgres_ip, port=started_cluster.postgres_port,
+                                               database=True, replication=True, auto_commit=True)
+    snapshot = create_replication_slot(replication_connection, slot_name=slot_name)
+    create_materialized_db(ip=started_cluster.postgres_ip,
+                           port=started_cluster.postgres_port,
+                           settings=["materialized_postgresql_replication_slot = '{}'".format(slot_name),
+                                     "materialized_postgresql_snapshot = '{}'".format(snapshot)])
+    check_tables_are_synchronized(table_name);
+    instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(10000, 10000)".format(table_name))
+    check_tables_are_synchronized(table_name);
+    instance.restart_clickhouse()
+    instance.query("INSERT INTO postgres_database.{} SELECT number, number from numbers(20000, 10000)".format(table_name))
+    check_tables_are_synchronized(table_name);
+    drop_postgres_table(cursor, table_name)
+    drop_materialized_db()
+
+
 if __name__ == '__main__':
     cluster.start()
     input("Cluster created, press any key to destroy...")

From bcbc830b0512970de78e86e2037ec0d510e636be Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Sat, 4 Sep 2021 15:28:11 +0300
Subject: [PATCH 092/177] Function dictGet small fix

---
 src/Functions/FunctionsExternalDictionaries.h | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h
index 4f79b06b44a..73810a306d6 100644
--- a/src/Functions/FunctionsExternalDictionaries.h
+++ b/src/Functions/FunctionsExternalDictionaries.h
@@ -104,9 +104,11 @@ public:
 
         const auto * attr_name_col = checkAndGetColumnConst<ColumnString>(sample_columns.getByPosition(1).column.get());
         if (!attr_name_col)
-            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument of function dictGet... must be a constant string");
+            throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Second argument of function dictGet must be a constant string");
 
-        return getDictionary(dict_name_col->getValue<String>())->isInjective(attr_name_col->getValue<String>());
+        const auto dictionary_name = dict_name_col->getValue<String>();
+        const auto attribute_name = attr_name_col->getValue<String>();
+        return getDictionary(dictionary_name)->isInjective(attribute_name);
     }
 
     DictionaryStructure getDictionaryStructure(const String & dictionary_name) const
@@ -585,14 +587,14 @@ private:
             {
                 const auto * tuple_column = tuple_col.getColumnPtr(i).get();
 
-                const auto * attribute_name_column = checkAndGetColumn<ColumnString>(tuple_column);
+                const auto * attribute_name_column = checkAndGetColumnConst<ColumnString>(tuple_column);
 
                 if (!attribute_name_column)
                     throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
                         "Tuple second argument of function {} must contain multiple constant string columns",
                         getName());
 
-                attribute_names.emplace_back(attribute_name_column->getDataAt(0));
+                attribute_names.emplace_back(attribute_name_column->getValue<String>());
             }
         }
         else

From 327bd44f23502226b3657ed415afe51b26b3354c Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Sat, 4 Sep 2021 15:46:41 +0300
Subject: [PATCH 093/177] Fix gtest

---
 src/Common/tests/gtest_log.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Common/tests/gtest_log.cpp b/src/Common/tests/gtest_log.cpp
index e60d87a30c0..4769b8c0261 100644
--- a/src/Common/tests/gtest_log.cpp
+++ b/src/Common/tests/gtest_log.cpp
@@ -26,7 +26,7 @@ TEST(Logger, TestLog)
 
         std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
         auto my_channel = Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss));
-        auto log = &Poco::Logger::create("TestLogger", my_channel.get());
+        auto * log = &Poco::Logger::create("TestLogger", my_channel.get());
         log->setLevel("test");
         LOG_TEST(log, "Hello World");
 
@@ -39,7 +39,7 @@ TEST(Logger, TestLog)
         {
             std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
             auto my_channel = Poco::AutoPtr<Poco::StreamChannel>(new Poco::StreamChannel(oss));
-            auto log = &Poco::Logger::create(std::string{level} + "_Logger", my_channel.get());
+            auto * log = &Poco::Logger::create(std::string{level} + "_Logger", my_channel.get());
             log->setLevel(level);
             LOG_TEST(log, "Hello World");
 

From 5f9952d7420d9f173c210abb27d829d0f9708c1e Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Sat, 4 Sep 2021 15:56:19 +0300
Subject: [PATCH 094/177] Add clang-tidy-12

---
 cmake/analysis.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/cmake/analysis.cmake b/cmake/analysis.cmake
index 267bb34248b..24d8168e2c0 100644
--- a/cmake/analysis.cmake
+++ b/cmake/analysis.cmake
@@ -6,7 +6,7 @@ if (ENABLE_CLANG_TIDY)
         message(FATAL_ERROR "clang-tidy requires CMake version at least 3.6.")
     endif()
 
-    find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-11" "clang-tidy-10" "clang-tidy-9" "clang-tidy-8")
+    find_program (CLANG_TIDY_PATH NAMES "clang-tidy" "clang-tidy-12" "clang-tidy-11" "clang-tidy-10" "clang-tidy-9" "clang-tidy-8")
 
     if (CLANG_TIDY_PATH)
         message(STATUS

From a879c907a228041a94e9dee387a56e5083c273fe Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Sat, 4 Sep 2021 16:00:42 +0300
Subject: [PATCH 095/177] Update PVS hashsum

---
 docker/test/pvs/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/test/pvs/Dockerfile b/docker/test/pvs/Dockerfile
index 7bd45ba4018..35e07748845 100644
--- a/docker/test/pvs/Dockerfile
+++ b/docker/test/pvs/Dockerfile
@@ -28,7 +28,7 @@ RUN apt-get update --yes \
 ENV PKG_VERSION="pvs-studio-latest"
 
 RUN set -x \
-    && export PUBKEY_HASHSUM="686e5eb8b3c543a5c54442c39ec876b6c2d912fe8a729099e600017ae53c877dda3368fe38ed7a66024fe26df6b5892a" \
+    && export PUBKEY_HASHSUM="ad369a2e9d8b8c30f5a9f2eb131121739b79c78e03fef0f016ea51871a5f78cd4e6257b270dca0ac3be3d1f19d885516" \
     && wget -nv https://files.viva64.com/etc/pubkey.txt -O /tmp/pubkey.txt \
     && echo "${PUBKEY_HASHSUM} /tmp/pubkey.txt" | sha384sum -c \
     && apt-key add /tmp/pubkey.txt \

From 4df6fa1ae7739f9229a149a239c1a3c681d78b69 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Sat, 4 Sep 2021 16:09:12 +0300
Subject: [PATCH 096/177] Remove strange changes

---
 docker/packager/binary/build.sh | 23 +++++++----------------
 docker/packager/packager        |  3 ---
 2 files changed, 7 insertions(+), 19 deletions(-)

diff --git a/docker/packager/binary/build.sh b/docker/packager/binary/build.sh
index 26f83649762..71402a2fd66 100755
--- a/docker/packager/binary/build.sh
+++ b/docker/packager/binary/build.sh
@@ -2,24 +2,15 @@
 
 set -x -e
 
-if [ "1" == "${IS_CROSS_DARWIN:0}" ]
-then
-    mkdir -p build/cmake/toolchain/darwin-x86_64
-    tar xJf MacOSX11.0.sdk.tar.xz -C build/cmake/toolchain/darwin-x86_64 --strip-components=1
-    ln -sf darwin-x86_64 build/cmake/toolchain/darwin-aarch64
-fi
+mkdir -p build/cmake/toolchain/darwin-x86_64
+tar xJf MacOSX11.0.sdk.tar.xz -C build/cmake/toolchain/darwin-x86_64 --strip-components=1
+ln -sf darwin-x86_64 build/cmake/toolchain/darwin-aarch64
 
-if [ "1" == "${IS_CROSS_ARM:0}" ]
-then
-    mkdir -p build/cmake/toolchain/linux-aarch64
-    tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build/cmake/toolchain/linux-aarch64 --strip-components=1
-fi
+mkdir -p build/cmake/toolchain/linux-aarch64
+tar xJf gcc-arm-8.3-2019.03-x86_64-aarch64-linux-gnu.tar.xz -C build/cmake/toolchain/linux-aarch64 --strip-components=1
 
-if [ "1" == "${IS_CROSS_ARM:0}" ]
-then
-    mkdir -p build/cmake/toolchain/freebsd-x86_64
-    tar xJf freebsd-11.3-toolchain.tar.xz -C build/cmake/toolchain/freebsd-x86_64 --strip-components=1
-fi
+mkdir -p build/cmake/toolchain/freebsd-x86_64
+tar xJf freebsd-11.3-toolchain.tar.xz -C build/cmake/toolchain/freebsd-x86_64 --strip-components=1
 
 # Uncomment to debug ccache. Don't put ccache log in /output right away, or it
 # will be confusingly packed into the "performance" package.
diff --git a/docker/packager/packager b/docker/packager/packager
index 1f472ed98e1..f37d64e9949 100755
--- a/docker/packager/packager
+++ b/docker/packager/packager
@@ -78,7 +78,6 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
         cmake_flags.append("-DLINKER_NAME=ld.lld")
 
     if is_cross_darwin:
-        result.append("IS_CROSS_DARWIN=1")
         cc = compiler[:-len(DARWIN_SUFFIX)]
         cmake_flags.append("-DCMAKE_AR:FILEPATH=/cctools/bin/x86_64-apple-darwin-ar")
         cmake_flags.append("-DCMAKE_INSTALL_NAME_TOOL=/cctools/bin/x86_64-apple-darwin-install_name_tool")
@@ -93,11 +92,9 @@ def parse_env_variables(build_type, compiler, sanitizer, package_type, image_typ
         cmake_flags.append("-DLINKER_NAME=/cctools/bin/aarch64-apple-darwin-ld")
         cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/darwin/toolchain-aarch64.cmake")
     elif is_cross_arm:
-        result.append("IS_CROSS_ARM=1")
         cc = compiler[:-len(ARM_SUFFIX)]
         cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/linux/toolchain-aarch64.cmake")
     elif is_cross_freebsd:
-        result.append("IS_CROSS_FREEBSD=1")
         cc = compiler[:-len(FREEBSD_SUFFIX)]
         cmake_flags.append("-DCMAKE_TOOLCHAIN_FILE=/build/cmake/freebsd/toolchain-x86_64.cmake")
     else:

From 25a879044be772103ba97658817f800f9c2acf8d Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Sat, 4 Sep 2021 17:23:05 +0000
Subject: [PATCH 097/177] Allow allocations to allocate a thread

---
 src/Common/ThreadPool.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp
index a50a5dce20a..641b51e0d60 100644
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@@ -75,7 +75,7 @@ void ThreadPoolImpl<Thread>::setQueueSize(size_t value)
     std::lock_guard lock(mutex);
     queue_size = value;
     /// Reserve memory to get rid of allocations
-    jobs.reserve(queue_size);
+    jobs.reserve(2 * queue_size);
 }
 
 
@@ -123,6 +123,7 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, int priority, std::opti
         /// Check if there are enough threads to process job.
         if (threads.size() < std::min(max_threads, scheduled_jobs + 1))
         {
+            ALLOW_ALLOCATIONS_IN_SCOPE;
             try
             {
                 threads.emplace_front();

From 685d061c4ed91332f9b0f8f398b302d823e2add6 Mon Sep 17 00:00:00 2001
From: Maksim Kita <kitaetoya@gmail.com>
Date: Sat, 4 Sep 2021 22:13:20 +0300
Subject: [PATCH 098/177] Fixes tests

---
 src/Functions/FunctionsExternalDictionaries.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Functions/FunctionsExternalDictionaries.h b/src/Functions/FunctionsExternalDictionaries.h
index 73810a306d6..cccf42e470c 100644
--- a/src/Functions/FunctionsExternalDictionaries.h
+++ b/src/Functions/FunctionsExternalDictionaries.h
@@ -587,14 +587,14 @@ private:
             {
                 const auto * tuple_column = tuple_col.getColumnPtr(i).get();
 
-                const auto * attribute_name_column = checkAndGetColumnConst<ColumnString>(tuple_column);
+                const auto * attribute_name_column = checkAndGetColumn<ColumnString>(tuple_column);
 
                 if (!attribute_name_column)
                     throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT,
                         "Tuple second argument of function {} must contain multiple constant string columns",
                         getName());
 
-                attribute_names.emplace_back(attribute_name_column->getValue<String>());
+                attribute_names.emplace_back(attribute_name_column->getDataAt(0));
             }
         }
         else

From 28517e57fc04427b007bbeddc17741afd39ea90a Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Sat, 4 Sep 2021 23:55:59 +0300
Subject: [PATCH 099/177] Fix test

---
 .../test_postgresql_replica_database_engine/test.py           | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/tests/integration/test_postgresql_replica_database_engine/test.py b/tests/integration/test_postgresql_replica_database_engine/test.py
index 1e7188458a9..1dd096087ff 100644
--- a/tests/integration/test_postgresql_replica_database_engine/test.py
+++ b/tests/integration/test_postgresql_replica_database_engine/test.py
@@ -55,6 +55,9 @@ def create_replication_slot(conn, slot_name='user_slot'):
     print(result[0][2]) # snapshot
     return result[0][2]
 
+def drop_replication_slot(conn, slot_name='user_slot'):
+    cursor = conn.cursor()
+    cursor.execute("select pg_drop_replication_slot('{}')".format(slot_name))
 
 def create_postgres_db(cursor, name='postgres_database'):
     cursor.execute("CREATE DATABASE {}".format(name))
@@ -978,6 +981,7 @@ def test_user_managed_slots(started_cluster):
     check_tables_are_synchronized(table_name);
     drop_postgres_table(cursor, table_name)
     drop_materialized_db()
+    drop_replication_slot(replication_connection, slot_name)
 
 
 if __name__ == '__main__':

From 3c166df8ea1e0c44483262d588eaec2be81735d4 Mon Sep 17 00:00:00 2001
From: feng lv <fenglv15@mails.ucas.ac.cn>
Date: Sun, 5 Sep 2021 07:25:52 +0000
Subject: [PATCH 100/177] fix minor typo

---
 src/Storages/StorageMemory.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index 6823f661984..6ec3c6c8ac6 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -61,7 +61,7 @@ protected:
 
         const Block & src = (*data)[current_index];
         Columns columns;
-        columns.reserve(columns.size());
+        columns.reserve(column_names_and_types.size());
 
         /// Add only required columns to `res`.
         for (const auto & elem : column_names_and_types)

From b40b4b0b4ddd16e7792a88acd0b7cf6a2c4e2d11 Mon Sep 17 00:00:00 2001
From: kirillikoff <kirillikoff@yandex.ru>
Date: Sun, 5 Sep 2021 14:37:24 +0300
Subject: [PATCH 101/177] DOCSUP-13418: Document the regexp_max_matches_per_row
 setting (#28398)

* Document the regexp_max_matches_per_row setting

* Update docs/en/operations/settings/settings.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>

* Document the regexp_max_matches_per_row setting ru version

* Update docs/ru/operations/settings/settings.md

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>

Co-authored-by: Tatiana Kirillova <kirillikoff@yandex-team.ru>
Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/operations/settings/settings.md | 10 ++++++++++
 docs/ru/operations/settings/settings.md | 12 +++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index a1c7d1aab32..5d162f6a426 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -3457,3 +3457,13 @@ Possible values:
 -   1 — Projection optimization is obligatory.
 
 Default value: `0`.
+
+## regexp_max_matches_per_row {#regexp-max-matches-per-row}
+
+Sets the maximum number of matches for a single regular expression per row. Use it to protect against memory overload when using greedy regular expression in the [extractAllGroupsHorizontal](../../sql-reference/functions/string-search-functions.md#extractallgroups-horizontal) function.
+
+Possible values:
+
+-   Positive integer.
+
+Default value: `1000`.
diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md
index aac9c30658c..56b04e81a94 100644
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@@ -3273,4 +3273,14 @@ SETTINGS index_granularity = 8192 │
 -   0 — Проекции используются опционально.
 -   1 — Проекции обязательно используются.
 
-Значение по умолчанию: `0`.
\ No newline at end of file
+Значение по умолчанию: `0`.
+
+## regexp_max_matches_per_row {#regexp-max-matches-per-row}
+
+Задает максимальное количество совпадений для регулярного выражения. Настройка применяется для защиты памяти от перегрузки при использовании "жадных" квантификаторов в регулярном выражении для функции [extractAllGroupsHorizontal](../../sql-reference/functions/string-search-functions.md#extractallgroups-horizontal).
+
+Возможные значения:
+
+-   Положительное целое число.
+
+Значение по умолчанию: `1000`.
\ No newline at end of file

From 40d4d64a65e10739b2975e8055490a2a03d0e688 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Sun, 5 Sep 2021 15:50:25 +0300
Subject: [PATCH 102/177] Fix PVS Image

---
 docker/test/pvs/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/test/pvs/Dockerfile b/docker/test/pvs/Dockerfile
index 35e07748845..438f0bd07ec 100644
--- a/docker/test/pvs/Dockerfile
+++ b/docker/test/pvs/Dockerfile
@@ -38,7 +38,7 @@ RUN set -x \
     && dpkg -i "${PKG_VERSION}.deb"
 
 CMD echo "Running PVS version $PKG_VERSION" && cd /repo_folder && pvs-studio-analyzer credentials $LICENCE_NAME $LICENCE_KEY -o ./licence.lic  \
-    && cmake . -D"ENABLE_EMBEDDED_COMPILER"=OFF -D"USE_INTERNAL_PROTOBUF_LIBRARY"=OFF -D"USE_INTERNAL_GRPC_LIBRARY"=OFF \
+    && cmake . -D"ENABLE_EMBEDDED_COMPILER"=OFF -D"USE_INTERNAL_PROTOBUF_LIBRARY"=OFF -D"USE_INTERNAL_GRPC_LIBRARY"=OFF -DCMAKE_C_COMPILER=clang-12 -DCMAKE_CXX_COMPILER=clang\+\+-12 \
     && ninja re2_st clickhouse_grpc_protos \
     && pvs-studio-analyzer analyze -o pvs-studio.log -e contrib -j 4 -l ./licence.lic; \
     cp /repo_folder/pvs-studio.log /test_output; \

From 04f0efaaf4177e496b9d7146d1b0450b627f415d Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Mon, 6 Sep 2021 02:06:06 +0300
Subject: [PATCH 103/177] first draft

---
 docs/en/operations/settings/settings.md | 33 +++++++++++++++++++++++++
 1 file changed, 33 insertions(+)

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 5d162f6a426..221bdf0e720 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -260,6 +260,39 @@ If an error occurred while reading rows but the error counter is still less than
 
 If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` are exceeded, ClickHouse throws an exception.
 
+## input_format_parquet_import_nested {#input_format_parquet_import_nested}
+
+Enables or disables the ability to insert [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) structures into [Parquet](../../interfaces/formats.md#data-format-parquet) input format as an array of structs.
+
+Possible values:
+
+-   0 — Disabled.
+-   1 — Enabled.
+
+Default value: `0`.
+
+## input_format_arrow_import_nested {#input_format_arrow_import_nested}
+
+Enables or disables the ability to insert [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) structures into [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format as an array of structs.
+
+Possible values:
+
+-   0 — Disabled.
+-   1 — Enabled.
+
+Default value: `0`.
+
+## input_format_orc_import_nested {#input_format_orc_import_nested}
+
+Enables or disables the ability to insert [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) structures into [ORC](../../interfaces/formats.md#data-format-orc) input format as an array of structs.
+
+Possible values:
+
+-   0 — Disabled.
+-   1 — Enabled.
+
+Default value: `0`.
+
 ## input_format_values_interpret_expressions {#settings-input_format_values_interpret_expressions}
 
 Enables or disables the full SQL parser if the fast stream parser can’t parse the data. This setting is used only for the [Values](../../interfaces/formats.md#data-format-values) format at the data insertion. For more information about syntax parsing, see the [Syntax](../../sql-reference/syntax.md) section.

From 18a7adf0fa17667ef03829d122d88ac23cf93d71 Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Sun, 29 Aug 2021 16:49:30 +0800
Subject: [PATCH 104/177] Fix NOT-IN index optimization when not all keys are
 used.

---
 src/Interpreters/Set.cpp                          |  8 ++++----
 src/Interpreters/Set.h                            |  4 +++-
 .../01891_not_in_partition_prune.reference        |  2 ++
 .../0_stateless/01891_not_in_partition_prune.sql  | 15 +++++++++++++++
 4 files changed, 24 insertions(+), 5 deletions(-)

diff --git a/src/Interpreters/Set.cpp b/src/Interpreters/Set.cpp
index 5ab59ba3f07..5304859aeea 100644
--- a/src/Interpreters/Set.cpp
+++ b/src/Interpreters/Set.cpp
@@ -402,8 +402,8 @@ void Set::checkTypesEqual(size_t set_type_idx, const DataTypePtr & other_type) c
                         + data_types[set_type_idx]->getName() + " on the right", ErrorCodes::TYPE_MISMATCH);
 }
 
-MergeTreeSetIndex::MergeTreeSetIndex(const Columns & set_elements, std::vector<KeyTuplePositionMapping> && index_mapping_)
-    : indexes_mapping(std::move(index_mapping_))
+MergeTreeSetIndex::MergeTreeSetIndex(const Columns & set_elements, std::vector<KeyTuplePositionMapping> && indexes_mapping_)
+    : has_all_keys(set_elements.size() == indexes_mapping_.size()), indexes_mapping(std::move(indexes_mapping_))
 {
     std::sort(indexes_mapping.begin(), indexes_mapping.end(),
         [](const KeyTuplePositionMapping & l, const KeyTuplePositionMapping & r)
@@ -548,11 +548,11 @@ BoolMask MergeTreeSetIndex::checkInRange(const std::vector<Range> & key_ranges,
             break;
         }
     }
-    if (one_element_range)
+    if (one_element_range && has_all_keys)
     {
         /// Here we know that there is one element in range.
         /// The main difference with the normal case is that we can definitely say that
-        /// condition in this range always TRUE (can_be_false = 0) xor always FALSE (can_be_true = 0).
+        /// condition in this range is always TRUE (can_be_false = 0) or always FALSE (can_be_true = 0).
 
         /// Check if it's an empty range
         if (!left_included || !right_included)
diff --git a/src/Interpreters/Set.h b/src/Interpreters/Set.h
index 727a2c144a1..578913dd0d2 100644
--- a/src/Interpreters/Set.h
+++ b/src/Interpreters/Set.h
@@ -208,7 +208,7 @@ public:
         std::vector<FunctionBasePtr> functions;
     };
 
-    MergeTreeSetIndex(const Columns & set_elements, std::vector<KeyTuplePositionMapping> && index_mapping_);
+    MergeTreeSetIndex(const Columns & set_elements, std::vector<KeyTuplePositionMapping> && indexes_mapping_);
 
     size_t size() const { return ordered_set.at(0)->size(); }
 
@@ -217,6 +217,8 @@ public:
     BoolMask checkInRange(const std::vector<Range> & key_ranges, const DataTypes & data_types) const;
 
 private:
+    // If all arguments in tuple are key columns, we can optimize NOT IN when there is only one element.
+    bool has_all_keys;
     Columns ordered_set;
     std::vector<KeyTuplePositionMapping> indexes_mapping;
 
diff --git a/tests/queries/0_stateless/01891_not_in_partition_prune.reference b/tests/queries/0_stateless/01891_not_in_partition_prune.reference
index 628053cd4f8..9d2517ad760 100644
--- a/tests/queries/0_stateless/01891_not_in_partition_prune.reference
+++ b/tests/queries/0_stateless/01891_not_in_partition_prune.reference
@@ -4,3 +4,5 @@
 7	107
 8	108
 9	109
+1970-01-01	1	one
+1970-01-01	3	three
diff --git a/tests/queries/0_stateless/01891_not_in_partition_prune.sql b/tests/queries/0_stateless/01891_not_in_partition_prune.sql
index edbfad93e5d..5bf90fdd65c 100644
--- a/tests/queries/0_stateless/01891_not_in_partition_prune.sql
+++ b/tests/queries/0_stateless/01891_not_in_partition_prune.sql
@@ -8,3 +8,18 @@ set max_rows_to_read = 5;
 select * from test1 where i not in (1,2,3,4,5) order by i;
 
 drop table test1;
+
+drop table if exists t1;
+drop table if exists t2;
+
+create table t1 (date Date, a Float64, b String) Engine=MergeTree ORDER BY date;
+create table t2 (date Date, a Float64, b String) Engine=MergeTree ORDER BY date;
+
+insert into t1(a, b) values (1, 'one'), (2, 'two');
+insert into t2(a, b) values (2, 'two'), (3, 'three');
+
+select date, a, b from t1 where (date, a, b) NOT IN (select date,a,b from t2);
+select date, a, b from t2 where (date, a, b) NOT IN (select date,a,b from t1);
+
+drop table t1;
+drop table t2;

From 7b8101f289b33bce483a45bc6f85737225a644b2 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Mon, 6 Sep 2021 09:09:35 +0300
Subject: [PATCH 105/177] Minor change

---
 src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp
index b43e7656084..1fc279bff23 100644
--- a/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp
+++ b/src/Storages/PostgreSQL/MaterializedPostgreSQLConsumer.cpp
@@ -625,9 +625,8 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot()
         tryLogCurrentException(__PRETTY_FUNCTION__);
         return false;
     }
-    catch (const pqxx::broken_connection & e)
+    catch (const pqxx::broken_connection &)
     {
-        LOG_ERROR(log, "Connection error: {}", e.what());
         connection->tryUpdateConnection();
         return false;
     }
@@ -641,6 +640,7 @@ bool MaterializedPostgreSQLConsumer::readFromReplicationSlot()
         if (error_message.find("out of relcache_callback_list slots") == std::string::npos)
             tryLogCurrentException(__PRETTY_FUNCTION__);
 
+        connection->tryUpdateConnection();
         return false;
     }
     catch (const pqxx::conversion_error & e)

From 73ef1233efbd301060e33b170c60c01f80e8bac8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Sun, 5 Sep 2021 16:44:14 +0300
Subject: [PATCH 106/177] Fix tidy

Fix tidy one more time
---
 .clang-tidy                                   |  2 ++
 src/Compression/CompressionCodecEncrypted.cpp |  2 +-
 src/IO/WriteBufferFromFile.cpp                |  6 ++---
 src/IO/WriteBufferFromFile.h                  |  1 -
 src/IO/WriteBufferFromFileDescriptor.cpp      | 26 +++++++++----------
 src/IO/WriteBufferFromFileDescriptor.h        | 25 ++++++++++++------
 6 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index b0971418e0e..ecb8ac6dcbf 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -203,3 +203,5 @@ CheckOptions:
     value: CamelCase
   - key: readability-identifier-naming.UsingCase
     value: CamelCase
+  - key: modernize-loop-convert.UseCxx20ReverseRanges
+    value: false
diff --git a/src/Compression/CompressionCodecEncrypted.cpp b/src/Compression/CompressionCodecEncrypted.cpp
index 6b921fb9c0a..ba3f12c32a0 100644
--- a/src/Compression/CompressionCodecEncrypted.cpp
+++ b/src/Compression/CompressionCodecEncrypted.cpp
@@ -113,7 +113,7 @@ namespace DB
 
     std::string CompressionCodecEncrypted::deriveKey(const std::string_view & master_key)
     {
-        std::string_view salt(""); // No salt: derive keys in a deterministic manner.
+        std::string_view salt; // No salt: derive keys in a deterministic manner.
         std::string_view info("Codec Encrypted('AES-128-GCM-SIV') key generation key");
         std::array<char, 32> result;
 
diff --git a/src/IO/WriteBufferFromFile.cpp b/src/IO/WriteBufferFromFile.cpp
index 67cd7ba27d6..270882d0774 100644
--- a/src/IO/WriteBufferFromFile.cpp
+++ b/src/IO/WriteBufferFromFile.cpp
@@ -32,7 +32,7 @@ WriteBufferFromFile::WriteBufferFromFile(
     mode_t mode,
     char * existing_memory,
     size_t alignment)
-    : WriteBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment), file_name(file_name_)
+    : WriteBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment, file_name_)
 {
     ProfileEvents::increment(ProfileEvents::FileOpen);
 
@@ -65,9 +65,7 @@ WriteBufferFromFile::WriteBufferFromFile(
     size_t buf_size,
     char * existing_memory,
     size_t alignment)
-    :
-    WriteBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment),
-    file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name)
+    : WriteBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, original_file_name)
 {
     fd_ = -1;
 }
diff --git a/src/IO/WriteBufferFromFile.h b/src/IO/WriteBufferFromFile.h
index b7d58638113..584a0221f1a 100644
--- a/src/IO/WriteBufferFromFile.h
+++ b/src/IO/WriteBufferFromFile.h
@@ -25,7 +25,6 @@ namespace DB
 class WriteBufferFromFile : public WriteBufferFromFileDescriptor
 {
 protected:
-    std::string file_name;
     CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForWrite};
 
 public:
diff --git a/src/IO/WriteBufferFromFileDescriptor.cpp b/src/IO/WriteBufferFromFileDescriptor.cpp
index cd265653bb9..f1afca171d2 100644
--- a/src/IO/WriteBufferFromFileDescriptor.cpp
+++ b/src/IO/WriteBufferFromFileDescriptor.cpp
@@ -61,7 +61,9 @@ void WriteBufferFromFileDescriptor::nextImpl()
         if ((-1 == res || 0 == res) && errno != EINTR)
         {
             ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
-            throwFromErrnoWithPath("Cannot write to file " + getFileName(), getFileName(),
+
+            /// Don't use getFileName() here because this method can be called from destructor
+            throwFromErrnoWithPath("Cannot write to file " + file_name, file_name,
                                    ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
         }
 
@@ -74,19 +76,17 @@ void WriteBufferFromFileDescriptor::nextImpl()
 }
 
 
-/// Name or some description of file.
-std::string WriteBufferFromFileDescriptor::getFileName() const
-{
-    return "(fd = " + toString(fd) + ")";
-}
-
-
 WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor(
     int fd_,
     size_t buf_size,
     char * existing_memory,
-    size_t alignment)
-    : WriteBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_) {}
+    size_t alignment,
+    const std::string & file_name_)
+    : WriteBufferFromFileBase(buf_size, existing_memory, alignment)
+    , fd(fd_)
+    , file_name(file_name_.empty() ? "(fd = " + toString(fd) + ")" : file_name_)
+{
+}
 
 
 WriteBufferFromFileDescriptor::~WriteBufferFromFileDescriptor()
@@ -115,7 +115,7 @@ void WriteBufferFromFileDescriptor::sync()
 }
 
 
-off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence)
+off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence) // NOLINT
 {
     off_t res = lseek(fd, offset, whence);
     if (-1 == res)
@@ -125,7 +125,7 @@ off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence)
 }
 
 
-void WriteBufferFromFileDescriptor::truncate(off_t length)
+void WriteBufferFromFileDescriptor::truncate(off_t length) // NOLINT
 {
     int res = ftruncate(fd, length);
     if (-1 == res)
@@ -133,7 +133,7 @@ void WriteBufferFromFileDescriptor::truncate(off_t length)
 }
 
 
-off_t WriteBufferFromFileDescriptor::size()
+off_t WriteBufferFromFileDescriptor::size() const
 {
     struct stat buf;
     int res = fstat(fd, &buf);
diff --git a/src/IO/WriteBufferFromFileDescriptor.h b/src/IO/WriteBufferFromFileDescriptor.h
index 18c0ac64f63..aef332b38b0 100644
--- a/src/IO/WriteBufferFromFileDescriptor.h
+++ b/src/IO/WriteBufferFromFileDescriptor.h
@@ -13,17 +13,17 @@ class WriteBufferFromFileDescriptor : public WriteBufferFromFileBase
 protected:
     int fd;
 
+    /// If file has name contains filename, otherwise contains string "(fd=...)"
+    std::string file_name;
+
     void nextImpl() override;
-
-    /// Name or some description of file.
-    std::string getFileName() const override;
-
 public:
     WriteBufferFromFileDescriptor(
         int fd_ = -1,
         size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
         char * existing_memory = nullptr,
-        size_t alignment = 0);
+        size_t alignment = 0,
+        const std::string & file_name_ = "");
 
     /** Could be used before initialization if needed 'fd' was not passed to constructor.
       * It's not possible to change 'fd' during work.
@@ -42,10 +42,19 @@ public:
 
     void sync() override;
 
-    off_t seek(off_t offset, int whence);
-    void truncate(off_t length);
+    /// clang-tidy wants these methods to be const, but
+    /// they are not const semantically
+    off_t seek(off_t offset, int whence); // NOLINT
+    void truncate(off_t length); // NOLINT
 
-    off_t size();
+    /// Name or some description of file.
+    std::string getFileName() const override
+    {
+        return file_name;
+    }
+
+
+    off_t size() const;
 };
 
 }

From 2e5e017d6d9adbab69de388eb5cad88bfcf4310b Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 6 Sep 2021 10:52:20 +0300
Subject: [PATCH 107/177] Revert "Fix tidy"

This reverts commit 73ef1233efbd301060e33b170c60c01f80e8bac8.
---
 .clang-tidy                                   |  2 --
 src/Compression/CompressionCodecEncrypted.cpp |  2 +-
 src/IO/WriteBufferFromFile.cpp                |  6 +++--
 src/IO/WriteBufferFromFile.h                  |  1 +
 src/IO/WriteBufferFromFileDescriptor.cpp      | 26 +++++++++----------
 src/IO/WriteBufferFromFileDescriptor.h        | 25 ++++++------------
 6 files changed, 27 insertions(+), 35 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index ecb8ac6dcbf..b0971418e0e 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -203,5 +203,3 @@ CheckOptions:
     value: CamelCase
   - key: readability-identifier-naming.UsingCase
     value: CamelCase
-  - key: modernize-loop-convert.UseCxx20ReverseRanges
-    value: false
diff --git a/src/Compression/CompressionCodecEncrypted.cpp b/src/Compression/CompressionCodecEncrypted.cpp
index ba3f12c32a0..6b921fb9c0a 100644
--- a/src/Compression/CompressionCodecEncrypted.cpp
+++ b/src/Compression/CompressionCodecEncrypted.cpp
@@ -113,7 +113,7 @@ namespace DB
 
     std::string CompressionCodecEncrypted::deriveKey(const std::string_view & master_key)
     {
-        std::string_view salt; // No salt: derive keys in a deterministic manner.
+        std::string_view salt(""); // No salt: derive keys in a deterministic manner.
         std::string_view info("Codec Encrypted('AES-128-GCM-SIV') key generation key");
         std::array<char, 32> result;
 
diff --git a/src/IO/WriteBufferFromFile.cpp b/src/IO/WriteBufferFromFile.cpp
index 270882d0774..67cd7ba27d6 100644
--- a/src/IO/WriteBufferFromFile.cpp
+++ b/src/IO/WriteBufferFromFile.cpp
@@ -32,7 +32,7 @@ WriteBufferFromFile::WriteBufferFromFile(
     mode_t mode,
     char * existing_memory,
     size_t alignment)
-    : WriteBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment, file_name_)
+    : WriteBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment), file_name(file_name_)
 {
     ProfileEvents::increment(ProfileEvents::FileOpen);
 
@@ -65,7 +65,9 @@ WriteBufferFromFile::WriteBufferFromFile(
     size_t buf_size,
     char * existing_memory,
     size_t alignment)
-    : WriteBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, original_file_name)
+    :
+    WriteBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment),
+    file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name)
 {
     fd_ = -1;
 }
diff --git a/src/IO/WriteBufferFromFile.h b/src/IO/WriteBufferFromFile.h
index 584a0221f1a..b7d58638113 100644
--- a/src/IO/WriteBufferFromFile.h
+++ b/src/IO/WriteBufferFromFile.h
@@ -25,6 +25,7 @@ namespace DB
 class WriteBufferFromFile : public WriteBufferFromFileDescriptor
 {
 protected:
+    std::string file_name;
     CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForWrite};
 
 public:
diff --git a/src/IO/WriteBufferFromFileDescriptor.cpp b/src/IO/WriteBufferFromFileDescriptor.cpp
index f1afca171d2..cd265653bb9 100644
--- a/src/IO/WriteBufferFromFileDescriptor.cpp
+++ b/src/IO/WriteBufferFromFileDescriptor.cpp
@@ -61,9 +61,7 @@ void WriteBufferFromFileDescriptor::nextImpl()
         if ((-1 == res || 0 == res) && errno != EINTR)
         {
             ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
-
-            /// Don't use getFileName() here because this method can be called from destructor
-            throwFromErrnoWithPath("Cannot write to file " + file_name, file_name,
+            throwFromErrnoWithPath("Cannot write to file " + getFileName(), getFileName(),
                                    ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
         }
 
@@ -76,17 +74,19 @@ void WriteBufferFromFileDescriptor::nextImpl()
 }
 
 
+/// Name or some description of file.
+std::string WriteBufferFromFileDescriptor::getFileName() const
+{
+    return "(fd = " + toString(fd) + ")";
+}
+
+
 WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor(
     int fd_,
     size_t buf_size,
     char * existing_memory,
-    size_t alignment,
-    const std::string & file_name_)
-    : WriteBufferFromFileBase(buf_size, existing_memory, alignment)
-    , fd(fd_)
-    , file_name(file_name_.empty() ? "(fd = " + toString(fd) + ")" : file_name_)
-{
-}
+    size_t alignment)
+    : WriteBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_) {}
 
 
 WriteBufferFromFileDescriptor::~WriteBufferFromFileDescriptor()
@@ -115,7 +115,7 @@ void WriteBufferFromFileDescriptor::sync()
 }
 
 
-off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence) // NOLINT
+off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence)
 {
     off_t res = lseek(fd, offset, whence);
     if (-1 == res)
@@ -125,7 +125,7 @@ off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence) // NOLINT
 }
 
 
-void WriteBufferFromFileDescriptor::truncate(off_t length) // NOLINT
+void WriteBufferFromFileDescriptor::truncate(off_t length)
 {
     int res = ftruncate(fd, length);
     if (-1 == res)
@@ -133,7 +133,7 @@ void WriteBufferFromFileDescriptor::truncate(off_t length) // NOLINT
 }
 
 
-off_t WriteBufferFromFileDescriptor::size() const
+off_t WriteBufferFromFileDescriptor::size()
 {
     struct stat buf;
     int res = fstat(fd, &buf);
diff --git a/src/IO/WriteBufferFromFileDescriptor.h b/src/IO/WriteBufferFromFileDescriptor.h
index aef332b38b0..18c0ac64f63 100644
--- a/src/IO/WriteBufferFromFileDescriptor.h
+++ b/src/IO/WriteBufferFromFileDescriptor.h
@@ -13,17 +13,17 @@ class WriteBufferFromFileDescriptor : public WriteBufferFromFileBase
 protected:
     int fd;
 
-    /// If file has name contains filename, otherwise contains string "(fd=...)"
-    std::string file_name;
-
     void nextImpl() override;
+
+    /// Name or some description of file.
+    std::string getFileName() const override;
+
 public:
     WriteBufferFromFileDescriptor(
         int fd_ = -1,
         size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
         char * existing_memory = nullptr,
-        size_t alignment = 0,
-        const std::string & file_name_ = "");
+        size_t alignment = 0);
 
     /** Could be used before initialization if needed 'fd' was not passed to constructor.
       * It's not possible to change 'fd' during work.
@@ -42,19 +42,10 @@ public:
 
     void sync() override;
 
-    /// clang-tidy wants these methods to be const, but
-    /// they are not const semantically
-    off_t seek(off_t offset, int whence); // NOLINT
-    void truncate(off_t length); // NOLINT
+    off_t seek(off_t offset, int whence);
+    void truncate(off_t length);
 
-    /// Name or some description of file.
-    std::string getFileName() const override
-    {
-        return file_name;
-    }
-
-
-    off_t size() const;
+    off_t size();
 };
 
 }

From 181bf0dbd65ca82ae5553b474be9b6a7cc710118 Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <dimas.kovas@gmail.com>
Date: Mon, 6 Sep 2021 11:11:45 +0300
Subject: [PATCH 108/177] Fix arcadia build

---
 src/IO/ya.make       | 3 ++-
 src/IO/ya.make.in    | 3 ++-
 src/Storages/ya.make | 1 +
 3 files changed, 5 insertions(+), 2 deletions(-)

diff --git a/src/IO/ya.make b/src/IO/ya.make
index 4723d8a0a17..cc24a690308 100644
--- a/src/IO/ya.make
+++ b/src/IO/ya.make
@@ -5,6 +5,7 @@ LIBRARY()
 
 ADDINCL(
     contrib/libs/zstd/include
+    contrib/libs/lz4
     contrib/restricted/fast_float/include
 )
 
@@ -14,10 +15,10 @@ PEERDIR(
     contrib/libs/brotli/enc
     contrib/libs/poco/NetSSL_OpenSSL
     contrib/libs/zstd
+    contrib/libs/lz4
     contrib/restricted/fast_float
 )
 
-
 SRCS(
     AIO.cpp
     AsynchronousReadBufferFromFile.cpp
diff --git a/src/IO/ya.make.in b/src/IO/ya.make.in
index 3c5460a5378..21e64baa10a 100644
--- a/src/IO/ya.make.in
+++ b/src/IO/ya.make.in
@@ -4,6 +4,7 @@ LIBRARY()
 
 ADDINCL(
     contrib/libs/zstd/include
+    contrib/libs/lz4
     contrib/restricted/fast_float/include
 )
 
@@ -13,10 +14,10 @@ PEERDIR(
     contrib/libs/brotli/enc
     contrib/libs/poco/NetSSL_OpenSSL
     contrib/libs/zstd
+    contrib/libs/lz4
     contrib/restricted/fast_float
 )
 
-
 SRCS(
 <? find . -name '*.cpp' | grep -v -F tests | grep -v -F examples | grep -v -F fuzzers | grep -v -P 'S3|HDFS' | sed 's/^\.\//    /' | sort ?>
 )
diff --git a/src/Storages/ya.make b/src/Storages/ya.make
index 0daea968af5..5b246cf5aca 100644
--- a/src/Storages/ya.make
+++ b/src/Storages/ya.make
@@ -18,6 +18,7 @@ SRCS(
     Distributed/DirectoryMonitor.cpp
     Distributed/DistributedSettings.cpp
     Distributed/DistributedSink.cpp
+    ExecutablePoolSettings.cpp
     IStorage.cpp
     IndicesDescription.cpp
     JoinSettings.cpp

From d1e91a786056901104eef98b530b54a37b62416d Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 6 Sep 2021 12:16:52 +0300
Subject: [PATCH 109/177] Revert "Revert "Fix tidy""

This reverts commit 2e5e017d6d9adbab69de388eb5cad88bfcf4310b.
---
 .clang-tidy                                   |  2 ++
 src/Compression/CompressionCodecEncrypted.cpp |  2 +-
 src/IO/WriteBufferFromFile.cpp                |  6 ++---
 src/IO/WriteBufferFromFile.h                  |  1 -
 src/IO/WriteBufferFromFileDescriptor.cpp      | 26 +++++++++----------
 src/IO/WriteBufferFromFileDescriptor.h        | 25 ++++++++++++------
 6 files changed, 35 insertions(+), 27 deletions(-)

diff --git a/.clang-tidy b/.clang-tidy
index b0971418e0e..ecb8ac6dcbf 100644
--- a/.clang-tidy
+++ b/.clang-tidy
@@ -203,3 +203,5 @@ CheckOptions:
     value: CamelCase
   - key: readability-identifier-naming.UsingCase
     value: CamelCase
+  - key: modernize-loop-convert.UseCxx20ReverseRanges
+    value: false
diff --git a/src/Compression/CompressionCodecEncrypted.cpp b/src/Compression/CompressionCodecEncrypted.cpp
index 6b921fb9c0a..ba3f12c32a0 100644
--- a/src/Compression/CompressionCodecEncrypted.cpp
+++ b/src/Compression/CompressionCodecEncrypted.cpp
@@ -113,7 +113,7 @@ namespace DB
 
     std::string CompressionCodecEncrypted::deriveKey(const std::string_view & master_key)
     {
-        std::string_view salt(""); // No salt: derive keys in a deterministic manner.
+        std::string_view salt; // No salt: derive keys in a deterministic manner.
         std::string_view info("Codec Encrypted('AES-128-GCM-SIV') key generation key");
         std::array<char, 32> result;
 
diff --git a/src/IO/WriteBufferFromFile.cpp b/src/IO/WriteBufferFromFile.cpp
index 67cd7ba27d6..270882d0774 100644
--- a/src/IO/WriteBufferFromFile.cpp
+++ b/src/IO/WriteBufferFromFile.cpp
@@ -32,7 +32,7 @@ WriteBufferFromFile::WriteBufferFromFile(
     mode_t mode,
     char * existing_memory,
     size_t alignment)
-    : WriteBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment), file_name(file_name_)
+    : WriteBufferFromFileDescriptor(-1, buf_size, existing_memory, alignment, file_name_)
 {
     ProfileEvents::increment(ProfileEvents::FileOpen);
 
@@ -65,9 +65,7 @@ WriteBufferFromFile::WriteBufferFromFile(
     size_t buf_size,
     char * existing_memory,
     size_t alignment)
-    :
-    WriteBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment),
-    file_name(original_file_name.empty() ? "(fd = " + toString(fd_) + ")" : original_file_name)
+    : WriteBufferFromFileDescriptor(fd_, buf_size, existing_memory, alignment, original_file_name)
 {
     fd_ = -1;
 }
diff --git a/src/IO/WriteBufferFromFile.h b/src/IO/WriteBufferFromFile.h
index b7d58638113..584a0221f1a 100644
--- a/src/IO/WriteBufferFromFile.h
+++ b/src/IO/WriteBufferFromFile.h
@@ -25,7 +25,6 @@ namespace DB
 class WriteBufferFromFile : public WriteBufferFromFileDescriptor
 {
 protected:
-    std::string file_name;
     CurrentMetrics::Increment metric_increment{CurrentMetrics::OpenFileForWrite};
 
 public:
diff --git a/src/IO/WriteBufferFromFileDescriptor.cpp b/src/IO/WriteBufferFromFileDescriptor.cpp
index cd265653bb9..f1afca171d2 100644
--- a/src/IO/WriteBufferFromFileDescriptor.cpp
+++ b/src/IO/WriteBufferFromFileDescriptor.cpp
@@ -61,7 +61,9 @@ void WriteBufferFromFileDescriptor::nextImpl()
         if ((-1 == res || 0 == res) && errno != EINTR)
         {
             ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
-            throwFromErrnoWithPath("Cannot write to file " + getFileName(), getFileName(),
+
+            /// Don't use getFileName() here because this method can be called from destructor
+            throwFromErrnoWithPath("Cannot write to file " + file_name, file_name,
                                    ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
         }
 
@@ -74,19 +76,17 @@ void WriteBufferFromFileDescriptor::nextImpl()
 }
 
 
-/// Name or some description of file.
-std::string WriteBufferFromFileDescriptor::getFileName() const
-{
-    return "(fd = " + toString(fd) + ")";
-}
-
-
 WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor(
     int fd_,
     size_t buf_size,
     char * existing_memory,
-    size_t alignment)
-    : WriteBufferFromFileBase(buf_size, existing_memory, alignment), fd(fd_) {}
+    size_t alignment,
+    const std::string & file_name_)
+    : WriteBufferFromFileBase(buf_size, existing_memory, alignment)
+    , fd(fd_)
+    , file_name(file_name_.empty() ? "(fd = " + toString(fd) + ")" : file_name_)
+{
+}
 
 
 WriteBufferFromFileDescriptor::~WriteBufferFromFileDescriptor()
@@ -115,7 +115,7 @@ void WriteBufferFromFileDescriptor::sync()
 }
 
 
-off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence)
+off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence) // NOLINT
 {
     off_t res = lseek(fd, offset, whence);
     if (-1 == res)
@@ -125,7 +125,7 @@ off_t WriteBufferFromFileDescriptor::seek(off_t offset, int whence)
 }
 
 
-void WriteBufferFromFileDescriptor::truncate(off_t length)
+void WriteBufferFromFileDescriptor::truncate(off_t length) // NOLINT
 {
     int res = ftruncate(fd, length);
     if (-1 == res)
@@ -133,7 +133,7 @@ void WriteBufferFromFileDescriptor::truncate(off_t length)
 }
 
 
-off_t WriteBufferFromFileDescriptor::size()
+off_t WriteBufferFromFileDescriptor::size() const
 {
     struct stat buf;
     int res = fstat(fd, &buf);
diff --git a/src/IO/WriteBufferFromFileDescriptor.h b/src/IO/WriteBufferFromFileDescriptor.h
index 18c0ac64f63..aef332b38b0 100644
--- a/src/IO/WriteBufferFromFileDescriptor.h
+++ b/src/IO/WriteBufferFromFileDescriptor.h
@@ -13,17 +13,17 @@ class WriteBufferFromFileDescriptor : public WriteBufferFromFileBase
 protected:
     int fd;
 
+    /// If file has name contains filename, otherwise contains string "(fd=...)"
+    std::string file_name;
+
     void nextImpl() override;
-
-    /// Name or some description of file.
-    std::string getFileName() const override;
-
 public:
     WriteBufferFromFileDescriptor(
         int fd_ = -1,
         size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
         char * existing_memory = nullptr,
-        size_t alignment = 0);
+        size_t alignment = 0,
+        const std::string & file_name_ = "");
 
     /** Could be used before initialization if needed 'fd' was not passed to constructor.
       * It's not possible to change 'fd' during work.
@@ -42,10 +42,19 @@ public:
 
     void sync() override;
 
-    off_t seek(off_t offset, int whence);
-    void truncate(off_t length);
+    /// clang-tidy wants these methods to be const, but
+    /// they are not const semantically
+    off_t seek(off_t offset, int whence); // NOLINT
+    void truncate(off_t length); // NOLINT
 
-    off_t size();
+    /// Name or some description of file.
+    std::string getFileName() const override
+    {
+        return file_name;
+    }
+
+
+    off_t size() const;
 };
 
 }

From 5c75b93fe8b4842fccbe0570514b00680066ac4a Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 6 Sep 2021 12:17:31 +0300
Subject: [PATCH 110/177] Revert one warning

---
 src/Compression/CompressionCodecEncrypted.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Compression/CompressionCodecEncrypted.cpp b/src/Compression/CompressionCodecEncrypted.cpp
index ba3f12c32a0..47f93eb6202 100644
--- a/src/Compression/CompressionCodecEncrypted.cpp
+++ b/src/Compression/CompressionCodecEncrypted.cpp
@@ -113,7 +113,8 @@ namespace DB
 
     std::string CompressionCodecEncrypted::deriveKey(const std::string_view & master_key)
     {
-        std::string_view salt; // No salt: derive keys in a deterministic manner.
+        /// No salt: derive keys in a deterministic manner.
+        std::string_view salt(""); // NOLINT
         std::string_view info("Codec Encrypted('AES-128-GCM-SIV') key generation key");
         std::array<char, 32> result;
 

From 209b748fcae2f429be40d4582ceff21b7e6a85e4 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 6 Sep 2021 13:11:19 +0300
Subject: [PATCH 111/177] Add missed level

---
 src/Interpreters/TextLog.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Interpreters/TextLog.cpp b/src/Interpreters/TextLog.cpp
index baf98b6771d..51ffbdd66ee 100644
--- a/src/Interpreters/TextLog.cpp
+++ b/src/Interpreters/TextLog.cpp
@@ -26,7 +26,8 @@ NamesAndTypesList TextLogElement::getNamesAndTypes()
                 {"Notice",         static_cast<Int8>(Message::PRIO_NOTICE)},
                 {"Information",    static_cast<Int8>(Message::PRIO_INFORMATION)},
                 {"Debug",          static_cast<Int8>(Message::PRIO_DEBUG)},
-                {"Trace",          static_cast<Int8>(Message::PRIO_TRACE)}
+                {"Trace",          static_cast<Int8>(Message::PRIO_TRACE)},
+                {"Test",           static_cast<Int8>(Message::PRIO_TEST)},
         });
 
     return

From fcfe77cb5741de5edccd6c713582893f7d2e24f8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 6 Sep 2021 13:35:37 +0300
Subject: [PATCH 112/177] Fix allocation in buffer

---
 src/IO/WriteBufferFromFileDescriptor.cpp | 22 ++++++++++++++++++----
 src/IO/WriteBufferFromFileDescriptor.h   |  8 ++------
 2 files changed, 20 insertions(+), 10 deletions(-)

diff --git a/src/IO/WriteBufferFromFileDescriptor.cpp b/src/IO/WriteBufferFromFileDescriptor.cpp
index f1afca171d2..96579626dfc 100644
--- a/src/IO/WriteBufferFromFileDescriptor.cpp
+++ b/src/IO/WriteBufferFromFileDescriptor.cpp
@@ -63,7 +63,10 @@ void WriteBufferFromFileDescriptor::nextImpl()
             ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteFailed);
 
             /// Don't use getFileName() here because this method can be called from destructor
-            throwFromErrnoWithPath("Cannot write to file " + file_name, file_name,
+            String error_file_name = file_name;
+            if (error_file_name.empty())
+                error_file_name = "(fd = " + toString(fd) + ")";
+            throwFromErrnoWithPath("Cannot write to file " + error_file_name, error_file_name,
                                    ErrorCodes::CANNOT_WRITE_TO_FILE_DESCRIPTOR);
         }
 
@@ -75,16 +78,18 @@ void WriteBufferFromFileDescriptor::nextImpl()
     ProfileEvents::increment(ProfileEvents::WriteBufferFromFileDescriptorWriteBytes, bytes_written);
 }
 
-
+/// NOTE: This class can be used as a very low-level building block, for example
+/// in trace collector. In such places allocations of memory can be dangerous,
+/// so don't allocate anything in this consturctor.
 WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor(
     int fd_,
     size_t buf_size,
     char * existing_memory,
     size_t alignment,
-    const std::string & file_name_)
+    std::string file_name_)
     : WriteBufferFromFileBase(buf_size, existing_memory, alignment)
     , fd(fd_)
-    , file_name(file_name_.empty() ? "(fd = " + toString(fd) + ")" : file_name_)
+    , file_name(std::move(file_name_))
 {
 }
 
@@ -142,4 +147,13 @@ off_t WriteBufferFromFileDescriptor::size() const
     return buf.st_size;
 }
 
+std::string WriteBufferFromFileDescriptor::getFileName() const
+{
+    if (file_name.empty())
+        return "(fd = " + toString(fd) + ")";
+
+    return file_name;
+}
+
+
 }
diff --git a/src/IO/WriteBufferFromFileDescriptor.h b/src/IO/WriteBufferFromFileDescriptor.h
index aef332b38b0..cad45067548 100644
--- a/src/IO/WriteBufferFromFileDescriptor.h
+++ b/src/IO/WriteBufferFromFileDescriptor.h
@@ -23,7 +23,7 @@ public:
         size_t buf_size = DBMS_DEFAULT_BUFFER_SIZE,
         char * existing_memory = nullptr,
         size_t alignment = 0,
-        const std::string & file_name_ = "");
+        std::string file_name_ = "");
 
     /** Could be used before initialization if needed 'fd' was not passed to constructor.
       * It's not possible to change 'fd' during work.
@@ -48,11 +48,7 @@ public:
     void truncate(off_t length); // NOLINT
 
     /// Name or some description of file.
-    std::string getFileName() const override
-    {
-        return file_name;
-    }
-
+    std::string getFileName() const override;
 
     off_t size() const;
 };

From 9db10a7164e0e22a3749d360244da995a38bec44 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 6 Sep 2021 14:34:17 +0300
Subject: [PATCH 113/177] Fix typo:

---
 src/IO/WriteBufferFromFileDescriptor.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/IO/WriteBufferFromFileDescriptor.cpp b/src/IO/WriteBufferFromFileDescriptor.cpp
index 96579626dfc..38aaa945362 100644
--- a/src/IO/WriteBufferFromFileDescriptor.cpp
+++ b/src/IO/WriteBufferFromFileDescriptor.cpp
@@ -80,7 +80,7 @@ void WriteBufferFromFileDescriptor::nextImpl()
 
 /// NOTE: This class can be used as a very low-level building block, for example
 /// in trace collector. In such places allocations of memory can be dangerous,
-/// so don't allocate anything in this consturctor.
+/// so don't allocate anything in this constructor.
 WriteBufferFromFileDescriptor::WriteBufferFromFileDescriptor(
     int fd_,
     size_t buf_size,

From 02490155153f127d3ad10b1f8949fde3aff8903e Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 11:37:51 +0000
Subject: [PATCH 114/177] added priority queue

---
 contrib/boost                                 |  2 +-
 contrib/boost-cmake/CMakeLists.txt            | 10 +++-
 src/CMakeLists.txt                            |  2 +-
 src/Common/PriorityQueue.h                    | 53 -------------------
 src/Common/ThreadPool.cpp                     | 12 +++--
 src/Common/ThreadPool.h                       |  4 +-
 src/Common/tests/gtest_priority_queue.cpp     | 25 ---------
 .../MergeTree/MergeMutateExecutor.cpp         | 20 ++++---
 8 files changed, 34 insertions(+), 94 deletions(-)
 delete mode 100644 src/Common/PriorityQueue.h
 delete mode 100644 src/Common/tests/gtest_priority_queue.cpp

diff --git a/contrib/boost b/contrib/boost
index 4b98e2befd3..66d17f060c4 160000
--- a/contrib/boost
+++ b/contrib/boost
@@ -1 +1 @@
-Subproject commit 4b98e2befd3f3265b0db0acb5d20c4812ef8d88e
+Subproject commit 66d17f060c4867aeea99fa2a20cfdae89ae2a2ec
diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt
index 9a2234ac9f6..7996d1b66b8 100644
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@@ -15,11 +15,12 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         coroutine
         graph
         circular_buffer
+        heap
     )
 
     if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND
         Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
-        Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY AND Boost_CIRCULAR_BUFFER_LIBRARY)
+        Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY AND Boost_CIRCULAR_BUFFER_LIBRARY AND Boost_HEAP_LIBRARY)
 
         set(EXTERNAL_BOOST_FOUND 1)
 
@@ -242,9 +243,14 @@ if (NOT EXTERNAL_BOOST_FOUND)
     target_include_directories (_boost_graph PRIVATE ${LIBRARY_DIR})
     target_link_libraries(_boost_graph PRIVATE _boost_regex)
 
-
+    # circular buffer
     add_library(_boost_circular_buffer INTERFACE)
     add_library(boost::circular_buffer ALIAS _boost_circular_buffer)
     target_include_directories(_boost_circular_buffer SYSTEM BEFORE INTERFACE ${LIBRARY_DIR})
 
+    # heap
+    add_library(_boost_heap INTERFACE)
+    add_library(boost::heap ALIAS _boost_heap)
+    target_include_directories(_boost_heap SYSTEM BEFORE INTERFACE ${LIBRARY_DIR})
+
 endif ()
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 54e73389d3f..48cff3e8e91 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -297,7 +297,6 @@ target_link_libraries(clickhouse_common_io
         PUBLIC
             boost::program_options
             boost::system
-            boost::circular_buffer
             ${CITYHASH_LIBRARIES}
             ${ZLIB_LIBRARIES}
             pcg_random
@@ -337,6 +336,7 @@ dbms_target_link_libraries (
         boost::filesystem
         boost::program_options
         boost::circular_buffer
+        boost::heap
         clickhouse_common_config
         clickhouse_common_zookeeper
         clickhouse_dictionaries_embedded
diff --git a/src/Common/PriorityQueue.h b/src/Common/PriorityQueue.h
deleted file mode 100644
index b845fc0caa8..00000000000
--- a/src/Common/PriorityQueue.h
+++ /dev/null
@@ -1,53 +0,0 @@
-#pragma once
-
-#include <queue>
-
-namespace DB
-{
-
-
-template <class T, class Comparator = std::less<T>>
-class PriorityQueue
-{
-public:
-
-    T pop()
-    {
-        assert(!buffer.empty());
-        std::pop_heap(buffer.begin(), buffer.end(), comparator);
-        auto element = std::move(buffer.back());
-        buffer.pop_back();
-        return element;
-    }
-
-    void push(T element)
-    {
-        buffer.push_back(std::move(element));
-        std::push_heap(buffer.begin(), buffer.end(), comparator);
-    }
-
-    template< class... Args >
-    void emplace(Args &&... args)
-    {
-        buffer.emplace_back(std::forward<Args>(args)...);
-        std::push_heap(buffer.begin(), buffer.end(), comparator);
-    }
-
-    bool empty() { return buffer.empty(); }
-    size_t size() { return buffer.size(); }
-    void reserve(size_t count) { buffer.reserve(count); }
-    void resize(size_t count)
-    {
-        buffer.resize(count);
-        std::make_heap(buffer.begin(), buffer.end(), comparator);
-    }
-
-private:
-
-    Comparator comparator;
-    std::vector<T> buffer;
-
-
-};
-
-}
diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp
index 641b51e0d60..f14d68da662 100644
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@@ -123,7 +123,6 @@ ReturnType ThreadPoolImpl<Thread>::scheduleImpl(Job job, int priority, std::opti
         /// Check if there are enough threads to process job.
         if (threads.size() < std::min(max_threads, scheduled_jobs + 1))
         {
-            ALLOW_ALLOCATIONS_IN_SCOPE;
             try
             {
                 threads.emplace_front();
@@ -249,9 +248,16 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
             need_shutdown = shutdown;
 
             if (!jobs.empty())
-                job = std::move(jobs.pop().job);
+            {
+                job = std::move(jobs.top().job);
+                jobs.pop();
+            }
             else
-                return; /// shutdown is true, simply finish the thread.
+            {
+                /// shutdown is true, simply finish the thread.
+                return;
+            }
+
         }
 
         if (!need_shutdown)
diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h
index 25d86a2fabc..b7e59041839 100644
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@@ -9,6 +9,8 @@
 #include <list>
 #include <optional>
 
+#include <boost/heap/priority_queue.hpp>
+
 #include <Poco/Event.h>
 #include <Common/ThreadStatus.h>
 #include <Common/PriorityQueue.h>
@@ -104,7 +106,7 @@ private:
         }
     };
 
-    DB::PriorityQueue<JobWithPriority> jobs;
+    boost::heap::priority_queue<JobWithPriority> jobs;
     std::list<Thread> threads;
     std::exception_ptr first_exception;
 
diff --git a/src/Common/tests/gtest_priority_queue.cpp b/src/Common/tests/gtest_priority_queue.cpp
deleted file mode 100644
index c2c943ea4cc..00000000000
--- a/src/Common/tests/gtest_priority_queue.cpp
+++ /dev/null
@@ -1,25 +0,0 @@
-#include <gtest/gtest.h>
-
-#include <random>
-
-#include <Common/PriorityQueue.h>
-
-using namespace DB;
-
-TEST(PriorityQueue, Simple)
-{
-    PriorityQueue<int> my;
-    std::priority_queue<int> original;
-
-    for (int i = 0; i < 1000; ++i)
-    {
-        my.push(i);
-        original.emplace(i);
-    }
-
-    for (int i = 0; i < 1000; ++i)
-    {
-        ASSERT_EQ(my.pop(), original.top());
-        original.pop();
-    }
-}
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeMutateExecutor.cpp
index 15ee6f7f2fc..cbed2aa9f69 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeMutateExecutor.cpp
@@ -207,15 +207,19 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         ItemPtr item = std::move(pending.front());
         pending.pop_front();
 
-        /// Execute a piece of task
-        bool res = pool.trySchedule([this, item]
+        bool res = false;
         {
-            routine(item);
-            /// When storage shutdowns it will wait until all related background tasks
-            /// are finished, because they may want to interact with its fields
-            /// and this will cause segfault.
-            item->is_done.set();
-        });
+            ALLOW_ALLOCATIONS_IN_SCOPE;
+            /// Execute a piece of task
+            res = pool.trySchedule([this, item]
+            {
+                routine(item);
+                /// When storage shutdowns it will wait until all related background tasks
+                /// are finished, because they may want to interact with its fields
+                /// and this will cause segfault.
+                item->is_done.set();
+            });
+        }
 
         if (!res)
         {

From ea0fbf81af801858b84f87a9657216b623f053cc Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 12:01:16 +0000
Subject: [PATCH 115/177] Renaming

---
 src/Common/ThreadPool.h                       |  1 -
 src/Interpreters/Context.cpp                  |  2 +-
 ...xecutor.cpp => BackgroundJobsAssignee.cpp} | 26 +++++++++----------
 ...obsExecutor.h => BackgroundJobsAssignee.h} |  8 +++---
 ...or.cpp => MergeTreeBackgroundExecutor.cpp} | 10 +++----
 ...ecutor.h => MergeTreeBackgroundExecutor.h} |  2 +-
 src/Storages/MergeTree/MergeTreeData.cpp      |  6 ++---
 src/Storages/MergeTree/MergeTreeData.h        | 10 +++----
 .../MergeTree/tests/gtest_executor.cpp        |  2 +-
 src/Storages/StorageMergeTree.cpp             |  2 +-
 src/Storages/StorageMergeTree.h               |  2 +-
 src/Storages/StorageReplicatedMergeTree.cpp   |  4 +--
 src/Storages/StorageReplicatedMergeTree.h     |  4 +--
 src/Storages/ya.make                          |  3 ++-
 14 files changed, 41 insertions(+), 41 deletions(-)
 rename src/Storages/MergeTree/{BackgroundJobsExecutor.cpp => BackgroundJobsAssignee.cpp} (79%)
 rename src/Storages/MergeTree/{BackgroundJobsExecutor.h => BackgroundJobsAssignee.h} (92%)
 rename src/Storages/MergeTree/{MergeMutateExecutor.cpp => MergeTreeBackgroundExecutor.cpp} (96%)
 rename src/Storages/MergeTree/{MergeMutateExecutor.h => MergeTreeBackgroundExecutor.h} (97%)

diff --git a/src/Common/ThreadPool.h b/src/Common/ThreadPool.h
index b7e59041839..f2b6ed10a39 100644
--- a/src/Common/ThreadPool.h
+++ b/src/Common/ThreadPool.h
@@ -13,7 +13,6 @@
 
 #include <Poco/Event.h>
 #include <Common/ThreadStatus.h>
-#include <Common/PriorityQueue.h>
 #include <common/scope_guard.h>
 
 /** Very simple thread pool similar to boost::threadpool.
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index 2c7bf9e6eca..a75f552ab05 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -77,7 +77,7 @@
 #include <Common/RemoteHostFilter.h>
 #include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/JIT/CompiledExpressionCache.h>
-#include <Storages/MergeTree/BackgroundJobsExecutor.h>
+#include <Storages/MergeTree/BackgroundJobsAssignee.h>
 #include <Storages/MergeTree/MergeTreeDataPartUUID.h>
 #include <Interpreters/SynonymsExtensions.h>
 #include <Interpreters/Lemmatizers.h>
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp
similarity index 79%
rename from src/Storages/MergeTree/BackgroundJobsExecutor.cpp
rename to src/Storages/MergeTree/BackgroundJobsAssignee.cpp
index a8d2f52926c..598c43f2153 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp
@@ -1,4 +1,4 @@
-#include <Storages/MergeTree/BackgroundJobsExecutor.h>
+#include <Storages/MergeTree/BackgroundJobsAssignee.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Common/CurrentMetrics.h>
 #include <Common/randomSeed.h>
@@ -8,7 +8,7 @@
 namespace DB
 {
 
-BackgroundJobAssignee::BackgroundJobAssignee(MergeTreeData & data_, BackgroundJobAssignee::Type type_, ContextPtr global_context_)
+BackgroundJobsAssignee::BackgroundJobsAssignee(MergeTreeData & data_, BackgroundJobsAssignee::Type type_, ContextPtr global_context_)
     : WithContext(global_context_)
     , data(data_)
     , sleep_settings(global_context_->getBackgroundMoveTaskSchedulingSettings())
@@ -17,7 +17,7 @@ BackgroundJobAssignee::BackgroundJobAssignee(MergeTreeData & data_, BackgroundJo
 {
 }
 
-void BackgroundJobAssignee::trigger()
+void BackgroundJobsAssignee::trigger()
 {
     std::lock_guard lock(holder_mutex);
 
@@ -29,7 +29,7 @@ void BackgroundJobAssignee::trigger()
     holder->schedule();
 }
 
-void BackgroundJobAssignee::postpone()
+void BackgroundJobsAssignee::postpone()
 {
     std::lock_guard lock(holder_mutex);
 
@@ -48,28 +48,28 @@ void BackgroundJobAssignee::postpone()
 }
 
 
-void BackgroundJobAssignee::scheduleMergeMutateTask(ExecutableTaskPtr merge_task)
+void BackgroundJobsAssignee::scheduleMergeMutateTask(ExecutableTaskPtr merge_task)
 {
     bool res = getContext()->getMergeMutateExecutor()->trySchedule(merge_task);
     res ? trigger() : postpone();
 }
 
 
-void BackgroundJobAssignee::scheduleFetchTask(ExecutableTaskPtr fetch_task)
+void BackgroundJobsAssignee::scheduleFetchTask(ExecutableTaskPtr fetch_task)
 {
     bool res = getContext()->getFetchesExecutor()->trySchedule(fetch_task);
     res ? trigger() : postpone();
 }
 
 
-void BackgroundJobAssignee::scheduleMoveTask(ExecutableTaskPtr move_task)
+void BackgroundJobsAssignee::scheduleMoveTask(ExecutableTaskPtr move_task)
 {
     bool res = getContext()->getMovesExecutor()->trySchedule(move_task);
     res ? trigger() : postpone();
 }
 
 
-String BackgroundJobAssignee::toString(Type type)
+String BackgroundJobsAssignee::toString(Type type)
 {
     switch (type)
     {
@@ -80,16 +80,16 @@ String BackgroundJobAssignee::toString(Type type)
     }
 }
 
-void BackgroundJobAssignee::start()
+void BackgroundJobsAssignee::start()
 {
     std::lock_guard lock(holder_mutex);
     if (!holder)
-        holder = getContext()->getSchedulePool().createTask("BackgroundJobAssignee:" + toString(type), [this]{ main(); });
+        holder = getContext()->getSchedulePool().createTask("BackgroundJobsAssignee:" + toString(type), [this]{ main(); });
 
     holder->activateAndSchedule();
 }
 
-void BackgroundJobAssignee::finish()
+void BackgroundJobsAssignee::finish()
 {
     /// No lock here, because scheduled tasks could call trigger method
     if (holder)
@@ -105,7 +105,7 @@ void BackgroundJobAssignee::finish()
 }
 
 
-void BackgroundJobAssignee::main()
+void BackgroundJobsAssignee::main()
 try
 {
     bool succeed = false;
@@ -128,7 +128,7 @@ catch (...) /// Catch any exception to avoid thread termination.
     postpone();
 }
 
-BackgroundJobAssignee::~BackgroundJobAssignee()
+BackgroundJobsAssignee::~BackgroundJobsAssignee()
 {
     try
     {
diff --git a/src/Storages/MergeTree/BackgroundJobsExecutor.h b/src/Storages/MergeTree/BackgroundJobsAssignee.h
similarity index 92%
rename from src/Storages/MergeTree/BackgroundJobsExecutor.h
rename to src/Storages/MergeTree/BackgroundJobsAssignee.h
index 82bd6882b96..6d2778adea2 100644
--- a/src/Storages/MergeTree/BackgroundJobsExecutor.h
+++ b/src/Storages/MergeTree/BackgroundJobsAssignee.h
@@ -1,6 +1,6 @@
 #pragma once
 
-#include <Storages/MergeTree/MergeMutateExecutor.h>
+#include <Storages/MergeTree/MergeTreeBackgroundExecutor.h>
 #include <Common/ThreadPool.h>
 #include <Core/BackgroundSchedulePool.h>
 #include <pcg_random.hpp>
@@ -29,7 +29,7 @@ struct ExecutableTaskSchedulingSettings
 
 class MergeTreeData;
 
-class BackgroundJobAssignee : protected WithContext
+class BackgroundJobsAssignee : protected WithContext
 {
 private:
     MergeTreeData & data;
@@ -66,9 +66,9 @@ public:
     void scheduleMoveTask(ExecutableTaskPtr move_task);
 
     /// Just call finish
-    virtual ~BackgroundJobAssignee();
+    virtual ~BackgroundJobsAssignee();
 
-    BackgroundJobAssignee(
+    BackgroundJobsAssignee(
         MergeTreeData & data_,
         Type type,
         ContextPtr global_context_);
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
similarity index 96%
rename from src/Storages/MergeTree/MergeMutateExecutor.cpp
rename to src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index cbed2aa9f69..f43bbce2ebd 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -1,7 +1,7 @@
-#include <Storages/MergeTree/MergeMutateExecutor.h>
+#include <Storages/MergeTree/MergeTreeBackgroundExecutor.h>
 
 #include <Common/setThreadName.h>
-#include <Storages/MergeTree/BackgroundJobsExecutor.h>
+#include <Storages/MergeTree/BackgroundJobsAssignee.h>
 
 
 namespace DB
@@ -57,7 +57,7 @@ void MergeTreeBackgroundExecutor::wait()
     if (scheduler.joinable())
         scheduler.join();
 
-    pool.wait();
+    /// ThreadPool will be finalized in destructor.
 }
 
 
@@ -115,7 +115,7 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 
         /// Erase storage related tasks from pending and select active tasks to wait for
         auto it = std::remove_if(pending.begin(), pending.end(),
-            [&] (auto item) -> bool { return item->task->getStorageID() == id; } );
+            [&] (auto item) -> bool { return item->task->getStorageID() == id; });
         pending.erase(it, pending.end());
 
         /// Copy items to wait for their completion
@@ -179,7 +179,7 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
         /// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
         item->task->onCompleted();
     }
-    catch(...)
+    catch (...)
     {
         std::lock_guard guard(mutex);
         erase_from_active();
diff --git a/src/Storages/MergeTree/MergeMutateExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
similarity index 97%
rename from src/Storages/MergeTree/MergeMutateExecutor.h
rename to src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index 67edf282862..bdeeecb7592 100644
--- a/src/Storages/MergeTree/MergeMutateExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -35,7 +35,7 @@ namespace DB
  *
  *  Due to all caveats I described above we use boost::circular_buffer as a container for queues.
  *
- *  Another nuisance that we faces with is than backgroud operations always interacts with an associated Storage.
+ *  Another nuisance that we faces with is than background operations always interact with an associated Storage.
  *  So, when a Storage want to shutdown, it must wait until all its background operaions are finished.
  */
 class MergeTreeBackgroundExecutor : public shared_ptr_helper<MergeTreeBackgroundExecutor>
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index a57c27c33ef..6592b9a043d 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -200,8 +200,8 @@ MergeTreeData::MergeTreeData(
     , data_parts_by_info(data_parts_indexes.get<TagByInfo>())
     , data_parts_by_state_and_info(data_parts_indexes.get<TagByStateAndInfo>())
     , parts_mover(this)
-    , background_executor(*this, BackgroundJobAssignee::Type::DataProcessing, getContext())
-    , background_moves_executor(*this, BackgroundJobAssignee::Type::Moving, getContext())
+    , background_executor(*this, BackgroundJobsAssignee::Type::DataProcessing, getContext())
+    , background_moves_executor(*this, BackgroundJobsAssignee::Type::Moving, getContext())
 {
     const auto settings = getSettings();
     allow_nullable_key = attach || settings->allow_nullable_key;
@@ -5029,7 +5029,7 @@ MergeTreeData::CurrentlyMovingPartsTagger::~CurrentlyMovingPartsTagger()
     }
 }
 
-bool MergeTreeData::scheduleDataMovingJob(BackgroundJobAssignee & executor)
+bool MergeTreeData::scheduleDataMovingJob(BackgroundJobsAssignee & executor)
 {
     if (parts_mover.moves_blocker.isCancelled())
         return false;
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index bc0b12c3e46..c3b9eaf1b59 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -3,7 +3,7 @@
 #include <Common/SimpleIncrement.h>
 #include <Common/MultiVersion.h>
 #include <Storages/IStorage.h>
-#include <Storages/MergeTree/BackgroundJobsExecutor.h>
+#include <Storages/MergeTree/BackgroundJobsAssignee.h>
 #include <Storages/MergeTree/MergeTreeIndices.h>
 #include <Storages/MergeTree/MergeTreePartInfo.h>
 #include <Storages/MergeTree/MergeTreeSettings.h>
@@ -827,9 +827,9 @@ public:
     PinnedPartUUIDsPtr getPinnedPartUUIDs() const;
 
     /// Schedules background job to like merge/mutate/fetch an executor
-    virtual bool scheduleDataProcessingJob(BackgroundJobAssignee & executor) = 0;
+    virtual bool scheduleDataProcessingJob(BackgroundJobsAssignee & executor) = 0;
     /// Schedules job to move parts between disks/volumes and so on.
-    bool scheduleDataMovingJob(BackgroundJobAssignee & executor);
+    bool scheduleDataMovingJob(BackgroundJobsAssignee & executor);
     bool areBackgroundMovesNeeded() const;
 
     /// Lock part in zookeeper for shared data in several nodes
@@ -925,8 +925,8 @@ protected:
 
     /// Executors are common for both ReplicatedMergeTree and plain MergeTree
     /// but they are being started and finished in derived classes, so let them be protected.
-    BackgroundJobAssignee background_executor;
-    BackgroundJobAssignee background_moves_executor;
+    BackgroundJobsAssignee background_executor;
+    BackgroundJobsAssignee background_moves_executor;
 
     /// Every task that is finished will ask to assign a new one into an executor.
     std::function<void(bool)> common_assignee_trigger;
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index ece859fbb0f..1377d9add9a 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -6,7 +6,7 @@
 #include <random>
 
 #include <Storages/MergeTree/ExecutableTask.h>
-#include <Storages/MergeTree/MergeMutateExecutor.h>
+#include <Storages/MergeTree/MergeTreeBackgroundExecutor.h>
 
 using namespace DB;
 
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 92b8be63826..46bc319a288 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -1041,7 +1041,7 @@ bool StorageMergeTree::mutateSelectedPart(const StorageMetadataPtr & metadata_sn
     return true;
 }
 
-bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee & executor) //-V657
+bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & executor) //-V657
 {
     if (shutdown_called)
         return false;
diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h
index c05e4cf44ab..09d362b6eef 100644
--- a/src/Storages/StorageMergeTree.h
+++ b/src/Storages/StorageMergeTree.h
@@ -95,7 +95,7 @@ public:
 
     RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) override;
 
-    bool scheduleDataProcessingJob(BackgroundJobAssignee & executor) override;
+    bool scheduleDataProcessingJob(BackgroundJobsAssignee & executor) override;
 
     MergeTreeDeduplicationLog * getDeduplicationLog() { return deduplication_log.get(); }
 
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index c49868d6230..942eb0e561c 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -17,7 +17,7 @@
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Storages/MergeTree/IMergeTreeDataPart.h>
 #include <Storages/MergeTree/MergeList.h>
-#include <Storages/MergeTree/MergeMutateExecutor.h>
+#include <Storages/MergeTree/MergeTreeBackgroundExecutor.h>
 #include <Storages/MergeTree/MergedBlockOutputStream.h>
 #include <Storages/MergeTree/PinnedPartUUIDs.h>
 #include <Storages/MergeTree/PartitionPruner.h>
@@ -3173,7 +3173,7 @@ bool StorageReplicatedMergeTree::processQueueEntry(ReplicatedMergeTreeQueue::Sel
     });
 }
 
-bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobAssignee & executor)
+bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & executor)
 {
     /// If replication queue is stopped exit immediately as we successfully executed the task
     if (queue.actions_blocker.isCancelled())
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index cf7fe5169fa..ddb14378f90 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -29,7 +29,7 @@
 #include <Common/Throttler.h>
 #include <Core/BackgroundSchedulePool.h>
 #include <Processors/Pipe.h>
-#include <Storages/MergeTree/BackgroundJobsExecutor.h>
+#include <Storages/MergeTree/BackgroundJobsAssignee.h>
 
 
 namespace DB
@@ -218,7 +218,7 @@ public:
                                               const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger);
 
     /// Schedules job to execute in background pool (merge, mutate, drop range and so on)
-    bool scheduleDataProcessingJob(BackgroundJobAssignee & executor) override;
+    bool scheduleDataProcessingJob(BackgroundJobsAssignee & executor) override;
 
     /// Checks that fetches are not disabled with action blocker and pool for fetches
     /// is not overloaded
diff --git a/src/Storages/ya.make b/src/Storages/ya.make
index 0daea968af5..1cbb611e8d3 100644
--- a/src/Storages/ya.make
+++ b/src/Storages/ya.make
@@ -27,7 +27,7 @@ SRCS(
     MemorySettings.cpp
     MergeTree/ActiveDataPartSet.cpp
     MergeTree/AllMergeSelector.cpp
-    MergeTree/BackgroundJobsExecutor.cpp
+    MergeTree/BackgroundJobsAssignee.cpp
     MergeTree/BoolMask.cpp
     MergeTree/DataPartsExchange.cpp
     MergeTree/DropPartsRanges.cpp
@@ -40,6 +40,7 @@ SRCS(
     MergeTree/LevelMergeSelector.cpp
     MergeTree/MergeAlgorithm.cpp
     MergeTree/MergeList.cpp
+    MergeTree/MergeTreeBackgroundExecutor.cpp
     MergeTree/MergeTreeBaseSelectProcessor.cpp
     MergeTree/MergeTreeBlockReadUtils.cpp
     MergeTree/MergeTreeData.cpp

From 47f46e388da63be9eef8fc8f8da57e70d8e0d690 Mon Sep 17 00:00:00 2001
From: Anton Popov <pad11rus@gmail.com>
Date: Mon, 6 Sep 2021 16:54:26 +0300
Subject: [PATCH 116/177] fix optimization of disjunctions chain

---
 .../LogicalExpressionsOptimizer.cpp           | 24 +++++++------------
 .../02023_transform_or_to_in.reference        |  2 ++
 .../0_stateless/02023_transform_or_to_in.sql  | 15 ++++++++++++
 3 files changed, 26 insertions(+), 15 deletions(-)
 create mode 100644 tests/queries/0_stateless/02023_transform_or_to_in.reference
 create mode 100644 tests/queries/0_stateless/02023_transform_or_to_in.sql

diff --git a/src/Interpreters/LogicalExpressionsOptimizer.cpp b/src/Interpreters/LogicalExpressionsOptimizer.cpp
index c0d5a16fa65..936ed0149d2 100644
--- a/src/Interpreters/LogicalExpressionsOptimizer.cpp
+++ b/src/Interpreters/LogicalExpressionsOptimizer.cpp
@@ -225,22 +225,19 @@ void LogicalExpressionsOptimizer::addInExpression(const DisjunctiveEqualityChain
 
     /// 1. Create a new IN expression based on information from the OR-chain.
 
-    /// Construct a list of literals `x1, ..., xN` from the string `expr = x1 OR ... OR expr = xN`
-    ASTPtr value_list = std::make_shared<ASTExpressionList>();
+    /// Construct a tuple of literals `x1, ..., xN` from the string `expr = x1 OR ... OR expr = xN`
+
+    Tuple tuple;
+    tuple.reserve(equality_functions.size());
+
     for (const auto * function : equality_functions)
     {
         const auto & operands = getFunctionOperands(function);
-        value_list->children.push_back(operands[1]);
+        tuple.push_back(operands[1]->as<ASTLiteral>()->value);
     }
 
     /// Sort the literals so that they are specified in the same order in the IN expression.
-    /// Otherwise, they would be specified in the order of the ASTLiteral addresses, which is nondeterministic.
-    std::sort(value_list->children.begin(), value_list->children.end(), [](const DB::ASTPtr & lhs, const DB::ASTPtr & rhs)
-    {
-        const auto * val_lhs = lhs->as<ASTLiteral>();
-        const auto * val_rhs = rhs->as<ASTLiteral>();
-        return val_lhs->value < val_rhs->value;
-    });
+    std::sort(tuple.begin(), tuple.end());
 
     /// Get the expression `expr` from the chain `expr = x1 OR ... OR expr = xN`
     ASTPtr equals_expr_lhs;
@@ -250,14 +247,11 @@ void LogicalExpressionsOptimizer::addInExpression(const DisjunctiveEqualityChain
         equals_expr_lhs = operands[0];
     }
 
-    auto tuple_function = std::make_shared<ASTFunction>();
-    tuple_function->name = "tuple";
-    tuple_function->arguments = value_list;
-    tuple_function->children.push_back(tuple_function->arguments);
+    auto tuple_literal = std::make_shared<ASTLiteral>(std::move(tuple));
 
     ASTPtr expression_list = std::make_shared<ASTExpressionList>();
     expression_list->children.push_back(equals_expr_lhs);
-    expression_list->children.push_back(tuple_function);
+    expression_list->children.push_back(tuple_literal);
 
     /// Construct the expression `expr IN (x1, ..., xN)`
     auto in_function = std::make_shared<ASTFunction>();
diff --git a/tests/queries/0_stateless/02023_transform_or_to_in.reference b/tests/queries/0_stateless/02023_transform_or_to_in.reference
new file mode 100644
index 00000000000..aa47d0d46d4
--- /dev/null
+++ b/tests/queries/0_stateless/02023_transform_or_to_in.reference
@@ -0,0 +1,2 @@
+0
+0
diff --git a/tests/queries/0_stateless/02023_transform_or_to_in.sql b/tests/queries/0_stateless/02023_transform_or_to_in.sql
new file mode 100644
index 00000000000..c4ceeb76931
--- /dev/null
+++ b/tests/queries/0_stateless/02023_transform_or_to_in.sql
@@ -0,0 +1,15 @@
+DROP TABLE IF EXISTS t_transform_or;
+
+CREATE TABLE t_transform_or(B AggregateFunction(uniq, String), A String) Engine=MergeTree ORDER BY (A);
+
+INSERT INTO t_transform_or SELECT uniqState(''), '0';
+
+SELECT uniqMergeIf(B, (A = '1') OR (A = '2') OR (A = '3'))
+FROM cluster(test_cluster_two_shards, currentDatabase(), t_transform_or)
+SETTINGS legacy_column_name_of_tuple_literal = 0;
+
+SELECT uniqMergeIf(B, (A = '1') OR (A = '2') OR (A = '3'))
+FROM cluster(test_cluster_two_shards, currentDatabase(), t_transform_or)
+SETTINGS legacy_column_name_of_tuple_literal = 1;
+
+DROP TABLE t_transform_or;

From 17d65b3d7ba1fe5c0f009f7829613bb069b5f937 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 14:04:37 +0000
Subject: [PATCH 117/177] Allow allocations

---
 src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index f43bbce2ebd..6961a4df501 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -147,7 +147,6 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
 
     auto erase_from_active = [&]
     {
-        DENY_ALLOCATIONS_IN_SCOPE;
         active.erase(std::remove(active.begin(), active.end(), item), active.end());
     };
 
@@ -155,7 +154,6 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
     {
         if (item->task->execute())
         {
-            DENY_ALLOCATIONS_IN_SCOPE;
             std::lock_guard guard(mutex);
 
             if (currently_deleting.contains(item->task->getStorageID()))

From 29a18d8d10f4c25090a8784a3637b094803cce92 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 15:58:08 +0000
Subject: [PATCH 118/177] Try to fix unbundled

---
 contrib/boost-cmake/CMakeLists.txt   |  5 -----
 contrib/poco                         |  2 +-
 docker/packager/unbundled/Dockerfile |  1 +
 src/CMakeLists.txt                   | 10 ++++++++--
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt
index 7996d1b66b8..b19b39f8b81 100644
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@@ -14,8 +14,6 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         context
         coroutine
         graph
-        circular_buffer
-        heap
     )
 
     if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND
@@ -36,7 +34,6 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         add_library (_boost_context INTERFACE)
         add_library (_boost_coroutine INTERFACE)
         add_library (_boost_graph INTERFACE)
-        add_library (_boost_circular_buffer INTERFACE)
 
         target_link_libraries (_boost_filesystem INTERFACE ${Boost_FILESYSTEM_LIBRARY})
         target_link_libraries (_boost_iostreams INTERFACE ${Boost_IOSTREAMS_LIBRARY})
@@ -46,7 +43,6 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         target_link_libraries (_boost_context INTERFACE ${Boost_CONTEXT_LIBRARY})
         target_link_libraries (_boost_coroutine INTERFACE ${Boost_COROUTINE_LIBRARY})
         target_link_libraries (_boost_graph INTERFACE ${Boost_GRAPH_LIBRARY})
-        target_link_libraries (_boost_circular_buffer INTERFACE ${Boost_CIRCULAR_BUFFER_LIBRARY})
 
         add_library (boost::filesystem ALIAS _boost_filesystem)
         add_library (boost::iostreams ALIAS _boost_iostreams)
@@ -56,7 +52,6 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
         add_library (boost::context ALIAS _boost_context)
         add_library (boost::coroutine ALIAS _boost_coroutine)
         add_library (boost::graph ALIAS _boost_graph)
-        add_library (boost::circular_buffer ALIAS _boost_circular_buffer)
     else()
         set(EXTERNAL_BOOST_FOUND 0)
         message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system boost")
diff --git a/contrib/poco b/contrib/poco
index 46c80daf1b0..7351c4691b5 160000
--- a/contrib/poco
+++ b/contrib/poco
@@ -1 +1 @@
-Subproject commit 46c80daf1b015aa10474ce82e3d24b578c6ae422
+Subproject commit 7351c4691b5d401f59e3959adfc5b4fa263b32da
diff --git a/docker/packager/unbundled/Dockerfile b/docker/packager/unbundled/Dockerfile
index b2d9f555f19..1d0eee155ef 100644
--- a/docker/packager/unbundled/Dockerfile
+++ b/docker/packager/unbundled/Dockerfile
@@ -17,6 +17,7 @@ RUN apt-get update \
         devscripts \
         libc++-dev \
         libc++abi-dev \
+        libboost-all-dev \
         libboost-program-options-dev \
         libboost-system-dev \
         libboost-filesystem-dev \
diff --git a/src/CMakeLists.txt b/src/CMakeLists.txt
index 48cff3e8e91..c7c555c2f3b 100644
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -335,8 +335,6 @@ dbms_target_link_libraries (
     PRIVATE
         boost::filesystem
         boost::program_options
-        boost::circular_buffer
-        boost::heap
         clickhouse_common_config
         clickhouse_common_zookeeper
         clickhouse_dictionaries_embedded
@@ -351,6 +349,14 @@ dbms_target_link_libraries (
         clickhouse_common_io
 )
 
+if (NOT_UNBUNDLED)
+    dbms_target_link_libraries (
+        PUBLIC
+            boost::circular_buffer
+            boost::heap
+    )
+endif()
+
 target_include_directories(clickhouse_common_io PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/Core/include") # uses some includes from core
 dbms_target_include_directories(PUBLIC "${CMAKE_CURRENT_BINARY_DIR}/Core/include")
 

From a327144d40d95e44db74aac1689b712c3d2f8db0 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 16:39:46 +0000
Subject: [PATCH 119/177] Better

---
 contrib/boost-cmake/CMakeLists.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/boost-cmake/CMakeLists.txt b/contrib/boost-cmake/CMakeLists.txt
index b19b39f8b81..27072910135 100644
--- a/contrib/boost-cmake/CMakeLists.txt
+++ b/contrib/boost-cmake/CMakeLists.txt
@@ -18,7 +18,7 @@ if (NOT USE_INTERNAL_BOOST_LIBRARY)
 
     if(Boost_INCLUDE_DIR AND Boost_FILESYSTEM_LIBRARY AND
         Boost_PROGRAM_OPTIONS_LIBRARY AND Boost_REGEX_LIBRARY AND Boost_SYSTEM_LIBRARY AND Boost_CONTEXT_LIBRARY AND
-        Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY AND Boost_CIRCULAR_BUFFER_LIBRARY AND Boost_HEAP_LIBRARY)
+        Boost_COROUTINE_LIBRARY AND Boost_GRAPH_LIBRARY)
 
         set(EXTERNAL_BOOST_FOUND 1)
 

From 474f34e78c70e9e2f1d2aa910d1216e92897c4d3 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 16:41:49 +0000
Subject: [PATCH 120/177] Move back

---
 contrib/poco | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/poco b/contrib/poco
index 7351c4691b5..46c80daf1b0 160000
--- a/contrib/poco
+++ b/contrib/poco
@@ -1 +1 @@
-Subproject commit 7351c4691b5d401f59e3959adfc5b4fa263b32da
+Subproject commit 46c80daf1b015aa10474ce82e3d24b578c6ae422

From 324ab97f50c8d8d00a837411b25fc515555a946a Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 16:44:46 +0000
Subject: [PATCH 121/177] Fix gcc build

---
 src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index 6961a4df501..d2d547809e4 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -19,6 +19,7 @@ String MergeTreeBackgroundExecutor::toString(Type type)
         case Type::MOVE:
             return "Move";
     }
+    __builtin_unreachable();
 }
 
 

From 74c0b61e530e560f7baa897a9577cb8eaac08585 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 18:33:31 +0000
Subject: [PATCH 122/177] Better

---
 src/Common/ThreadPool.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp
index f14d68da662..2d6c524ae48 100644
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@@ -249,7 +249,9 @@ void ThreadPoolImpl<Thread>::worker(typename std::list<Thread>::iterator thread_
 
             if (!jobs.empty())
             {
-                job = std::move(jobs.top().job);
+                /// boost::priority_queue does not provide interface for getting non-const reference to an element
+                /// to prevent us from modifying its priority. We have to use const_cast to force move semantics on JobWithPriority::job.
+                job = std::move(const_cast<Job &>(jobs.top().job));
                 jobs.pop();
             }
             else

From 7bea8200d80b24022c285b252258805a5427e3d2 Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Mon, 6 Sep 2021 19:18:09 +0000
Subject: [PATCH 123/177] Revert "Merge pull request #28082 from
 zhongyuankai/add-system-of-table_views"

This reverts commit e5bcfba89e379c458ba15475421fdbcf8614cd0e, reversing
changes made to d77f2436517712e0d3678533cd464c60fe5a0aed.
---
 src/Interpreters/DatabaseCatalog.cpp          |  6 --
 src/Interpreters/DatabaseCatalog.h            |  1 -
 src/Storages/System/StorageSystemViews.cpp    | 68 -------------------
 src/Storages/System/StorageSystemViews.h      | 24 -------
 src/Storages/System/attachSystemTables.cpp    |  2 -
 .../0_stateless/02015_system_views.reference  |  1 -
 .../0_stateless/02015_system_views.sql        | 14 ----
 tests/queries/skip_list.json                  |  3 +-
 8 files changed, 1 insertion(+), 118 deletions(-)
 delete mode 100644 src/Storages/System/StorageSystemViews.cpp
 delete mode 100644 src/Storages/System/StorageSystemViews.h
 delete mode 100644 tests/queries/0_stateless/02015_system_views.reference
 delete mode 100644 tests/queries/0_stateless/02015_system_views.sql

diff --git a/src/Interpreters/DatabaseCatalog.cpp b/src/Interpreters/DatabaseCatalog.cpp
index af60eeeaba3..20ebc0a9ee5 100644
--- a/src/Interpreters/DatabaseCatalog.cpp
+++ b/src/Interpreters/DatabaseCatalog.cpp
@@ -617,12 +617,6 @@ Dependencies DatabaseCatalog::getDependencies(const StorageID & from) const
     return Dependencies(iter->second.begin(), iter->second.end());
 }
 
-ViewDependencies DatabaseCatalog::getViewDependencies() const
-{
-    std::lock_guard lock{databases_mutex};
-    return ViewDependencies(view_dependencies.begin(), view_dependencies.end());
-}
-
 void
 DatabaseCatalog::updateDependency(const StorageID & old_from, const StorageID & old_where, const StorageID & new_from,
                                   const StorageID & new_where)
diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h
index 72dd28d335b..071b80690df 100644
--- a/src/Interpreters/DatabaseCatalog.h
+++ b/src/Interpreters/DatabaseCatalog.h
@@ -175,7 +175,6 @@ public:
     void addDependency(const StorageID & from, const StorageID & where);
     void removeDependency(const StorageID & from, const StorageID & where);
     Dependencies getDependencies(const StorageID & from) const;
-    ViewDependencies getViewDependencies() const;
 
     /// For Materialized and Live View
     void updateDependency(const StorageID & old_from, const StorageID & old_where,const StorageID & new_from, const StorageID & new_where);
diff --git a/src/Storages/System/StorageSystemViews.cpp b/src/Storages/System/StorageSystemViews.cpp
deleted file mode 100644
index 0bb2724b358..00000000000
--- a/src/Storages/System/StorageSystemViews.cpp
+++ /dev/null
@@ -1,68 +0,0 @@
-#include <Storages/System/StorageSystemViews.h>
-#include <DataTypes/DataTypeString.h>
-#include <Access/ContextAccess.h>
-#include <Interpreters/Context.h>
-#include <Interpreters/QueryViewsLog.h>
-#include <DataTypes/DataTypeEnum.h>
-#include <Storages/StorageMaterializedView.h>
-#include <Storages/LiveView/StorageLiveView.h>
-
-namespace DB
-{
-
-class Context;
-
-NamesAndTypesList StorageSystemViews::getNamesAndTypes()
-{
-    auto view_type_datatype = std::make_shared<DataTypeEnum8>(DataTypeEnum8::Values{
-        {"Default", static_cast<Int8>(QueryViewsLogElement::ViewType::DEFAULT)},
-        {"Materialized", static_cast<Int8>(QueryViewsLogElement::ViewType::MATERIALIZED)},
-        {"Live", static_cast<Int8>(QueryViewsLogElement::ViewType::LIVE)}});
-
-    return {
-        {"database", std::make_shared<DataTypeString>()},
-        {"name", std::make_shared<DataTypeString>()},
-        {"main_dependency_database", std::make_shared<DataTypeString>()},
-        {"main_dependency_table", std::make_shared<DataTypeString>()},
-        {"view_type", std::move(view_type_datatype)},
-    };
-}
-
-void StorageSystemViews::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const
-{
-    const auto access = context->getAccess();
-    const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES);
-
-    for (const auto & [table_id, view_ids] : DatabaseCatalog::instance().getViewDependencies())
-    {
-        const bool check_access_for_tables = check_access_for_databases && !access->isGranted(AccessType::SHOW_TABLES, table_id.database_name);
-
-        if (check_access_for_tables && !access->isGranted(AccessType::SHOW_TABLES, table_id.database_name, table_id.table_name))
-            continue;
-
-        size_t col_num;
-        for (const auto & view_id : view_ids)
-        {
-            auto view_ptr = DatabaseCatalog::instance().getTable(view_id, context);
-            QueryViewsLogElement::ViewType type = QueryViewsLogElement::ViewType::DEFAULT;
-
-            if (typeid_cast<const StorageMaterializedView *>(view_ptr.get()))
-            {
-                type = QueryViewsLogElement::ViewType::MATERIALIZED;
-            }
-            else if (typeid_cast<const StorageLiveView *>(view_ptr.get()))
-            {
-                type = QueryViewsLogElement::ViewType::LIVE;
-            }
-
-            col_num = 0;
-            res_columns[col_num++]->insert(view_id.database_name);
-            res_columns[col_num++]->insert(view_id.table_name);
-            res_columns[col_num++]->insert(table_id.database_name);
-            res_columns[col_num++]->insert(table_id.table_name);
-            res_columns[col_num++]->insert(type);
-        }
-    }
-}
-
-}
diff --git a/src/Storages/System/StorageSystemViews.h b/src/Storages/System/StorageSystemViews.h
deleted file mode 100644
index 67fcb79067e..00000000000
--- a/src/Storages/System/StorageSystemViews.h
+++ /dev/null
@@ -1,24 +0,0 @@
-#pragma once
-
-#include <common/shared_ptr_helper.h>
-#include <Storages/System/IStorageSystemOneBlock.h>
-
-namespace DB
-{
-
-class StorageSystemViews final : public shared_ptr_helper<StorageSystemViews>, public IStorageSystemOneBlock<StorageSystemViews>
-{
-    friend struct shared_ptr_helper<StorageSystemViews>;
-protected:
-    using IStorageSystemOneBlock::IStorageSystemOneBlock;
-
-    void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo &) const override;
-
-public:
-    std::string getName() const override { return "SystemViews"; }
-
-    static NamesAndTypesList getNamesAndTypes();
-
-};
-
-}
diff --git a/src/Storages/System/attachSystemTables.cpp b/src/Storages/System/attachSystemTables.cpp
index 3656a239adb..95e86487073 100644
--- a/src/Storages/System/attachSystemTables.cpp
+++ b/src/Storages/System/attachSystemTables.cpp
@@ -44,7 +44,6 @@
 #include <Storages/System/StorageSystemTableEngines.h>
 #include <Storages/System/StorageSystemTableFunctions.h>
 #include <Storages/System/StorageSystemTables.h>
-#include <Storages/System/StorageSystemViews.h>
 #include <Storages/System/StorageSystemZooKeeper.h>
 #include <Storages/System/StorageSystemContributors.h>
 #include <Storages/System/StorageSystemErrors.h>
@@ -96,7 +95,6 @@ void attachSystemTablesLocal(IDatabase & system_database)
     attach<StorageSystemZeros>(system_database, "zeros_mt", true);
     attach<StorageSystemDatabases>(system_database, "databases");
     attach<StorageSystemTables>(system_database, "tables");
-    attach<StorageSystemViews>(system_database, "views");
     attach<StorageSystemColumns>(system_database, "columns");
     attach<StorageSystemFunctions>(system_database, "functions");
     attach<StorageSystemEvents>(system_database, "events");
diff --git a/tests/queries/0_stateless/02015_system_views.reference b/tests/queries/0_stateless/02015_system_views.reference
deleted file mode 100644
index a1b1b2a9fd3..00000000000
--- a/tests/queries/0_stateless/02015_system_views.reference
+++ /dev/null
@@ -1 +0,0 @@
-02015_db	materialized_view	02015_db	view_source_tb	Materialized
diff --git a/tests/queries/0_stateless/02015_system_views.sql b/tests/queries/0_stateless/02015_system_views.sql
deleted file mode 100644
index a6375dcb591..00000000000
--- a/tests/queries/0_stateless/02015_system_views.sql
+++ /dev/null
@@ -1,14 +0,0 @@
-DROP DATABASE IF EXISTS 02015_db;
-CREATE DATABASE IF NOT EXISTS 02015_db;
-
-DROP TABLE IF EXISTS 02015_db.view_source_tb;
-CREATE TABLE IF NOT EXISTS 02015_db.view_source_tb (a UInt8, s String) ENGINE = MergeTree() ORDER BY a;
-
-DROP TABLE IF EXISTS 02015_db.materialized_view;
-CREATE MATERIALIZED VIEW IF NOT EXISTS 02015_db.materialized_view ENGINE = ReplacingMergeTree() ORDER BY a AS SELECT * FROM 02015_db.view_source_tb;
-
-SELECT * FROM system.views WHERE database='02015_db' and name = 'materialized_view';
-
-DROP TABLE IF EXISTS 02015_db.materialized_view;
-DROP TABLE IF EXISTS 02015_db.view_source_tb;
-DROP DATABASE IF EXISTS 02015_db;
diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json
index 0143cc78dbe..335ed370b9b 100644
--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -512,7 +512,6 @@
         "01532_execute_merges_on_single_replica", /// static zk path
         "01530_drop_database_atomic_sync", /// creates database
         "02001_add_default_database_to_system_users", ///create user
-        "02002_row_level_filter_bug", ///create user
-        "02015_system_views"
+        "02002_row_level_filter_bug" ///create user
     ]
 }

From 3f1e904722530680bea03964998807aed8a78959 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Mon, 6 Sep 2021 23:31:34 +0300
Subject: [PATCH 124/177] Second iteration

---
 docs/en/interfaces/formats.md                             | 6 ++++++
 docs/en/operations/settings/settings.md                   | 6 +++---
 .../data-types/nested-data-structures/nested.md           | 8 +++++++-
 3 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index bac67ecf140..d28045ce213 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -1270,6 +1270,8 @@ You can insert Parquet data from a file into ClickHouse table by the following c
 $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet"
 ```
 
+To insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
+
 You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command:
 
 ``` bash
@@ -1328,6 +1330,8 @@ You can insert Arrow data from a file into ClickHouse table by the following com
 $ cat filename.arrow | clickhouse-client --query="INSERT INTO some_table FORMAT Arrow"
 ```
 
+To insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
+
 ### Selecting Data {#selecting-data-arrow}
 
 You can select data from a ClickHouse table and save them into some file in the Arrow format by the following command:
@@ -1384,6 +1388,8 @@ You can insert ORC data from a file into ClickHouse table by the following comma
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
+To insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
+
 ### Selecting Data {#selecting-data-2}
 
 You can select data from a ClickHouse table and save them into some file in the ORC format by the following command:
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 221bdf0e720..e8355eeaa1d 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -262,7 +262,7 @@ If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` ar
 
 ## input_format_parquet_import_nested {#input_format_parquet_import_nested}
 
-Enables or disables the ability to insert [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) structures into [Parquet](../../interfaces/formats.md#data-format-parquet) input format as an array of structs.
+Enables or disables the ability to insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [Parquet](../../interfaces/formats.md#data-format-parquet) input format.
 
 Possible values:
 
@@ -273,7 +273,7 @@ Default value: `0`.
 
 ## input_format_arrow_import_nested {#input_format_arrow_import_nested}
 
-Enables or disables the ability to insert [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) structures into [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format as an array of structs.
+Enables or disables the ability to insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format.
 
 Possible values:
 
@@ -284,7 +284,7 @@ Default value: `0`.
 
 ## input_format_orc_import_nested {#input_format_orc_import_nested}
 
-Enables or disables the ability to insert [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) structures into [ORC](../../interfaces/formats.md#data-format-orc) input format as an array of structs.
+Enables or disables the ability to insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [ORC](../../interfaces/formats.md#data-format-orc) input format.
 
 Possible values:
 
diff --git a/docs/en/sql-reference/data-types/nested-data-structures/nested.md b/docs/en/sql-reference/data-types/nested-data-structures/nested.md
index ec6c613a956..281465f2075 100644
--- a/docs/en/sql-reference/data-types/nested-data-structures/nested.md
+++ b/docs/en/sql-reference/data-types/nested-data-structures/nested.md
@@ -3,7 +3,9 @@ toc_priority: 57
 toc_title: Nested(Name1 Type1, Name2 Type2, ...)
 ---
 
-# Nested(name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2}
+# Nested {#nested}
+
+## Nested(name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2}
 
 A nested data structure is like a table inside a cell. The parameters of a nested data structure – the column names and types – are specified the same way as in a [CREATE TABLE](../../../sql-reference/statements/create/table.md) query. Each table row can correspond to any number of rows in a nested data structure.
 
@@ -101,4 +103,8 @@ For a DESCRIBE query, the columns in a nested data structure are listed separate
 
 The ALTER query for elements in a nested data structure has limitations.
 
+## Inserting {#inserting}
+
+It is possible to insert `Array(Struct)` columns into nested columns with the settings [input_format_parquet_import_nested](../../../operations/settings/settings.md#input_format_parquet_import_nested), [input_format_arrow_import_nested](../../../operations/settings/settings.md#input_format_arrow_import_nested) and [input_format_orc_import_nested](../../../operations/settings/settings.md#input_format_orc_import_nested) in [Parquet](../../../interfaces/formats.md#data-format-parquet), [Arrow](../../../interfaces/formats.md#data_types-matching-arrow) and [ORC](../../../interfaces/formats.md#data-format-orc) input formats correspondingly.
+
 [Original article](https://clickhouse.tech/docs/en/data_types/nested_data_structures/nested/) <!--hide-->

From dbdb3356ba4f0d3c02a3d8ff2da0063509a4f596 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Mon, 6 Sep 2021 23:34:43 +0300
Subject: [PATCH 125/177] fixed links

---
 docs/en/interfaces/formats.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index d28045ce213..a83ee999070 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -1270,7 +1270,7 @@ You can insert Parquet data from a file into ClickHouse table by the following c
 $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet"
 ```
 
-To insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
+To insert `Array(Struct)` columns into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
 
 You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command:
 
@@ -1330,7 +1330,7 @@ You can insert Arrow data from a file into ClickHouse table by the following com
 $ cat filename.arrow | clickhouse-client --query="INSERT INTO some_table FORMAT Arrow"
 ```
 
-To insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
+To insert `Array(Struct)` columns into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
 
 ### Selecting Data {#selecting-data-arrow}
 
@@ -1388,7 +1388,7 @@ You can insert ORC data from a file into ClickHouse table by the following comma
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
-To insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
+To insert `Array(Struct)` columns into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
 
 ### Selecting Data {#selecting-data-2}
 

From cbc10cd3c0a5b266ec261edbf916307f6e61042f Mon Sep 17 00:00:00 2001
From: kssenii <sumarokovakseniia@mail.ru>
Date: Mon, 6 Sep 2021 21:11:21 +0000
Subject: [PATCH 126/177] Revert "Merge pull request #28397 from
 zhongyuankai/DOCSUP-13927-document-system_views"

This reverts commit d6f89fd9cfbdb39da4be8248007d68df0f09e6d1, reversing
changes made to 68f6ecec62b60ec9826da459663e4da0e13d7bfb.
---
 docs/en/operations/system-tables/views.md | 44 -----------------------
 1 file changed, 44 deletions(-)
 delete mode 100644 docs/en/operations/system-tables/views.md

diff --git a/docs/en/operations/system-tables/views.md b/docs/en/operations/system-tables/views.md
deleted file mode 100644
index 8edebf00a91..00000000000
--- a/docs/en/operations/system-tables/views.md
+++ /dev/null
@@ -1,44 +0,0 @@
-# system.views {#system-views}
-
-Contains the dependencies of all views and the type to which the view belongs. The metadata of the view comes from the [system.tables](tables.md).
-
-Columns:
-
--   `database` ([String](../../sql-reference/data-types/string.md)) — The name of the database the view is in.
-
--   `name` ([String](../../sql-reference/data-types/string.md)) — Name of the view.
-
--   `main_dependency_database` ([String](../../sql-reference/data-types/string.md)) — The name of the database on which the view depends.
-
--   `main_dependency_table` ([String](../../sql-reference/data-types/string.md)) - The name of the table on which the view depends.
-
--   `view_type` ([Enum8](../../sql-reference/data-types/enum.md)) — Type of the view. Values:
-    -   `'Default' = 1` — [Default views](../../sql-reference/statements/create/view.md#normal). Should not appear in this log.
-    -   `'Materialized' = 2` — [Materialized views](../../sql-reference/statements/create/view.md#materialized).
-    -   `'Live' = 3` — [Live views](../../sql-reference/statements/create/view.md#live-view).
-
-**Example**
-
-```sql
-SELECT * FROM system.views LIMIT 2 FORMAT Vertical;
-```
-
-```text
-Row 1:
-──────
-database:                 default
-name:                     live_view
-main_dependency_database: default
-main_dependency_table:    view_source_tb
-view_type:                Live
-
-Row 2:
-──────
-database:                 default
-name:                     materialized_view
-main_dependency_database: default
-main_dependency_table:    view_source_tb
-view_type:                Materialized
-```
-
-[Original article](https://clickhouse.tech/docs/en/operations/system-tables/views) <!--hide-->

From 503b7a59f07f5da9d8cad7a6196be1939582e2f2 Mon Sep 17 00:00:00 2001
From: bharatnc <bharatnc@gmail.com>
Date: Mon, 6 Sep 2021 09:54:59 -0700
Subject: [PATCH 127/177] fix getNumberOfArguments() for s2Rect functions

This fixes the value returned by the getNumberOfArguments() by the
s2RectAdd and the s2RectContains functions. Only 3 arguments are
used by these functions and not 4:

- low s2Point of rectangle
- high s2Point of rectangle
- given s2Point

The given s2Point is used to groow the size of the bounding rectangle to
include the given S2Point in case of the s2RectAdd function. In case
of s2RectContains, the function determines if the bounded rectangle
contains the given point.

PS: I wonder if it's more apt to call rectAdd as rectGrow since
it's used to grow the size of a given rectangle.
---
 src/Functions/s2RectAdd.cpp      | 2 +-
 src/Functions/s2RectContains.cpp | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Functions/s2RectAdd.cpp b/src/Functions/s2RectAdd.cpp
index 90578567da2..d9b12ce22a3 100644
--- a/src/Functions/s2RectAdd.cpp
+++ b/src/Functions/s2RectAdd.cpp
@@ -41,7 +41,7 @@ public:
         return name;
     }
 
-    size_t getNumberOfArguments() const override { return 4; }
+    size_t getNumberOfArguments() const override { return 3; }
 
     bool useDefaultImplementationForConstants() const override { return true; }
 
diff --git a/src/Functions/s2RectContains.cpp b/src/Functions/s2RectContains.cpp
index 5f556c3ec14..27fed9e2031 100644
--- a/src/Functions/s2RectContains.cpp
+++ b/src/Functions/s2RectContains.cpp
@@ -41,7 +41,7 @@ public:
         return name;
     }
 
-    size_t getNumberOfArguments() const override { return 4; }
+    size_t getNumberOfArguments() const override { return 3; }
 
     bool useDefaultImplementationForConstants() const override { return true; }
 

From cfe2207be37b91149352639351c78894046a8280 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Mon, 6 Sep 2021 22:07:41 +0000
Subject: [PATCH 128/177] Better

---
 .../MergeTree/BackgroundJobsAssignee.cpp      |  1 +
 .../{ExecutableTask.h => IExecutableTask.h}   |  9 ++---
 .../MergeTree/MergeTreeBackgroundExecutor.cpp | 34 ++++++-------------
 .../MergeTree/MergeTreeBackgroundExecutor.h   |  2 +-
 .../MergeTree/tests/gtest_executor.cpp        |  4 +--
 5 files changed, 20 insertions(+), 30 deletions(-)
 rename src/Storages/MergeTree/{ExecutableTask.h => IExecutableTask.h} (83%)

diff --git a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp
index 598c43f2153..e104e188cc6 100644
--- a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp
@@ -78,6 +78,7 @@ String BackgroundJobsAssignee::toString(Type type)
         case Type::Moving:
             return "Moving";
     }
+    __builtin_unreachable();
 }
 
 void BackgroundJobsAssignee::start()
diff --git a/src/Storages/MergeTree/ExecutableTask.h b/src/Storages/MergeTree/IExecutableTask.h
similarity index 83%
rename from src/Storages/MergeTree/ExecutableTask.h
rename to src/Storages/MergeTree/IExecutableTask.h
index 2b89a98cac1..2e546e83035 100644
--- a/src/Storages/MergeTree/ExecutableTask.h
+++ b/src/Storages/MergeTree/IExecutableTask.h
@@ -9,19 +9,19 @@
 namespace DB
 {
 
-class ExecutableTask
+class IExecutableTask
 {
 public:
     virtual bool execute() = 0;
     virtual void onCompleted() = 0;
     virtual StorageID getStorageID() = 0;
-    virtual ~ExecutableTask() = default;
+    virtual ~IExecutableTask() = default;
 };
 
-using ExecutableTaskPtr = std::shared_ptr<ExecutableTask>;
+using ExecutableTaskPtr = std::shared_ptr<IExecutableTask>;
 
 
-class LambdaAdapter : public shared_ptr_helper<LambdaAdapter>, public ExecutableTask
+class LambdaAdapter : public shared_ptr_helper<LambdaAdapter>, public IExecutableTask
 {
 public:
 
@@ -32,6 +32,7 @@ public:
     bool execute() override
     {
         res = inner();
+        inner = {};
         return false;
     }
 
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index d2d547809e4..01431abad88 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -33,7 +33,7 @@ void MergeTreeBackgroundExecutor::updateConfiguration()
         pending.set_capacity(new_max_tasks_count);
         active.set_capacity(new_max_tasks_count);
 
-        pool.setMaxFreeThreads(new_threads_count);
+        pool.setMaxFreeThreads(0);
         pool.setMaxThreads(new_threads_count);
         pool.setQueueSize(new_max_tasks_count);
     }
@@ -130,13 +130,6 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 
     {
         std::lock_guard lock(mutex);
-
-        for (auto & item : tasks_to_wait)
-        {
-            assert(item.use_count() == 1);
-            item.reset();
-        }
-
         currently_deleting.erase(id);
     }
 }
@@ -146,7 +139,7 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
 {
     setThreadName(name.c_str());
 
-    auto erase_from_active = [&]
+    auto erase_from_active = [this, item]
     {
         active.erase(std::remove(active.begin(), active.end(), item), active.end());
     };
@@ -177,6 +170,7 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
         /// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
         /// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
         item->task->onCompleted();
+        item->task.reset();
     }
     catch (...)
     {
@@ -192,8 +186,6 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
 
 void MergeTreeBackgroundExecutor::schedulerThreadFunction()
 {
-    DENY_ALLOCATIONS_IN_SCOPE;
-
     while (true)
     {
         std::unique_lock lock(mutex);
@@ -206,19 +198,15 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
         ItemPtr item = std::move(pending.front());
         pending.pop_front();
 
-        bool res = false;
+        /// Execute a piece of task
+        bool res = pool.trySchedule([this, item] () mutable
         {
-            ALLOW_ALLOCATIONS_IN_SCOPE;
-            /// Execute a piece of task
-            res = pool.trySchedule([this, item]
-            {
-                routine(item);
-                /// When storage shutdowns it will wait until all related background tasks
-                /// are finished, because they may want to interact with its fields
-                /// and this will cause segfault.
-                item->is_done.set();
-            });
-        }
+            routine(item);
+            /// When storage shutdowns it will wait until all related background tasks
+            /// are finished, because they may want to interact with its fields
+            /// and this will cause segfault.
+            item->is_done.set();
+        });
 
         if (!res)
         {
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index bdeeecb7592..d64652a3b10 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -14,7 +14,7 @@
 #include <common/logger_useful.h>
 #include <Common/ThreadPool.h>
 #include <Common/Stopwatch.h>
-#include <Storages/MergeTree/ExecutableTask.h>
+#include <Storages/MergeTree/IExecutableTask.h>
 
 
 namespace DB
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index 1377d9add9a..b3f8ad2a0ca 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -5,7 +5,7 @@
 #include <memory>
 #include <random>
 
-#include <Storages/MergeTree/ExecutableTask.h>
+#include <Storages/MergeTree/IExecutableTask.h>
 #include <Storages/MergeTree/MergeTreeBackgroundExecutor.h>
 
 using namespace DB;
@@ -17,7 +17,7 @@ namespace CurrentMetrics
 
 std::random_device device;
 
-class FakeExecutableTask : public ExecutableTask
+class FakeExecutableTask : public IExecutableTask
 {
 public:
     explicit FakeExecutableTask(String name_) : generator(device()), distribution(0, 5), name(name_)

From bcc31f1f3e9616940eb8e3ddd3a51b89d368c734 Mon Sep 17 00:00:00 2001
From: Vitaly Baranov <vitbar@yandex-team.ru>
Date: Mon, 6 Sep 2021 22:42:32 +0300
Subject: [PATCH 129/177] Remove unnecessary changes.

---
 programs/server/Server.cpp                    |  3 ---
 src/Access/AccessControlManager.h             |  2 --
 src/Access/SettingsProfilesCache.cpp          |  1 +
 src/Core/MySQL/Authentication.cpp             |  2 --
 src/Core/MySQL/MySQLSession.h                 | 19 -------------------
 src/Core/PostgreSQLProtocol.h                 |  3 +--
 src/Interpreters/Context.cpp                  | 18 ++----------------
 src/Interpreters/Context.h                    | 12 ------------
 src/Interpreters/InterpreterSetQuery.cpp      |  5 -----
 src/Interpreters/ya.make                      |  1 -
 .../Formats/Impl/MySQLOutputFormat.h          |  2 --
 src/Server/HTTPHandler.h                      |  3 +--
 src/TableFunctions/TableFunctionMySQL.cpp     |  3 +--
 .../01702_system_query_log.reference          |  2 +-
 14 files changed, 7 insertions(+), 69 deletions(-)
 delete mode 100644 src/Core/MySQL/MySQLSession.h

diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index bf4e2f947dc..6a19fc9e036 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -48,11 +48,8 @@
 #include <Interpreters/DNSCacheUpdater.h>
 #include <Interpreters/DatabaseCatalog.h>
 #include <Interpreters/ExternalDictionariesLoader.h>
-#include <Interpreters/ExternalLoaderXMLConfigRepository.h>
 #include <Interpreters/ExternalModelsLoader.h>
-#include <Interpreters/InterserverCredentials.h>
 #include <Interpreters/ProcessList.h>
-#include <Interpreters/Session.h>
 #include <Interpreters/loadMetadata.h>
 #include <Interpreters/JIT/CompiledExpressionCache.h>
 #include <Interpreters/UserDefinedObjectsLoader.h>
diff --git a/src/Access/AccessControlManager.h b/src/Access/AccessControlManager.h
index c20216a291a..79f7073ef69 100644
--- a/src/Access/AccessControlManager.h
+++ b/src/Access/AccessControlManager.h
@@ -153,8 +153,6 @@ public:
 
     const ExternalAuthenticators & getExternalAuthenticators() const;
 
-    String getProfileName(const UUID & profile_id) const;
-
 private:
     class ContextAccessCache;
     class CustomSettingsPrefixes;
diff --git a/src/Access/SettingsProfilesCache.cpp b/src/Access/SettingsProfilesCache.cpp
index 20880b94aba..3cd73720c3e 100644
--- a/src/Access/SettingsProfilesCache.cpp
+++ b/src/Access/SettingsProfilesCache.cpp
@@ -116,6 +116,7 @@ void SettingsProfilesCache::mergeSettingsAndConstraints()
     }
 }
 
+
 void SettingsProfilesCache::mergeSettingsAndConstraintsFor(EnabledSettings & enabled) const
 {
     SettingsProfileElements merged_settings;
diff --git a/src/Core/MySQL/Authentication.cpp b/src/Core/MySQL/Authentication.cpp
index 0eb080892c1..aeb9a411082 100644
--- a/src/Core/MySQL/Authentication.cpp
+++ b/src/Core/MySQL/Authentication.cpp
@@ -2,8 +2,6 @@
 #include <Core/MySQL/PacketsConnection.h>
 #include <Poco/RandomStream.h>
 #include <Poco/SHA1Engine.h>
-#include <Access/User.h>
-#include <Access/AccessControlManager.h>
 #include <Interpreters/Session.h>
 
 #include <common/logger_useful.h>
diff --git a/src/Core/MySQL/MySQLSession.h b/src/Core/MySQL/MySQLSession.h
deleted file mode 100644
index 1ba17a40483..00000000000
--- a/src/Core/MySQL/MySQLSession.h
+++ /dev/null
@@ -1,19 +0,0 @@
-#pragma once
-
-#include <common/types.h>
-#include <Interpreters/Session.h>
-
-namespace DB
-{
-
-class MySQLSession : public DB::Session
-{
-public:
-    using DB::Session::Session;
-
-    uint8_t sequence_id = 0;
-    uint32_t client_capabilities = 0;
-    size_t max_packet_size = 0;
-};
-
-}
diff --git a/src/Core/PostgreSQLProtocol.h b/src/Core/PostgreSQLProtocol.h
index f0de4bbb843..2b92258394e 100644
--- a/src/Core/PostgreSQLProtocol.h
+++ b/src/Core/PostgreSQLProtocol.h
@@ -1,12 +1,11 @@
 #pragma once
 
 #include <functional>
-#include <Interpreters/Session.h>
-#include <Interpreters/Context.h>
 #include <IO/ReadBuffer.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteBuffer.h>
 #include <IO/WriteHelpers.h>
+#include <Interpreters/Session.h>
 #include <common/logger_useful.h>
 #include <Poco/Format.h>
 #include <Poco/RegularExpression.h>
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index da7228ebd0f..c4fb89067e2 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -626,20 +626,14 @@ ConfigurationPtr Context::getUsersConfig()
     return shared->users_config;
 }
 
-std::shared_ptr<const ContextAccess> Context::getContextAccessForUser(const UUID & user_id) const
-{
-    return getAccessControlManager().getContextAccess(
-        user_id, /* current_roles = */ {}, /* use_default_roles = */ true,
-        settings, current_database, client_info);
-}
-
 void Context::setUser(const UUID & user_id_)
 {
     auto lock = getLock();
 
     user_id = user_id_;
 
-    access = getContextAccessForUser(user_id_);
+    access = getAccessControlManager().getContextAccess(
+        user_id_, /* current_roles = */ {}, /* use_default_roles = */ true, settings, current_database, client_info);
 
     auto user = access->getUser();
     current_roles = std::make_shared<std::vector<UUID>>(user->granted_roles.findGranted(user->default_roles));
@@ -1271,14 +1265,6 @@ ContextMutablePtr Context::getBufferContext() const
     return buffer_context;
 }
 
-Session * Context::getSessionOrNull() const
-{
-    if (hasSessionContext())
-        return getSession();
-    else
-        return nullptr;
-}
-
 
 const EmbeddedDictionaries & Context::getEmbeddedDictionaries() const
 {
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 3e373c493f5..fbf64908f84 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -276,8 +276,6 @@ private:
 
     /// XXX: move this stuff to shared part instead.
     ContextMutablePtr buffer_context;  /// Buffer context. Could be equal to this.
-    /// Non-owning, only here for MySQLOutputFormat to be able to modify sequence_id, see setSession() and getSession()
-    Session * session = nullptr;
 
     /// A flag, used to distinguish between user query and internal query to a database engine (MaterializePostgreSQL).
     bool is_internal_query = false;
@@ -373,8 +371,6 @@ public:
     /// Normally you shouldn't call this function. Use the Session class to do authentication instead.
     void setUser(const UUID & user_id_);
 
-    std::shared_ptr<const ContextAccess> getContextAccessForUser(const UUID & user_id) const;
-
     UserPtr getUser() const;
     String getUserName() const;
     std::optional<UUID> getUserID() const;
@@ -604,14 +600,6 @@ public:
 
     ContextMutablePtr getGlobalContext() const;
 
-    // Exists only due to MySQLOutputFormat
-    Session * getSession() const { return getSessionContext()->session; }
-    void setSession(Session * new_session)
-    {
-        session = getSessionContext()->session = new_session;
-    }
-    Session * getSessionOrNull() const;
-
     bool hasGlobalContext() const { return !global_context.expired(); }
     bool isGlobalContext() const
     {
diff --git a/src/Interpreters/InterpreterSetQuery.cpp b/src/Interpreters/InterpreterSetQuery.cpp
index 73af2bbe3c0..1c6a4236bf6 100644
--- a/src/Interpreters/InterpreterSetQuery.cpp
+++ b/src/Interpreters/InterpreterSetQuery.cpp
@@ -9,13 +9,8 @@ namespace DB
 BlockIO InterpreterSetQuery::execute()
 {
     const auto & ast = query_ptr->as<ASTSetQuery &>();
-
     getContext()->checkSettingsConstraints(ast.changes);
-    // Here settings are pushed to the session context and are not visible in the query context
     getContext()->getSessionContext()->applySettingsChanges(ast.changes);
-    // Make setting changes also available to the query context.
-    getContext()->applySettingsChanges(ast.changes);
-
     return {};
 }
 
diff --git a/src/Interpreters/ya.make b/src/Interpreters/ya.make
index 9263435e003..0bc7cb11cf0 100644
--- a/src/Interpreters/ya.make
+++ b/src/Interpreters/ya.make
@@ -148,7 +148,6 @@ SRCS(
     RowRefs.cpp
     SelectIntersectExceptQueryVisitor.cpp
     Session.cpp
-    Session.cpp
     SessionLog.cpp
     Set.cpp
     SetVariants.cpp
diff --git a/src/Processors/Formats/Impl/MySQLOutputFormat.h b/src/Processors/Formats/Impl/MySQLOutputFormat.h
index a285a6d75f3..a8e1ada3d6a 100644
--- a/src/Processors/Formats/Impl/MySQLOutputFormat.h
+++ b/src/Processors/Formats/Impl/MySQLOutputFormat.h
@@ -14,7 +14,6 @@ class IColumn;
 class IDataType;
 class WriteBuffer;
 struct FormatSettings;
-class MySQLSession;
 
 /** A stream for outputting data in a binary line-by-line format.
   */
@@ -35,7 +34,6 @@ public:
 private:
     void initialize();
 
-private:
     bool initialized = false;
     uint32_t client_capabilities = 0;
     uint8_t * sequence_id = nullptr;
diff --git a/src/Server/HTTPHandler.h b/src/Server/HTTPHandler.h
index f52958a191d..98f573f8cef 100644
--- a/src/Server/HTTPHandler.h
+++ b/src/Server/HTTPHandler.h
@@ -83,13 +83,12 @@ private:
 
     // The request_credential instance may outlive a single request/response loop.
     // This happens only when the authentication mechanism requires more than a single request/response exchange (e.g., SPNEGO).
-    std::shared_ptr<Session> request_session;
     std::unique_ptr<Credentials> request_credentials;
 
     // Returns true when the user successfully authenticated,
     //  the session instance will be configured accordingly, and the request_credentials instance will be dropped.
     // Returns false when the user is not authenticated yet, and the 'Negotiate' response is sent,
-    //  the request_session and request_credentials instances are preserved.
+    //  the session and request_credentials instances are preserved.
     // Throws an exception if authentication failed.
     bool authenticateUser(
         HTTPServerRequest & request,
diff --git a/src/TableFunctions/TableFunctionMySQL.cpp b/src/TableFunctions/TableFunctionMySQL.cpp
index 92387b13d55..09f9cf8b1f5 100644
--- a/src/TableFunctions/TableFunctionMySQL.cpp
+++ b/src/TableFunctions/TableFunctionMySQL.cpp
@@ -61,9 +61,8 @@ void TableFunctionMySQL::parseArguments(const ASTPtr & ast_function, ContextPtr
     user_name = args[3]->as<ASTLiteral &>().value.safeGet<String>();
     password = args[4]->as<ASTLiteral &>().value.safeGet<String>();
 
-    const auto & settings = context->getSettingsRef();
     /// Split into replicas if needed. 3306 is the default MySQL port number
-    const size_t max_addresses = settings.glob_expansion_max_elements;
+    size_t max_addresses = context->getSettingsRef().glob_expansion_max_elements;
     auto addresses = parseRemoteDescriptionForExternalDatabase(host_port, max_addresses, 3306);
     pool.emplace(remote_database_name, addresses, user_name, password);
 
diff --git a/tests/queries/0_stateless/01702_system_query_log.reference b/tests/queries/0_stateless/01702_system_query_log.reference
index 3458c2e5ed4..1f329feac22 100644
--- a/tests/queries/0_stateless/01702_system_query_log.reference
+++ b/tests/queries/0_stateless/01702_system_query_log.reference
@@ -8,7 +8,6 @@ GRANT queries
 REVOKE queries
 Misc queries
 ACTUAL LOG CONTENT:
-	-- fire all kinds of queries and then check if those are present in the system.query_log\nSET log_comment=\'system.query_log logging test\';
 Select	SELECT \'DROP queries and also a cleanup before the test\';
 Drop	DROP DATABASE IF EXISTS sqllt SYNC;
 	DROP USER IF EXISTS sqllt_user;
@@ -83,4 +82,5 @@ Rename	RENAME TABLE sqllt.table TO sqllt.table_new;
 Rename	RENAME TABLE sqllt.table_new TO sqllt.table;
 Drop	TRUNCATE TABLE sqllt.table;
 Drop	DROP TABLE sqllt.table SYNC;
+	SET log_comment=\'\';
 DROP queries and also a cleanup after the test

From 81d3e330870b4a8491cee44a54a45d58cf5da4a4 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 7 Sep 2021 10:36:39 +0300
Subject: [PATCH 130/177] Add crashing test

---
 ...02015_shard_crash_clang_12_build.reference |  1 +
 .../02015_shard_crash_clang_12_build.sh       | 45 +++++++++++++++++++
 2 files changed, 46 insertions(+)
 create mode 100644 tests/queries/0_stateless/02015_shard_crash_clang_12_build.reference
 create mode 100755 tests/queries/0_stateless/02015_shard_crash_clang_12_build.sh

diff --git a/tests/queries/0_stateless/02015_shard_crash_clang_12_build.reference b/tests/queries/0_stateless/02015_shard_crash_clang_12_build.reference
new file mode 100644
index 00000000000..d00491fd7e5
--- /dev/null
+++ b/tests/queries/0_stateless/02015_shard_crash_clang_12_build.reference
@@ -0,0 +1 @@
+1
diff --git a/tests/queries/0_stateless/02015_shard_crash_clang_12_build.sh b/tests/queries/0_stateless/02015_shard_crash_clang_12_build.sh
new file mode 100755
index 00000000000..287a9b45f4c
--- /dev/null
+++ b/tests/queries/0_stateless/02015_shard_crash_clang_12_build.sh
@@ -0,0 +1,45 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+
+$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS local"
+$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS distributed"
+
+$CLICKHOUSE_CLIENT --query "CREATE TABLE local (x UInt8) ENGINE = Memory;"
+$CLICKHOUSE_CLIENT --query "CREATE TABLE distributed AS local ENGINE = Distributed(test_cluster_two_shards, currentDatabase(), local, x);"
+
+$CLICKHOUSE_CLIENT --insert_distributed_sync=0 --network_compression_method='zstd' --query "INSERT INTO distributed SELECT number FROM numbers(256);"
+$CLICKHOUSE_CLIENT --insert_distributed_sync=0 --network_compression_method='zstd' --query "SYSTEM FLUSH DISTRIBUTED distributed;"
+
+function select_thread()
+{
+    while true; do
+        $CLICKHOUSE_CLIENT --insert_distributed_sync=0 --network_compression_method='zstd' --query "SELECT count() FROM local" >/dev/null
+        $CLICKHOUSE_CLIENT --insert_distributed_sync=0 --network_compression_method='zstd' --query "SELECT count() FROM distributed" >/dev/null
+    done
+}
+
+export -f select_thread;
+
+TIMEOUT=30
+
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+timeout $TIMEOUT bash -c select_thread 2> /dev/null &
+
+wait
+
+$CLICKHOUSE_CLIENT --query "SELECT 1"
+
+$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS local"
+$CLICKHOUSE_CLIENT --query "DROP TABLE IF EXISTS distributed"

From 5ff99314b1590d1a1fbf7d6a8bdbc27618dcb77b Mon Sep 17 00:00:00 2001
From: Anton Ivashkin <iantonspb@yandex-team.ru>
Date: Tue, 7 Sep 2021 12:51:00 +0300
Subject: [PATCH 131/177] Reduce default settings for S3 multipart upload part
 size

---
 src/Core/Settings.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 887964bb233..09dfd347423 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -70,8 +70,8 @@ class IColumn;
     M(UInt64, idle_connection_timeout, 3600, "Close idle TCP connections after specified number of seconds.", 0) \
     M(UInt64, distributed_connections_pool_size, DBMS_DEFAULT_DISTRIBUTED_CONNECTIONS_POOL_SIZE, "Maximum number of connections with one remote server in the pool.", 0) \
     M(UInt64, connections_with_failover_max_tries, DBMS_CONNECTION_POOL_WITH_FAILOVER_DEFAULT_MAX_TRIES, "The maximum number of attempts to connect to replicas.", 0) \
-    M(UInt64, s3_min_upload_part_size, 512*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \
-    M(UInt64, s3_max_single_part_upload_size, 64*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \
+    M(UInt64, s3_min_upload_part_size, 32*1024*1024, "The minimum size of part to upload during multipart upload to S3.", 0) \
+    M(UInt64, s3_max_single_part_upload_size, 32*1024*1024, "The maximum size of object to upload using singlepart upload to S3.", 0) \
     M(UInt64, s3_max_single_read_retries, 4, "The maximum number of retries during single S3 read.", 0) \
     M(UInt64, s3_max_redirects, 10, "Max number of S3 redirects hops allowed.", 0) \
     M(UInt64, s3_max_connections, 1024, "The maximum number of connections per server.", 0) \

From 978dd19fa242469bbfac1e3701c23d59dd291b05 Mon Sep 17 00:00:00 2001
From: ZhiYong Wang <wangzhiyong@didiglobal.com>
Date: Tue, 7 Sep 2021 19:05:26 +0800
Subject: [PATCH 132/177] Fix coredump in creating distributed table

---
 src/Storages/StorageDistributed.cpp                  |  7 ++++++-
 ...02017_create_distributed_table_coredump.reference |  0
 .../02017_create_distributed_table_coredump.sql      | 12 ++++++++++++
 3 files changed, 18 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/02017_create_distributed_table_coredump.reference
 create mode 100644 tests/queries/0_stateless/02017_create_distributed_table_coredump.sql

diff --git a/src/Storages/StorageDistributed.cpp b/src/Storages/StorageDistributed.cpp
index df7d568deb9..1ad80f8aea6 100644
--- a/src/Storages/StorageDistributed.cpp
+++ b/src/Storages/StorageDistributed.cpp
@@ -1332,7 +1332,12 @@ void registerStorageDistributed(StorageFactory & factory)
         String remote_table = engine_args[2]->as<ASTLiteral &>().value.safeGet<String>();
 
         const auto & sharding_key = engine_args.size() >= 4 ? engine_args[3] : nullptr;
-        const auto & storage_policy = engine_args.size() >= 5 ? engine_args[4]->as<ASTLiteral &>().value.safeGet<String>() : "default";
+        String storage_policy = "default";
+        if (engine_args.size() >= 5)
+        {
+            engine_args[4] = evaluateConstantExpressionOrIdentifierAsLiteral(engine_args[4], local_context);
+            storage_policy = engine_args[4]->as<ASTLiteral &>().value.safeGet<String>();
+        }
 
         /// Check that sharding_key exists in the table and has numeric type.
         if (sharding_key)
diff --git a/tests/queries/0_stateless/02017_create_distributed_table_coredump.reference b/tests/queries/0_stateless/02017_create_distributed_table_coredump.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/02017_create_distributed_table_coredump.sql b/tests/queries/0_stateless/02017_create_distributed_table_coredump.sql
new file mode 100644
index 00000000000..27c98c3e237
--- /dev/null
+++ b/tests/queries/0_stateless/02017_create_distributed_table_coredump.sql
@@ -0,0 +1,12 @@
+drop table if exists t;
+drop table if exists td1;
+drop table if exists td2;
+drop table if exists td3;
+create table t (val UInt32) engine = MergeTree order by val;
+create table td1 engine = Distributed(test_shard_localhost, currentDatabase(), 't') as t;
+create table td2 engine = Distributed(test_shard_localhost, currentDatabase(), 't', xxHash32(val), default) as t;
+create table td3 engine = Distributed(test_shard_localhost, currentDatabase(), 't', xxHash32(val), 'default') as t;
+drop table if exists t;
+drop table if exists td1; 
+drop table if exists td2; 
+drop table if exists td3; 

From 69604eab3f72fc8c6e14b387a73dd4166319d680 Mon Sep 17 00:00:00 2001
From: Vitaly <orloff.v@gmail.com>
Date: Tue, 7 Sep 2021 14:05:55 +0300
Subject: [PATCH 133/177] Add Settings.Names, Settings.Values aliases for
 system.processes table

---
 src/Storages/System/StorageSystemProcesses.cpp | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/src/Storages/System/StorageSystemProcesses.cpp b/src/Storages/System/StorageSystemProcesses.cpp
index e2685af7718..5e6ba37226c 100644
--- a/src/Storages/System/StorageSystemProcesses.cpp
+++ b/src/Storages/System/StorageSystemProcesses.cpp
@@ -73,7 +73,9 @@ NamesAndAliases StorageSystemProcesses::getNamesAndAliases()
     return
     {
         {"ProfileEvents.Names", {std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())}, "mapKeys(ProfileEvents)"},
-        {"ProfileEvents.Values", {std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>())}, "mapValues(ProfileEvents)"}
+        {"ProfileEvents.Values", {std::make_shared<DataTypeArray>(std::make_shared<DataTypeUInt64>())}, "mapValues(ProfileEvents)"},
+        {"Settings.Names", {std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())}, "mapKeys(Settings)" },
+        {"Settings.Values", {std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())}, "mapValues(Settings)"}
     };
 }
 

From 8a0765c60d0414506bca146e0734586fb38b5d3f Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 7 Sep 2021 12:45:39 +0000
Subject: [PATCH 134/177] Review fixes (partially)

---
 src/Common/ThreadPool.cpp                     |  2 +-
 .../MergeTree/MergeTreeBackgroundExecutor.cpp | 29 +++++++------------
 .../MergeTree/MergeTreeBackgroundExecutor.h   |  3 +-
 3 files changed, 13 insertions(+), 21 deletions(-)

diff --git a/src/Common/ThreadPool.cpp b/src/Common/ThreadPool.cpp
index 2d6c524ae48..4b6b795a5cd 100644
--- a/src/Common/ThreadPool.cpp
+++ b/src/Common/ThreadPool.cpp
@@ -75,7 +75,7 @@ void ThreadPoolImpl<Thread>::setQueueSize(size_t value)
     std::lock_guard lock(mutex);
     queue_size = value;
     /// Reserve memory to get rid of allocations
-    jobs.reserve(2 * queue_size);
+    jobs.reserve(queue_size);
 }
 
 
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index 01431abad88..b185e957120 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -1,5 +1,7 @@
 #include <Storages/MergeTree/MergeTreeBackgroundExecutor.h>
 
+#include <algorithm>
+
 #include <Common/setThreadName.h>
 #include <Storages/MergeTree/BackgroundJobsAssignee.h>
 
@@ -25,8 +27,8 @@ String MergeTreeBackgroundExecutor::toString(Type type)
 
 void MergeTreeBackgroundExecutor::updateConfiguration()
 {
-    auto new_threads_count = threads_count_getter();
-    auto new_max_tasks_count = max_task_count_getter();
+    auto new_threads_count = std::max<size_t>(1u, threads_count_getter());
+    auto new_max_tasks_count = std::max<size_t>(1, max_task_count_getter());
 
     try
     {
@@ -58,7 +60,7 @@ void MergeTreeBackgroundExecutor::wait()
     if (scheduler.joinable())
         scheduler.join();
 
-    /// ThreadPool will be finalized in destructor.
+    pool.wait();
 }
 
 
@@ -104,16 +106,10 @@ bool MergeTreeBackgroundExecutor::trySchedule(ExecutableTaskPtr task)
 
 void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id)
 {
-    /// Executor is global, so protect from any concurrent storage shutdowns
-    std::lock_guard remove_lock(remove_mutex);
-
     std::vector<ItemPtr> tasks_to_wait;
     {
         std::lock_guard lock(mutex);
 
-        /// Mark this StorageID as deleting
-        currently_deleting.emplace(id);
-
         /// Erase storage related tasks from pending and select active tasks to wait for
         auto it = std::remove_if(pending.begin(), pending.end(),
             [&] (auto item) -> bool { return item->task->getStorageID() == id; });
@@ -122,16 +118,14 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
         /// Copy items to wait for their completion
         std::copy_if(active.begin(), active.end(), std::back_inserter(tasks_to_wait),
             [&] (auto item) -> bool { return item->task->getStorageID() == id; });
+
+        for (auto & item : tasks_to_wait)
+            item->is_currently_deleting = true;
     }
 
 
     for (auto & item : tasks_to_wait)
         item->is_done.wait();
-
-    {
-        std::lock_guard lock(mutex);
-        currently_deleting.erase(id);
-    }
 }
 
 
@@ -150,7 +144,7 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
         {
             std::lock_guard guard(mutex);
 
-            if (currently_deleting.contains(item->task->getStorageID()))
+            if (item->is_currently_deleting)
             {
                 erase_from_active();
                 return;
@@ -158,7 +152,6 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
 
             pending.push_back(item);
             erase_from_active();
-            item->is_done.reset();
             has_tasks.notify_one();
             return;
         }
@@ -205,12 +198,12 @@ void MergeTreeBackgroundExecutor::schedulerThreadFunction()
             /// When storage shutdowns it will wait until all related background tasks
             /// are finished, because they may want to interact with its fields
             /// and this will cause segfault.
-            item->is_done.set();
+            if (item->is_currently_deleting)
+                item->is_done.set();
         });
 
         if (!res)
         {
-            active.erase(std::remove(active.begin(), active.end(), item), active.end());
             pending.push_back(item);
             continue;
         }
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index d64652a3b10..76043bab762 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -118,6 +118,7 @@ private:
 
         ExecutableTaskPtr task;
         CurrentMetrics::Increment increment;
+        std::atomic_bool is_currently_deleting{false};
         Poco::Event is_done;
     };
 
@@ -129,9 +130,7 @@ private:
     /// Initially it will be empty
     boost::circular_buffer<ItemPtr> pending{0};
     boost::circular_buffer<ItemPtr> active{0};
-    std::set<StorageID> currently_deleting;
 
-    std::mutex remove_mutex;
     std::mutex mutex;
     std::condition_variable has_tasks;
 

From d9ca1e29c3eac0ebb4c8da8a449d0a514a61a2b6 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 7 Sep 2021 15:56:32 +0300
Subject: [PATCH 135/177] Increase stack size for coroutines

---
 src/Common/FiberStack.h                                  | 9 ++++++++-
 .../0_stateless/02015_shard_crash_clang_12_build.sh      | 2 ++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h
index c509540cc9e..aec4befb515 100644
--- a/src/Common/FiberStack.h
+++ b/src/Common/FiberStack.h
@@ -27,7 +27,12 @@ private:
     size_t stack_size;
     size_t page_size = 0;
 public:
-    static constexpr size_t default_stack_size = 128 * 1024; /// 64KB was not enough for tests
+    /// NOTE: If you see random segfaults in CI and stack starts from boost::context::...fiber...
+    /// probably it worth to try to increase stack size for coroutines.
+    ///
+    /// Current value is just enough for all tests in our CI. It's not selected in some special
+    /// way. We will have 36 pages with 4KB page size.
+    static constexpr size_t default_stack_size = 144 * 1024; /// 64KB was not enough for tests
 
     explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_)
     {
@@ -43,6 +48,8 @@ public:
         if (MAP_FAILED == vp)
             DB::throwFromErrno(fmt::format("FiberStack: Cannot mmap {}.", ReadableSize(num_bytes)), DB::ErrorCodes::CANNOT_ALLOCATE_MEMORY);
 
+        /// TODO: make reports on illegal guard page access more clear.
+        /// Currently we will see segfault and almost random stacktrace.
         if (-1 == ::mprotect(vp, page_size, PROT_NONE))
         {
             ::munmap(vp, num_bytes);
diff --git a/tests/queries/0_stateless/02015_shard_crash_clang_12_build.sh b/tests/queries/0_stateless/02015_shard_crash_clang_12_build.sh
index 287a9b45f4c..f6ede6592ff 100755
--- a/tests/queries/0_stateless/02015_shard_crash_clang_12_build.sh
+++ b/tests/queries/0_stateless/02015_shard_crash_clang_12_build.sh
@@ -1,5 +1,7 @@
 #!/usr/bin/env bash
 
+# This test reproduces crash in case of insufficient coroutines stack size
+
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh

From 9b38248e6476159a12c2f25363e6f4abbac3fff7 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 7 Sep 2021 13:13:23 +0000
Subject: [PATCH 136/177] Remove scheduler thread

---
 .../MergeTree/MergeTreeBackgroundExecutor.cpp | 66 +++++++------------
 .../MergeTree/MergeTreeBackgroundExecutor.h   |  4 +-
 2 files changed, 26 insertions(+), 44 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index b185e957120..244c4f761f4 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -38,6 +38,9 @@ void MergeTreeBackgroundExecutor::updateConfiguration()
         pool.setMaxFreeThreads(0);
         pool.setMaxThreads(new_threads_count);
         pool.setQueueSize(new_max_tasks_count);
+
+        for (size_t number = threads_count; number < new_threads_count; ++number)
+            pool.scheduleOrThrowOnError([this, number] { threadFunction(number); });
     }
     catch (...)
     {
@@ -57,9 +60,6 @@ void MergeTreeBackgroundExecutor::wait()
         has_tasks.notify_all();
     }
 
-    if (scheduler.joinable())
-        scheduler.join();
-
     pool.wait();
 }
 
@@ -86,17 +86,6 @@ bool MergeTreeBackgroundExecutor::trySchedule(ExecutableTaskPtr task)
     if (value.load() >= static_cast<int64_t>(max_tasks_count))
         return false;
 
-    /// Just check if the main scheduler thread in excellent condition
-    if (!scheduler.joinable())
-    {
-        LOG_ERROR(&Poco::Logger::get("MergeTreeBackgroundExecutor"), "Scheduler thread is dead. Trying to alive..");
-        scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
-
-        if (!scheduler.joinable())
-            LOG_FATAL(&Poco::Logger::get("MergeTreeBackgroundExecutor"), "Scheduler thread is dead permanently. Restart is needed");
-    }
-
-
     pending.push_back(std::make_shared<Item>(std::move(task), metric));
 
     has_tasks.notify_one();
@@ -131,8 +120,6 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 
 void MergeTreeBackgroundExecutor::routine(ItemPtr item)
 {
-    setThreadName(name.c_str());
-
     auto erase_from_active = [this, item]
     {
         active.erase(std::remove(active.begin(), active.end(), item), active.end());
@@ -177,38 +164,35 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
 }
 
 
-void MergeTreeBackgroundExecutor::schedulerThreadFunction()
+void MergeTreeBackgroundExecutor::threadFunction(size_t number)
 {
+    setThreadName(name.c_str());
+
     while (true)
     {
-        std::unique_lock lock(mutex);
-
-        has_tasks.wait(lock, [this](){ return !pending.empty() || shutdown; });
-
-        if (shutdown)
-            break;
-
-        ItemPtr item = std::move(pending.front());
-        pending.pop_front();
-
-        /// Execute a piece of task
-        bool res = pool.trySchedule([this, item] () mutable
+        ItemPtr item;
         {
-            routine(item);
-            /// When storage shutdowns it will wait until all related background tasks
-            /// are finished, because they may want to interact with its fields
-            /// and this will cause segfault.
-            if (item->is_currently_deleting)
-                item->is_done.set();
-        });
+            std::unique_lock lock(mutex);
+            has_tasks.wait(lock, [this](){ return !pending.empty() || shutdown; });
 
-        if (!res)
-        {
-            pending.push_back(item);
-            continue;
+            if (number >= threads_count)
+                break;
+
+            if (shutdown)
+                break;
+
+            item = std::move(pending.front());
+            pending.pop_front();
+            active.push_back(item);
         }
 
-        active.push_back(std::move(item));
+        routine(item);
+
+        /// When storage shutdowns it will wait until all related background tasks
+        /// are finished, because they may want to interact with its fields
+        /// and this will cause segfault.
+        if (item->is_currently_deleting)
+            item->is_done.set();
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index 76043bab762..1c640f8e6ec 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -65,7 +65,6 @@ public:
         name = toString(type);
 
         updateConfiguration();
-        scheduler = ThreadFromGlobalPool([this]() { schedulerThreadFunction(); });
     }
 
     ~MergeTreeBackgroundExecutor()
@@ -125,7 +124,7 @@ private:
     using ItemPtr = std::shared_ptr<Item>;
 
     void routine(ItemPtr item);
-    void schedulerThreadFunction();
+    void threadFunction(size_t number);
 
     /// Initially it will be empty
     boost::circular_buffer<ItemPtr> pending{0};
@@ -137,7 +136,6 @@ private:
     std::atomic_bool shutdown{false};
 
     ThreadPool pool;
-    ThreadFromGlobalPool scheduler;
 };
 
 }

From 231f64cf37adfe66b6f5394c77918aee0031c614 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 7 Sep 2021 13:51:30 +0000
Subject: [PATCH 137/177] Fix unit test

---
 src/Storages/MergeTree/MergeTreeBackgroundExecutor.h | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index 1c640f8e6ec..5dffe7f5908 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -118,7 +118,10 @@ private:
         ExecutableTaskPtr task;
         CurrentMetrics::Increment increment;
         std::atomic_bool is_currently_deleting{false};
-        Poco::Event is_done;
+        /// Actually autoreset=false is needed only for unit test
+        /// where multiple threads could remove tasks corresponding to the same storage
+        /// This scenario in not possible in reality.
+        Poco::Event is_done{/*autoreset=*/false};
     };
 
     using ItemPtr = std::shared_ptr<Item>;

From 36f57555f68fb8088d7ee553f6801d1e77f50c2e Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Tue, 7 Sep 2021 19:50:07 +0300
Subject: [PATCH 138/177] Edited and translated to Russian

---
 .../external-dicts-dict-layout.md             |  5 +++--
 .../table-engines/integrations/mongodb.md     | 16 +++++++++++++--
 docs/ru/operations/settings/settings-users.md |  2 +-
 .../external-dicts-dict-layout.md             | 20 ++++++++++++++++---
 .../sql-reference/statements/create/user.md   |  1 +
 5 files changed, 36 insertions(+), 8 deletions(-)

diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
index 00a9610ce91..eb8ca425d24 100644
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@@ -300,8 +300,9 @@ When searching for a dictionary, the cache is searched first. For each block of
 
 If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`.
 
-For cache dictionaries, the expiration [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired, and it is re-requested the next time it needs to be used this behaviour can be configured with setting `allow_read_expired_keys`.
-This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the `system.dictionaries` table.
+For cache dictionaries, the expiration [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`.
+
+This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table.
 
 If setting `allow_read_expired_keys` is set to 1, by default 0. Then dictionary can support asynchronous updates. If a client requests keys and all of them are in cache, but some of them are expired, then dictionary will return expired keys for a client and request them asynchronously from the source.
 
diff --git a/docs/ru/engines/table-engines/integrations/mongodb.md b/docs/ru/engines/table-engines/integrations/mongodb.md
index 05820d03fe6..66c30250926 100644
--- a/docs/ru/engines/table-engines/integrations/mongodb.md
+++ b/docs/ru/engines/table-engines/integrations/mongodb.md
@@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name
     name1 [type1],
     name2 [type2],
     ...
-) ENGINE = MongoDB(host:port, database, collection, user, password);
+) ENGINE = MongoDB(host:port, database, collection, user, password [, options]);
 ```
 
 **Параметры движка**
@@ -30,9 +30,11 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name
 
 -   `password` — пароль пользователя.
 
+-   `options` — MongoDB connection string options (optional parameter).
+
 ## Примеры использования {#usage-example}
 
-Таблица в ClickHouse для чтения данных из колекции MongoDB:
+Создание в ClickHouse для чтения данных из колекции MongoDB:
 
 ``` text
 CREATE TABLE mongo_table
@@ -42,6 +44,16 @@ CREATE TABLE mongo_table
 ) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'testuser', 'clickhouse');
 ```
 
+Чтение из сервера MongoDB, защищенного SSL:
+
+``` text
+CREATE TABLE mongo_table_ssl
+(
+    key UInt64,
+    data String
+) ENGINE = MongoDB('mongo2:27017', 'test', 'simple_table', 'testuser', 'clickhouse', 'ssl=true');
+```
+
 Запрос к таблице:
 
 ``` sql
diff --git a/docs/ru/operations/settings/settings-users.md b/docs/ru/operations/settings/settings-users.md
index 6a10e518817..4570ce38bad 100644
--- a/docs/ru/operations/settings/settings-users.md
+++ b/docs/ru/operations/settings/settings-users.md
@@ -28,7 +28,7 @@ toc_title: "Настройки пользователей"
         <profile>profile_name</profile>
 
         <quota>default</quota>
-
+        <default_database>default<default_database>
         <databases>
             <database_name>
                 <table_name>
diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
index 06fe4ae327a..b61c2cbcbd7 100644
--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@@ -297,9 +297,13 @@ RANGE(MIN StartDate MAX EndDate);
 
 При поиске в словаре сначала просматривается кэш. На каждый блок данных, все не найденные в кэше или устаревшие ключи запрашиваются у источника с помощью `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. Затем, полученные данные записываются в кэш.
 
-Для cache-словарей может быть задано время устаревания [lifetime](external-dicts-dict-lifetime.md) данных в кэше. Если от загрузки данных в ячейке прошло больше времени, чем `lifetime`, то значение не используется, и будет запрошено заново при следующей необходимости его использовать.
+Если ключи не были найдены в словаре, то создается задание для обновления кэша, которое добавляется в очередь обновлений. Параметры очереди обновлений можно устанавливать настройками `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`
 
-Это наименее эффективный из всех способов размещения словарей. Скорость работы кэша очень сильно зависит от правильности настройки и сценария использования. Словарь типа cache показывает высокую производительность лишь при достаточно больших hit rate-ах (рекомендуется 99% и выше). Посмотреть средний hit rate можно в таблице `system.dictionaries`.
+Для cache-словарей может быть задано время устаревания [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) данных в кэше. Если от загрузки данных в ячейке прошло больше времени, чем `lifetime`, то значение не используется, а ключ устаревает. Ключ будет запрошен заново при следующей необходимости его использовать. Это можно настроить с помощью `allow_read_expired_keys`.
+
+Это наименее эффективный из всех способов размещения словарей. Скорость работы кэша очень сильно зависит от правильности настройки и сценария использования. Словарь типа cache показывает высокую производительность лишь при достаточно больших hit rate-ах (рекомендуется 99% и выше). Посмотреть средний hit rate можно в таблице [system.dictionaries](../../../operations/system-tables/dictionaries.md).
+
+Если параметр `allow_read_expired_keys` выставлен на 1 (0 по умолчанию), то словарь поддерживает асинхронные обновления. Если клиент запрашивает ключи, которые находятся в кэше, но при этом некоторые из них устарели, то словарь вернет устаревшие ключи клиенту и запросит их асинхронно у источника.
 
 Чтобы увеличить производительность кэша, используйте подзапрос с `LIMIT`, а снаружи вызывайте функцию со словарём.
 
@@ -312,6 +316,16 @@ RANGE(MIN StartDate MAX EndDate);
     <cache>
         <!-- Размер кэша в количестве ячеек. Округляется вверх до степени двух. -->
         <size_in_cells>1000000000</size_in_cells>
+        <!-- Позволить читать устаревшие ключи. -->
+        <allow_read_expired_keys>0</allow_read_expired_keys>
+        <!-- Максимальный размер очереди обновлений. -->
+        <max_update_queue_size>100000</max_update_queue_size>
+        <!-- Максимальное время (в миллисекундах) для отправки в очередь. -->
+        <update_queue_push_timeout_milliseconds>10</update_queue_push_timeout_milliseconds>
+        <!-- Максимальное время ожидания (в миллисекундах) для выполнения обновлений. -->
+        <query_wait_timeout_milliseconds>60000</query_wait_timeout_milliseconds>
+        <!-- Максимальное число потоков для обновления кэша словаря. -->
+        <max_threads_for_updates>4</max_threads_for_updates>
     </cache>
 </layout>
 ```
@@ -338,7 +352,7 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
 
 ### ssd_cache {#ssd-cache}
 
-Похож на `cache`, но хранит данные на SSD и индекс в оперативной памяти.
+Похож на `cache`, но хранит данные на SSD и индекс в оперативной памяти. Все параметры, относящиеся к очереди обновлений, могут также быть применены к SSD-кэш словарям.
 
 ``` xml
 <layout>
diff --git a/docs/ru/sql-reference/statements/create/user.md b/docs/ru/sql-reference/statements/create/user.md
index 22efaa71bfc..f6248d97ba9 100644
--- a/docs/ru/sql-reference/statements/create/user.md
+++ b/docs/ru/sql-reference/statements/create/user.md
@@ -15,6 +15,7 @@ CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1]
     [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']}]
     [HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE]
     [DEFAULT ROLE role [,...]]
+    [DEFAULT DATABASE database | NONE]
     [GRANTEES {user | role | ANY | NONE} [,...] [EXCEPT {user | role} [,...]]]
     [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY | WRITABLE] | PROFILE 'profile_name'] [,...]
 ```

From b984caaab7145f4cd2239eb7de2b0d7052b43da7 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Tue, 7 Sep 2021 19:55:13 +0300
Subject: [PATCH 139/177] Revert "Edited and translated to Russian"

This reverts commit 36f57555f68fb8088d7ee553f6801d1e77f50c2e.
---
 .../external-dicts-dict-layout.md             |  5 ++---
 .../table-engines/integrations/mongodb.md     | 16 ++-------------
 docs/ru/operations/settings/settings-users.md |  2 +-
 .../external-dicts-dict-layout.md             | 20 +++----------------
 .../sql-reference/statements/create/user.md   |  1 -
 5 files changed, 8 insertions(+), 36 deletions(-)

diff --git a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
index eb8ca425d24..00a9610ce91 100644
--- a/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@@ -300,9 +300,8 @@ When searching for a dictionary, the cache is searched first. For each block of
 
 If keys are not found in dictionary, then update cache task is created and added into update queue. Update queue properties can be controlled with settings `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`.
 
-For cache dictionaries, the expiration [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired. The key is re-requested the next time it needs to be used. This behaviour can be configured with setting `allow_read_expired_keys`.
-
-This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the [system.dictionaries](../../../operations/system-tables/dictionaries.md) table.
+For cache dictionaries, the expiration [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) of data in the cache can be set. If more time than `lifetime` has passed since loading the data in a cell, the cell’s value is not used and key becomes expired, and it is re-requested the next time it needs to be used this behaviour can be configured with setting `allow_read_expired_keys`.
+This is the least effective of all the ways to store dictionaries. The speed of the cache depends strongly on correct settings and the usage scenario. A cache type dictionary performs well only when the hit rates are high enough (recommended 99% and higher). You can view the average hit rate in the `system.dictionaries` table.
 
 If setting `allow_read_expired_keys` is set to 1, by default 0. Then dictionary can support asynchronous updates. If a client requests keys and all of them are in cache, but some of them are expired, then dictionary will return expired keys for a client and request them asynchronously from the source.
 
diff --git a/docs/ru/engines/table-engines/integrations/mongodb.md b/docs/ru/engines/table-engines/integrations/mongodb.md
index 66c30250926..05820d03fe6 100644
--- a/docs/ru/engines/table-engines/integrations/mongodb.md
+++ b/docs/ru/engines/table-engines/integrations/mongodb.md
@@ -15,7 +15,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name
     name1 [type1],
     name2 [type2],
     ...
-) ENGINE = MongoDB(host:port, database, collection, user, password [, options]);
+) ENGINE = MongoDB(host:port, database, collection, user, password);
 ```
 
 **Параметры движка**
@@ -30,11 +30,9 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name
 
 -   `password` — пароль пользователя.
 
--   `options` — MongoDB connection string options (optional parameter).
-
 ## Примеры использования {#usage-example}
 
-Создание в ClickHouse для чтения данных из колекции MongoDB:
+Таблица в ClickHouse для чтения данных из колекции MongoDB:
 
 ``` text
 CREATE TABLE mongo_table
@@ -44,16 +42,6 @@ CREATE TABLE mongo_table
 ) ENGINE = MongoDB('mongo1:27017', 'test', 'simple_table', 'testuser', 'clickhouse');
 ```
 
-Чтение из сервера MongoDB, защищенного SSL:
-
-``` text
-CREATE TABLE mongo_table_ssl
-(
-    key UInt64,
-    data String
-) ENGINE = MongoDB('mongo2:27017', 'test', 'simple_table', 'testuser', 'clickhouse', 'ssl=true');
-```
-
 Запрос к таблице:
 
 ``` sql
diff --git a/docs/ru/operations/settings/settings-users.md b/docs/ru/operations/settings/settings-users.md
index 4570ce38bad..6a10e518817 100644
--- a/docs/ru/operations/settings/settings-users.md
+++ b/docs/ru/operations/settings/settings-users.md
@@ -28,7 +28,7 @@ toc_title: "Настройки пользователей"
         <profile>profile_name</profile>
 
         <quota>default</quota>
-        <default_database>default<default_database>
+
         <databases>
             <database_name>
                 <table_name>
diff --git a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
index b61c2cbcbd7..06fe4ae327a 100644
--- a/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
+++ b/docs/ru/sql-reference/dictionaries/external-dictionaries/external-dicts-dict-layout.md
@@ -297,13 +297,9 @@ RANGE(MIN StartDate MAX EndDate);
 
 При поиске в словаре сначала просматривается кэш. На каждый блок данных, все не найденные в кэше или устаревшие ключи запрашиваются у источника с помощью `SELECT attrs... FROM db.table WHERE id IN (k1, k2, ...)`. Затем, полученные данные записываются в кэш.
 
-Если ключи не были найдены в словаре, то создается задание для обновления кэша, которое добавляется в очередь обновлений. Параметры очереди обновлений можно устанавливать настройками `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds`, `max_threads_for_updates`
+Для cache-словарей может быть задано время устаревания [lifetime](external-dicts-dict-lifetime.md) данных в кэше. Если от загрузки данных в ячейке прошло больше времени, чем `lifetime`, то значение не используется, и будет запрошено заново при следующей необходимости его использовать.
 
-Для cache-словарей может быть задано время устаревания [lifetime](../../../sql-reference/dictionaries/external-dictionaries/external-dicts-dict-lifetime.md) данных в кэше. Если от загрузки данных в ячейке прошло больше времени, чем `lifetime`, то значение не используется, а ключ устаревает. Ключ будет запрошен заново при следующей необходимости его использовать. Это можно настроить с помощью `allow_read_expired_keys`.
-
-Это наименее эффективный из всех способов размещения словарей. Скорость работы кэша очень сильно зависит от правильности настройки и сценария использования. Словарь типа cache показывает высокую производительность лишь при достаточно больших hit rate-ах (рекомендуется 99% и выше). Посмотреть средний hit rate можно в таблице [system.dictionaries](../../../operations/system-tables/dictionaries.md).
-
-Если параметр `allow_read_expired_keys` выставлен на 1 (0 по умолчанию), то словарь поддерживает асинхронные обновления. Если клиент запрашивает ключи, которые находятся в кэше, но при этом некоторые из них устарели, то словарь вернет устаревшие ключи клиенту и запросит их асинхронно у источника.
+Это наименее эффективный из всех способов размещения словарей. Скорость работы кэша очень сильно зависит от правильности настройки и сценария использования. Словарь типа cache показывает высокую производительность лишь при достаточно больших hit rate-ах (рекомендуется 99% и выше). Посмотреть средний hit rate можно в таблице `system.dictionaries`.
 
 Чтобы увеличить производительность кэша, используйте подзапрос с `LIMIT`, а снаружи вызывайте функцию со словарём.
 
@@ -316,16 +312,6 @@ RANGE(MIN StartDate MAX EndDate);
     <cache>
         <!-- Размер кэша в количестве ячеек. Округляется вверх до степени двух. -->
         <size_in_cells>1000000000</size_in_cells>
-        <!-- Позволить читать устаревшие ключи. -->
-        <allow_read_expired_keys>0</allow_read_expired_keys>
-        <!-- Максимальный размер очереди обновлений. -->
-        <max_update_queue_size>100000</max_update_queue_size>
-        <!-- Максимальное время (в миллисекундах) для отправки в очередь. -->
-        <update_queue_push_timeout_milliseconds>10</update_queue_push_timeout_milliseconds>
-        <!-- Максимальное время ожидания (в миллисекундах) для выполнения обновлений. -->
-        <query_wait_timeout_milliseconds>60000</query_wait_timeout_milliseconds>
-        <!-- Максимальное число потоков для обновления кэша словаря. -->
-        <max_threads_for_updates>4</max_threads_for_updates>
     </cache>
 </layout>
 ```
@@ -352,7 +338,7 @@ LAYOUT(CACHE(SIZE_IN_CELLS 1000000000))
 
 ### ssd_cache {#ssd-cache}
 
-Похож на `cache`, но хранит данные на SSD и индекс в оперативной памяти. Все параметры, относящиеся к очереди обновлений, могут также быть применены к SSD-кэш словарям.
+Похож на `cache`, но хранит данные на SSD и индекс в оперативной памяти.
 
 ``` xml
 <layout>
diff --git a/docs/ru/sql-reference/statements/create/user.md b/docs/ru/sql-reference/statements/create/user.md
index f6248d97ba9..22efaa71bfc 100644
--- a/docs/ru/sql-reference/statements/create/user.md
+++ b/docs/ru/sql-reference/statements/create/user.md
@@ -15,7 +15,6 @@ CREATE USER [IF NOT EXISTS | OR REPLACE] name1 [ON CLUSTER cluster_name1]
     [NOT IDENTIFIED | IDENTIFIED {[WITH {no_password | plaintext_password | sha256_password | sha256_hash | double_sha1_password | double_sha1_hash}] BY {'password' | 'hash'}} | {WITH ldap SERVER 'server_name'} | {WITH kerberos [REALM 'realm']}]
     [HOST {LOCAL | NAME 'name' | REGEXP 'name_regexp' | IP 'address' | LIKE 'pattern'} [,...] | ANY | NONE]
     [DEFAULT ROLE role [,...]]
-    [DEFAULT DATABASE database | NONE]
     [GRANTEES {user | role | ANY | NONE} [,...] [EXCEPT {user | role} [,...]]]
     [SETTINGS variable [= value] [MIN [=] min_value] [MAX [=] max_value] [READONLY | WRITABLE] | PROFILE 'profile_name'] [,...]
 ```

From e398af08dc1830f3e120ef031e42a5f48e5c309f Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Tue, 7 Sep 2021 19:52:29 +0000
Subject: [PATCH 140/177] Maybe fix test

---
 .../01200_mutations_memory_consumption.sql       | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/tests/queries/0_stateless/01200_mutations_memory_consumption.sql b/tests/queries/0_stateless/01200_mutations_memory_consumption.sql
index 764117586b9..f6f6d32a477 100644
--- a/tests/queries/0_stateless/01200_mutations_memory_consumption.sql
+++ b/tests/queries/0_stateless/01200_mutations_memory_consumption.sql
@@ -15,9 +15,9 @@ ALTER TABLE table_with_single_pk DELETE WHERE key % 77 = 0 SETTINGS mutations_sy
 SYSTEM FLUSH LOGS;
 
 -- Memory usage for all mutations must be almost constant and less than
--- read_bytes.
+-- read_bytes (with a margin)
 SELECT
-  DISTINCT read_bytes >= peak_memory_usage
+  DISTINCT 1.1 * read_bytes >= peak_memory_usage
 FROM
     system.part_log
 WHERE event_type = 'MutatePart' AND table = 'table_with_single_pk' AND database = currentDatabase();
@@ -43,9 +43,9 @@ ALTER TABLE table_with_multi_pk DELETE WHERE key1 % 77 = 0 SETTINGS mutations_sy
 SYSTEM FLUSH LOGS;
 
 -- Memory usage for all mutations must be almost constant and less than
--- read_bytes.
+-- read_bytes (with a margin)
 SELECT
-  DISTINCT read_bytes >= peak_memory_usage
+  DISTINCT 1.1 * read_bytes >= peak_memory_usage
   FROM
       system.part_log
  WHERE event_type = 'MutatePart' AND table = 'table_with_multi_pk' AND database = currentDatabase();
@@ -73,9 +73,9 @@ ALTER TABLE table_with_function_pk DELETE WHERE key1 % 77 = 0 SETTINGS mutations
 SYSTEM FLUSH LOGS;
 
 -- Memory usage for all mutations must be almost constant and less than
--- read_bytes.
+-- read_bytes (with a margin)
 SELECT
-  DISTINCT read_bytes >= peak_memory_usage
+  DISTINCT 1.1 * read_bytes >= peak_memory_usage
   FROM
       system.part_log
  WHERE event_type = 'MutatePart' AND table = 'table_with_function_pk' AND database = currentDatabase();
@@ -101,9 +101,9 @@ ALTER TABLE table_without_pk DELETE WHERE key1 % 77 = 0 SETTINGS mutations_sync
 SYSTEM FLUSH LOGS;
 
 -- Memory usage for all mutations must be almost constant and less than
--- read_bytes.
+-- read_bytes (with a margin)
 SELECT
-  DISTINCT read_bytes >= peak_memory_usage
+  DISTINCT 1.1 * read_bytes >= peak_memory_usage
   FROM
       system.part_log
  WHERE event_type = 'MutatePart' AND table = 'table_without_pk' AND database = currentDatabase();

From db0767a194500b2ca0be303f713da7e22e516735 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 7 Sep 2021 23:29:57 +0300
Subject: [PATCH 141/177] Implement detach_not_byte_identical_parts

Maybe useful for further analysis of non byte-identical parts.
---
 src/Storages/MergeTree/MergeTreeSettings.h  |  1 +
 src/Storages/StorageReplicatedMergeTree.cpp | 10 ++++++++--
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h
index 56ba0f25ded..889b89b9a27 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@@ -89,6 +89,7 @@ struct Settings;
     M(Bool, replicated_can_become_leader, true, "If true, Replicated tables replicas on this node will try to acquire leadership.", 0) \
     M(Seconds, zookeeper_session_expiration_check_period, 60, "ZooKeeper session expiration check period, in seconds.", 0) \
     M(Bool, detach_old_local_parts_when_cloning_replica, true, "Do not remove old local parts when repairing lost replica.", 0) \
+    M(Bool, detach_not_byte_identical_parts, false, "Do not remove non byte-idential parts for ReplicatedMergeTree, instead detach them (maybe useful for further analysis).", 0) \
     M(UInt64, max_replicated_fetches_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for replicated fetches. Zero means unlimited.", 0) \
     M(UInt64, max_replicated_sends_network_bandwidth, 0, "The maximum speed of data exchange over the network in bytes per second for replicated sends. Zero means unlimited.", 0) \
     \
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index dcb1913f8b7..5045b34da67 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -1811,7 +1811,10 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
 
                 write_part_log(ExecutionStatus::fromCurrentException());
 
-                tryRemovePartImmediately(std::move(part));
+                if (storage_settings_ptr->detach_not_byte_identical_parts)
+                    forgetPartAndMoveToDetached(std::move(part), "merge-not-byte-identical");
+                else
+                    tryRemovePartImmediately(std::move(part));
                 /// No need to delete the part from ZK because we can be sure that the commit transaction
                 /// didn't go through.
 
@@ -1935,7 +1938,10 @@ bool StorageReplicatedMergeTree::tryExecutePartMutation(const StorageReplicatedM
 
                 write_part_log(ExecutionStatus::fromCurrentException());
 
-                tryRemovePartImmediately(std::move(new_part));
+                if (storage_settings_ptr->detach_not_byte_identical_parts)
+                    forgetPartAndMoveToDetached(std::move(new_part), "mutate-not-byte-identical");
+                else
+                    tryRemovePartImmediately(std::move(new_part));
                 /// No need to delete the part from ZK because we can be sure that the commit transaction
                 /// didn't go through.
 

From fe3c53040ab3486aa9a0e96f581f13437addec98 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 2 Sep 2021 21:02:35 +0300
Subject: [PATCH 142/177] Cache thread name to avoid prctl(PR_GET_NAME) for
 query_log/text_log

---
 src/Common/setThreadName.cpp | 23 ++++++++++++++++-------
 src/Common/setThreadName.h   |  2 +-
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/src/Common/setThreadName.cpp b/src/Common/setThreadName.cpp
index 9e45c62a9c9..958404b9ad1 100644
--- a/src/Common/setThreadName.cpp
+++ b/src/Common/setThreadName.cpp
@@ -22,6 +22,10 @@ namespace ErrorCodes
 }
 
 
+/// Cache thread_name to avoid prctl(PR_GET_NAME) for query_log/text_log
+static thread_local std::string thread_name;
+
+
 void setThreadName(const char * name)
 {
 #ifndef NDEBUG
@@ -40,24 +44,29 @@ void setThreadName(const char * name)
     if (0 != prctl(PR_SET_NAME, name, 0, 0, 0))
 #endif
         DB::throwFromErrno("Cannot set thread name with prctl(PR_SET_NAME, ...)", DB::ErrorCodes::PTHREAD_ERROR);
+
+    thread_name = name;
 }
 
-std::string getThreadName()
+const std::string & getThreadName()
 {
-    std::string name(16, '\0');
+    if (!thread_name.empty())
+        return thread_name;
+
+    thread_name.resize(16);
 
 #if defined(__APPLE__) || defined(OS_SUNOS)
-    if (pthread_getname_np(pthread_self(), name.data(), name.size()))
+    if (pthread_getname_np(pthread_self(), thread_name.data(), thread_name.size()))
         throw DB::Exception("Cannot get thread name with pthread_getname_np()", DB::ErrorCodes::PTHREAD_ERROR);
 #elif defined(__FreeBSD__)
 // TODO: make test. freebsd will have this function soon https://freshbsd.org/commit/freebsd/r337983
-//    if (pthread_get_name_np(pthread_self(), name.data(), name.size()))
+//    if (pthread_get_name_np(pthread_self(), thread_name.data(), thread_name.size()))
 //        throw DB::Exception("Cannot get thread name with pthread_get_name_np()", DB::ErrorCodes::PTHREAD_ERROR);
 #else
-    if (0 != prctl(PR_GET_NAME, name.data(), 0, 0, 0))
+    if (0 != prctl(PR_GET_NAME, thread_name.data(), 0, 0, 0))
         DB::throwFromErrno("Cannot get thread name with prctl(PR_GET_NAME)", DB::ErrorCodes::PTHREAD_ERROR);
 #endif
 
-    name.resize(std::strlen(name.data()));
-    return name;
+    thread_name.resize(std::strlen(thread_name.data()));
+    return thread_name;
 }
diff --git a/src/Common/setThreadName.h b/src/Common/setThreadName.h
index cdcb6b46914..ea988885db2 100644
--- a/src/Common/setThreadName.h
+++ b/src/Common/setThreadName.h
@@ -7,4 +7,4 @@
   */
 void setThreadName(const char * name);
 
-std::string getThreadName();
+const std::string & getThreadName();

From 6062dd0021bdb6b019f539652790257abb24b0d9 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 8 Sep 2021 00:21:21 +0000
Subject: [PATCH 143/177] Better

---
 src/Common/MemoryTracker.h                    |  4 +-
 src/Interpreters/Context.cpp                  | 14 +++---
 .../MergeTree/BackgroundJobsAssignee.cpp      |  4 +-
 .../MergeTree/BackgroundJobsAssignee.h        |  5 ++-
 src/Storages/MergeTree/IExecutableTask.h      | 41 ++++++++++++-----
 .../MergeTree/MergeTreeBackgroundExecutor.cpp | 25 +++++++----
 .../MergeTree/MergeTreeBackgroundExecutor.h   | 44 ++++++++++++-------
 src/Storages/MergeTree/MergeTreeData.cpp      | 16 +++----
 src/Storages/MergeTree/MergeTreeData.h        | 16 +++++--
 src/Storages/MergeTree/MergeTreeSink.cpp      |  2 +-
 .../MergeTree/ReplicatedMergeTreeQueue.cpp    |  6 +--
 .../ReplicatedMergeTreeRestartingThread.cpp   |  4 +-
 .../MergeTree/tests/gtest_executor.cpp        |  2 +-
 src/Storages/StorageMergeTree.cpp             | 34 +++++++-------
 src/Storages/StorageMergeTree.h               |  2 +-
 src/Storages/StorageReplicatedMergeTree.cpp   | 18 ++++----
 src/Storages/StorageReplicatedMergeTree.h     |  2 +-
 ...200_mutations_memory_consumption.reference |  8 ++--
 .../01200_mutations_memory_consumption.sql    | 28 +++++++-----
 19 files changed, 166 insertions(+), 109 deletions(-)

diff --git a/src/Common/MemoryTracker.h b/src/Common/MemoryTracker.h
index d4de7a2cafa..d87e922ef09 100644
--- a/src/Common/MemoryTracker.h
+++ b/src/Common/MemoryTracker.h
@@ -62,8 +62,8 @@ private:
     void logMemoryUsage(Int64 current) const;
 
 public:
-    MemoryTracker(VariableContext level_ = VariableContext::Thread);
-    MemoryTracker(MemoryTracker * parent_, VariableContext level_ = VariableContext::Thread);
+    explicit MemoryTracker(VariableContext level_ = VariableContext::Thread);
+    explicit MemoryTracker(MemoryTracker * parent_, VariableContext level_ = VariableContext::Thread);
 
     ~MemoryTracker();
 
diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index a75f552ab05..ed25fbead37 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -2729,19 +2729,21 @@ PartUUIDsPtr Context::getIgnoredPartUUIDs() const
 
 void Context::initializeBackgroundExecutors()
 {
+    // Initialize background executors with callbacks to be able to change pool size and tasks count at runtime.
+
     shared->merge_mutate_executor = MergeTreeBackgroundExecutor::create
     (
         MergeTreeBackgroundExecutor::Type::MERGE_MUTATE,
-        [this] () { return getSettingsRef().background_pool_size; },
-        [this] () { return getSettingsRef().background_pool_size; },
+        [this] () { auto lock = getLock(); return getSettingsRef().background_pool_size; },
+        [this] () { auto lock = getLock(); return getSettingsRef().background_pool_size; },
         CurrentMetrics::BackgroundPoolTask
     );
 
     shared->moves_executor = MergeTreeBackgroundExecutor::create
     (
         MergeTreeBackgroundExecutor::Type::MOVE,
-        [this] () { return getSettingsRef().background_move_pool_size; },
-        [this] () { return getSettingsRef().background_move_pool_size; },
+        [this] () { auto lock = getLock(); return getSettingsRef().background_move_pool_size; },
+        [this] () { auto lock = getLock(); return getSettingsRef().background_move_pool_size; },
         CurrentMetrics::BackgroundMovePoolTask
     );
 
@@ -2749,8 +2751,8 @@ void Context::initializeBackgroundExecutors()
     shared->fetch_executor = MergeTreeBackgroundExecutor::create
     (
         MergeTreeBackgroundExecutor::Type::FETCH,
-        [this] () { return getSettingsRef().background_fetches_pool_size; },
-        [this] () { return getSettingsRef().background_fetches_pool_size; },
+        [this] () { auto lock = getLock(); return getSettingsRef().background_fetches_pool_size; },
+        [this] () { auto lock = getLock(); return getSettingsRef().background_fetches_pool_size; },
         CurrentMetrics::BackgroundFetchesPoolTask
     );
 }
diff --git a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp
index e104e188cc6..c1cb0b497cd 100644
--- a/src/Storages/MergeTree/BackgroundJobsAssignee.cpp
+++ b/src/Storages/MergeTree/BackgroundJobsAssignee.cpp
@@ -85,7 +85,7 @@ void BackgroundJobsAssignee::start()
 {
     std::lock_guard lock(holder_mutex);
     if (!holder)
-        holder = getContext()->getSchedulePool().createTask("BackgroundJobsAssignee:" + toString(type), [this]{ main(); });
+        holder = getContext()->getSchedulePool().createTask("BackgroundJobsAssignee:" + toString(type), [this]{ threadFunc(); });
 
     holder->activateAndSchedule();
 }
@@ -106,7 +106,7 @@ void BackgroundJobsAssignee::finish()
 }
 
 
-void BackgroundJobsAssignee::main()
+void BackgroundJobsAssignee::threadFunc()
 try
 {
     bool succeed = false;
diff --git a/src/Storages/MergeTree/BackgroundJobsAssignee.h b/src/Storages/MergeTree/BackgroundJobsAssignee.h
index 6d2778adea2..619b6fbb966 100644
--- a/src/Storages/MergeTree/BackgroundJobsAssignee.h
+++ b/src/Storages/MergeTree/BackgroundJobsAssignee.h
@@ -29,7 +29,7 @@ struct ExecutableTaskSchedulingSettings
 
 class MergeTreeData;
 
-class BackgroundJobsAssignee : protected WithContext
+class BackgroundJobsAssignee : public WithContext
 {
 private:
     MergeTreeData & data;
@@ -49,6 +49,7 @@ private:
     std::mutex holder_mutex;
 
 public:
+    ///
     enum class Type
     {
         DataProcessing,
@@ -77,7 +78,7 @@ private:
     static String toString(Type type);
 
     /// Function that executes in background scheduling pool
-    void main();
+    void threadFunc();
 };
 
 
diff --git a/src/Storages/MergeTree/IExecutableTask.h b/src/Storages/MergeTree/IExecutableTask.h
index 2e546e83035..0811cf62d00 100644
--- a/src/Storages/MergeTree/IExecutableTask.h
+++ b/src/Storages/MergeTree/IExecutableTask.h
@@ -9,10 +9,21 @@
 namespace DB
 {
 
+/**
+ * Generic interface for background operations. Simply this is self-made coroutine.
+ * The main method is executeStep, which will return true
+ * if the task wants to execute another 'step' in near future and false otherwise.
+ *
+ * Each storage assigns some operations such as merges, mutations, fetches, etc.
+ * We need to ask a storage or some another entity to try to assign another operation when current operation is completed.
+ *
+ * Each task corresponds to a storage, that's why there is a method getStorageID.
+ * This is needed to correctly shutdown a storage, e.g. we need to wait for all background operations to complete.
+ */
 class IExecutableTask
 {
 public:
-    virtual bool execute() = 0;
+    virtual bool executeStep() = 0;
     virtual void onCompleted() = 0;
     virtual StorageID getStorageID() = 0;
     virtual ~IExecutableTask() = default;
@@ -21,29 +32,37 @@ public:
 using ExecutableTaskPtr = std::shared_ptr<IExecutableTask>;
 
 
-class LambdaAdapter : public shared_ptr_helper<LambdaAdapter>, public IExecutableTask
+/**
+ * Some background operations won't represent a coroutines (don't want to be executed step-by-step). For this we have this wrapper.
+ */
+class ExecutableLambdaAdapter : public shared_ptr_helper<ExecutableLambdaAdapter>, public IExecutableTask
 {
 public:
 
-    template <typename InnerJob, typename Callback>
-    explicit LambdaAdapter(InnerJob && inner_, Callback && callback_, StorageID id_)
-        : inner(inner_), callback(callback_), id(id_) {}
+    template <typename Job, typename Callback>
+    explicit ExecutableLambdaAdapter(
+        Job && job_to_execute_,
+        Callback && job_result_callback_,
+        StorageID id_)
+        : job_to_execute(job_to_execute_)
+        , job_result_callback(job_result_callback_)
+        , id(id_) {}
 
-    bool execute() override
+    bool executeStep() override
     {
-        res = inner();
-        inner = {};
+        res = job_to_execute();
+        job_to_execute = {};
         return false;
     }
 
-    void onCompleted() override { callback(!res); }
+    void onCompleted() override { job_result_callback(!res); }
 
     StorageID getStorageID() override { return id; }
 
 private:
     bool res = false;
-    std::function<bool()> inner;
-    std::function<void(bool)> callback;
+    std::function<bool()> job_to_execute;
+    std::function<void(bool)> job_result_callback;
     StorageID id;
 };
 
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index 244c4f761f4..ed841ff0fc9 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -39,6 +39,9 @@ void MergeTreeBackgroundExecutor::updateConfiguration()
         pool.setMaxThreads(new_threads_count);
         pool.setQueueSize(new_max_tasks_count);
 
+        std::cout << "threads_count " << threads_count << std::endl;
+
+        /// We don't enter this loop if size is decreased.
         for (size_t number = threads_count; number < new_threads_count; ++number)
             pool.scheduleOrThrowOnError([this, number] { threadFunction(number); });
     }
@@ -74,6 +77,8 @@ bool MergeTreeBackgroundExecutor::trySchedule(ExecutableTaskPtr task)
     try
     {
         /// This is needed to increase / decrease the number of threads at runtime
+        /// Using stopwatch here not to do it so often.
+        /// No need to move the time to a config.
         if (update_timer.compareAndRestartDeferred(10.))
             updateConfiguration();
     }
@@ -86,7 +91,7 @@ bool MergeTreeBackgroundExecutor::trySchedule(ExecutableTaskPtr task)
     if (value.load() >= static_cast<int64_t>(max_tasks_count))
         return false;
 
-    pending.push_back(std::make_shared<Item>(std::move(task), metric));
+    pending.push_back(std::make_shared<TaskRuntimeData>(std::move(task), metric));
 
     has_tasks.notify_one();
     return true;
@@ -95,7 +100,7 @@ bool MergeTreeBackgroundExecutor::trySchedule(ExecutableTaskPtr task)
 
 void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id)
 {
-    std::vector<ItemPtr> tasks_to_wait;
+    std::vector<TaskRuntimeDataPtr> tasks_to_wait;
     {
         std::lock_guard lock(mutex);
 
@@ -118,7 +123,7 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 }
 
 
-void MergeTreeBackgroundExecutor::routine(ItemPtr item)
+void MergeTreeBackgroundExecutor::routine(TaskRuntimeDataPtr item)
 {
     auto erase_from_active = [this, item]
     {
@@ -127,7 +132,7 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
 
     try
     {
-        if (item->task->execute())
+        if (item->task->executeStep())
         {
             std::lock_guard guard(mutex);
 
@@ -143,9 +148,12 @@ void MergeTreeBackgroundExecutor::routine(ItemPtr item)
             return;
         }
 
-        std::lock_guard guard(mutex);
-        erase_from_active();
-        has_tasks.notify_one();
+        {
+            std::lock_guard guard(mutex);
+            erase_from_active();
+            has_tasks.notify_one();
+        }
+
         /// In a situation of a lack of memory this method can throw an exception,
         /// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
         /// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
@@ -170,11 +178,12 @@ void MergeTreeBackgroundExecutor::threadFunction(size_t number)
 
     while (true)
     {
-        ItemPtr item;
+        TaskRuntimeDataPtr item;
         {
             std::unique_lock lock(mutex);
             has_tasks.wait(lock, [this](){ return !pending.empty() || shutdown; });
 
+            /// Decrease the number of threads (setting could be dynamically reloaded)
             if (number >= threads_count)
                 break;
 
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index 5dffe7f5908..9aca20bed47 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -25,15 +25,25 @@ namespace DB
  *  It can execute only successors of ExecutableTask interface.
  *  Which is a self-written coroutine. It suspends, when returns true from execute() method.
  *
- *  Executor consists of ThreadPool to execute pieces of a task (basically calls 'execute' on a task)
- *  and a scheduler thread, which manages the tasks. Due to bad experience of working with high memory under
- *  high memory pressure scheduler thread mustn't do any allocations,
- *  because it will be a fatal error if this thread will die from a random exception.
- *
  *  There are two queues of a tasks: pending (main queue for all the tasks) and active (currently executing).
+ *  Pending queue is needed since the number of tasks will be more than thread to execute.
+ *  Pending tasks are tasks that successfully scheduled to an executor or tasks that have some extra steps to execute.
  *  There is an invariant, that task may occur only in one of these queue. It can occur in both queues only in critical sections.
  *
- *  Due to all caveats I described above we use boost::circular_buffer as a container for queues.
+ *  Pending:                                              Active:
+ *
+ *  |s| |s| |s| |s| |s| |s| |s| |s| |s| |s|               |s|
+ *  |s| |s| |s| |s| |s| |s| |s| |s| |s|                   |s|
+ *  |s| |s|     |s|     |s| |s|     |s|                   |s|
+ *      |s|             |s| |s|                           |s|
+ *      |s|                 |s|
+ *                          |s|
+ *
+ *  Each task is simply a sequence of steps. Heavier tasks have longer sequences.
+ *  When a step of a task is executed, we move tasks to pending queue. And take another from the queue's head.
+ *  With these architecture all small merges / mutations will be executed faster, than bigger ones.
+ *
+ *  We use boost::circular_buffer as a container for queues not to do any allocations.
  *
  *  Another nuisance that we faces with is than background operations always interact with an associated Storage.
  *  So, when a Storage want to shutdown, it must wait until all its background operaions are finished.
@@ -43,7 +53,6 @@ class MergeTreeBackgroundExecutor : public shared_ptr_helper<MergeTreeBackground
 public:
 
     using CountGetter = std::function<size_t()>;
-    using Callback = std::function<void()>;
 
     enum class Type
     {
@@ -107,13 +116,16 @@ private:
 
     AtomicStopwatch update_timer;
 
-    struct Item
+    /**
+     * Has RAII class to determine how many tasks are waiting for the execution and executing at the moment.
+     * Also has some flags and primitives to wait for current task to be executed.
+     */
+    struct TaskRuntimeData
     {
-        explicit Item(ExecutableTaskPtr && task_, CurrentMetrics::Metric metric_)
+        TaskRuntimeData(ExecutableTaskPtr && task_, CurrentMetrics::Metric metric_)
             : task(std::move(task_))
             , increment(std::move(metric_))
-        {
-        }
+        {}
 
         ExecutableTaskPtr task;
         CurrentMetrics::Increment increment;
@@ -124,14 +136,16 @@ private:
         Poco::Event is_done{/*autoreset=*/false};
     };
 
-    using ItemPtr = std::shared_ptr<Item>;
+    using TaskRuntimeDataPtr = std::shared_ptr<TaskRuntimeData>;
 
-    void routine(ItemPtr item);
+    void routine(TaskRuntimeDataPtr item);
+
+    /// Number all the threads in ThreadPool. To be able to lower the number of threads in runtime.
     void threadFunction(size_t number);
 
     /// Initially it will be empty
-    boost::circular_buffer<ItemPtr> pending{0};
-    boost::circular_buffer<ItemPtr> active{0};
+    boost::circular_buffer<TaskRuntimeDataPtr> pending{0};
+    boost::circular_buffer<TaskRuntimeDataPtr> active{0};
 
     std::mutex mutex;
     std::condition_variable has_tasks;
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index 6592b9a043d..ff469078630 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -200,8 +200,8 @@ MergeTreeData::MergeTreeData(
     , data_parts_by_info(data_parts_indexes.get<TagByInfo>())
     , data_parts_by_state_and_info(data_parts_indexes.get<TagByStateAndInfo>())
     , parts_mover(this)
-    , background_executor(*this, BackgroundJobsAssignee::Type::DataProcessing, getContext())
-    , background_moves_executor(*this, BackgroundJobsAssignee::Type::Moving, getContext())
+    , background_operations_assignee(*this, BackgroundJobsAssignee::Type::DataProcessing, getContext())
+    , background_moves_assignee(*this, BackgroundJobsAssignee::Type::Moving, getContext())
 {
     const auto settings = getSettings();
     allow_nullable_key = attach || settings->allow_nullable_key;
@@ -311,17 +311,17 @@ MergeTreeData::MergeTreeData(
     common_assignee_trigger = [this] (bool delay) noexcept
     {
         if (delay)
-            background_executor.postpone();
+            background_operations_assignee.postpone();
         else
-            background_executor.trigger();
+            background_operations_assignee.trigger();
     };
 
     moves_assignee_trigger = [this] (bool delay) noexcept
     {
         if (delay)
-            background_moves_executor.postpone();
+            background_moves_assignee.postpone();
         else
-            background_moves_executor.trigger();
+            background_moves_assignee.trigger();
     };
 }
 
@@ -5029,7 +5029,7 @@ MergeTreeData::CurrentlyMovingPartsTagger::~CurrentlyMovingPartsTagger()
     }
 }
 
-bool MergeTreeData::scheduleDataMovingJob(BackgroundJobsAssignee & executor)
+bool MergeTreeData::scheduleDataMovingJob(BackgroundJobsAssignee & assignee)
 {
     if (parts_mover.moves_blocker.isCancelled())
         return false;
@@ -5038,7 +5038,7 @@ bool MergeTreeData::scheduleDataMovingJob(BackgroundJobsAssignee & executor)
     if (moving_tagger->parts_to_move.empty())
         return false;
 
-    executor.scheduleMoveTask(LambdaAdapter::create(
+    assignee.scheduleMoveTask(ExecutableLambdaAdapter::create(
         [this, moving_tagger] () mutable
         {
             return moveParts(moving_tagger);
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index c3b9eaf1b59..2fab402a74a 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -827,9 +827,9 @@ public:
     PinnedPartUUIDsPtr getPinnedPartUUIDs() const;
 
     /// Schedules background job to like merge/mutate/fetch an executor
-    virtual bool scheduleDataProcessingJob(BackgroundJobsAssignee & executor) = 0;
+    virtual bool scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) = 0;
     /// Schedules job to move parts between disks/volumes and so on.
-    bool scheduleDataMovingJob(BackgroundJobsAssignee & executor);
+    bool scheduleDataMovingJob(BackgroundJobsAssignee & assignee);
     bool areBackgroundMovesNeeded() const;
 
     /// Lock part in zookeeper for shared data in several nodes
@@ -925,10 +925,18 @@ protected:
 
     /// Executors are common for both ReplicatedMergeTree and plain MergeTree
     /// but they are being started and finished in derived classes, so let them be protected.
-    BackgroundJobsAssignee background_executor;
-    BackgroundJobsAssignee background_moves_executor;
+    ///
+    /// Why there are two executors, not one? Or an executor for each kind of operation?
+    /// It is historically formed.
+    /// Another explanation is that moving operations are common for Replicated and Plain MergeTree classes.
+    /// Task that schedules this operations is executed with its own timetable and triggered in a specific places in code.
+    /// And for ReplicatedMergeTree we don't have LogEntry type for this operation.
+    BackgroundJobsAssignee background_operations_assignee;
+    BackgroundJobsAssignee background_moves_assignee;
 
+    /// Strongly connected with two fields above.
     /// Every task that is finished will ask to assign a new one into an executor.
+    /// These callbacks will be passed to the constructor of each task.
     std::function<void(bool)> common_assignee_trigger;
     std::function<void(bool)> moves_assignee_trigger;
 
diff --git a/src/Storages/MergeTree/MergeTreeSink.cpp b/src/Storages/MergeTree/MergeTreeSink.cpp
index 600c25b3774..c3193ac68b5 100644
--- a/src/Storages/MergeTree/MergeTreeSink.cpp
+++ b/src/Storages/MergeTree/MergeTreeSink.cpp
@@ -37,7 +37,7 @@ void MergeTreeSink::consume(Chunk chunk)
             PartLog::addNewPart(storage.getContext(), part, watch.elapsed());
 
             /// Initiate async merge - it will be done if it's good time for merge and if there are space in 'background_pool'.
-            storage.background_executor.trigger();
+            storage.background_operations_assignee.trigger();
         }
     }
 }
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
index dee1aa47cb4..a2e81e97335 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@@ -624,7 +624,7 @@ int32_t ReplicatedMergeTreeQueue::pullLogsToQueue(zkutil::ZooKeeperPtr zookeeper
             }
         }
 
-        storage.background_executor.trigger();
+        storage.background_operations_assignee.trigger();
     }
 
     return stat.version;
@@ -713,7 +713,7 @@ void ReplicatedMergeTreeQueue::updateMutations(zkutil::ZooKeeperPtr zookeeper, C
     }
 
     if (some_active_mutations_were_killed)
-        storage.background_executor.trigger();
+        storage.background_operations_assignee.trigger();
 
     if (!entries_to_load.empty())
     {
@@ -847,7 +847,7 @@ ReplicatedMergeTreeMutationEntryPtr ReplicatedMergeTreeQueue::removeMutation(
     }
 
     if (mutation_was_active)
-        storage.background_executor.trigger();
+        storage.background_operations_assignee.trigger();
 
     return entry;
 }
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
index 5c19f4166fa..f9ea8c623f7 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeRestartingThread.cpp
@@ -204,7 +204,7 @@ bool ReplicatedMergeTreeRestartingThread::tryStartup()
         storage.partial_shutdown_event.reset();
 
         /// Start queue processing
-        storage.background_executor.start();
+        storage.background_operations_assignee.start();
 
         storage.queue_updating_task->activateAndSchedule();
         storage.mutations_updating_task->activateAndSchedule();
@@ -389,7 +389,7 @@ void ReplicatedMergeTreeRestartingThread::partialShutdown()
         auto fetch_lock = storage.fetcher.blocker.cancel();
         auto merge_lock = storage.merger_mutator.merges_blocker.cancel();
         auto move_lock = storage.parts_mover.moves_blocker.cancel();
-        storage.background_executor.finish();
+        storage.background_operations_assignee.finish();
     }
 
     LOG_TRACE(log, "Threads finished");
diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index b3f8ad2a0ca..bda61e207e9 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -24,7 +24,7 @@ public:
     {
     }
 
-    bool execute() override
+    bool executeStep() override
     {
         auto sleep_time = distribution(generator);
         std::this_thread::sleep_for(std::chrono::milliseconds(5 * sleep_time));
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 46bc319a288..dbcc6f4a5de 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -112,7 +112,7 @@ void StorageMergeTree::startup()
 
     try
     {
-        background_executor.start();
+        background_operations_assignee.start();
         startBackgroundMovesIfNeeded();
     }
     catch (...)
@@ -150,8 +150,8 @@ void StorageMergeTree::shutdown()
     merger_mutator.merges_blocker.cancelForever();
     parts_mover.moves_blocker.cancelForever();
 
-    background_executor.finish();
-    background_moves_executor.finish();
+    background_operations_assignee.finish();
+    background_moves_assignee.finish();
 
     try
     {
@@ -411,7 +411,7 @@ Int64 StorageMergeTree::startMutation(const MutationCommands & commands, String
 
         LOG_INFO(log, "Added mutation: {}", mutation_file_name);
     }
-    background_executor.trigger();
+    background_operations_assignee.trigger();
     return version;
 }
 
@@ -637,7 +637,7 @@ CancellationCode StorageMergeTree::killMutation(const String & mutation_id)
     }
 
     /// Maybe there is another mutation that was blocked by the killed one. Try to execute it immediately.
-    background_executor.trigger();
+    background_operations_assignee.trigger();
 
     return CancellationCode::CancelSent;
 }
@@ -1041,7 +1041,7 @@ bool StorageMergeTree::mutateSelectedPart(const StorageMetadataPtr & metadata_sn
     return true;
 }
 
-bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & executor) //-V657
+bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) //-V657
 {
     if (shutdown_called)
         return false;
@@ -1073,7 +1073,7 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & execut
 
     if (merge_entry)
     {
-        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+        assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
             [this, metadata_snapshot, merge_entry, share_lock] () mutable
             {
                 return mergeSelectedParts(metadata_snapshot, false, {}, *merge_entry, share_lock);
@@ -1082,27 +1082,27 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & execut
     }
     if (mutate_entry)
     {
-        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+        assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
             [this, metadata_snapshot, merge_entry, mutate_entry, share_lock] () mutable
             {
             return mutateSelectedPart(metadata_snapshot, *mutate_entry, share_lock);
             }, common_assignee_trigger, getStorageID()));
         return true;
     }
-    bool executed = false;
+    bool scheduled = false;
     if (time_after_previous_cleanup_temporary_directories.compareAndRestartDeferred(getContext()->getSettingsRef().merge_tree_clear_old_temporary_directories_interval_seconds))
     {
-        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+        assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
             [this, share_lock] ()
             {
                 clearOldTemporaryDirectories(getSettings()->temporary_directories_lifetime.totalSeconds());
                 return true;
             }, common_assignee_trigger, getStorageID()));
-        executed = true;
+        scheduled = true;
     }
     if (time_after_previous_cleanup_parts.compareAndRestartDeferred(getContext()->getSettingsRef().merge_tree_clear_old_parts_interval_seconds))
     {
-        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+        assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
             [this, share_lock] ()
             {
                 /// All use relative_data_path which changes during rename
@@ -1113,10 +1113,10 @@ bool StorageMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & execut
                 clearEmptyParts();
                 return true;
             }, common_assignee_trigger, getStorageID()));
-        executed = true;
+        scheduled = true;
      }
 
-    return executed;
+    return scheduled;
 }
 
 Int64 StorageMergeTree::getCurrentMutationVersion(
@@ -1567,9 +1567,9 @@ ActionLock StorageMergeTree::getActionLock(StorageActionBlockType action_type)
 void StorageMergeTree::onActionLockRemove(StorageActionBlockType action_type)
 {
     if (action_type == ActionLocks::PartsMerge ||  action_type == ActionLocks::PartsTTLMerge)
-        background_executor.trigger();
+        background_operations_assignee.trigger();
     else if (action_type == ActionLocks::PartsMove)
-        background_moves_executor.trigger();
+        background_moves_assignee.trigger();
 }
 
 CheckResults StorageMergeTree::checkData(const ASTPtr & query, ContextPtr local_context)
@@ -1647,7 +1647,7 @@ MutationCommands StorageMergeTree::getFirstAlterMutationCommandsForPart(const Da
 void StorageMergeTree::startBackgroundMovesIfNeeded()
 {
     if (areBackgroundMovesNeeded())
-        background_moves_executor.start();
+        background_moves_assignee.start();
 }
 
 std::unique_ptr<MergeTreeSettings> StorageMergeTree::getDefaultSettings() const
diff --git a/src/Storages/StorageMergeTree.h b/src/Storages/StorageMergeTree.h
index 09d362b6eef..0fc7b9f9f91 100644
--- a/src/Storages/StorageMergeTree.h
+++ b/src/Storages/StorageMergeTree.h
@@ -95,7 +95,7 @@ public:
 
     RestoreDataTasks restoreFromBackup(const BackupPtr & backup, const String & data_path_in_backup, const ASTs & partitions, ContextMutablePtr context) override;
 
-    bool scheduleDataProcessingJob(BackgroundJobsAssignee & executor) override;
+    bool scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) override;
 
     MergeTreeDeduplicationLog * getDeduplicationLog() { return deduplication_log.get(); }
 
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 942eb0e561c..5593c088624 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -3173,7 +3173,7 @@ bool StorageReplicatedMergeTree::processQueueEntry(ReplicatedMergeTreeQueue::Sel
     });
 }
 
-bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & executor)
+bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobsAssignee & assignee)
 {
     /// If replication queue is stopped exit immediately as we successfully executed the task
     if (queue.actions_blocker.isCancelled())
@@ -3188,7 +3188,7 @@ bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobsAssigne
     /// Depending on entry type execute in fetches (small) pool or big merge_mutate pool
     if (selected_entry->log_entry->type == LogEntry::GET_PART)
     {
-        executor.scheduleFetchTask(LambdaAdapter::create(
+        assignee.scheduleFetchTask(ExecutableLambdaAdapter::create(
             [this, selected_entry] () mutable
             {
                 return processQueueEntry(selected_entry);
@@ -3197,7 +3197,7 @@ bool StorageReplicatedMergeTree::scheduleDataProcessingJob(BackgroundJobsAssigne
     }
     else
     {
-        executor.scheduleMergeMutateTask(LambdaAdapter::create(
+        assignee.scheduleMergeMutateTask(ExecutableLambdaAdapter::create(
             [this, selected_entry] () mutable
             {
                 return processQueueEntry(selected_entry);
@@ -4337,7 +4337,7 @@ void StorageReplicatedMergeTree::shutdown()
     parts_mover.moves_blocker.cancelForever();
 
     restarting_thread.shutdown();
-    background_executor.finish();
+    background_operations_assignee.finish();
     part_moves_between_shards_orchestrator.shutdown();
 
     {
@@ -4347,7 +4347,7 @@ void StorageReplicatedMergeTree::shutdown()
         /// MUTATE, etc. query.
         queue.pull_log_blocker.cancelForever();
     }
-    background_moves_executor.finish();
+    background_moves_assignee.finish();
 
     auto data_parts_exchange_ptr = std::atomic_exchange(&data_parts_exchange_endpoint, InterserverIOEndpointPtr{});
     if (data_parts_exchange_ptr)
@@ -6947,9 +6947,9 @@ void StorageReplicatedMergeTree::onActionLockRemove(StorageActionBlockType actio
     if (action_type == ActionLocks::PartsMerge || action_type == ActionLocks::PartsTTLMerge
         || action_type == ActionLocks::PartsFetch || action_type == ActionLocks::PartsSend
         || action_type == ActionLocks::ReplicationQueue)
-        background_executor.trigger();
+        background_operations_assignee.trigger();
     else if (action_type == ActionLocks::PartsMove)
-        background_moves_executor.trigger();
+        background_moves_assignee.trigger();
 }
 
 bool StorageReplicatedMergeTree::waitForShrinkingQueueSize(size_t queue_size, UInt64 max_wait_milliseconds)
@@ -6961,7 +6961,7 @@ bool StorageReplicatedMergeTree::waitForShrinkingQueueSize(size_t queue_size, UI
 
     /// This is significant, because the execution of this task could be delayed at BackgroundPool.
     /// And we force it to be executed.
-    background_executor.trigger();
+    background_operations_assignee.trigger();
 
     Poco::Event target_size_event;
     auto callback = [&target_size_event, queue_size] (size_t new_queue_size)
@@ -7195,7 +7195,7 @@ MutationCommands StorageReplicatedMergeTree::getFirstAlterMutationCommandsForPar
 void StorageReplicatedMergeTree::startBackgroundMovesIfNeeded()
 {
     if (areBackgroundMovesNeeded())
-        background_moves_executor.start();
+        background_moves_assignee.start();
 }
 
 std::unique_ptr<MergeTreeSettings> StorageReplicatedMergeTree::getDefaultSettings() const
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index ddb14378f90..faeb3cbd1a2 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -218,7 +218,7 @@ public:
                                               const zkutil::EphemeralNodeHolder::Ptr & metadata_drop_lock, Poco::Logger * logger);
 
     /// Schedules job to execute in background pool (merge, mutate, drop range and so on)
-    bool scheduleDataProcessingJob(BackgroundJobsAssignee & executor) override;
+    bool scheduleDataProcessingJob(BackgroundJobsAssignee & assignee) override;
 
     /// Checks that fetches are not disabled with action blocker and pool for fetches
     /// is not overloaded
diff --git a/tests/queries/0_stateless/01200_mutations_memory_consumption.reference b/tests/queries/0_stateless/01200_mutations_memory_consumption.reference
index 98fb6a68656..1bb9c019697 100644
--- a/tests/queries/0_stateless/01200_mutations_memory_consumption.reference
+++ b/tests/queries/0_stateless/01200_mutations_memory_consumption.reference
@@ -1,4 +1,4 @@
-1
-1
-1
-1
+[[1]]
+[[1]]
+[[1]]
+[[1]]
diff --git a/tests/queries/0_stateless/01200_mutations_memory_consumption.sql b/tests/queries/0_stateless/01200_mutations_memory_consumption.sql
index f6f6d32a477..8632f7c29d6 100644
--- a/tests/queries/0_stateless/01200_mutations_memory_consumption.sql
+++ b/tests/queries/0_stateless/01200_mutations_memory_consumption.sql
@@ -6,7 +6,8 @@ CREATE TABLE table_with_single_pk
   value String
 )
 ENGINE = MergeTree
-ORDER BY key;
+ORDER BY key
+SETTINGS min_compress_block_size=65536, max_compress_block_size=65536;
 
 INSERT INTO table_with_single_pk SELECT number, toString(number % 10) FROM numbers(10000000);
 
@@ -15,9 +16,9 @@ ALTER TABLE table_with_single_pk DELETE WHERE key % 77 = 0 SETTINGS mutations_sy
 SYSTEM FLUSH LOGS;
 
 -- Memory usage for all mutations must be almost constant and less than
--- read_bytes (with a margin)
+-- read_bytes
 SELECT
-  DISTINCT 1.1 * read_bytes >= peak_memory_usage
+  arrayDistinct(groupArray(if (read_bytes >= peak_memory_usage, [1], [read_bytes, peak_memory_usage])))
 FROM
     system.part_log
 WHERE event_type = 'MutatePart' AND table = 'table_with_single_pk' AND database = currentDatabase();
@@ -34,7 +35,8 @@ CREATE TABLE table_with_multi_pk
   value String
 )
 ENGINE = MergeTree
-ORDER BY (key1, key2, key3);
+ORDER BY (key1, key2, key3)
+SETTINGS min_compress_block_size=65536, max_compress_block_size=65536;
 
 INSERT INTO table_with_multi_pk SELECT number % 32, number, toDateTime('2019-10-01 00:00:00'), toString(number % 10) FROM numbers(10000000);
 
@@ -43,9 +45,9 @@ ALTER TABLE table_with_multi_pk DELETE WHERE key1 % 77 = 0 SETTINGS mutations_sy
 SYSTEM FLUSH LOGS;
 
 -- Memory usage for all mutations must be almost constant and less than
--- read_bytes (with a margin)
+-- read_bytes
 SELECT
-  DISTINCT 1.1 * read_bytes >= peak_memory_usage
+  arrayDistinct(groupArray(if (read_bytes >= peak_memory_usage, [1], [read_bytes, peak_memory_usage])))
   FROM
       system.part_log
  WHERE event_type = 'MutatePart' AND table = 'table_with_multi_pk' AND database = currentDatabase();
@@ -64,7 +66,8 @@ CREATE TABLE table_with_function_pk
     value String
   )
 ENGINE = MergeTree
-ORDER BY (cast(value as UInt64), key2);
+ORDER BY (cast(value as UInt64), key2)
+SETTINGS min_compress_block_size=65536, max_compress_block_size=65536;
 
 INSERT INTO table_with_function_pk SELECT number % 32, number, toDateTime('2019-10-01 00:00:00'), toString(number % 10) FROM numbers(10000000);
 
@@ -73,9 +76,9 @@ ALTER TABLE table_with_function_pk DELETE WHERE key1 % 77 = 0 SETTINGS mutations
 SYSTEM FLUSH LOGS;
 
 -- Memory usage for all mutations must be almost constant and less than
--- read_bytes (with a margin)
+-- read_bytes
 SELECT
-  DISTINCT 1.1 * read_bytes >= peak_memory_usage
+  arrayDistinct(groupArray(if (read_bytes >= peak_memory_usage, [1], [read_bytes, peak_memory_usage])))
   FROM
       system.part_log
  WHERE event_type = 'MutatePart' AND table = 'table_with_function_pk' AND database = currentDatabase();
@@ -92,7 +95,8 @@ CREATE TABLE table_without_pk
   value String
 )
 ENGINE = MergeTree
-ORDER BY tuple();
+ORDER BY tuple()
+SETTINGS min_compress_block_size=65536, max_compress_block_size=65536;
 
 INSERT INTO table_without_pk SELECT number % 32, number, toDateTime('2019-10-01 00:00:00'), toString(number % 10) FROM numbers(10000000);
 
@@ -101,9 +105,9 @@ ALTER TABLE table_without_pk DELETE WHERE key1 % 77 = 0 SETTINGS mutations_sync
 SYSTEM FLUSH LOGS;
 
 -- Memory usage for all mutations must be almost constant and less than
--- read_bytes (with a margin)
+-- read_bytes
 SELECT
-  DISTINCT 1.1 * read_bytes >= peak_memory_usage
+  arrayDistinct(groupArray(if (read_bytes >= peak_memory_usage, [1], [read_bytes, peak_memory_usage])))
   FROM
       system.part_log
  WHERE event_type = 'MutatePart' AND table = 'table_without_pk' AND database = currentDatabase();

From 5e133a3cc638b5e16fee5529c258553e6b43d90d Mon Sep 17 00:00:00 2001
From: Dmitrii Kovalkov <dimas.kovas@gmail.com>
Date: Wed, 8 Sep 2021 07:58:22 +0300
Subject: [PATCH 144/177] Run generate-ya-make.sh

---
 src/Storages/ya.make | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/Storages/ya.make b/src/Storages/ya.make
index 5b246cf5aca..11a1ad212c1 100644
--- a/src/Storages/ya.make
+++ b/src/Storages/ya.make
@@ -214,7 +214,6 @@ SRCS(
     System/StorageSystemTables.cpp
     System/StorageSystemUserDirectories.cpp
     System/StorageSystemUsers.cpp
-    System/StorageSystemViews.cpp
     System/StorageSystemWarnings.cpp
     System/StorageSystemZeros.cpp
     System/StorageSystemZooKeeper.cpp

From e5d7791f87094a216d90756d0eea5ce7d2f117d3 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 8 Sep 2021 09:34:33 +0300
Subject: [PATCH 145/177] Fix test_MemoryTracking integration test flakiness

---
 .../test_MemoryTracking/configs/no_system_log.xml    | 12 +++++++++++-
 .../configs/users.d/overrides.xml                    |  8 ++++++++
 tests/integration/test_MemoryTracking/test.py        |  4 ++--
 3 files changed, 21 insertions(+), 3 deletions(-)
 create mode 100644 tests/integration/test_MemoryTracking/configs/users.d/overrides.xml

diff --git a/tests/integration/test_MemoryTracking/configs/no_system_log.xml b/tests/integration/test_MemoryTracking/configs/no_system_log.xml
index 7e235458d39..e9b98fb6f0e 100644
--- a/tests/integration/test_MemoryTracking/configs/no_system_log.xml
+++ b/tests/integration/test_MemoryTracking/configs/no_system_log.xml
@@ -1,7 +1,17 @@
 <yandex>
-    <metric_log remove="remove"/>
     <query_masking_rules remove="remove"/>
+
     <query_thread_log remove="remove"/>
+    <query_log remove="remove" />
+    <query_views_log remove="remove" />
+    <metric_log remove="remove"/>
     <text_log remove="remove"/>
     <trace_log remove="remove"/>
+    <asynchronous_metric_log remove="remove" />
+    <session_log remove="remove" />
+    <part_log remove="remove" />
+    <crash_log remove="remove" />
+    <opentelemetry_span_log remove="remove" />
+    <!-- just in case it will be enabled by default -->
+    <zookeeper_log remove="remove" />
 </yandex>
diff --git a/tests/integration/test_MemoryTracking/configs/users.d/overrides.xml b/tests/integration/test_MemoryTracking/configs/users.d/overrides.xml
new file mode 100644
index 00000000000..4cc9c058193
--- /dev/null
+++ b/tests/integration/test_MemoryTracking/configs/users.d/overrides.xml
@@ -0,0 +1,8 @@
+<yandex>
+    <profiles>
+        <default>
+            <query_profiler_real_time_period_ns>0</query_profiler_real_time_period_ns>
+            <query_profiler_cpu_time_period_ns>0</query_profiler_cpu_time_period_ns>
+        </default>
+    </profiles>
+</yandex>
diff --git a/tests/integration/test_MemoryTracking/test.py b/tests/integration/test_MemoryTracking/test.py
index 1ee528eaa8d..c006780682f 100644
--- a/tests/integration/test_MemoryTracking/test.py
+++ b/tests/integration/test_MemoryTracking/test.py
@@ -11,6 +11,8 @@ cluster = ClickHouseCluster(__file__)
 node = cluster.add_instance('node', main_configs=[
     'configs/no_system_log.xml',
     'configs/asynchronous_metrics_update_period_s.xml',
+], user_configs=[
+    'conifgs/users.d/overrides.xml',
 ])
 
 @pytest.fixture(scope='module', autouse=True)
@@ -23,8 +25,6 @@ def start_cluster():
 
 query_settings = {
     'max_threads': 1,
-    'query_profiler_real_time_period_ns': 0,
-    'query_profiler_cpu_time_period_ns': 0,
     'log_queries': 0,
 }
 sample_query = "SELECT groupArray(repeat('a', 1000)) FROM numbers(10000) GROUP BY number%10 FORMAT JSON"

From ed37a22c3767439105971d199a701498f3d1c398 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 8 Sep 2021 09:35:18 +0000
Subject: [PATCH 146/177] Delete debug print

---
 src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp | 2 --
 1 file changed, 2 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index ed841ff0fc9..89c7bb37952 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -39,8 +39,6 @@ void MergeTreeBackgroundExecutor::updateConfiguration()
         pool.setMaxThreads(new_threads_count);
         pool.setQueueSize(new_max_tasks_count);
 
-        std::cout << "threads_count " << threads_count << std::endl;
-
         /// We don't enter this loop if size is decreased.
         for (size_t number = threads_count; number < new_threads_count; ++number)
             pool.scheduleOrThrowOnError([this, number] { threadFunction(number); });

From 5588853893bd1c96f7a96de75f8df027b93bed13 Mon Sep 17 00:00:00 2001
From: gyuton <40863448+gyuton@users.noreply.github.com>
Date: Wed, 8 Sep 2021 13:25:06 +0300
Subject: [PATCH 147/177] Apply suggestions from code review

Co-authored-by: olgarev <56617294+olgarev@users.noreply.github.com>
---
 docs/en/interfaces/formats.md                  |  6 +++---
 docs/en/operations/settings/settings.md        | 18 +++++++++---------
 .../nested-data-structures/nested.md           |  2 +-
 3 files changed, 13 insertions(+), 13 deletions(-)

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index a83ee999070..129a50bb7fc 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -1270,7 +1270,7 @@ You can insert Parquet data from a file into ClickHouse table by the following c
 $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet"
 ```
 
-To insert `Array(Struct)` columns into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
+To insert `Array(Struct)` values into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns you must switch on the [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested) setting.
 
 You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command:
 
@@ -1330,7 +1330,7 @@ You can insert Arrow data from a file into ClickHouse table by the following com
 $ cat filename.arrow | clickhouse-client --query="INSERT INTO some_table FORMAT Arrow"
 ```
 
-To insert `Array(Struct)` columns into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
+To insert `Array(Struct)` values into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns you must switch on the [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested) setting.
 
 ### Selecting Data {#selecting-data-arrow}
 
@@ -1388,7 +1388,7 @@ You can insert ORC data from a file into ClickHouse table by the following comma
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
-To insert `Array(Struct)` columns into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns use the setting [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
+To insert `Array(Struct)` values into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns you must switch on the [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested) setting.
 
 ### Selecting Data {#selecting-data-2}
 
diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index e8355eeaa1d..9c54f925613 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -262,34 +262,34 @@ If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` ar
 
 ## input_format_parquet_import_nested {#input_format_parquet_import_nested}
 
-Enables or disables the ability to insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [Parquet](../../interfaces/formats.md#data-format-parquet) input format.
+Enables or disables the ability to insert `Array(Struct)` values into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [Parquet](../../interfaces/formats.md#data-format-parquet) input format.
 
 Possible values:
 
--   0 — Disabled.
--   1 — Enabled.
+-   0 — `Array(Struct)` values can not be inserted into `Nested` columns.
+-   1 — `Array(Struct)` values can be inserted into `Nested` columns.
 
 Default value: `0`.
 
 ## input_format_arrow_import_nested {#input_format_arrow_import_nested}
 
-Enables or disables the ability to insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format.
+Enables or disables the ability to insert `Array(Struct)` values into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format.
 
 Possible values:
 
--   0 — Disabled.
--   1 — Enabled.
+-   0 — `Array(Struct)` values can not be inserted into `Nested` columns.
+-   1 — `Array(Struct)` values can be inserted into `Nested` columns.
 
 Default value: `0`.
 
 ## input_format_orc_import_nested {#input_format_orc_import_nested}
 
-Enables or disables the ability to insert `Array(Struct)` columns into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [ORC](../../interfaces/formats.md#data-format-orc) input format.
+Enables or disables the ability to insert `Array(Struct)` values into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [ORC](../../interfaces/formats.md#data-format-orc) input format.
 
 Possible values:
 
--   0 — Disabled.
--   1 — Enabled.
+-   0 — `Array(Struct)` values can not be inserted into `Nested` columns.
+-   1 — `Array(Struct)` values can be inserted into `Nested` columns.
 
 Default value: `0`.
 
diff --git a/docs/en/sql-reference/data-types/nested-data-structures/nested.md b/docs/en/sql-reference/data-types/nested-data-structures/nested.md
index 281465f2075..0c53248baf9 100644
--- a/docs/en/sql-reference/data-types/nested-data-structures/nested.md
+++ b/docs/en/sql-reference/data-types/nested-data-structures/nested.md
@@ -105,6 +105,6 @@ The ALTER query for elements in a nested data structure has limitations.
 
 ## Inserting {#inserting}
 
-It is possible to insert `Array(Struct)` columns into nested columns with the settings [input_format_parquet_import_nested](../../../operations/settings/settings.md#input_format_parquet_import_nested), [input_format_arrow_import_nested](../../../operations/settings/settings.md#input_format_arrow_import_nested) and [input_format_orc_import_nested](../../../operations/settings/settings.md#input_format_orc_import_nested) in [Parquet](../../../interfaces/formats.md#data-format-parquet), [Arrow](../../../interfaces/formats.md#data_types-matching-arrow) and [ORC](../../../interfaces/formats.md#data-format-orc) input formats correspondingly.
+To insert `Array(Struct)` values into `Nested` columns, you must switch on the [input_format_parquet_import_nested](../../../operations/settings/settings.md#input_format_parquet_import_nested), [input_format_arrow_import_nested](../../../operations/settings/settings.md#input_format_arrow_import_nested) or [input_format_orc_import_nested](../../../operations/settings/settings.md#input_format_orc_import_nested) settings for [Parquet](../../../interfaces/formats.md#data-format-parquet), [Arrow](../../../interfaces/formats.md#data_types-matching-arrow) and [ORC](../../../interfaces/formats.md#data-format-orc) input formats respectively.
 
 [Original article](https://clickhouse.tech/docs/en/data_types/nested_data_structures/nested/) <!--hide-->

From 3a949289118ba4b56e8007f3fc01f28cebbb657f Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Wed, 8 Sep 2021 13:40:43 +0300
Subject: [PATCH 148/177] translated to Russian

---
 docs/ru/interfaces/formats.md                 | 12 +++++--
 docs/ru/operations/settings/settings.md       | 33 +++++++++++++++++++
 .../nested-data-structures/nested.md          |  7 +++-
 3 files changed, 48 insertions(+), 4 deletions(-)

diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md
index 1dd10e1f76d..93be1736ef7 100644
--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@@ -1180,7 +1180,7 @@ ClickHouse поддерживает настраиваемую точность
 
 Типы данных столбцов в ClickHouse могут отличаться от типов данных соответствующих полей файла в формате Parquet. При вставке данных ClickHouse интерпретирует типы данных в соответствии с таблицей выше, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к тому типу, который установлен для столбца таблицы.
 
-### Вставка и выборка данных {#vstavka-i-vyborka-dannykh}
+### Вставка и выборка данных {#inserting-and-selecting-data}
 
 Чтобы вставить в ClickHouse данные из файла в формате Parquet, выполните команду следующего вида:
 
@@ -1188,6 +1188,8 @@ ClickHouse поддерживает настраиваемую точность
 $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet"
 ```
 
+Чтобы вставить данные типа `Array(Struct)` в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md), нужно включить настройку [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
+
 Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата Parquet, используйте команду следующего вида:
 
 ``` bash
@@ -1246,6 +1248,8 @@ ClickHouse поддерживает настраиваемую точность
 $ cat filename.arrow | clickhouse-client --query="INSERT INTO some_table FORMAT Arrow"
 ```
 
+Чтобы вставить данные типа `Array(Struct)` в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md), нужно включить настройку [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
+
 ### Вывод данных {#selecting-data-arrow}
 
 Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата Arrow, используйте команду следующего вида:
@@ -1294,7 +1298,7 @@ ClickHouse поддерживает настраиваемую точность
 
 Типы данных столбцов в таблицах ClickHouse могут отличаться от типов данных для соответствующих полей ORC. При вставке данных ClickHouse интерпретирует типы данных ORC согласно таблице соответствия, а затем [приводит](../sql-reference/functions/type-conversion-functions/#type_conversion_function-cast) данные к типу, установленному для столбца таблицы ClickHouse.
 
-### Вставка данных {#vstavka-dannykh-1}
+### Вставка данных {#inserting-data-2}
 
 Чтобы вставить в ClickHouse данные из файла в формате ORC, используйте команду следующего вида:
 
@@ -1302,7 +1306,9 @@ ClickHouse поддерживает настраиваемую точность
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
-### Вывод данных {#vyvod-dannykh-1}
+Чтобы вставить данные типа `Array(Struct)` в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md), нужно включить настройку [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
+
+### Вывод данных {#selecting-data-2}
 
 Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата ORC, используйте команду следующего вида:
 
diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md
index 56b04e81a94..d69c7fddb30 100644
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@@ -237,6 +237,39 @@ ClickHouse применяет настройку в тех случаях, ко
 
 В случае превышения `input_format_allow_errors_ratio` ClickHouse генерирует исключение.
 
+## input_format_parquet_import_nested {#input_format_parquet_import_nested}
+
+Включает или отключает возможность вставки данных типа `Array(Struct)` в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в формате ввода [Parquet](../../interfaces/formats.md#data-format-parquet).
+
+Возможные значения:
+
+-   0 — данные типа `Array(Struct)` не могут быть вставлены в колонки `Nested`.
+-   1 — данные типа `Array(Struct)` могут быть вставлены в колонки `Nested`.
+
+Значение по умолчанию: `0`.
+
+## input_format_arrow_import_nested {#input_format_arrow_import_nested}
+
+Включает или отключает возможность вставки данных типа `Array(Struct)` в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в формате ввода [Arrow](../../interfaces/formats.md#data_types-matching-arrow).
+
+Возможные значения:
+
+-   0 — данные типа `Array(Struct)` не могут быть вставлены в колонки `Nested`.
+-   1 — данные типа `Array(Struct)` могут быть вставлены в колонки `Nested`.
+
+Значение по умолчанию: `0`.
+
+## input_format_orc_import_nested {#input_format_orc_import_nested}
+
+Включает или отключает возможность вставки данных типа `Array(Struct)` в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в формате ввода [ORC](../../interfaces/formats.md#data-format-orc).
+
+Возможные значения:
+
+-   0 — данные типа `Array(Struct)` не могут быть вставлены в колонки `Nested`.
+-   1 — данные типа `Array(Struct)` могут быть вставлены в колонки `Nested`.
+
+Значение по умолчанию: `0`.
+
 ## input_format_values_interpret_expressions {#settings-input_format_values_interpret_expressions}
 
 Включает или отключает парсер SQL, если потоковый парсер не может проанализировать данные. Этот параметр используется только для формата [Values](../../interfaces/formats.md#data-format-values) при вставке данных. Дополнительные сведения о парсерах читайте в разделе [Синтаксис](../../sql-reference/syntax.md).
diff --git a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md
index 718fe77ae95..8d48cd55713 100644
--- a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md
+++ b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md
@@ -1,4 +1,6 @@
-# Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2}
+# Nested {#nested}
+
+## Nested(Name1 Type1, Name2 Type2, …) {#nestedname1-type1-name2-type2}
 
 Вложенная структура данных - это как будто вложенная таблица. Параметры вложенной структуры данных - имена и типы столбцов, указываются так же, как у запроса CREATE. Каждой строке таблицы может соответствовать произвольное количество строк вложенной структуры данных.
 
@@ -96,3 +98,6 @@ LIMIT 10
 
 Работоспособность запроса ALTER для элементов вложенных структур данных, является сильно ограниченной.
 
+## Inserting {#inserting}
+
+Чтобы вставить данные типа `Array(Struct)` в колонки `Nested`, нужно включить настройку [input_format_parquet_import_nested](../../../operations/settings/settings.md#input_format_parquet_import_nested), [input_format_arrow_import_nested](../../../operations/settings/settings.md#input_format_arrow_import_nested) или [input_format_orc_import_nested](../../../operations/settings/settings.md#input_format_orc_import_nested) для форматов ввода [Parquet](../../../interfaces/formats.md#data-format-parquet), [Arrow](../../../interfaces/formats.md#data_types-matching-arrow) и [ORC](../../../interfaces/formats.md#data-format-orc) соответственно.

From f8a6dfb0c41638f3f1cb32c3228fcd58ab780267 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 8 Sep 2021 14:37:16 +0300
Subject: [PATCH 149/177] Fix race on mutation_pointer update

---
 src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
index c71a79d2009..4c5838cde9b 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@@ -144,7 +144,10 @@ bool ReplicatedMergeTreeQueue::load(zkutil::ZooKeeperPtr zookeeper)
             updated = true;
         }
 
-        zookeeper->tryGet(fs::path(replica_path) / "mutation_pointer", mutation_pointer);
+        {  /// Mutation pointer is a part of "state" and must be updated with state mutex
+            std::lock_guard lock(state_mutex);
+            zookeeper->tryGet(fs::path(replica_path) / "mutation_pointer", mutation_pointer);
+        }
     }
 
     updateTimesInZooKeeper(zookeeper, min_unprocessed_insert_time_changed, {});

From 3b7fa065404273d827b9008c145b2af3a9a8e5bb Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 8 Sep 2021 12:42:03 +0000
Subject: [PATCH 150/177] Review fixes

---
 src/Interpreters/Context.cpp                  |  20 +--
 src/Interpreters/Context.h                    |   6 +-
 .../MergeTree/BackgroundJobsAssignee.h        |  12 +-
 .../MergeTree/MergeTreeBackgroundExecutor.cpp | 147 ++++++++----------
 .../MergeTree/MergeTreeBackgroundExecutor.h   |  34 ++--
 src/Storages/ya.make                          |   1 -
 6 files changed, 102 insertions(+), 118 deletions(-)

diff --git a/src/Interpreters/Context.cpp b/src/Interpreters/Context.cpp
index ed25fbead37..25a36b12c7e 100644
--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@@ -1538,9 +1538,9 @@ BackgroundSchedulePool & Context::getBufferFlushSchedulePool() const
     return *shared->buffer_flush_schedule_pool;
 }
 
-ExecutableTaskSchedulingSettings Context::getBackgroundProcessingTaskSchedulingSettings() const
+BackgroundTaskSchedulingSettings Context::getBackgroundProcessingTaskSchedulingSettings() const
 {
-    ExecutableTaskSchedulingSettings task_settings;
+    BackgroundTaskSchedulingSettings task_settings;
 
     const auto & config = getConfigRef();
     task_settings.thread_sleep_seconds = config.getDouble("background_processing_pool_thread_sleep_seconds", 10);
@@ -1553,9 +1553,9 @@ ExecutableTaskSchedulingSettings Context::getBackgroundProcessingTaskSchedulingS
     return task_settings;
 }
 
-ExecutableTaskSchedulingSettings Context::getBackgroundMoveTaskSchedulingSettings() const
+BackgroundTaskSchedulingSettings Context::getBackgroundMoveTaskSchedulingSettings() const
 {
-    ExecutableTaskSchedulingSettings task_settings;
+    BackgroundTaskSchedulingSettings task_settings;
 
     const auto & config = getConfigRef();
     task_settings.thread_sleep_seconds = config.getDouble("background_move_processing_pool_thread_sleep_seconds", 10);
@@ -2734,16 +2734,16 @@ void Context::initializeBackgroundExecutors()
     shared->merge_mutate_executor = MergeTreeBackgroundExecutor::create
     (
         MergeTreeBackgroundExecutor::Type::MERGE_MUTATE,
-        [this] () { auto lock = getLock(); return getSettingsRef().background_pool_size; },
-        [this] () { auto lock = getLock(); return getSettingsRef().background_pool_size; },
+        getSettingsRef().background_pool_size,
+        getSettingsRef().background_pool_size,
         CurrentMetrics::BackgroundPoolTask
     );
 
     shared->moves_executor = MergeTreeBackgroundExecutor::create
     (
         MergeTreeBackgroundExecutor::Type::MOVE,
-        [this] () { auto lock = getLock(); return getSettingsRef().background_move_pool_size; },
-        [this] () { auto lock = getLock(); return getSettingsRef().background_move_pool_size; },
+        getSettingsRef().background_move_pool_size,
+        getSettingsRef().background_move_pool_size,
         CurrentMetrics::BackgroundMovePoolTask
     );
 
@@ -2751,8 +2751,8 @@ void Context::initializeBackgroundExecutors()
     shared->fetch_executor = MergeTreeBackgroundExecutor::create
     (
         MergeTreeBackgroundExecutor::Type::FETCH,
-        [this] () { auto lock = getLock(); return getSettingsRef().background_fetches_pool_size; },
-        [this] () { auto lock = getLock(); return getSettingsRef().background_fetches_pool_size; },
+        getSettingsRef().background_fetches_pool_size,
+        getSettingsRef().background_fetches_pool_size,
         CurrentMetrics::BackgroundFetchesPoolTask
     );
 }
diff --git a/src/Interpreters/Context.h b/src/Interpreters/Context.h
index 8179ebfd1c2..d38b26427f4 100644
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@@ -112,7 +112,7 @@ using OutputFormatPtr = std::shared_ptr<IOutputFormat>;
 class IVolume;
 using VolumePtr = std::shared_ptr<IVolume>;
 struct NamedSession;
-struct ExecutableTaskSchedulingSettings;
+struct BackgroundTaskSchedulingSettings;
 
 #if USE_NLP
     class SynonymsExtensions;
@@ -693,8 +693,8 @@ public:
     void dropCaches() const;
 
     /// Settings for MergeTree background tasks stored in config.xml
-    ExecutableTaskSchedulingSettings getBackgroundProcessingTaskSchedulingSettings() const;
-    ExecutableTaskSchedulingSettings getBackgroundMoveTaskSchedulingSettings() const;
+    BackgroundTaskSchedulingSettings getBackgroundProcessingTaskSchedulingSettings() const;
+    BackgroundTaskSchedulingSettings getBackgroundMoveTaskSchedulingSettings() const;
 
     BackgroundSchedulePool & getBufferFlushSchedulePool() const;
     BackgroundSchedulePool & getSchedulePool() const;
diff --git a/src/Storages/MergeTree/BackgroundJobsAssignee.h b/src/Storages/MergeTree/BackgroundJobsAssignee.h
index 619b6fbb966..2be92502347 100644
--- a/src/Storages/MergeTree/BackgroundJobsAssignee.h
+++ b/src/Storages/MergeTree/BackgroundJobsAssignee.h
@@ -9,10 +9,10 @@
 namespace DB
 {
 
-/// Settings for background tasks scheduling. Each background executor has one
+/// Settings for background tasks scheduling. Each background assignee has one
 /// BackgroundSchedulingPoolTask and depending on execution result may put this
 /// task to sleep according to settings. Look at scheduleTask function for details.
-struct ExecutableTaskSchedulingSettings
+struct BackgroundTaskSchedulingSettings
 {
     double thread_sleep_seconds_random_part = 1.0;
     double thread_sleep_seconds_if_nothing_to_do = 0.1;
@@ -35,7 +35,7 @@ private:
     MergeTreeData & data;
 
     /// Settings for execution control of background scheduling task
-    ExecutableTaskSchedulingSettings sleep_settings;
+    BackgroundTaskSchedulingSettings sleep_settings;
     /// Useful for random backoff timeouts generation
     pcg64 rng;
 
@@ -49,7 +49,11 @@ private:
     std::mutex holder_mutex;
 
 public:
-    ///
+    /// In case of ReplicatedMergeTree the first assignee will be responsible for
+    /// polling the replication queue and schedule operations according to the LogEntry type
+    /// e.g. merges, mutations and fetches. The same will be for Plain MergeTree except there is no
+    /// replication queue, so we will just scan parts and decide what to do.
+    /// Moving operations are the same for all types of MergeTree and also have their own timetable.
     enum class Type
     {
         DataProcessing,
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index 89c7bb37952..f4d2261cfe0 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -25,34 +25,6 @@ String MergeTreeBackgroundExecutor::toString(Type type)
 }
 
 
-void MergeTreeBackgroundExecutor::updateConfiguration()
-{
-    auto new_threads_count = std::max<size_t>(1u, threads_count_getter());
-    auto new_max_tasks_count = std::max<size_t>(1, max_task_count_getter());
-
-    try
-    {
-        pending.set_capacity(new_max_tasks_count);
-        active.set_capacity(new_max_tasks_count);
-
-        pool.setMaxFreeThreads(0);
-        pool.setMaxThreads(new_threads_count);
-        pool.setQueueSize(new_max_tasks_count);
-
-        /// We don't enter this loop if size is decreased.
-        for (size_t number = threads_count; number < new_threads_count; ++number)
-            pool.scheduleOrThrowOnError([this, number] { threadFunction(number); });
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-
-    threads_count = new_threads_count;
-    max_tasks_count = new_max_tasks_count;
-}
-
-
 void MergeTreeBackgroundExecutor::wait()
 {
     {
@@ -72,19 +44,6 @@ bool MergeTreeBackgroundExecutor::trySchedule(ExecutableTaskPtr task)
     if (shutdown)
         return false;
 
-    try
-    {
-        /// This is needed to increase / decrease the number of threads at runtime
-        /// Using stopwatch here not to do it so often.
-        /// No need to move the time to a config.
-        if (update_timer.compareAndRestartDeferred(10.))
-            updateConfiguration();
-    }
-    catch (...)
-    {
-        tryLogCurrentException(__PRETTY_FUNCTION__);
-    }
-
     auto & value = CurrentMetrics::values[metric];
     if (value.load() >= static_cast<int64_t>(max_tasks_count))
         return false;
@@ -123,35 +82,59 @@ void MergeTreeBackgroundExecutor::removeTasksCorrespondingToStorage(StorageID id
 
 void MergeTreeBackgroundExecutor::routine(TaskRuntimeDataPtr item)
 {
+    DENY_ALLOCATIONS_IN_SCOPE;
+
+    /// All operations with queues are considered no to do any allocations
+
     auto erase_from_active = [this, item]
     {
         active.erase(std::remove(active.begin(), active.end(), item), active.end());
     };
 
+    bool need_execute_again = false;
+
     try
     {
-        if (item->task->executeStep())
+        ALLOW_ALLOCATIONS_IN_SCOPE;
+        need_execute_again = item->task->executeStep();
+    }
+    catch (...)
+    {
+        std::lock_guard guard(mutex);
+        erase_from_active();
+        has_tasks.notify_one();
+        /// Do not want any exceptions
+        try { item->task->onCompleted(); } catch (...) {}
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+
+
+    if (need_execute_again)
+    {
+        std::lock_guard guard(mutex);
+
+        if (item->is_currently_deleting)
         {
-            std::lock_guard guard(mutex);
-
-            if (item->is_currently_deleting)
-            {
-                erase_from_active();
-                return;
-            }
-
-            pending.push_back(item);
             erase_from_active();
-            has_tasks.notify_one();
             return;
         }
 
-        {
-            std::lock_guard guard(mutex);
-            erase_from_active();
-            has_tasks.notify_one();
-        }
+        pending.push_back(item);
+        erase_from_active();
+        has_tasks.notify_one();
+        return;
+    }
 
+
+    {
+        std::lock_guard guard(mutex);
+        erase_from_active();
+        has_tasks.notify_one();
+    }
+
+    try
+    {
+        ALLOW_ALLOCATIONS_IN_SCOPE;
         /// In a situation of a lack of memory this method can throw an exception,
         /// because it may interact somehow with BackgroundSchedulePool, which may allocate memory
         /// But it is rather safe, because we have try...catch block here, and another one in ThreadPool.
@@ -160,46 +143,46 @@ void MergeTreeBackgroundExecutor::routine(TaskRuntimeDataPtr item)
     }
     catch (...)
     {
-        std::lock_guard guard(mutex);
-        erase_from_active();
-        has_tasks.notify_one();
         tryLogCurrentException(__PRETTY_FUNCTION__);
-        /// Do not want any exceptions
-        try { item->task->onCompleted(); } catch (...) {}
     }
 }
 
 
-void MergeTreeBackgroundExecutor::threadFunction(size_t number)
+void MergeTreeBackgroundExecutor::threadFunction()
 {
     setThreadName(name.c_str());
 
+    DENY_ALLOCATIONS_IN_SCOPE;
+
     while (true)
     {
-        TaskRuntimeDataPtr item;
+        try
         {
-            std::unique_lock lock(mutex);
-            has_tasks.wait(lock, [this](){ return !pending.empty() || shutdown; });
+            TaskRuntimeDataPtr item;
+            {
+                std::unique_lock lock(mutex);
+                has_tasks.wait(lock, [this](){ return !pending.empty() || shutdown; });
 
-            /// Decrease the number of threads (setting could be dynamically reloaded)
-            if (number >= threads_count)
-                break;
+                if (shutdown)
+                    break;
 
-            if (shutdown)
-                break;
+                item = std::move(pending.front());
+                pending.pop_front();
+                active.push_back(item);
+            }
 
-            item = std::move(pending.front());
-            pending.pop_front();
-            active.push_back(item);
+            routine(item);
+
+            /// When storage shutdowns it will wait until all related background tasks
+            /// are finished, because they may want to interact with its fields
+            /// and this will cause segfault.
+            if (item->is_currently_deleting)
+                item->is_done.set();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(__PRETTY_FUNCTION__);
         }
-
-        routine(item);
-
-        /// When storage shutdowns it will wait until all related background tasks
-        /// are finished, because they may want to interact with its fields
-        /// and this will cause segfault.
-        if (item->is_currently_deleting)
-            item->is_done.set();
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index 9aca20bed47..7af1524a023 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -23,7 +23,7 @@ namespace DB
 /**
  *  Executor for a background MergeTree related operations such as merges, mutations, fetches an so on.
  *  It can execute only successors of ExecutableTask interface.
- *  Which is a self-written coroutine. It suspends, when returns true from execute() method.
+ *  Which is a self-written coroutine. It suspends, when returns true from executeStep() method.
  *
  *  There are two queues of a tasks: pending (main queue for all the tasks) and active (currently executing).
  *  Pending queue is needed since the number of tasks will be more than thread to execute.
@@ -52,8 +52,6 @@ class MergeTreeBackgroundExecutor : public shared_ptr_helper<MergeTreeBackground
 {
 public:
 
-    using CountGetter = std::function<size_t()>;
-
     enum class Type
     {
         MERGE_MUTATE,
@@ -63,17 +61,25 @@ public:
 
     MergeTreeBackgroundExecutor(
         Type type_,
-        CountGetter && threads_count_getter_,
-        CountGetter && max_task_count_getter_,
+        size_t threads_count_,
+        size_t max_tasks_count_,
         CurrentMetrics::Metric metric_)
         : type(type_)
-        , threads_count_getter(threads_count_getter_)
-        , max_task_count_getter(max_task_count_getter_)
+        , threads_count(threads_count_)
+        , max_tasks_count(max_tasks_count_)
         , metric(metric_)
     {
         name = toString(type);
 
-        updateConfiguration();
+        pool.setMaxThreads(std::max(1UL, threads_count));
+        pool.setMaxFreeThreads(std::max(1UL, threads_count));
+        pool.setQueueSize(std::max(1UL, threads_count));
+
+        for (size_t number = 0; number < threads_count; ++number)
+            pool.scheduleOrThrowOnError([this] { threadFunction(); });
+
+        pending.set_capacity(max_tasks_count);
+        active.set_capacity(max_tasks_count);
     }
 
     ~MergeTreeBackgroundExecutor()
@@ -101,20 +107,13 @@ public:
 
 private:
 
-    void updateConfiguration();
-
     static String toString(Type type);
 
     Type type;
     String name;
-    CountGetter threads_count_getter;
-    CountGetter max_task_count_getter;
-    CurrentMetrics::Metric metric;
-
     size_t threads_count{0};
     size_t max_tasks_count{0};
-
-    AtomicStopwatch update_timer;
+    CurrentMetrics::Metric metric;
 
     /**
      * Has RAII class to determine how many tasks are waiting for the execution and executing at the moment.
@@ -140,8 +139,7 @@ private:
 
     void routine(TaskRuntimeDataPtr item);
 
-    /// Number all the threads in ThreadPool. To be able to lower the number of threads in runtime.
-    void threadFunction(size_t number);
+    void threadFunction();
 
     /// Initially it will be empty
     boost::circular_buffer<TaskRuntimeDataPtr> pending{0};
diff --git a/src/Storages/ya.make b/src/Storages/ya.make
index c11a4bfea41..4ea5cb14db4 100644
--- a/src/Storages/ya.make
+++ b/src/Storages/ya.make
@@ -215,7 +215,6 @@ SRCS(
     System/StorageSystemTables.cpp
     System/StorageSystemUserDirectories.cpp
     System/StorageSystemUsers.cpp
-    System/StorageSystemViews.cpp
     System/StorageSystemWarnings.cpp
     System/StorageSystemZeros.cpp
     System/StorageSystemZooKeeper.cpp

From 2b59fbacd3f5c274d7eb86cb314a9269a546d9ba Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 8 Sep 2021 16:14:00 +0300
Subject: [PATCH 151/177] Update tests/integration/test_MemoryTracking/test.py

---
 tests/integration/test_MemoryTracking/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_MemoryTracking/test.py b/tests/integration/test_MemoryTracking/test.py
index c006780682f..59b30b8f08b 100644
--- a/tests/integration/test_MemoryTracking/test.py
+++ b/tests/integration/test_MemoryTracking/test.py
@@ -12,7 +12,7 @@ node = cluster.add_instance('node', main_configs=[
     'configs/no_system_log.xml',
     'configs/asynchronous_metrics_update_period_s.xml',
 ], user_configs=[
-    'conifgs/users.d/overrides.xml',
+    'configs/users.d/overrides.xml',
 ])
 
 @pytest.fixture(scope='module', autouse=True)

From f2c9d94faa490fd84a650c0333d4c7b54ac8c1cc Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 8 Sep 2021 16:37:32 +0300
Subject: [PATCH 152/177] Changelog 21.9 (draft)

---
 CHANGELOG.md | 227 +++++++++++++++++++++++++++++++++++++++++++++++++++
 1 file changed, 227 insertions(+)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 71cdac17825..d3ca1c544dc 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,3 +1,230 @@
+### ClickHouse release v21.9, 2021-09-
+
+#### Backward Incompatible Change
+
+* Do not output trailing zeros in text representation of `Decimal` types. Example: `1.23` will be printed instead of `1.230000` for decimal with scale 6. This closes [#15794](https://github.com/ClickHouse/ClickHouse/issues/15794). It may introduce slight incompatibility if your applications somehow relied on the trailing zeros. Serialization in output formats can be controlled with the setting `output_format_decimal_trailing_zeros`. Implementation of `toString` and casting to String is changed unconditionally. [#27680](https://github.com/ClickHouse/ClickHouse/pull/27680) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Do not allow to apply parametric aggregate function with `-Merge` combinator to aggregate function state if state was produced by aggregate function with different parameters. For example, state of `fooState(42)(x)` cannot be finalized with `fooMerge(s)` or `fooMerge(123)(s)`, parameters must be specified explicitly like `fooMerge(42)(s)` and must be equal. It does not affect some special aggregate functions like `quantile` and `sequence*` that use parameters for finalization only. [#26847](https://github.com/ClickHouse/ClickHouse/pull/26847) ([tavplubix](https://github.com/tavplubix)).
+* Under clickhouse-local, always treat local addresses with a port as remote. [#26736](https://github.com/ClickHouse/ClickHouse/pull/26736) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix the issue that in case of some sophisticated query with column aliases identical to the names of expressions, bad cast may happen. This fixes [#25447](https://github.com/ClickHouse/ClickHouse/issues/25447). This fixes [#26914](https://github.com/ClickHouse/ClickHouse/issues/26914). This fix may introduce backward incompatibility: if there are different expressions with identical names, exception will be thrown. It may break some rare cases when `enable_optimize_predicate_expression` is set. [#26639](https://github.com/ClickHouse/ClickHouse/pull/26639) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+
+#### New Feature
+
+* Added ComplexKeyRangeHashed dictionary. Closes [#22029](https://github.com/ClickHouse/ClickHouse/issues/22029). [#27629](https://github.com/ClickHouse/ClickHouse/pull/27629) ([Maksim Kita](https://github.com/kitaisreal)).
+* Enable using constants from with and select in aggregate function parameters. Close [#10945](https://github.com/ClickHouse/ClickHouse/issues/10945). [#27531](https://github.com/ClickHouse/ClickHouse/pull/27531) ([abel-cheng](https://github.com/abel-cheng)).
+* Add "tupleToNameValuePairs", a function that turns a named tuple into an array of pairs. [#27505](https://github.com/ClickHouse/ClickHouse/pull/27505) ([Braulio Valdivielso Martínez](https://github.com/BraulioVM)).
+* - Add replicated storage of user, roles, row policies, quotas and settings profiles through ZooKeeper (experimental). [#27426](https://github.com/ClickHouse/ClickHouse/pull/27426) ([Kevin Michel](https://github.com/kmichel-aiven)).
+* Add support for `bzip2` compression method for import/export. Closes [#22428](https://github.com/ClickHouse/ClickHouse/issues/22428). [#27377](https://github.com/ClickHouse/ClickHouse/pull/27377) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Added `bitmapSubsetOffsetLimit(bitmap, offset, cardinality_limit)` function. It creates a subset of bitmap limit the results to `cardinality_limit` with offset of `offset`. [#27234](https://github.com/ClickHouse/ClickHouse/pull/27234) ([DHBin](https://github.com/DHBin)).
+* add column default_database to system.users. [#27054](https://github.com/ClickHouse/ClickHouse/pull/27054) ([kevin wan](https://github.com/MaxWk)).
+* Added support for custom query for MySQL, PostgreSQL, ClickHouse, JDBC, Cassandra dictionary source. Closes [#1270](https://github.com/ClickHouse/ClickHouse/issues/1270). [#26995](https://github.com/ClickHouse/ClickHouse/pull/26995) ([Maksim Kita](https://github.com/kitaisreal)).
+* Supported cluster macros inside table functions 'cluster' and 'clusterAllReplicas'. [#26913](https://github.com/ClickHouse/ClickHouse/pull/26913) ([polyprogrammist](https://github.com/PolyProgrammist)).
+* Add new functions `currentRoles()`, `enabledRoles()`, `defaultRoles()`. [#26780](https://github.com/ClickHouse/ClickHouse/pull/26780) ([Vitaly Baranov](https://github.com/vitlibar)).
+* New functions `currentProfiles()`, `enabledProfiles()`, `defaultProfiles()`. [#26714](https://github.com/ClickHouse/ClickHouse/pull/26714) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Introduce syntax for here documents. Example `SELECT $doc$VALUE$doc$`. [#26671](https://github.com/ClickHouse/ClickHouse/pull/26671) ([Maksim Kita](https://github.com/kitaisreal)).
+* Functions that return (initial_)query_id of the current query. This closes [#23682](https://github.com/ClickHouse/ClickHouse/issues/23682). [#26410](https://github.com/ClickHouse/ClickHouse/pull/26410) ([Alexey Boykov](https://github.com/mathalex)).
+* - Add `REPLACE GRANT` feature. [#26384](https://github.com/ClickHouse/ClickHouse/pull/26384) ([Caspian](https://github.com/Cas-pian)).
+* Implement window function `nth_value(expr, N)` that returns the value of the Nth row of the window frame. [#26334](https://github.com/ClickHouse/ClickHouse/pull/26334) ([Zuo, RuoYu](https://github.com/ryzuo)).
+* `EXPLAIN` query now has `EXPLAIN ESTIMATE ...` mode that will show information about read rows, marks and parts from MergeTree tables. Closes [#23941](https://github.com/ClickHouse/ClickHouse/issues/23941). [#26131](https://github.com/ClickHouse/ClickHouse/pull/26131) ([fastio](https://github.com/fastio)).
+* Added `system.zookeeper_log` table. All actions of ZooKeeper client are logged into this table. Implements [#25449](https://github.com/ClickHouse/ClickHouse/issues/25449). [#26129](https://github.com/ClickHouse/ClickHouse/pull/26129) ([tavplubix](https://github.com/tavplubix)).
+* Zero-copy replication for ReplicatedMergeTree over HDFS storage. [#25918](https://github.com/ClickHouse/ClickHouse/pull/25918) ([Zhichang Yu](https://github.com/yuzhichang)).
+* Allow to insert Nested type as array of structs in Arrow, ORC and Parquet input format. Also add some refactoring around ArrowColumnToCHColumn. [#25902](https://github.com/ClickHouse/ClickHouse/pull/25902) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add a new datatype Date32 (store data as Int32), support date range same with Datetime64 support load parquet date32 to ClickHouse Date32 Add new function toDate32 like toDate Support some function :. [#25774](https://github.com/ClickHouse/ClickHouse/pull/25774) ([LiuNeng](https://github.com/liuneng1994)).
+* enable setting default database per user [#25268](https://github.com/ClickHouse/ClickHouse/issues/25268). [#25687](https://github.com/ClickHouse/ClickHouse/pull/25687) ([kevin wan](https://github.com/MaxWk)).
+* Added NLP functions for tokenization, stemming, lemmatizing and search in synonyms extensions. [#24997](https://github.com/ClickHouse/ClickHouse/pull/24997) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Added S2 geometry library. [#24980](https://github.com/ClickHouse/ClickHouse/pull/24980) ([Andr0901](https://github.com/Andr0901)).
+* Add support for INTERSECT, EXCEPT, ANY, ALL operators. [#24757](https://github.com/ClickHouse/ClickHouse/pull/24757) ([Kirill Ershov](https://github.com/zdikov)).
+* * Support conditions in JOIN ON section. Close [#21868](https://github.com/ClickHouse/ClickHouse/issues/21868). [#24420](https://github.com/ClickHouse/ClickHouse/pull/24420) ([Vladimir C](https://github.com/vdimir)).
+* adding encryption at the virtual file system level using AES_128_CTR algorithm ... [#24206](https://github.com/ClickHouse/ClickHouse/pull/24206) ([Latysheva Alexandra](https://github.com/alexelex)).
+* Add SQLite table engine, table function, database engine. [#24194](https://github.com/ClickHouse/ClickHouse/pull/24194) ([Arslan Gumerov](https://github.com/g-arslan)).
+* Implementation of short circuit function evaluation, closes [#12587](https://github.com/ClickHouse/ClickHouse/issues/12587). Add settings `short_circuit_function_evaluation` to configure short circuit function evaluation. [#23367](https://github.com/ClickHouse/ClickHouse/pull/23367) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add an optional parameter to MongoDB engine to accept connection string options and support SSL connection. Closes [#21189](https://github.com/ClickHouse/ClickHouse/issues/21189). Closes [#21041](https://github.com/ClickHouse/ClickHouse/issues/21041). [#22045](https://github.com/ClickHouse/ClickHouse/pull/22045) ([Omar Bazaraa](https://github.com/OmarBazaraa)).
+* - Added a compression codec `AES_128_GCM_SIV` which encrypts columns instead of compressing them. [#19896](https://github.com/ClickHouse/ClickHouse/pull/19896) ([PHO](https://github.com/depressed-pho)).
+
+#### Performance Improvement
+
+* Less number of `clock_gettime` syscalls that may lead to performance improvement for some types of fast queries. [#27492](https://github.com/ClickHouse/ClickHouse/pull/27492) ([filimonov](https://github.com/filimonov)).
+* Improve the performance of fast queries when `max_execution_time=0` by reducing the number of `clock_gettime` system calls. [#27325](https://github.com/ClickHouse/ClickHouse/pull/27325) ([filimonov](https://github.com/filimonov)).
+* Specialize date time related comparison to achieve better performance. This fixes [#27083](https://github.com/ClickHouse/ClickHouse/issues/27083) . [#27122](https://github.com/ClickHouse/ClickHouse/pull/27122) ([Amos Bird](https://github.com/amosbird)).
+* Share file descriptors in concurrent reads of the same files. There is no noticeable performance difference on Linux. But the number of opened files will be significantly (10..100 times) lower on typical servers and it makes operations easier. See [#26214](https://github.com/ClickHouse/ClickHouse/issues/26214). [#26768](https://github.com/ClickHouse/ClickHouse/pull/26768) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Improve latency of short queries, that require reading from tables with large number of columns. [#26371](https://github.com/ClickHouse/ClickHouse/pull/26371) ([Anton Popov](https://github.com/CurtizJ)).
+* Don't build sets for indices when analyzing a query. [#26365](https://github.com/ClickHouse/ClickHouse/pull/26365) ([Raúl Marín](https://github.com/Algunenano)).
+* - Vectorize the SUM of Nullable integer types with native representation ([David Manzanares](https://github.com/davidmanzanares), [Raúl Marín](https://github.com/Algunenano)). [#26248](https://github.com/ClickHouse/ClickHouse/pull/26248) ([Raúl Marín](https://github.com/Algunenano)).
+* Compile columns with `Enum` types. [#26237](https://github.com/ClickHouse/ClickHouse/pull/26237) ([Maksim Kita](https://github.com/kitaisreal)).
+* Compile aggregate functions `groupBitOr`, `groupBitAnd`, `groupBitXor`. [#26161](https://github.com/ClickHouse/ClickHouse/pull/26161) ([Maksim Kita](https://github.com/kitaisreal)).
+* Improved memory usage with better block size prediction when reading empty DEFAULT columns. Closes [#17317](https://github.com/ClickHouse/ClickHouse/issues/17317). [#25917](https://github.com/ClickHouse/ClickHouse/pull/25917) ([Vladimir Chebotarev](https://github.com/excitoon)).
+* Reduce memory usage and number of read rows in queries with `ORDER BY primary_key`. [#25721](https://github.com/ClickHouse/ClickHouse/pull/25721) ([Anton Popov](https://github.com/CurtizJ)).
+
+#### Improvement
+
+* Remove duplicate index analysis and avoid possible invalid limit checks during projection analysis. [#27742](https://github.com/ClickHouse/ClickHouse/pull/27742) ([Amos Bird](https://github.com/amosbird)).
+* Enables query parameters to be passed in the body of http requests. [#27706](https://github.com/ClickHouse/ClickHouse/pull/27706) ([Hermano Lustosa](https://github.com/hllustosa)).
+* - Add `FROM INFILE` command. [#27655](https://github.com/ClickHouse/ClickHouse/pull/27655) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Disable arrayJoin on partition expressions. [#27648](https://github.com/ClickHouse/ClickHouse/pull/27648) ([Raúl Marín](https://github.com/Algunenano)).
+* Log client IP address if authentication fails. [#27514](https://github.com/ClickHouse/ClickHouse/pull/27514) ([Misko Lee](https://github.com/imiskolee)).
+* Use bytes instead of strings for binary data in the GRPC protocol. [#27431](https://github.com/ClickHouse/ClickHouse/pull/27431) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Send response with error message if HTTP port is not set and user tries to send HTTP request to TCP port. [#27385](https://github.com/ClickHouse/ClickHouse/pull/27385) ([Braulio Valdivielso Martínez](https://github.com/BraulioVM)).
+* Add Cast function for internal usage, which will not preserve type nullability, but non-internal cast will preserve according to setting cast_keep_nullable. Closes [#12636](https://github.com/ClickHouse/ClickHouse/issues/12636). [#27382](https://github.com/ClickHouse/ClickHouse/pull/27382) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Add setting `log_formatted_queries` to log additional formatted query into `system.query_log`. It's useful for normalized query analysis because functions like `normalizeQuery` and `normalizeQueryKeepNames` don't parse/format queries in order to achieve better performance. [#27380](https://github.com/ClickHouse/ClickHouse/pull/27380) ([Amos Bird](https://github.com/amosbird)).
+* Add two settings `max_hyperscan_regexp_length` and `max_hyperscan_regexp_total_length` to prevent huge regexp being used in hyperscan related functions, such as `multiMatchAny`. [#27378](https://github.com/ClickHouse/ClickHouse/pull/27378) ([Amos Bird](https://github.com/amosbird)).
+* Memory consumed by bitmap aggregate functions now is taken into account for memory limits. This closes [#26555](https://github.com/ClickHouse/ClickHouse/issues/26555). [#27252](https://github.com/ClickHouse/ClickHouse/pull/27252) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Add new index data skipping minmax index format for proper Nullable support. [#27250](https://github.com/ClickHouse/ClickHouse/pull/27250) ([Azat Khuzhin](https://github.com/azat)).
+* Add 10 seconds cache for S3 proxy resolver. [#27216](https://github.com/ClickHouse/ClickHouse/pull/27216) ([ianton-ru](https://github.com/ianton-ru)).
+* Split global mutex into individual regexp construction. This helps avoid huge regexp construction blocking other related threads. Not sure how to proper test the improvement. [#27211](https://github.com/ClickHouse/ClickHouse/pull/27211) ([Amos Bird](https://github.com/amosbird)).
+* Support schema for postgres database engine. Closes [#27166](https://github.com/ClickHouse/ClickHouse/issues/27166). [#27198](https://github.com/ClickHouse/ClickHouse/pull/27198) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Memory client in client. [#27191](https://github.com/ClickHouse/ClickHouse/pull/27191) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Mark window functions as ready for general use. Remove the `allow_experimental_window_functions` setting. [#27184](https://github.com/ClickHouse/ClickHouse/pull/27184) ([Alexander Kuzmenkov](https://github.com/akuzm)).
+* Try recording `query_kind` even when query fails to start. [#27182](https://github.com/ClickHouse/ClickHouse/pull/27182) ([Amos Bird](https://github.com/amosbird)).
+* Added columns `replica_is_active` that maps replica name to is replica active status to table `system.replicas`. Closes [#27138](https://github.com/ClickHouse/ClickHouse/issues/27138). [#27180](https://github.com/ClickHouse/ClickHouse/pull/27180) ([Maksim Kita](https://github.com/kitaisreal)).
+* Allow to pass query settings via server URI in Web UI. [#27177](https://github.com/ClickHouse/ClickHouse/pull/27177) ([kolsys](https://github.com/kolsys)).
+* add a new metric called MaxPushedDDLEntryID which is the maximum ddl entry id that current node push to zookeeper. [#27174](https://github.com/ClickHouse/ClickHouse/pull/27174) ([Fuwang Hu](https://github.com/fuwhu)).
+* Add compression for `INTO OUTPUT` that automatically choose compression algorithm. Closes [#3473](https://github.com/ClickHouse/ClickHouse/issues/3473). [#27134](https://github.com/ClickHouse/ClickHouse/pull/27134) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Improved the existence condition judgment and empty string node judgment when clickhouse-keeper creates znode. [#27125](https://github.com/ClickHouse/ClickHouse/pull/27125) ([小路](https://github.com/nicelulu)).
+* Enable distributed_push_down_limit by default. [#27104](https://github.com/ClickHouse/ClickHouse/pull/27104) ([Azat Khuzhin](https://github.com/azat)).
+* Improve compatibility with non-whole-minute timezone offsets. [#27080](https://github.com/ClickHouse/ClickHouse/pull/27080) ([Raúl Marín](https://github.com/Algunenano)).
+* * Merge join correctly handles empty set in the right. [#27078](https://github.com/ClickHouse/ClickHouse/pull/27078) ([Vladimir C](https://github.com/vdimir)).
+* Now functions can be shard-level constants, which means if it's executed in the context of some distributed table, it generates a normal column, otherwise it produces a constant value. Notable functions are: `hostName()`, `tcpPort()`, `version()`, `buildId()`, `uptime()`, etc. [#27020](https://github.com/ClickHouse/ClickHouse/pull/27020) ([Amos Bird](https://github.com/amosbird)).
+* Updated extractAllGroupsHorizontal - upper limit on the number of matches per row can be set via optional third argument. ... [#26961](https://github.com/ClickHouse/ClickHouse/pull/26961) ([Vasily Nemkov](https://github.com/Enmk)).
+* Expose rocksdb statistics via system.rocksdb table. Read rocksdb options from ClickHouse config (`rocksdb`/`rocksdb_TABLE` keys). [#26821](https://github.com/ClickHouse/ClickHouse/pull/26821) ([Azat Khuzhin](https://github.com/azat)).
+* Less verbose internal RocksDB logs. This closes [#26252](https://github.com/ClickHouse/ClickHouse/issues/26252). [#26789](https://github.com/ClickHouse/ClickHouse/pull/26789) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Changing default roles affects new sessions only. [#26759](https://github.com/ClickHouse/ClickHouse/pull/26759) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Watchdog is disabled in docker by default. Fix for not handling ctrl+c. [#26757](https://github.com/ClickHouse/ClickHouse/pull/26757) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
+* remove uncessary exception thrown. [#26740](https://github.com/ClickHouse/ClickHouse/pull/26740) ([Caspian](https://github.com/Cas-pian)).
+* Support multiple keys for encrypted disk. Display error message if the key is probably wrong. (see https://github.com/ClickHouse/ClickHouse/pull/26465#issuecomment-882015970). [#26733](https://github.com/ClickHouse/ClickHouse/pull/26733) ([Vitaly Baranov](https://github.com/vitlibar)).
+* SET PROFILE now applies constraints too if they're set for a passed profile. [#26730](https://github.com/ClickHouse/ClickHouse/pull/26730) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Improve handling of KILL QUERY requests. [#26675](https://github.com/ClickHouse/ClickHouse/pull/26675) ([Raúl Marín](https://github.com/Algunenano)).
+* - `mapPopulatesSeries` function supports `Map` type. [#26663](https://github.com/ClickHouse/ClickHouse/pull/26663) ([Ildus Kurbangaliev](https://github.com/ildus)).
+* Fix excessive (x2) connect attempts with skip_unavailable_shards. [#26658](https://github.com/ClickHouse/ClickHouse/pull/26658) ([Azat Khuzhin](https://github.com/azat)).
+* Avoid hanging clickhouse-benchmark if connection fails (i.e. on EMFILE). [#26656](https://github.com/ClickHouse/ClickHouse/pull/26656) ([Azat Khuzhin](https://github.com/azat)).
+* Improve the high performance machine to use the kafka engine. and it can recuce the query node work load. [#26642](https://github.com/ClickHouse/ClickHouse/pull/26642) ([feihengye](https://github.com/feihengye)).
+* Add round-robin support for clickhouse-benchmark (it does not differ from the regular multi host/port run except for statistics report). [#26607](https://github.com/ClickHouse/ClickHouse/pull/26607) ([Azat Khuzhin](https://github.com/azat)).
+* Executable dictionaries (ExecutableDictionarySource, ExecutablePoolDictionarySource) enable creation with DDL query using clickhouse-local. Closes [#22355](https://github.com/ClickHouse/ClickHouse/issues/22355). [#26510](https://github.com/ClickHouse/ClickHouse/pull/26510) ([Maksim Kita](https://github.com/kitaisreal)).
+* - Set client query kind for mysql and postgresql handler. [#26498](https://github.com/ClickHouse/ClickHouse/pull/26498) ([anneji-dev](https://github.com/anneji-dev)).
+* Apply `LIMIT` on the shards for queries like `SELECT * FROM dist ORDER BY key LIMIT 10` w/ `distributed_push_down_limit=1`. Avoid running `Distinct`/`LIMIT BY` steps for queries like `SELECT DISTINCT shading_key FROM dist ORDER BY key`. Now `distributed_push_down_limit` is respected by `optimize_distributed_group_by_sharding_key` optimization. [#26466](https://github.com/ClickHouse/ClickHouse/pull/26466) ([Azat Khuzhin](https://github.com/azat)).
+* After https://github.com/ClickHouse/ClickHouse/pull/26377. Encryption algorithm now should be specified explicitly if it's not default (`aes_128_ctr`):. [#26465](https://github.com/ClickHouse/ClickHouse/pull/26465) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Updated protobuf to 3.17.3. Changelogs are available on https://github.com/protocolbuffers/protobuf/releases. [#26424](https://github.com/ClickHouse/ClickHouse/pull/26424) ([Ilya Yatsishin](https://github.com/qoega)).
+* Enable `use_hedged_requests` setting that allows to mitigate tail latencies on large clusters. [#26380](https://github.com/ClickHouse/ClickHouse/pull/26380) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Added comments for the code written in https://github.com/ClickHouse/ClickHouse/pull/24206; the code has been improved in several places. [#26377](https://github.com/ClickHouse/ClickHouse/pull/26377) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix behaviour with non-existing host in user allowed host list. [#26368](https://github.com/ClickHouse/ClickHouse/pull/26368) ([ianton-ru](https://github.com/ianton-ru)).
+* Add ability to set Distributed directory monitor settings via CREATE TABLE (i.e. `CREATE TABLE dist (key Int) Engine=Distributed(cluster, db, table) SETTINGS monitor_batch_inserts=1` and similar). [#26336](https://github.com/ClickHouse/ClickHouse/pull/26336) ([Azat Khuzhin](https://github.com/azat)).
+* Save server address in history URLs in web UI if it differs from the origin of web UI. This closes [#26044](https://github.com/ClickHouse/ClickHouse/issues/26044). [#26322](https://github.com/ClickHouse/ClickHouse/pull/26322) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Add events to profile calls to sleep / sleepEachRow. [#26320](https://github.com/ClickHouse/ClickHouse/pull/26320) ([Raúl Marín](https://github.com/Algunenano)).
+* Allow to reuse connections of shards among different clusters. It also avoids creating new connections when using `cluster` table function. [#26318](https://github.com/ClickHouse/ClickHouse/pull/26318) ([Amos Bird](https://github.com/amosbird)).
+* Control the execution period of clear old temporary directories by parameter with default value. [#26212](https://github.com/ClickHouse/ClickHouse/issues/26212). [#26313](https://github.com/ClickHouse/ClickHouse/pull/26313) ([fastio](https://github.com/fastio)).
+* Add a setting `function_range_max_elements_in_block` to tune the safety threshold for data volume generated by function `range`. This closes [#26303](https://github.com/ClickHouse/ClickHouse/issues/26303). [#26305](https://github.com/ClickHouse/ClickHouse/pull/26305) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* - When client connect to server, he receives information about all warnings that are already were collected by server. (It can be disabled by using option `--no-warnings`). [#26282](https://github.com/ClickHouse/ClickHouse/pull/26282) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Make `toTimeZone` monotonicity when timeZone is a constant value to support partition puring when use sql like:. [#26261](https://github.com/ClickHouse/ClickHouse/pull/26261) ([huangzhaowei](https://github.com/SaintBacchus)).
+* Check hash function at table creation, not at sampling. Add settings in MergeTreeSettings, if someone create a table with incorrect sampling column but sampling never be used, disable this settings for starting the server without exception. [#26256](https://github.com/ClickHouse/ClickHouse/pull/26256) ([zhaoyu](https://github.com/zxc111)).
+* - Add `system.warnings` table to collect warnings about server configuration. [#26246](https://github.com/ClickHouse/ClickHouse/pull/26246) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Added `output_format_avro_string_column_pattern` setting to put specified String columns to Avro as string instead of default bytes. Implements [#22414](https://github.com/ClickHouse/ClickHouse/issues/22414). [#26245](https://github.com/ClickHouse/ClickHouse/pull/26245) ([Ilya Golshtein](https://github.com/ilejn)).
+* Add information about column sizes in `system.columns` table for `Log` and `TinyLog` tables. This closes [#9001](https://github.com/ClickHouse/ClickHouse/issues/9001). [#26241](https://github.com/ClickHouse/ClickHouse/pull/26241) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Don't throw exception when querying `system.detached_parts` table if there is custom disk configuration and `detached` directory does not exist on some disks. This closes [#26078](https://github.com/ClickHouse/ClickHouse/issues/26078). [#26236](https://github.com/ClickHouse/ClickHouse/pull/26236) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Check for non-deterministic functions in keys, including constant expressions like `now()`, `today()`. This closes [#25875](https://github.com/ClickHouse/ClickHouse/issues/25875). This closes [#11333](https://github.com/ClickHouse/ClickHouse/issues/11333). [#26235](https://github.com/ClickHouse/ClickHouse/pull/26235) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* convert timestamp and timestamptz data types to DateTime64 in postgres engine. [#26234](https://github.com/ClickHouse/ClickHouse/pull/26234) ([jasine](https://github.com/jasine)).
+* Apply aggressive IN index analysis for projections so that better projection candidate can be selected. [#26218](https://github.com/ClickHouse/ClickHouse/pull/26218) ([Amos Bird](https://github.com/amosbird)).
+* Remove GLOBAL keyword for IN when scalar function is passed. In previous versions, if user specified `GLOBAL IN f(x)` exception was thrown. [#26217](https://github.com/ClickHouse/ClickHouse/pull/26217) ([Amos Bird](https://github.com/amosbird)).
+* Add error id (like `BAD_ARGUMENTS`) to exception messages. This closes [#25862](https://github.com/ClickHouse/ClickHouse/issues/25862). [#26172](https://github.com/ClickHouse/ClickHouse/pull/26172) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Fix incorrect output with --progress option for clickhouse-local. Progress bar will be cleared once it gets to 100% - same as it is done for clickhouse-client. Closes [#17484](https://github.com/ClickHouse/ClickHouse/issues/17484). [#26128](https://github.com/ClickHouse/ClickHouse/pull/26128) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* add merge_selecting_sleep_ms setting. [#26120](https://github.com/ClickHouse/ClickHouse/pull/26120) ([lthaooo](https://github.com/lthaooo)).
+* Remove complicated usage of Linux AIO with one block readahead and replace it with plain simple synchronous IO with O_DIRECT. In previous versions, the setting `min_bytes_to_use_direct_io` may not work correctly if `max_threads` is greater than one. Reading with direct IO (that is disabled by default for queries and enabled by default for large merges) will work in less efficient way. This closes [#25997](https://github.com/ClickHouse/ClickHouse/issues/25997). [#26003](https://github.com/ClickHouse/ClickHouse/pull/26003) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* If file descriptor in `File` table is regular file - allow to read multiple times from it. It allows `clickhouse-local` to read multiple times from stdin (with multiple SELECT queries or subqueries) if stdin is a regular file like `clickhouse-local --query "SELECT * FROM table UNION ALL SELECT * FROM table" ... < file`. This closes [#11124](https://github.com/ClickHouse/ClickHouse/issues/11124). Co-authored with @alexey-milovidov. [#25960](https://github.com/ClickHouse/ClickHouse/pull/25960) ([BoloniniD](https://github.com/BoloniniD)).
+* - Flush `Distributed` table on `REPLACE TABLE` query. Resolves [#24566](https://github.com/ClickHouse/ClickHouse/issues/24566) - Do not replace (or create) table on `[CREATE OR] REPLACE TABLE ... AS SELECT` query if insertion into new table fails. Resolves [#23175](https://github.com/ClickHouse/ClickHouse/issues/23175). [#25895](https://github.com/ClickHouse/ClickHouse/pull/25895) ([tavplubix](https://github.com/tavplubix)).
+* - Add `views` column to system.query_log containing the names of the (materialized or live) views executed by the query. - Adds a new log table (`system.query_views_log`) that contains information about each view executed during a query. - Modifies view execution: When an exception is thrown while executing a view, any view that has already startedwill continue running until it finishes. This used to be the behaviour under parallel_view_processing=true and now it's always the same behaviour. - Dependent views now report reading progress to the context. [#25714](https://github.com/ClickHouse/ClickHouse/pull/25714) ([Raúl Marín](https://github.com/Algunenano)).
+* Do connection draining asynchonously upon finishing executing distributed queries. A new server setting is added `max_threads_for_connection_collector` which specifies the number of workers to recycle connections in background. If the pool is full, connection will be drained synchronously but a bit different than before: It's drained after we send EOS to client, query will succeed immediately after receiving enough data, and any exception will be logged instead of throwing to the client. Added setting `drain_timeout` (3 seconds by default). Connection draining will disconnect upon timeout. [#25674](https://github.com/ClickHouse/ClickHouse/pull/25674) ([Amos Bird](https://github.com/amosbird)).
+* Support for multiple includes in configuration. It is possible to include users configuration, remote servers configuration from multiple sources. Simply place `<include />` element with `from_zk`, `from_env` or `incl` attribute and it will be replaced with the substitution. [#24404](https://github.com/ClickHouse/ClickHouse/pull/24404) ([nvartolomei](https://github.com/nvartolomei)).
+* Fix multiple block insertion into distributed table with `insert_distributed_one_random_shard = 1`. This is a marginal feature. Mark as improvement. [#23140](https://github.com/ClickHouse/ClickHouse/pull/23140) ([Amos Bird](https://github.com/amosbird)).
+* Support `LowCardinality` and `FixedString` keys/values for `Map` type. [#21543](https://github.com/ClickHouse/ClickHouse/pull/21543) ([hexiaoting](https://github.com/hexiaoting)).
+* Enable reloading of local disk config. [#19526](https://github.com/ClickHouse/ClickHouse/pull/19526) ([taiyang-li](https://github.com/taiyang-li)).
+* Now we ran stateful and stateless tests in random timezones. Fixes [#12439](https://github.com/ClickHouse/ClickHouse/issues/12439). Reading String as DateTime and writing DateTime as String in Protobuf format now respect timezone. Reading UInt16 as DateTime in Arrow and Parquet formats now treat it as Date and then converts to DateTime with respect to DateTime's timezone, because Date is serialized in Arrow and Parquet as UInt16. GraphiteMergeTree now respect time zone for rounding of times. Fixes [#5098](https://github.com/ClickHouse/ClickHouse/issues/5098). Author: @alexey-milovidov. [#15408](https://github.com/ClickHouse/ClickHouse/pull/15408) ([alesapin](https://github.com/alesapin)).
+* Now KeyConditions can correctly skip nullable keys, including isNull and isNotNull. https://github.com/ClickHouse/ClickHouse/pull/12433 Cond. [#12455](https://github.com/ClickHouse/ClickHouse/pull/12455) ([Amos Bird](https://github.com/amosbird)).
+
+#### Bug Fix
+
+* Fix a couple of bugs that may cause replicas to diverge. [#27808](https://github.com/ClickHouse/ClickHouse/pull/27808) ([tavplubix](https://github.com/tavplubix)).
+* Fix a rare bug in `DROP PART` which can lead to the error `Unexpected merged part intersects drop range`. [#27807](https://github.com/ClickHouse/ClickHouse/pull/27807) ([alesapin](https://github.com/alesapin)).
+* Prevent crashes for some formats when NULL (tombstone) message was coming from Kafka. Closes [#19255](https://github.com/ClickHouse/ClickHouse/issues/19255). [#27794](https://github.com/ClickHouse/ClickHouse/pull/27794) ([filimonov](https://github.com/filimonov)).
+* Fix possible crash when asynchronous connection draining is enabled and hedged connection is disabled. [#27774](https://github.com/ClickHouse/ClickHouse/pull/27774) ([Amos Bird](https://github.com/amosbird)).
+* Remove duplicated source files in CMakeLists.txt in arrow-cmake. [#27736](https://github.com/ClickHouse/ClickHouse/pull/27736) ([taiyang-li](https://github.com/taiyang-li)).
+* After https://github.com/ClickHouse/ClickHouse/pull/25687. Add backquotes for the default database shown in CREATE USER. [#27702](https://github.com/ClickHouse/ClickHouse/pull/27702) ([Vitaly Baranov](https://github.com/vitlibar)).
+* After https://github.com/ClickHouse/ClickHouse/pull/26384. To execute `GRANT WITH REPLACE OPTION` now the current user should have `GRANT OPTION` for access rights it's going to grant AND for access rights it's going to revoke. [#27701](https://github.com/ClickHouse/ClickHouse/pull/27701) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix column filtering with union distinct in subquery. Closes [#27578](https://github.com/ClickHouse/ClickHouse/issues/27578). [#27689](https://github.com/ClickHouse/ClickHouse/pull/27689) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix bad type cast when functions like `arrayHas` are applied to arrays of LowCardinality of Nullable of different non-numeric types like `DateTime` and `DateTime64`. In previous versions bad cast occurs. In new version it will lead to exception. This closes [#26330](https://github.com/ClickHouse/ClickHouse/issues/26330). [#27682](https://github.com/ClickHouse/ClickHouse/pull/27682) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Fix postgresql table function resulting in non-closing connections. Closes [#26088](https://github.com/ClickHouse/ClickHouse/issues/26088). [#27662](https://github.com/ClickHouse/ClickHouse/pull/27662) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fixed another case of `Unexpected merged part ... intersecting drop range ...` error. [#27656](https://github.com/ClickHouse/ClickHouse/pull/27656) ([tavplubix](https://github.com/tavplubix)).
+* - Fix bug with aliased column in `Distributed` table. [#27652](https://github.com/ClickHouse/ClickHouse/pull/27652) ([Vladimir C](https://github.com/vdimir)).
+* After setting `max_memory_usage*` to non-zero value it was not possible to reset it back to 0 (unlimited). It's fixed. [#27638](https://github.com/ClickHouse/ClickHouse/pull/27638) ([tavplubix](https://github.com/tavplubix)).
+* Fixed underflow of the time value when constructing it from components. Closes [#27193](https://github.com/ClickHouse/ClickHouse/issues/27193). [#27605](https://github.com/ClickHouse/ClickHouse/pull/27605) ([Vasily Nemkov](https://github.com/Enmk)).
+* Fix crash during projection materialization when some parts contain missing columns. This fixes [#27512](https://github.com/ClickHouse/ClickHouse/issues/27512). [#27528](https://github.com/ClickHouse/ClickHouse/pull/27528) ([Amos Bird](https://github.com/amosbird)).
+* fix metric BackgroundMessageBrokerSchedulePoolTask, maybe mistyped。. [#27452](https://github.com/ClickHouse/ClickHouse/pull/27452) ([Ben](https://github.com/benbiti)).
+* Fix distributed queries with zero shards and aggregation. [#27427](https://github.com/ClickHouse/ClickHouse/pull/27427) ([Azat Khuzhin](https://github.com/azat)).
+* /proc/info contains metrics like. [#27361](https://github.com/ClickHouse/ClickHouse/pull/27361) ([Mike Kot](https://github.com/myrrc)).
+* Fix incorrect result for query with row-level security, prewhere and LowCardinality filter. Fixes [#27179](https://github.com/ClickHouse/ClickHouse/issues/27179). [#27329](https://github.com/ClickHouse/ClickHouse/pull/27329) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fixed incorrect validation of partition id for MergeTree tables that created with old syntax. [#27328](https://github.com/ClickHouse/ClickHouse/pull/27328) ([tavplubix](https://github.com/tavplubix)).
+* Fix Mysql protocol when using parallel formats (CSV / TSV). [#27326](https://github.com/ClickHouse/ClickHouse/pull/27326) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix `Cannot find column` error for queries with sampling. Was introduced in [#24574](https://github.com/ClickHouse/ClickHouse/issues/24574). Fixes [#26522](https://github.com/ClickHouse/ClickHouse/issues/26522). [#27301](https://github.com/ClickHouse/ClickHouse/pull/27301) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix errors like `Expected ColumnLowCardinality, gotUInt8` or `Bad cast from type DB::ColumnVector<char8_t> to DB::ColumnLowCardinality` for some queries with `LowCardinality` in `PREWHERE`. Fixes [#23515](https://github.com/ClickHouse/ClickHouse/issues/23515). [#27298](https://github.com/ClickHouse/ClickHouse/pull/27298) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix `distributed_group_by_no_merge=2`+`distributed_push_down_limit=1` or `optimize_distributed_group_by_sharding_key=1` with `LIMIT BY` and `LIMIT OFFSET`. [#27249](https://github.com/ClickHouse/ClickHouse/pull/27249) ([Azat Khuzhin](https://github.com/azat)).
+* Fix mutation stuck on invalid partitions in non-replicated MergeTree. [#27248](https://github.com/ClickHouse/ClickHouse/pull/27248) ([Azat Khuzhin](https://github.com/azat)).
+* In case of ambiguity, lambda functions prefer its arguments to other aliases or identifiers. [#27235](https://github.com/ClickHouse/ClickHouse/pull/27235) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix column structure in merge join, close [#27091](https://github.com/ClickHouse/ClickHouse/issues/27091). [#27217](https://github.com/ClickHouse/ClickHouse/pull/27217) ([Vladimir C](https://github.com/vdimir)).
+* Fix on-disk format breakage for secondary indices over Nullable column (no stable release had been affected). [#27197](https://github.com/ClickHouse/ClickHouse/pull/27197) ([Azat Khuzhin](https://github.com/azat)).
+* In rare cases `system.detached_parts` table might contain incorrect information for some parts, it's fixed. Fixes [#27114](https://github.com/ClickHouse/ClickHouse/issues/27114). [#27183](https://github.com/ClickHouse/ClickHouse/pull/27183) ([tavplubix](https://github.com/tavplubix)).
+* - Fix uninitialized memory in functions `multiSearch*` with empty array, close [#27169](https://github.com/ClickHouse/ClickHouse/issues/27169). [#27181](https://github.com/ClickHouse/ClickHouse/pull/27181) ([Vladimir C](https://github.com/vdimir)).
+* Fix synchronization in GRPCServer This PR fixes [#27024](https://github.com/ClickHouse/ClickHouse/issues/27024). [#27064](https://github.com/ClickHouse/ClickHouse/pull/27064) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fixed `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache` configuration parsing. Options `allow_read_expired_keys`, `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds` were not parsed for dictionaries with non `cache` type. [#27032](https://github.com/ClickHouse/ClickHouse/pull/27032) ([Maksim Kita](https://github.com/kitaisreal)).
+* [RFC] Fix possible mutation stack due to race with DROP_RANGE. [#27002](https://github.com/ClickHouse/ClickHouse/pull/27002) ([Azat Khuzhin](https://github.com/azat)).
+* Now partition ID in queries like `ALTER TABLE ... PARTITION ID xxx` validates for correctness. Fixes [#25718](https://github.com/ClickHouse/ClickHouse/issues/25718). [#26963](https://github.com/ClickHouse/ClickHouse/pull/26963) ([alesapin](https://github.com/alesapin)).
+* Fix "Unknown column name" error with multiple JOINs in some cases, close [#26899](https://github.com/ClickHouse/ClickHouse/issues/26899). [#26957](https://github.com/ClickHouse/ClickHouse/pull/26957) ([Vladimir C](https://github.com/vdimir)).
+* Fix reading of custom TLDs (stops processing with lower buffer or bigger file). [#26948](https://github.com/ClickHouse/ClickHouse/pull/26948) ([Azat Khuzhin](https://github.com/azat)).
+* Fix error `Missing columns: 'xxx'` when `DEFAULT` column references other non materialized column without `DEFAULT` expression. Fixes [#26591](https://github.com/ClickHouse/ClickHouse/issues/26591). [#26900](https://github.com/ClickHouse/ClickHouse/pull/26900) ([alesapin](https://github.com/alesapin)).
+* Fix library-bridge ids load. [#26834](https://github.com/ClickHouse/ClickHouse/pull/26834) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Aggregate function parameters might be lost when applying some combinators causing exceptions like `Conversion from AggregateFunction(topKArray, Array(String)) to AggregateFunction(topKArray(10), Array(String)) is not supported`. It's fixed. Fixes [#26196](https://github.com/ClickHouse/ClickHouse/issues/26196) and [#26433](https://github.com/ClickHouse/ClickHouse/issues/26433). [#26814](https://github.com/ClickHouse/ClickHouse/pull/26814) ([tavplubix](https://github.com/tavplubix)).
+* Add `event_time_microseconds` value for `REMOVE_PART` in `system.part_log`. In previous versions is was not set. [#26720](https://github.com/ClickHouse/ClickHouse/pull/26720) ([Azat Khuzhin](https://github.com/azat)).
+* Do not remove data on ReplicatedMergeTree table shutdown to avoid creating data to metadata inconsistency. [#26716](https://github.com/ClickHouse/ClickHouse/pull/26716) ([nvartolomei](https://github.com/nvartolomei)).
+* Sometimes SET ROLE could work incorrectly, this PR fixes that. [#26707](https://github.com/ClickHouse/ClickHouse/pull/26707) ([Vitaly Baranov](https://github.com/vitlibar)).
+* ParallelFormattingOutputFormat: Use mutex to handle the join to the collector_thread (https://github.com/ClickHouse/ClickHouse/issues/26694). [#26703](https://github.com/ClickHouse/ClickHouse/pull/26703) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix potential nullptr dereference in window functions. This fixes [#25276](https://github.com/ClickHouse/ClickHouse/issues/25276). [#26668](https://github.com/ClickHouse/ClickHouse/pull/26668) ([Alexander Kuzmenkov](https://github.com/akuzm)).
+* Fix history file conversion if file is empty. [#26589](https://github.com/ClickHouse/ClickHouse/pull/26589) ([Azat Khuzhin](https://github.com/azat)).
+* Fix incorrect function names of groupBitmapAnd/Or/Xor. This fixes. [#26557](https://github.com/ClickHouse/ClickHouse/pull/26557) ([Amos Bird](https://github.com/amosbird)).
+* Update `chown` cmd check in clickhouse-server docker entrypoint. It fixes the bug that cluster pod restart failed (or timeout) on kubernetes. [#26545](https://github.com/ClickHouse/ClickHouse/pull/26545) ([Ky Li](https://github.com/Kylinrix)).
+* Fix crash in rabbitmq shutdown in case rabbitmq setup was not started. Closes [#26504](https://github.com/ClickHouse/ClickHouse/issues/26504). [#26529](https://github.com/ClickHouse/ClickHouse/pull/26529) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix issues with `CREATE DICTIONARY` query if dictionary name or database name was quoted. Closes [#26491](https://github.com/ClickHouse/ClickHouse/issues/26491). [#26508](https://github.com/ClickHouse/ClickHouse/pull/26508) ([Maksim Kita](https://github.com/kitaisreal)).
+* Fix broken name resolution after rewriting column aliases. This fixes [#26432](https://github.com/ClickHouse/ClickHouse/issues/26432). [#26475](https://github.com/ClickHouse/ClickHouse/pull/26475) ([Amos Bird](https://github.com/amosbird)).
+* Fix some fuzzed msan crash. Fixes [#22517](https://github.com/ClickHouse/ClickHouse/issues/22517). [#26428](https://github.com/ClickHouse/ClickHouse/pull/26428) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Now, scalar subquery always returns `Nullable` result if it's type can be `Nullable`. It is needed because in case of empty subquery it's result should be `Null`. Previously, it was possible to get error about incompatible types (type deduction does not execute scalar subquery, and it could use not-nullable type). Scalar subquery with empty result which can't be converted to `Nullable` (like `Array` or `Tuple`) now throws error. Fixes [#25411](https://github.com/ClickHouse/ClickHouse/issues/25411). [#26423](https://github.com/ClickHouse/ClickHouse/pull/26423) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix infinite non joined block stream in `partial_merge_join` close [#26325](https://github.com/ClickHouse/ClickHouse/issues/26325). [#26374](https://github.com/ClickHouse/ClickHouse/pull/26374) ([Vladimir C](https://github.com/vdimir)).
+* Fix possible crash when login as dropped user. This PR fixes [#26073](https://github.com/ClickHouse/ClickHouse/issues/26073). [#26363](https://github.com/ClickHouse/ClickHouse/pull/26363) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix `optimize_distributed_group_by_sharding_key` for multiple columns (leads to incorrect result w/ `optimize_skip_unused_shards=1`/`allow_nondeterministic_optimize_skip_unused_shards=1` and multiple columns in sharding key expression). [#26353](https://github.com/ClickHouse/ClickHouse/pull/26353) ([Azat Khuzhin](https://github.com/azat)).
+* Fixed rare bug in lost replica recovery that may cause replicas to diverge. [#26321](https://github.com/ClickHouse/ClickHouse/pull/26321) ([tavplubix](https://github.com/tavplubix)).
+* Fix zstd decompression in case there are escape sequences at the end of internal buffer. Closes [#26013](https://github.com/ClickHouse/ClickHouse/issues/26013). [#26314](https://github.com/ClickHouse/ClickHouse/pull/26314) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix logical error on join with totals, close [#26017](https://github.com/ClickHouse/ClickHouse/issues/26017). [#26250](https://github.com/ClickHouse/ClickHouse/pull/26250) ([Vladimir C](https://github.com/vdimir)).
+* Remove excessive newline in `thread_name` column in `system.stack_trace` table. This fixes [#24124](https://github.com/ClickHouse/ClickHouse/issues/24124). [#26210](https://github.com/ClickHouse/ClickHouse/pull/26210) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Fix potential crash if more than one `untuple` expression is used. [#26179](https://github.com/ClickHouse/ClickHouse/pull/26179) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Don't throw exception in `toString` for nullable enum, close [#25806](https://github.com/ClickHouse/ClickHouse/issues/25806). [#26123](https://github.com/ClickHouse/ClickHouse/pull/26123) ([Vladimir C](https://github.com/vdimir)).
+* Fixed incorrect `sequence_id` in MySQL protocol packets that ClickHouse sends on exception during query execution. It might cause MySQL client to reset connection to ClickHouse server. Fixes [#21184](https://github.com/ClickHouse/ClickHouse/issues/21184). [#26051](https://github.com/ClickHouse/ClickHouse/pull/26051) ([tavplubix](https://github.com/tavplubix)).
+* Before this patch `cutToFirstSignificantSubdomainCustom()`/`cutToFirstSignificantSubdomainCustomWithWWW()`/`firstSignificantSubdomainCustom()` returns incorrect type for consts, and hence `optimize_skip_unused_shards` does not work:. [#26041](https://github.com/ClickHouse/ClickHouse/pull/26041) ([Azat Khuzhin](https://github.com/azat)).
+* Fix possible mismatched header when using normal projection with prewhere. This fixes [#26020](https://github.com/ClickHouse/ClickHouse/issues/26020). [#26038](https://github.com/ClickHouse/ClickHouse/pull/26038) ([Amos Bird](https://github.com/amosbird)).
+* Fix sharding_key from column w/o function for remote() (before `select * from remote('127.1', system.one, dummy)` leads to `Unknown column: dummy, there are only columns .` error). [#25824](https://github.com/ClickHouse/ClickHouse/pull/25824) ([Azat Khuzhin](https://github.com/azat)).
+* Fixed `Not found column ...` and `Missing column ...` errors when selecting from `MaterializeMySQL`. Fixes [#23708](https://github.com/ClickHouse/ClickHouse/issues/23708), [#24830](https://github.com/ClickHouse/ClickHouse/issues/24830), [#25794](https://github.com/ClickHouse/ClickHouse/issues/25794). [#25822](https://github.com/ClickHouse/ClickHouse/pull/25822) ([tavplubix](https://github.com/tavplubix)).
+* Fix `optimize_skip_unused_shards_rewrite_in` for non-UInt64 types (may select incorrect shards eventually or throw `Cannot infer type of an empty tuple` or `Function tuple requires at least one argument`). [#25798](https://github.com/ClickHouse/ClickHouse/pull/25798) ([Azat Khuzhin](https://github.com/azat)).
+
+#### Build/Testing/Packaging Improvement
+
+* Improve support for build with `clang-13`. [#27777](https://github.com/ClickHouse/ClickHouse/pull/27777) ([Sergei Semin](https://github.com/syominsergey)).
+* Add support for build with `clang-13`. This closes [#27705](https://github.com/ClickHouse/ClickHouse/issues/27705). [#27714](https://github.com/ClickHouse/ClickHouse/pull/27714) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Add CMake options to build with or without specific CPU instruction set. This is for [#17469](https://github.com/ClickHouse/ClickHouse/issues/17469) and [#27509](https://github.com/ClickHouse/ClickHouse/issues/27509). [#27508](https://github.com/ClickHouse/ClickHouse/pull/27508) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Fix linking of auxiliar programs when using dynamic libraries. [#26958](https://github.com/ClickHouse/ClickHouse/pull/26958) ([Raúl Marín](https://github.com/Algunenano)).
+* Tests: Fix CLICKHOUSE_CLIENT_SECURE with the default config. [#26901](https://github.com/ClickHouse/ClickHouse/pull/26901) ([Raúl Marín](https://github.com/Algunenano)).
+* Enabling RBAC TestFlows tests and crossing out new fails. [#26747](https://github.com/ClickHouse/ClickHouse/pull/26747) ([vzakaznikov](https://github.com/vzakaznikov)).
+* Fix /clickhouse/window functions/tests/non distributed/errors/error window function in join. [#26744](https://github.com/ClickHouse/ClickHouse/pull/26744) ([vzakaznikov](https://github.com/vzakaznikov)).
+* `clickhouse-test` supports SQL tests with [Jinja2](https://jinja.palletsprojects.com/en/3.0.x/templates/#synopsis) templates. [#26579](https://github.com/ClickHouse/ClickHouse/pull/26579) ([Vladimir C](https://github.com/vdimir)).
+* Update RocksDB to 2021-07-16 master. [#26411](https://github.com/ClickHouse/ClickHouse/pull/26411) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Enabling Testflows tests for Kerberos support. [#21659](https://github.com/ClickHouse/ClickHouse/pull/21659) ([Andrey](https://github.com/zvonand)).
+
+#### Other
+
+* Rename `MaterializeMySQL` to `MaterializedMySQL`. [#26822](https://github.com/ClickHouse/ClickHouse/pull/26822) ([tavplubix](https://github.com/tavplubix)).
+
+
 ### ClickHouse release v21.8, 2021-08-12
 
 #### Upgrade Notes

From 06eca1cbc0e013b6660fde130bf7baeb12f1aa61 Mon Sep 17 00:00:00 2001
From: Denny Crane <deniszhuravlov@gmail.com>
Date: Wed, 8 Sep 2021 12:06:24 -0300
Subject: [PATCH 153/177] Update install.md

---
 docs/en/getting-started/install.md | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index 517be8704e4..36109bc9e04 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -38,6 +38,9 @@ You can also download and install packages manually from [here](https://repo.cli
 -   `clickhouse-client` — Creates a symbolic link for `clickhouse-client` and other client-related tools. and installs client configuration files.
 -   `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info.
 
+!!! attention "Attention"
+    If you need to install specific version of ClickHouse you have to install all packages with the same version: `apt-get install clickhouse-server=21.6.5.37 clickhouse-client=21.6.5.37 clickhouse-common-static=21.6.5.37`
+
 ### From RPM Packages {#from-rpm-packages}
 
 It is recommended to use official pre-compiled `rpm` packages for CentOS, RedHat, and all other rpm-based Linux distributions.

From 27ec8bd1943dc046d6a62071b3faf25d32e660ba Mon Sep 17 00:00:00 2001
From: Denny Crane <deniszhuravlov@gmail.com>
Date: Wed, 8 Sep 2021 12:09:18 -0300
Subject: [PATCH 154/177] Update install.md

---
 docs/en/getting-started/install.md | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index 36109bc9e04..16d3f7e9fb2 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -39,7 +39,8 @@ You can also download and install packages manually from [here](https://repo.cli
 -   `clickhouse-common-static-dbg` — Installs ClickHouse compiled binary files with debug info.
 
 !!! attention "Attention"
-    If you need to install specific version of ClickHouse you have to install all packages with the same version: `apt-get install clickhouse-server=21.6.5.37 clickhouse-client=21.6.5.37 clickhouse-common-static=21.6.5.37`
+    If you need to install specific version of ClickHouse you have to install all packages with the same version:
+    `apt-get install clickhouse-server=21.6.5.37 clickhouse-client=21.6.5.37 clickhouse-common-static=21.6.5.37`
 
 ### From RPM Packages {#from-rpm-packages}
 

From afb860c18abed64eda885819ceffa2d568332ff6 Mon Sep 17 00:00:00 2001
From: Denny Crane <deniszhuravlov@gmail.com>
Date: Wed, 8 Sep 2021 12:22:05 -0300
Subject: [PATCH 155/177] Update install.md

---
 docs/ru/getting-started/install.md | 13 +++++++++++++
 1 file changed, 13 insertions(+)

diff --git a/docs/ru/getting-started/install.md b/docs/ru/getting-started/install.md
index 1cbeb70ef96..9f8a179759e 100644
--- a/docs/ru/getting-started/install.md
+++ b/docs/ru/getting-started/install.md
@@ -31,6 +31,19 @@ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not su
 
 Если вы хотите использовать наиболее свежую версию, замените `stable` на `testing` (рекомендуется для тестовых окружений).
 
+Также вы можете вручную скачать и установить пакеты из [репозитория](https://repo.clickhouse.tech/deb/stable/main/).
+
+#### Пакеты {#packages}
+
+-   `clickhouse-common-static` — Устанавливает исполняемые файлы ClickHouse.
+-   `clickhouse-server` — Создает символические ссылки для `clickhouse-server` и устанавливает конфигурационные файлы.
+-   `clickhouse-client` — Создает символические ссылки для `clickhouse-client` и других клиентских инструментов и устанавливает конфигурационные файлы `clickhouse-client`.
+-   `clickhouse-common-static-dbg` — Устанавливает исполняемые файлы ClickHouse собранные с отладочной информацией.
+
+!!! attention "Внимание"
+    Если вам нужно установить ClickHouse определенной версии, вы должны установить все пакеты одной версии:
+    `apt-get install clickhouse-server=21.6.5.37 clickhouse-client=21.6.5.37 clickhouse-common-static=21.6.5.37`
+
 ### Из RPM пакетов {#from-rpm-packages}
 
 Команда ClickHouse в Яндексе рекомендует использовать официальные предкомпилированные `rpm` пакеты для CentOS, RedHat и всех остальных дистрибутивов Linux, основанных на rpm.

From dae9a01b4b9edb4fce2b2ad18b518ac17cc2b925 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 8 Sep 2021 16:12:59 +0000
Subject: [PATCH 156/177] Fix build

---
 src/Storages/MergeTree/tests/gtest_executor.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/src/Storages/MergeTree/tests/gtest_executor.cpp b/src/Storages/MergeTree/tests/gtest_executor.cpp
index bda61e207e9..394977aaf5b 100644
--- a/src/Storages/MergeTree/tests/gtest_executor.cpp
+++ b/src/Storages/MergeTree/tests/gtest_executor.cpp
@@ -65,8 +65,8 @@ TEST(Executor, RemoveTasks)
     auto executor = DB::MergeTreeBackgroundExecutor::create
     (
         DB::MergeTreeBackgroundExecutor::Type::MERGE_MUTATE,
-        [] () { return tasks_kinds; },
-        [] () { return tasks_kinds * batch; },
+        tasks_kinds,
+        tasks_kinds * batch,
         CurrentMetrics::BackgroundPoolTask
     );
 
@@ -108,8 +108,8 @@ TEST(Executor, RemoveTasksStress)
     auto executor = DB::MergeTreeBackgroundExecutor::create
     (
         DB::MergeTreeBackgroundExecutor::Type::MERGE_MUTATE,
-        [] () { return tasks_kinds; },
-        [] () { return tasks_kinds * batch * (schedulers_count + removers_count); },
+        tasks_kinds,
+        tasks_kinds * batch * (schedulers_count + removers_count),
         CurrentMetrics::BackgroundPoolTask
     );
 

From 5ab0f3ac28ae1b6ad4ec0caacb3a5edd8acaf679 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Wed, 8 Sep 2021 19:38:59 +0300
Subject: [PATCH 157/177] Updated docs

---
 docs/en/interfaces/formats.md                               | 6 +++---
 .../data-types/nested-data-structures/nested.md             | 4 ----
 docs/ru/interfaces/formats.md                               | 6 +++---
 .../data-types/nested-data-structures/nested.md             | 4 ----
 4 files changed, 6 insertions(+), 14 deletions(-)

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index 129a50bb7fc..68a52dd702e 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -1270,7 +1270,7 @@ You can insert Parquet data from a file into ClickHouse table by the following c
 $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet"
 ```
 
-To insert `Array(Struct)` values into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns you must switch on the [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested) setting.
+To insert data into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs values you must switch on the [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested) setting.
 
 You can select data from a ClickHouse table and save them into some file in the Parquet format by the following command:
 
@@ -1330,7 +1330,7 @@ You can insert Arrow data from a file into ClickHouse table by the following com
 $ cat filename.arrow | clickhouse-client --query="INSERT INTO some_table FORMAT Arrow"
 ```
 
-To insert `Array(Struct)` values into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns you must switch on the [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested) setting.
+To insert data into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs values you must switch on the [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested) setting.
 
 ### Selecting Data {#selecting-data-arrow}
 
@@ -1388,7 +1388,7 @@ You can insert ORC data from a file into ClickHouse table by the following comma
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
-To insert `Array(Struct)` values into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns you must switch on the [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested) setting.
+To insert data into [Nested](../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs values you must switch on the [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested) setting.
 
 ### Selecting Data {#selecting-data-2}
 
diff --git a/docs/en/sql-reference/data-types/nested-data-structures/nested.md b/docs/en/sql-reference/data-types/nested-data-structures/nested.md
index 0c53248baf9..65849f9cd0f 100644
--- a/docs/en/sql-reference/data-types/nested-data-structures/nested.md
+++ b/docs/en/sql-reference/data-types/nested-data-structures/nested.md
@@ -103,8 +103,4 @@ For a DESCRIBE query, the columns in a nested data structure are listed separate
 
 The ALTER query for elements in a nested data structure has limitations.
 
-## Inserting {#inserting}
-
-To insert `Array(Struct)` values into `Nested` columns, you must switch on the [input_format_parquet_import_nested](../../../operations/settings/settings.md#input_format_parquet_import_nested), [input_format_arrow_import_nested](../../../operations/settings/settings.md#input_format_arrow_import_nested) or [input_format_orc_import_nested](../../../operations/settings/settings.md#input_format_orc_import_nested) settings for [Parquet](../../../interfaces/formats.md#data-format-parquet), [Arrow](../../../interfaces/formats.md#data_types-matching-arrow) and [ORC](../../../interfaces/formats.md#data-format-orc) input formats respectively.
-
 [Original article](https://clickhouse.tech/docs/en/data_types/nested_data_structures/nested/) <!--hide-->
diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md
index 93be1736ef7..5bd0f8db406 100644
--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@@ -1188,7 +1188,7 @@ ClickHouse поддерживает настраиваемую точность
 $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet"
 ```
 
-Чтобы вставить данные типа `Array(Struct)` в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md), нужно включить настройку [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
+Чтобы вставлять данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
 
 Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата Parquet, используйте команду следующего вида:
 
@@ -1248,7 +1248,7 @@ ClickHouse поддерживает настраиваемую точность
 $ cat filename.arrow | clickhouse-client --query="INSERT INTO some_table FORMAT Arrow"
 ```
 
-Чтобы вставить данные типа `Array(Struct)` в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md), нужно включить настройку [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
+Чтобы вставлять данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
 
 ### Вывод данных {#selecting-data-arrow}
 
@@ -1306,7 +1306,7 @@ ClickHouse поддерживает настраиваемую точность
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
-Чтобы вставить данные типа `Array(Struct)` в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md), нужно включить настройку [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
+Чтобы вставлять данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
 
 ### Вывод данных {#selecting-data-2}
 
diff --git a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md
index 8d48cd55713..db957e57502 100644
--- a/docs/ru/sql-reference/data-types/nested-data-structures/nested.md
+++ b/docs/ru/sql-reference/data-types/nested-data-structures/nested.md
@@ -97,7 +97,3 @@ LIMIT 10
 При запросе DESCRIBE, столбцы вложенной структуры данных перечисляются так же по отдельности.
 
 Работоспособность запроса ALTER для элементов вложенных структур данных, является сильно ограниченной.
-
-## Inserting {#inserting}
-
-Чтобы вставить данные типа `Array(Struct)` в колонки `Nested`, нужно включить настройку [input_format_parquet_import_nested](../../../operations/settings/settings.md#input_format_parquet_import_nested), [input_format_arrow_import_nested](../../../operations/settings/settings.md#input_format_arrow_import_nested) или [input_format_orc_import_nested](../../../operations/settings/settings.md#input_format_orc_import_nested) для форматов ввода [Parquet](../../../interfaces/formats.md#data-format-parquet), [Arrow](../../../interfaces/formats.md#data_types-matching-arrow) и [ORC](../../../interfaces/formats.md#data-format-orc) соответственно.

From 43850b51d79927cee37fab7dd978433db48dac80 Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Wed, 8 Sep 2021 19:40:47 +0300
Subject: [PATCH 158/177] Small fix

---
 docs/ru/interfaces/formats.md | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/docs/ru/interfaces/formats.md b/docs/ru/interfaces/formats.md
index 5bd0f8db406..970c6c36e9f 100644
--- a/docs/ru/interfaces/formats.md
+++ b/docs/ru/interfaces/formats.md
@@ -1188,7 +1188,7 @@ ClickHouse поддерживает настраиваемую точность
 $ cat {filename} | clickhouse-client --query="INSERT INTO {some_table} FORMAT Parquet"
 ```
 
-Чтобы вставлять данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
+Чтобы вставить данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_parquet_import_nested](../operations/settings/settings.md#input_format_parquet_import_nested).
 
 Чтобы получить данные из таблицы ClickHouse и сохранить их в файл формата Parquet, используйте команду следующего вида:
 
@@ -1248,7 +1248,7 @@ ClickHouse поддерживает настраиваемую точность
 $ cat filename.arrow | clickhouse-client --query="INSERT INTO some_table FORMAT Arrow"
 ```
 
-Чтобы вставлять данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
+Чтобы вставить данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_arrow_import_nested](../operations/settings/settings.md#input_format_arrow_import_nested).
 
 ### Вывод данных {#selecting-data-arrow}
 
@@ -1306,7 +1306,7 @@ ClickHouse поддерживает настраиваемую точность
 $ cat filename.orc | clickhouse-client --query="INSERT INTO some_table FORMAT ORC"
 ```
 
-Чтобы вставлять данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
+Чтобы вставить данные в колонки типа [Nested](../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур, нужно включить настройку [input_format_orc_import_nested](../operations/settings/settings.md#input_format_orc_import_nested).
 
 ### Вывод данных {#selecting-data-2}
 

From a24186734effcfbabd656540f43d056ca0bcbeca Mon Sep 17 00:00:00 2001
From: George <gyuton@yandex-team.ru>
Date: Wed, 8 Sep 2021 20:14:25 +0300
Subject: [PATCH 159/177] minor changes

---
 docs/en/operations/settings/settings.md | 18 +++++++++---------
 docs/ru/operations/settings/settings.md | 18 +++++++++---------
 2 files changed, 18 insertions(+), 18 deletions(-)

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index 9c54f925613..d5758976457 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -262,34 +262,34 @@ If both `input_format_allow_errors_num` and `input_format_allow_errors_ratio` ar
 
 ## input_format_parquet_import_nested {#input_format_parquet_import_nested}
 
-Enables or disables the ability to insert `Array(Struct)` values into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [Parquet](../../interfaces/formats.md#data-format-parquet) input format.
+Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Parquet](../../interfaces/formats.md#data-format-parquet) input format.
 
 Possible values:
 
--   0 — `Array(Struct)` values can not be inserted into `Nested` columns.
--   1 — `Array(Struct)` values can be inserted into `Nested` columns.
+-   0 — Data can not be inserted into `Nested` columns as an array of structs.
+-   1 — Data can be inserted into `Nested` columns as an array of structs.
 
 Default value: `0`.
 
 ## input_format_arrow_import_nested {#input_format_arrow_import_nested}
 
-Enables or disables the ability to insert `Array(Struct)` values into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format.
+Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [Arrow](../../interfaces/formats.md#data_types-matching-arrow) input format.
 
 Possible values:
 
--   0 — `Array(Struct)` values can not be inserted into `Nested` columns.
--   1 — `Array(Struct)` values can be inserted into `Nested` columns.
+-   0 — Data can not be inserted into `Nested` columns as an array of structs.
+-   1 — Data can be inserted into `Nested` columns as an array of structs.
 
 Default value: `0`.
 
 ## input_format_orc_import_nested {#input_format_orc_import_nested}
 
-Enables or disables the ability to insert `Array(Struct)` values into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns in [ORC](../../interfaces/formats.md#data-format-orc) input format.
+Enables or disables the ability to insert the data into [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) columns as an array of structs in [ORC](../../interfaces/formats.md#data-format-orc) input format.
 
 Possible values:
 
--   0 — `Array(Struct)` values can not be inserted into `Nested` columns.
--   1 — `Array(Struct)` values can be inserted into `Nested` columns.
+-   0 — Data can not be inserted into `Nested` columns as an array of structs.
+-   1 — Data can be inserted into `Nested` columns as an array of structs.
 
 Default value: `0`.
 
diff --git a/docs/ru/operations/settings/settings.md b/docs/ru/operations/settings/settings.md
index d69c7fddb30..9ad300b8c9c 100644
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@@ -239,34 +239,34 @@ ClickHouse применяет настройку в тех случаях, ко
 
 ## input_format_parquet_import_nested {#input_format_parquet_import_nested}
 
-Включает или отключает возможность вставки данных типа `Array(Struct)` в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в формате ввода [Parquet](../../interfaces/formats.md#data-format-parquet).
+Включает или отключает возможность вставки данных в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур  в формате ввода [Parquet](../../interfaces/formats.md#data-format-parquet).
 
 Возможные значения:
 
--   0 — данные типа `Array(Struct)` не могут быть вставлены в колонки `Nested`.
--   1 — данные типа `Array(Struct)` могут быть вставлены в колонки `Nested`.
+-   0 — данные не могут быть вставлены в колонки типа `Nested` в виде массива структур.
+-   0 — данные могут быть вставлены в колонки типа `Nested` в виде массива структур.
 
 Значение по умолчанию: `0`.
 
 ## input_format_arrow_import_nested {#input_format_arrow_import_nested}
 
-Включает или отключает возможность вставки данных типа `Array(Struct)` в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в формате ввода [Arrow](../../interfaces/formats.md#data_types-matching-arrow).
+Включает или отключает возможность вставки данных в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур в формате ввода [Arrow](../../interfaces/formats.md#data_types-matching-arrow).
 
 Возможные значения:
 
--   0 — данные типа `Array(Struct)` не могут быть вставлены в колонки `Nested`.
--   1 — данные типа `Array(Struct)` могут быть вставлены в колонки `Nested`.
+-   0 — данные не могут быть вставлены в колонки типа `Nested` в виде массива структур.
+-   0 — данные могут быть вставлены в колонки типа `Nested` в виде массива структур.
 
 Значение по умолчанию: `0`.
 
 ## input_format_orc_import_nested {#input_format_orc_import_nested}
 
-Включает или отключает возможность вставки данных типа `Array(Struct)` в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в формате ввода [ORC](../../interfaces/formats.md#data-format-orc).
+Включает или отключает возможность вставки данных в колонки типа [Nested](../../sql-reference/data-types/nested-data-structures/nested.md) в виде массива структур в формате ввода [ORC](../../interfaces/formats.md#data-format-orc).
 
 Возможные значения:
 
--   0 — данные типа `Array(Struct)` не могут быть вставлены в колонки `Nested`.
--   1 — данные типа `Array(Struct)` могут быть вставлены в колонки `Nested`.
+-   0 — данные не могут быть вставлены в колонки типа `Nested` в виде массива структур.
+-   0 — данные могут быть вставлены в колонки типа `Nested` в виде массива структур.
 
 Значение по умолчанию: `0`.
 

From 1416b1ccba65a2a7551f2557fcb30ef9f775af08 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 8 Sep 2021 20:21:25 +0300
Subject: [PATCH 160/177] Update MergeTreeBackgroundExecutor.cpp

---
 src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
index f4d2261cfe0..d1678f6a70d 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.cpp
@@ -100,11 +100,6 @@ void MergeTreeBackgroundExecutor::routine(TaskRuntimeDataPtr item)
     }
     catch (...)
     {
-        std::lock_guard guard(mutex);
-        erase_from_active();
-        has_tasks.notify_one();
-        /// Do not want any exceptions
-        try { item->task->onCompleted(); } catch (...) {}
         tryLogCurrentException(__PRETTY_FUNCTION__);
     }
 

From df414ae71c54faa1eaaa218b80ee904f9948cd07 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 8 Sep 2021 21:10:34 +0300
Subject: [PATCH 161/177] Fix uncaught exception during server termination

Example of a stacktrace:

<details>

```
[ 47463 ] {} <Trace> BaseDaemon: Received signal 15
[ 47463 ] {} <Information> Application: Received termination signal (Terminated)
[ 47462 ] {} <Debug> Application: Received termination signal.
[ 47462 ] {} <Debug> Application: Waiting for current connections to close.
[ 47463 ] {} <Trace> BaseDaemon: Received signal 15
[ 47463 ] {} <Information> Application: Received termination signal (Terminated)
...
[ 47463 ] {} <Trace> BaseDaemon: Received signal -1
[ 47463 ] {} <Fatal> BaseDaemon: (version 21.9.1.1, build id: 63945F58FC2C28ED) (from thread 47462) Terminate called for uncaught exception:
[ 47463 ] {} <Fatal> BaseDaemon: Code: 210. DB::NetException: Connection reset by peer, while writing to socket (10.7.141.42:9000). (NETWORK_ERROR), Stack trace (when copying this message, always include the lines below):
[ 47463 ] {} <Fatal> BaseDaemon:
[ 47463 ] {} <Fatal> BaseDaemon: 0. DB::Exception::Exception(std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > const&, int, bool) @ 0x94ca99a in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 1. DB::WriteBufferFromPocoSocket::nextImpl() @ 0x10676a3b in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 2. DB::Connection::sendCancel() @ 0x11554701 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 3. DB::MultiplexedConnections::sendCancel() @ 0x1157e766 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 4. DB::RemoteQueryExecutor::tryCancel(char const*, std::__1::unique_ptr<DB::RemoteQueryExecutorReadContext, std::__1::default_delete<DB::RemoteQueryExecutorReadContext> >*) @ 0x10392000 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 5. DB::PipelineExecutor::cancel() @ 0x11697ffe in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 6. DB::QueryStatus::cancelQuery(bool) @ 0x10c19fc8 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 7. DB::ProcessList::killAllQueries() @ 0x10c1a6ae in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 8. basic_scope_guard<DB::Server::main(std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&)::$_18>::~basic_scope_guard() @ 0x95587ad in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 9. DB::Server::main(std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) @ 0x95528a2 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 10. Poco::Util::Application::run() @ 0x141e85a3 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 11. DB::Server::run() @ 0x9541dac in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 12. mainEntryClickHouseServer(int, char**) @ 0x9540153 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 13. main @ 0x94c569e in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon: 14. __libc_start_main @ 0x26d0a in /usr/lib/x86_64-linux-gnu/libc-2.31.so
[ 47463 ] {} <Fatal> BaseDaemon: 15. _start @ 0x9490a2a in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 47463 ] {} <Fatal> BaseDaemon:  (version 21.9.1.1)
[ 47463 ] {} <Trace> BaseDaemon: Received signal 6
[ 11858 ] {} <Fatal> BaseDaemon: ########################################
[ 11858 ] {} <Fatal> BaseDaemon: (version 21.9.1.1, build id: 63945F58FC2C28ED) (from thread 47462) (no query) Received signal Aborted (6)
[ 11858 ] {} <Fatal> BaseDaemon:
[ 11858 ] {} <Fatal> BaseDaemon: Stack trace: 0x7ff04c196ce1 0x7ff04c180537 0xff91f28 0x163304e3 0x1633044c 0x94c60cb 0x10c1a135 0x10c1a6ae 0x95587ad 0x95528a2 0x141e85a3 0x9541dac 0x9540153 0x94c569e 0x7ff04c181d0a 0x9490a2a
[ 11858 ] {} <Fatal> BaseDaemon: 1. raise @ 0x3bce1 in /usr/lib/x86_64-linux-gnu/libc-2.31.so
[ 11858 ] {} <Fatal> BaseDaemon: 2. abort @ 0x25537 in /usr/lib/x86_64-linux-gnu/libc-2.31.so
[ 11858 ] {} <Fatal> BaseDaemon: 3. terminate_handler() @ 0xff91f28 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 4. std::__terminate(void (*)()) @ 0x163304e3 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 5. std::terminate() @ 0x1633044c in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 6. ? @ 0x94c60cb in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 7. ? @ 0x10c1a135 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 8. DB::ProcessList::killAllQueries() @ 0x10c1a6ae in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 9. basic_scope_guard<DB::Server::main(std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&)::$_18>::~basic_scope_guard() @ 0x95587ad in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 10. DB::Server::main(std::__1::vector<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> >, std::__1::allocator<std::__1::basic_string<char, std::__1::char_traits<char>, std::__1::allocator<char> > > > const&) @ 0x95528a2 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 11. Poco::Util::Application::run() @ 0x141e85a3 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 12. DB::Server::run() @ 0x9541dac in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 13. mainEntryClickHouseServer(int, char**) @ 0x9540153 in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 14. main @ 0x94c569e in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
[ 11858 ] {} <Fatal> BaseDaemon: 15. __libc_start_main @ 0x26d0a in /usr/lib/x86_64-linux-gnu/libc-2.31.so
[ 11858 ] {} <Fatal> BaseDaemon: 16. _start @ 0x9490a2a in /usr/lib/debug/.build-id/63/945f58fc2c28ed.debug
```

</details>
---
 programs/server/Server.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index c09b4f774fa..e3e02e5782e 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -14,7 +14,7 @@
 #include <Poco/Net/NetException.h>
 #include <Poco/Util/HelpFormatter.h>
 #include <Poco/Environment.h>
-#include <common/scope_guard.h>
+#include <common/scope_guard_safe.h>
 #include <common/defines.h>
 #include <common/logger_useful.h>
 #include <common/phdr_cache.h>
@@ -1510,7 +1510,7 @@ if (ThreadFuzzer::instance().isEffective())
             server.start();
         LOG_INFO(log, "Ready for connections.");
 
-        SCOPE_EXIT({
+        SCOPE_EXIT_SAFE({
             LOG_DEBUG(log, "Received termination signal.");
             LOG_DEBUG(log, "Waiting for current connections to close.");
 

From ba4effd69a44bf3bed8fdd49a2f35ed9395a77df Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 8 Sep 2021 21:41:11 +0300
Subject: [PATCH 162/177] Update install.md

---
 docs/en/getting-started/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index 16d3f7e9fb2..1c3c957ca1d 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -40,7 +40,7 @@ You can also download and install packages manually from [here](https://repo.cli
 
 !!! attention "Attention"
     If you need to install specific version of ClickHouse you have to install all packages with the same version:
-    `apt-get install clickhouse-server=21.6.5.37 clickhouse-client=21.6.5.37 clickhouse-common-static=21.6.5.37`
+    `apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7`
 
 ### From RPM Packages {#from-rpm-packages}
 

From 821704192b60622ac3fa2a13fcecfc9d0ebb55f9 Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 8 Sep 2021 21:41:27 +0300
Subject: [PATCH 163/177] Update install.md

---
 docs/ru/getting-started/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/getting-started/install.md b/docs/ru/getting-started/install.md
index 9f8a179759e..5fa93a96c01 100644
--- a/docs/ru/getting-started/install.md
+++ b/docs/ru/getting-started/install.md
@@ -42,7 +42,7 @@ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not su
 
 !!! attention "Внимание"
     Если вам нужно установить ClickHouse определенной версии, вы должны установить все пакеты одной версии:
-    `apt-get install clickhouse-server=21.6.5.37 clickhouse-client=21.6.5.37 clickhouse-common-static=21.6.5.37`
+    `apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7`
 
 ### Из RPM пакетов {#from-rpm-packages}
 

From 2819fc4e2a300140dfb43a3db73794a6e857212e Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 8 Sep 2021 21:42:10 +0300
Subject: [PATCH 164/177] Update install.md

---
 docs/en/getting-started/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/getting-started/install.md b/docs/en/getting-started/install.md
index 1c3c957ca1d..66ee99ef345 100644
--- a/docs/en/getting-started/install.md
+++ b/docs/en/getting-started/install.md
@@ -40,7 +40,7 @@ You can also download and install packages manually from [here](https://repo.cli
 
 !!! attention "Attention"
     If you need to install specific version of ClickHouse you have to install all packages with the same version:
-    `apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7`
+    `sudo apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7`
 
 ### From RPM Packages {#from-rpm-packages}
 

From 6291ac4bbb87fb01b00dcc6da73a632727c8850c Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Wed, 8 Sep 2021 21:42:22 +0300
Subject: [PATCH 165/177] Update install.md

---
 docs/ru/getting-started/install.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/getting-started/install.md b/docs/ru/getting-started/install.md
index 5fa93a96c01..7bc21179e21 100644
--- a/docs/ru/getting-started/install.md
+++ b/docs/ru/getting-started/install.md
@@ -42,7 +42,7 @@ grep -q sse4_2 /proc/cpuinfo && echo "SSE 4.2 supported" || echo "SSE 4.2 not su
 
 !!! attention "Внимание"
     Если вам нужно установить ClickHouse определенной версии, вы должны установить все пакеты одной версии:
-    `apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7`
+    `sudo apt-get install clickhouse-server=21.8.5.7 clickhouse-client=21.8.5.7 clickhouse-common-static=21.8.5.7`
 
 ### Из RPM пакетов {#from-rpm-packages}
 

From 40df3e6c7b6aa53e0a69969267999470518d8a67 Mon Sep 17 00:00:00 2001
From: feng lv <fenglv15@mails.ucas.ac.cn>
Date: Wed, 8 Sep 2021 15:07:27 +0000
Subject: [PATCH 166/177] reduce debug build binary size by clang optimazition

---
 CMakeLists.txt | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index de517b1b589..e765ca02a4f 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -165,6 +165,13 @@ if (COMPILER_CLANG)
     if (NOT CMAKE_BUILD_TYPE_UC STREQUAL "RELEASE")
         set(COMPILER_FLAGS "${COMPILER_FLAGS} -gdwarf-aranges")
     endif ()
+
+    if (CMAKE_CXX_COMPILER_VERSION VERSION_GREATER_EQUAL 12.0.0)
+		if (CMAKE_BUILD_TYPE_UC STREQUAL "DEBUG" OR CMAKE_BUILD_TYPE_UC STREQUAL "RELWITHDEBINFO")
+            set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -Xclang -fuse-ctor-homing")
+            set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -Xclang -fuse-ctor-homing")
+        endif()
+    endif()
 endif ()
 
 # If turned `ON`, assumes the user has either the system GTest library or the bundled one.

From f12504b002c0d5c6e10329b9846546f3ed96e335 Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Wed, 8 Sep 2021 20:30:29 +0000
Subject: [PATCH 167/177] Fix stupid race

---
 src/Storages/MergeTree/MergeTreeBackgroundExecutor.h | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
index 7af1524a023..27a08f4628a 100644
--- a/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeBackgroundExecutor.h
@@ -71,15 +71,15 @@ public:
     {
         name = toString(type);
 
+        pending.set_capacity(max_tasks_count);
+        active.set_capacity(max_tasks_count);
+
         pool.setMaxThreads(std::max(1UL, threads_count));
         pool.setMaxFreeThreads(std::max(1UL, threads_count));
         pool.setQueueSize(std::max(1UL, threads_count));
 
         for (size_t number = 0; number < threads_count; ++number)
             pool.scheduleOrThrowOnError([this] { threadFunction(); });
-
-        pending.set_capacity(max_tasks_count);
-        active.set_capacity(max_tasks_count);
     }
 
     ~MergeTreeBackgroundExecutor()

From 4a6fcf80de35cdd134d578d51d1525d767b20063 Mon Sep 17 00:00:00 2001
From: alexey-milovidov <milovidov@yandex-team.ru>
Date: Thu, 9 Sep 2021 00:34:45 +0300
Subject: [PATCH 168/177] Update CHANGELOG.md

---
 CHANGELOG.md | 206 ++++++++++++++++++++++++---------------------------
 1 file changed, 95 insertions(+), 111 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d3ca1c544dc..0e92fc59509 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,4 +1,4 @@
-### ClickHouse release v21.9, 2021-09-
+### ClickHouse release v21.9, 2021-09-09
 
 #### Backward Incompatible Change
 
@@ -6,202 +6,194 @@
 * Do not allow to apply parametric aggregate function with `-Merge` combinator to aggregate function state if state was produced by aggregate function with different parameters. For example, state of `fooState(42)(x)` cannot be finalized with `fooMerge(s)` or `fooMerge(123)(s)`, parameters must be specified explicitly like `fooMerge(42)(s)` and must be equal. It does not affect some special aggregate functions like `quantile` and `sequence*` that use parameters for finalization only. [#26847](https://github.com/ClickHouse/ClickHouse/pull/26847) ([tavplubix](https://github.com/tavplubix)).
 * Under clickhouse-local, always treat local addresses with a port as remote. [#26736](https://github.com/ClickHouse/ClickHouse/pull/26736) ([Raúl Marín](https://github.com/Algunenano)).
 * Fix the issue that in case of some sophisticated query with column aliases identical to the names of expressions, bad cast may happen. This fixes [#25447](https://github.com/ClickHouse/ClickHouse/issues/25447). This fixes [#26914](https://github.com/ClickHouse/ClickHouse/issues/26914). This fix may introduce backward incompatibility: if there are different expressions with identical names, exception will be thrown. It may break some rare cases when `enable_optimize_predicate_expression` is set. [#26639](https://github.com/ClickHouse/ClickHouse/pull/26639) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Now, scalar subquery always returns `Nullable` result if it's type can be `Nullable`. It is needed because in case of empty subquery it's result should be `Null`. Previously, it was possible to get error about incompatible types (type deduction does not execute scalar subquery, and it could use not-nullable type). Scalar subquery with empty result which can't be converted to `Nullable` (like `Array` or `Tuple`) now throws error. Fixes [#25411](https://github.com/ClickHouse/ClickHouse/issues/25411). [#26423](https://github.com/ClickHouse/ClickHouse/pull/26423) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
 
 #### New Feature
 
-* Added ComplexKeyRangeHashed dictionary. Closes [#22029](https://github.com/ClickHouse/ClickHouse/issues/22029). [#27629](https://github.com/ClickHouse/ClickHouse/pull/27629) ([Maksim Kita](https://github.com/kitaisreal)).
-* Enable using constants from with and select in aggregate function parameters. Close [#10945](https://github.com/ClickHouse/ClickHouse/issues/10945). [#27531](https://github.com/ClickHouse/ClickHouse/pull/27531) ([abel-cheng](https://github.com/abel-cheng)).
-* Add "tupleToNameValuePairs", a function that turns a named tuple into an array of pairs. [#27505](https://github.com/ClickHouse/ClickHouse/pull/27505) ([Braulio Valdivielso Martínez](https://github.com/BraulioVM)).
-* - Add replicated storage of user, roles, row policies, quotas and settings profiles through ZooKeeper (experimental). [#27426](https://github.com/ClickHouse/ClickHouse/pull/27426) ([Kevin Michel](https://github.com/kmichel-aiven)).
+* Implementation of short circuit function evaluation, closes [#12587](https://github.com/ClickHouse/ClickHouse/issues/12587). Add settings `short_circuit_function_evaluation` to configure short circuit function evaluation. [#23367](https://github.com/ClickHouse/ClickHouse/pull/23367) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add support for INTERSECT, EXCEPT, ANY, ALL operators. [#24757](https://github.com/ClickHouse/ClickHouse/pull/24757) ([Kirill Ershov](https://github.com/zdikov)). ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Add support for encryption at the virtual file system level (data encryption at rest) using AES-CTR algorithm. [#24206](https://github.com/ClickHouse/ClickHouse/pull/24206) ([Latysheva Alexandra](https://github.com/alexelex)). ([Vitaly Baranov](https://github.com/vitlibar)) [#26733](https://github.com/ClickHouse/ClickHouse/pull/26733) [#26377](https://github.com/ClickHouse/ClickHouse/pull/26377) [#26465](https://github.com/ClickHouse/ClickHouse/pull/26465).
+* Added natural language processing (NLP) functions for tokenization, stemming, lemmatizing and search in synonyms extensions. [#24997](https://github.com/ClickHouse/ClickHouse/pull/24997) ([Nikolay Degterinsky](https://github.com/evillique)).
+* Added integration with S2 geometry library. [#24980](https://github.com/ClickHouse/ClickHouse/pull/24980) ([Andr0901](https://github.com/Andr0901)). ([Nikita Mikhaylov](https://github.com/nikitamikhaylov)).
+* Add SQLite table engine, table function, database engine. [#24194](https://github.com/ClickHouse/ClickHouse/pull/24194) ([Arslan Gumerov](https://github.com/g-arslan)). ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Added support for custom query for `MySQL`, `PostgreSQL`, `ClickHouse`, `JDBC`, `Cassandra` dictionary source. Closes [#1270](https://github.com/ClickHouse/ClickHouse/issues/1270). [#26995](https://github.com/ClickHouse/ClickHouse/pull/26995) ([Maksim Kita](https://github.com/kitaisreal)).
+* Introduce syntax for here documents. Example `SELECT $doc$ VALUE $doc$`. [#26671](https://github.com/ClickHouse/ClickHouse/pull/26671) ([Maksim Kita](https://github.com/kitaisreal)).
+* Add shared (replicated) storage of user, roles, row policies, quotas and settings profiles through ZooKeeper. [#27426](https://github.com/ClickHouse/ClickHouse/pull/27426) ([Kevin Michel](https://github.com/kmichel-aiven)).
+* Add compression for `INTO OUTFILE` that automatically choose compression algorithm. Closes [#3473](https://github.com/ClickHouse/ClickHouse/issues/3473). [#27134](https://github.com/ClickHouse/ClickHouse/pull/27134) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Add `INSERT ... FROM INFILE` similarly to `SELECT ... INTO OUTFILE`. [#27655](https://github.com/ClickHouse/ClickHouse/pull/27655) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Added `complex_key_range_hashed` dictionary. Closes [#22029](https://github.com/ClickHouse/ClickHouse/issues/22029). [#27629](https://github.com/ClickHouse/ClickHouse/pull/27629) ([Maksim Kita](https://github.com/kitaisreal)).
+* Support expressions in JOIN ON section. Close [#21868](https://github.com/ClickHouse/ClickHouse/issues/21868). [#24420](https://github.com/ClickHouse/ClickHouse/pull/24420) ([Vladimir C](https://github.com/vdimir)).
+* When client connects to server, it receives information about all warnings that are already were collected by server. (It can be disabled by using option `--no-warnings`). Add `system.warnings` table to collect warnings about server configuration. [#26246](https://github.com/ClickHouse/ClickHouse/pull/26246) ([Filatenkov Artur](https://github.com/FArthur-cmd)). [#26282](https://github.com/ClickHouse/ClickHouse/pull/26282) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Allow using constant expressions from with and select in aggregate function parameters. Close [#10945](https://github.com/ClickHouse/ClickHouse/issues/10945). [#27531](https://github.com/ClickHouse/ClickHouse/pull/27531) ([abel-cheng](https://github.com/abel-cheng)).
+* Add `tupleToNameValuePairs`, a function that turns a named tuple into an array of pairs. [#27505](https://github.com/ClickHouse/ClickHouse/pull/27505) ([Braulio Valdivielso Martínez](https://github.com/BraulioVM)).
 * Add support for `bzip2` compression method for import/export. Closes [#22428](https://github.com/ClickHouse/ClickHouse/issues/22428). [#27377](https://github.com/ClickHouse/ClickHouse/pull/27377) ([Nikolay Degterinsky](https://github.com/evillique)).
 * Added `bitmapSubsetOffsetLimit(bitmap, offset, cardinality_limit)` function. It creates a subset of bitmap limit the results to `cardinality_limit` with offset of `offset`. [#27234](https://github.com/ClickHouse/ClickHouse/pull/27234) ([DHBin](https://github.com/DHBin)).
-* add column default_database to system.users. [#27054](https://github.com/ClickHouse/ClickHouse/pull/27054) ([kevin wan](https://github.com/MaxWk)).
-* Added support for custom query for MySQL, PostgreSQL, ClickHouse, JDBC, Cassandra dictionary source. Closes [#1270](https://github.com/ClickHouse/ClickHouse/issues/1270). [#26995](https://github.com/ClickHouse/ClickHouse/pull/26995) ([Maksim Kita](https://github.com/kitaisreal)).
-* Supported cluster macros inside table functions 'cluster' and 'clusterAllReplicas'. [#26913](https://github.com/ClickHouse/ClickHouse/pull/26913) ([polyprogrammist](https://github.com/PolyProgrammist)).
+* Add column `default_database` to `system.users`. [#27054](https://github.com/ClickHouse/ClickHouse/pull/27054) ([kevin wan](https://github.com/MaxWk)).
+* Supported `cluster` macros inside table functions 'cluster' and 'clusterAllReplicas'. [#26913](https://github.com/ClickHouse/ClickHouse/pull/26913) ([polyprogrammist](https://github.com/PolyProgrammist)).
 * Add new functions `currentRoles()`, `enabledRoles()`, `defaultRoles()`. [#26780](https://github.com/ClickHouse/ClickHouse/pull/26780) ([Vitaly Baranov](https://github.com/vitlibar)).
 * New functions `currentProfiles()`, `enabledProfiles()`, `defaultProfiles()`. [#26714](https://github.com/ClickHouse/ClickHouse/pull/26714) ([Vitaly Baranov](https://github.com/vitlibar)).
-* Introduce syntax for here documents. Example `SELECT $doc$VALUE$doc$`. [#26671](https://github.com/ClickHouse/ClickHouse/pull/26671) ([Maksim Kita](https://github.com/kitaisreal)).
-* Functions that return (initial_)query_id of the current query. This closes [#23682](https://github.com/ClickHouse/ClickHouse/issues/23682). [#26410](https://github.com/ClickHouse/ClickHouse/pull/26410) ([Alexey Boykov](https://github.com/mathalex)).
-* - Add `REPLACE GRANT` feature. [#26384](https://github.com/ClickHouse/ClickHouse/pull/26384) ([Caspian](https://github.com/Cas-pian)).
+* Add functions that return (initial_)query_id of the current query. This closes [#23682](https://github.com/ClickHouse/ClickHouse/issues/23682). [#26410](https://github.com/ClickHouse/ClickHouse/pull/26410) ([Alexey Boykov](https://github.com/mathalex)).
+* Add `REPLACE GRANT` feature. [#26384](https://github.com/ClickHouse/ClickHouse/pull/26384) ([Caspian](https://github.com/Cas-pian)).
 * Implement window function `nth_value(expr, N)` that returns the value of the Nth row of the window frame. [#26334](https://github.com/ClickHouse/ClickHouse/pull/26334) ([Zuo, RuoYu](https://github.com/ryzuo)).
 * `EXPLAIN` query now has `EXPLAIN ESTIMATE ...` mode that will show information about read rows, marks and parts from MergeTree tables. Closes [#23941](https://github.com/ClickHouse/ClickHouse/issues/23941). [#26131](https://github.com/ClickHouse/ClickHouse/pull/26131) ([fastio](https://github.com/fastio)).
 * Added `system.zookeeper_log` table. All actions of ZooKeeper client are logged into this table. Implements [#25449](https://github.com/ClickHouse/ClickHouse/issues/25449). [#26129](https://github.com/ClickHouse/ClickHouse/pull/26129) ([tavplubix](https://github.com/tavplubix)).
-* Zero-copy replication for ReplicatedMergeTree over HDFS storage. [#25918](https://github.com/ClickHouse/ClickHouse/pull/25918) ([Zhichang Yu](https://github.com/yuzhichang)).
-* Allow to insert Nested type as array of structs in Arrow, ORC and Parquet input format. Also add some refactoring around ArrowColumnToCHColumn. [#25902](https://github.com/ClickHouse/ClickHouse/pull/25902) ([Kruglov Pavel](https://github.com/Avogar)).
-* Add a new datatype Date32 (store data as Int32), support date range same with Datetime64 support load parquet date32 to ClickHouse Date32 Add new function toDate32 like toDate Support some function :. [#25774](https://github.com/ClickHouse/ClickHouse/pull/25774) ([LiuNeng](https://github.com/liuneng1994)).
-* enable setting default database per user [#25268](https://github.com/ClickHouse/ClickHouse/issues/25268). [#25687](https://github.com/ClickHouse/ClickHouse/pull/25687) ([kevin wan](https://github.com/MaxWk)).
-* Added NLP functions for tokenization, stemming, lemmatizing and search in synonyms extensions. [#24997](https://github.com/ClickHouse/ClickHouse/pull/24997) ([Nikolay Degterinsky](https://github.com/evillique)).
-* Added S2 geometry library. [#24980](https://github.com/ClickHouse/ClickHouse/pull/24980) ([Andr0901](https://github.com/Andr0901)).
-* Add support for INTERSECT, EXCEPT, ANY, ALL operators. [#24757](https://github.com/ClickHouse/ClickHouse/pull/24757) ([Kirill Ershov](https://github.com/zdikov)).
-* * Support conditions in JOIN ON section. Close [#21868](https://github.com/ClickHouse/ClickHouse/issues/21868). [#24420](https://github.com/ClickHouse/ClickHouse/pull/24420) ([Vladimir C](https://github.com/vdimir)).
-* adding encryption at the virtual file system level using AES_128_CTR algorithm ... [#24206](https://github.com/ClickHouse/ClickHouse/pull/24206) ([Latysheva Alexandra](https://github.com/alexelex)).
-* Add SQLite table engine, table function, database engine. [#24194](https://github.com/ClickHouse/ClickHouse/pull/24194) ([Arslan Gumerov](https://github.com/g-arslan)).
-* Implementation of short circuit function evaluation, closes [#12587](https://github.com/ClickHouse/ClickHouse/issues/12587). Add settings `short_circuit_function_evaluation` to configure short circuit function evaluation. [#23367](https://github.com/ClickHouse/ClickHouse/pull/23367) ([Kruglov Pavel](https://github.com/Avogar)).
-* Add an optional parameter to MongoDB engine to accept connection string options and support SSL connection. Closes [#21189](https://github.com/ClickHouse/ClickHouse/issues/21189). Closes [#21041](https://github.com/ClickHouse/ClickHouse/issues/21041). [#22045](https://github.com/ClickHouse/ClickHouse/pull/22045) ([Omar Bazaraa](https://github.com/OmarBazaraa)).
-* - Added a compression codec `AES_128_GCM_SIV` which encrypts columns instead of compressing them. [#19896](https://github.com/ClickHouse/ClickHouse/pull/19896) ([PHO](https://github.com/depressed-pho)).
+* Zero-copy replication for `ReplicatedMergeTree` over `HDFS` storage. [#25918](https://github.com/ClickHouse/ClickHouse/pull/25918) ([Zhichang Yu](https://github.com/yuzhichang)).
+* Allow to insert Nested type as array of structs in `Arrow`, `ORC` and `Parquet` input format. [#25902](https://github.com/ClickHouse/ClickHouse/pull/25902) ([Kruglov Pavel](https://github.com/Avogar)).
+* Add a new datatype `Date32` (store data as Int32), support date range same with `DateTime64` support load parquet date32 to ClickHouse `Date32` Add new function `toDate32` like `toDate`. [#25774](https://github.com/ClickHouse/ClickHouse/pull/25774) ([LiuNeng](https://github.com/liuneng1994)).
+* Allow setting default database for users. [#25268](https://github.com/ClickHouse/ClickHouse/issues/25268). [#25687](https://github.com/ClickHouse/ClickHouse/pull/25687) ([kevin wan](https://github.com/MaxWk)).
+* Add an optional parameter to `MongoDB` engine to accept connection string options and support SSL connection. Closes [#21189](https://github.com/ClickHouse/ClickHouse/issues/21189). Closes [#21041](https://github.com/ClickHouse/ClickHouse/issues/21041). [#22045](https://github.com/ClickHouse/ClickHouse/pull/22045) ([Omar Bazaraa](https://github.com/OmarBazaraa)).
+
+#### Experimental Feature
+
+* Added a compression codec `AES_128_GCM_SIV` which encrypts columns instead of compressing them. [#19896](https://github.com/ClickHouse/ClickHouse/pull/19896) ([PHO](https://github.com/depressed-pho)). Will be rewritten, do not use.
+* Rename `MaterializeMySQL` to `MaterializedMySQL`. [#26822](https://github.com/ClickHouse/ClickHouse/pull/26822) ([tavplubix](https://github.com/tavplubix)).
 
 #### Performance Improvement
 
-* Less number of `clock_gettime` syscalls that may lead to performance improvement for some types of fast queries. [#27492](https://github.com/ClickHouse/ClickHouse/pull/27492) ([filimonov](https://github.com/filimonov)).
-* Improve the performance of fast queries when `max_execution_time=0` by reducing the number of `clock_gettime` system calls. [#27325](https://github.com/ClickHouse/ClickHouse/pull/27325) ([filimonov](https://github.com/filimonov)).
+* Improve the performance of fast queries when `max_execution_time = 0` by reducing the number of `clock_gettime` system calls. [#27325](https://github.com/ClickHouse/ClickHouse/pull/27325) ([filimonov](https://github.com/filimonov)).
 * Specialize date time related comparison to achieve better performance. This fixes [#27083](https://github.com/ClickHouse/ClickHouse/issues/27083) . [#27122](https://github.com/ClickHouse/ClickHouse/pull/27122) ([Amos Bird](https://github.com/amosbird)).
 * Share file descriptors in concurrent reads of the same files. There is no noticeable performance difference on Linux. But the number of opened files will be significantly (10..100 times) lower on typical servers and it makes operations easier. See [#26214](https://github.com/ClickHouse/ClickHouse/issues/26214). [#26768](https://github.com/ClickHouse/ClickHouse/pull/26768) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 * Improve latency of short queries, that require reading from tables with large number of columns. [#26371](https://github.com/ClickHouse/ClickHouse/pull/26371) ([Anton Popov](https://github.com/CurtizJ)).
 * Don't build sets for indices when analyzing a query. [#26365](https://github.com/ClickHouse/ClickHouse/pull/26365) ([Raúl Marín](https://github.com/Algunenano)).
-* - Vectorize the SUM of Nullable integer types with native representation ([David Manzanares](https://github.com/davidmanzanares), [Raúl Marín](https://github.com/Algunenano)). [#26248](https://github.com/ClickHouse/ClickHouse/pull/26248) ([Raúl Marín](https://github.com/Algunenano)).
-* Compile columns with `Enum` types. [#26237](https://github.com/ClickHouse/ClickHouse/pull/26237) ([Maksim Kita](https://github.com/kitaisreal)).
+* Vectorize the SUM of Nullable integer types with native representation ([David Manzanares](https://github.com/davidmanzanares), [Raúl Marín](https://github.com/Algunenano)). [#26248](https://github.com/ClickHouse/ClickHouse/pull/26248) ([Raúl Marín](https://github.com/Algunenano)).
+* Compile expressions involving columns with `Enum` types. [#26237](https://github.com/ClickHouse/ClickHouse/pull/26237) ([Maksim Kita](https://github.com/kitaisreal)).
 * Compile aggregate functions `groupBitOr`, `groupBitAnd`, `groupBitXor`. [#26161](https://github.com/ClickHouse/ClickHouse/pull/26161) ([Maksim Kita](https://github.com/kitaisreal)).
 * Improved memory usage with better block size prediction when reading empty DEFAULT columns. Closes [#17317](https://github.com/ClickHouse/ClickHouse/issues/17317). [#25917](https://github.com/ClickHouse/ClickHouse/pull/25917) ([Vladimir Chebotarev](https://github.com/excitoon)).
 * Reduce memory usage and number of read rows in queries with `ORDER BY primary_key`. [#25721](https://github.com/ClickHouse/ClickHouse/pull/25721) ([Anton Popov](https://github.com/CurtizJ)).
+* Enable `distributed_push_down_limit` by default. [#27104](https://github.com/ClickHouse/ClickHouse/pull/27104) ([Azat Khuzhin](https://github.com/azat)).
+* Make `toTimeZone` monotonicity when timeZone is a constant value to support partition puring when use sql like:. [#26261](https://github.com/ClickHouse/ClickHouse/pull/26261) ([huangzhaowei](https://github.com/SaintBacchus)).
 
 #### Improvement
 
+* Mark window functions as ready for general use. Remove the `allow_experimental_window_functions` setting. [#27184](https://github.com/ClickHouse/ClickHouse/pull/27184) ([Alexander Kuzmenkov](https://github.com/akuzm)).
+* Improve compatibility with non-whole-minute timezone offsets. [#27080](https://github.com/ClickHouse/ClickHouse/pull/27080) ([Raúl Marín](https://github.com/Algunenano)).
+* If file descriptor in `File` table is regular file - allow to read multiple times from it. It allows `clickhouse-local` to read multiple times from stdin (with multiple SELECT queries or subqueries) if stdin is a regular file like `clickhouse-local --query "SELECT * FROM table UNION ALL SELECT * FROM table" ... < file`. This closes [#11124](https://github.com/ClickHouse/ClickHouse/issues/11124). Co-authored with ([alexey-milovidov](https://github.com/alexey-milovidov)). [#25960](https://github.com/ClickHouse/ClickHouse/pull/25960) ([BoloniniD](https://github.com/BoloniniD)).
 * Remove duplicate index analysis and avoid possible invalid limit checks during projection analysis. [#27742](https://github.com/ClickHouse/ClickHouse/pull/27742) ([Amos Bird](https://github.com/amosbird)).
-* Enables query parameters to be passed in the body of http requests. [#27706](https://github.com/ClickHouse/ClickHouse/pull/27706) ([Hermano Lustosa](https://github.com/hllustosa)).
-* - Add `FROM INFILE` command. [#27655](https://github.com/ClickHouse/ClickHouse/pull/27655) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
-* Disable arrayJoin on partition expressions. [#27648](https://github.com/ClickHouse/ClickHouse/pull/27648) ([Raúl Marín](https://github.com/Algunenano)).
+* Enable query parameters to be passed in the body of HTTP requests. [#27706](https://github.com/ClickHouse/ClickHouse/pull/27706) ([Hermano Lustosa](https://github.com/hllustosa)).
+* Disallow `arrayJoin` on partition expressions. [#27648](https://github.com/ClickHouse/ClickHouse/pull/27648) ([Raúl Marín](https://github.com/Algunenano)).
 * Log client IP address if authentication fails. [#27514](https://github.com/ClickHouse/ClickHouse/pull/27514) ([Misko Lee](https://github.com/imiskolee)).
 * Use bytes instead of strings for binary data in the GRPC protocol. [#27431](https://github.com/ClickHouse/ClickHouse/pull/27431) ([Vitaly Baranov](https://github.com/vitlibar)).
 * Send response with error message if HTTP port is not set and user tries to send HTTP request to TCP port. [#27385](https://github.com/ClickHouse/ClickHouse/pull/27385) ([Braulio Valdivielso Martínez](https://github.com/BraulioVM)).
-* Add Cast function for internal usage, which will not preserve type nullability, but non-internal cast will preserve according to setting cast_keep_nullable. Closes [#12636](https://github.com/ClickHouse/ClickHouse/issues/12636). [#27382](https://github.com/ClickHouse/ClickHouse/pull/27382) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Add `_CAST` function for internal usage, which will not preserve type nullability, but non-internal cast will preserve according to setting `cast_keep_nullable`. Closes [#12636](https://github.com/ClickHouse/ClickHouse/issues/12636). [#27382](https://github.com/ClickHouse/ClickHouse/pull/27382) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Add setting `log_formatted_queries` to log additional formatted query into `system.query_log`. It's useful for normalized query analysis because functions like `normalizeQuery` and `normalizeQueryKeepNames` don't parse/format queries in order to achieve better performance. [#27380](https://github.com/ClickHouse/ClickHouse/pull/27380) ([Amos Bird](https://github.com/amosbird)).
 * Add two settings `max_hyperscan_regexp_length` and `max_hyperscan_regexp_total_length` to prevent huge regexp being used in hyperscan related functions, such as `multiMatchAny`. [#27378](https://github.com/ClickHouse/ClickHouse/pull/27378) ([Amos Bird](https://github.com/amosbird)).
 * Memory consumed by bitmap aggregate functions now is taken into account for memory limits. This closes [#26555](https://github.com/ClickHouse/ClickHouse/issues/26555). [#27252](https://github.com/ClickHouse/ClickHouse/pull/27252) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 * Add new index data skipping minmax index format for proper Nullable support. [#27250](https://github.com/ClickHouse/ClickHouse/pull/27250) ([Azat Khuzhin](https://github.com/azat)).
 * Add 10 seconds cache for S3 proxy resolver. [#27216](https://github.com/ClickHouse/ClickHouse/pull/27216) ([ianton-ru](https://github.com/ianton-ru)).
-* Split global mutex into individual regexp construction. This helps avoid huge regexp construction blocking other related threads. Not sure how to proper test the improvement. [#27211](https://github.com/ClickHouse/ClickHouse/pull/27211) ([Amos Bird](https://github.com/amosbird)).
-* Support schema for postgres database engine. Closes [#27166](https://github.com/ClickHouse/ClickHouse/issues/27166). [#27198](https://github.com/ClickHouse/ClickHouse/pull/27198) ([Kseniia Sumarokova](https://github.com/kssenii)).
-* Memory client in client. [#27191](https://github.com/ClickHouse/ClickHouse/pull/27191) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
-* Mark window functions as ready for general use. Remove the `allow_experimental_window_functions` setting. [#27184](https://github.com/ClickHouse/ClickHouse/pull/27184) ([Alexander Kuzmenkov](https://github.com/akuzm)).
-* Try recording `query_kind` even when query fails to start. [#27182](https://github.com/ClickHouse/ClickHouse/pull/27182) ([Amos Bird](https://github.com/amosbird)).
+* Split global mutex into individual regexp construction. This helps avoid huge regexp construction blocking other related threads. [#27211](https://github.com/ClickHouse/ClickHouse/pull/27211) ([Amos Bird](https://github.com/amosbird)).
+* Support schema for PostgreSQL database engine. Closes [#27166](https://github.com/ClickHouse/ClickHouse/issues/27166). [#27198](https://github.com/ClickHouse/ClickHouse/pull/27198) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Track memory usage in clickhouse-client. [#27191](https://github.com/ClickHouse/ClickHouse/pull/27191) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Try recording `query_kind` in `system.query_log` even when query fails to start. [#27182](https://github.com/ClickHouse/ClickHouse/pull/27182) ([Amos Bird](https://github.com/amosbird)).
 * Added columns `replica_is_active` that maps replica name to is replica active status to table `system.replicas`. Closes [#27138](https://github.com/ClickHouse/ClickHouse/issues/27138). [#27180](https://github.com/ClickHouse/ClickHouse/pull/27180) ([Maksim Kita](https://github.com/kitaisreal)).
 * Allow to pass query settings via server URI in Web UI. [#27177](https://github.com/ClickHouse/ClickHouse/pull/27177) ([kolsys](https://github.com/kolsys)).
-* add a new metric called MaxPushedDDLEntryID which is the maximum ddl entry id that current node push to zookeeper. [#27174](https://github.com/ClickHouse/ClickHouse/pull/27174) ([Fuwang Hu](https://github.com/fuwhu)).
-* Add compression for `INTO OUTPUT` that automatically choose compression algorithm. Closes [#3473](https://github.com/ClickHouse/ClickHouse/issues/3473). [#27134](https://github.com/ClickHouse/ClickHouse/pull/27134) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
-* Improved the existence condition judgment and empty string node judgment when clickhouse-keeper creates znode. [#27125](https://github.com/ClickHouse/ClickHouse/pull/27125) ([小路](https://github.com/nicelulu)).
-* Enable distributed_push_down_limit by default. [#27104](https://github.com/ClickHouse/ClickHouse/pull/27104) ([Azat Khuzhin](https://github.com/azat)).
-* Improve compatibility with non-whole-minute timezone offsets. [#27080](https://github.com/ClickHouse/ClickHouse/pull/27080) ([Raúl Marín](https://github.com/Algunenano)).
-* * Merge join correctly handles empty set in the right. [#27078](https://github.com/ClickHouse/ClickHouse/pull/27078) ([Vladimir C](https://github.com/vdimir)).
+* Add a new metric called `MaxPushedDDLEntryID` which is the maximum ddl entry id that current node push to zookeeper. [#27174](https://github.com/ClickHouse/ClickHouse/pull/27174) ([Fuwang Hu](https://github.com/fuwhu)).
+* Improved the existence condition judgment and empty string node judgment when `clickhouse-keeper` creates znode. [#27125](https://github.com/ClickHouse/ClickHouse/pull/27125) ([小路](https://github.com/nicelulu)).
+* Merge JOIN correctly handles empty set in the right. [#27078](https://github.com/ClickHouse/ClickHouse/pull/27078) ([Vladimir C](https://github.com/vdimir)).
 * Now functions can be shard-level constants, which means if it's executed in the context of some distributed table, it generates a normal column, otherwise it produces a constant value. Notable functions are: `hostName()`, `tcpPort()`, `version()`, `buildId()`, `uptime()`, etc. [#27020](https://github.com/ClickHouse/ClickHouse/pull/27020) ([Amos Bird](https://github.com/amosbird)).
-* Updated extractAllGroupsHorizontal - upper limit on the number of matches per row can be set via optional third argument. ... [#26961](https://github.com/ClickHouse/ClickHouse/pull/26961) ([Vasily Nemkov](https://github.com/Enmk)).
-* Expose rocksdb statistics via system.rocksdb table. Read rocksdb options from ClickHouse config (`rocksdb`/`rocksdb_TABLE` keys). [#26821](https://github.com/ClickHouse/ClickHouse/pull/26821) ([Azat Khuzhin](https://github.com/azat)).
-* Less verbose internal RocksDB logs. This closes [#26252](https://github.com/ClickHouse/ClickHouse/issues/26252). [#26789](https://github.com/ClickHouse/ClickHouse/pull/26789) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Updated `extractAllGroupsHorizontal` - upper limit on the number of matches per row can be set via optional third argument. [#26961](https://github.com/ClickHouse/ClickHouse/pull/26961) ([Vasily Nemkov](https://github.com/Enmk)).
+* Expose `RocksDB` statistics via system.rocksdb table. Read rocksdb options from ClickHouse config (`rocksdb...` keys). NOTE: ClickHouse does not rely on RocksDB, it is just one of the additional integration storage engines. [#26821](https://github.com/ClickHouse/ClickHouse/pull/26821) ([Azat Khuzhin](https://github.com/azat)).
+* Less verbose internal RocksDB logs. NOTE: ClickHouse does not rely on RocksDB, it is just one of the additional integration storage engines. This closes [#26252](https://github.com/ClickHouse/ClickHouse/issues/26252). [#26789](https://github.com/ClickHouse/ClickHouse/pull/26789) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 * Changing default roles affects new sessions only. [#26759](https://github.com/ClickHouse/ClickHouse/pull/26759) ([Vitaly Baranov](https://github.com/vitlibar)).
 * Watchdog is disabled in docker by default. Fix for not handling ctrl+c. [#26757](https://github.com/ClickHouse/ClickHouse/pull/26757) ([Mikhail f. Shiryaev](https://github.com/Felixoid)).
-* remove uncessary exception thrown. [#26740](https://github.com/ClickHouse/ClickHouse/pull/26740) ([Caspian](https://github.com/Cas-pian)).
-* Support multiple keys for encrypted disk. Display error message if the key is probably wrong. (see https://github.com/ClickHouse/ClickHouse/pull/26465#issuecomment-882015970). [#26733](https://github.com/ClickHouse/ClickHouse/pull/26733) ([Vitaly Baranov](https://github.com/vitlibar)).
-* SET PROFILE now applies constraints too if they're set for a passed profile. [#26730](https://github.com/ClickHouse/ClickHouse/pull/26730) ([Vitaly Baranov](https://github.com/vitlibar)).
-* Improve handling of KILL QUERY requests. [#26675](https://github.com/ClickHouse/ClickHouse/pull/26675) ([Raúl Marín](https://github.com/Algunenano)).
-* - `mapPopulatesSeries` function supports `Map` type. [#26663](https://github.com/ClickHouse/ClickHouse/pull/26663) ([Ildus Kurbangaliev](https://github.com/ildus)).
-* Fix excessive (x2) connect attempts with skip_unavailable_shards. [#26658](https://github.com/ClickHouse/ClickHouse/pull/26658) ([Azat Khuzhin](https://github.com/azat)).
-* Avoid hanging clickhouse-benchmark if connection fails (i.e. on EMFILE). [#26656](https://github.com/ClickHouse/ClickHouse/pull/26656) ([Azat Khuzhin](https://github.com/azat)).
-* Improve the high performance machine to use the kafka engine. and it can recuce the query node work load. [#26642](https://github.com/ClickHouse/ClickHouse/pull/26642) ([feihengye](https://github.com/feihengye)).
-* Add round-robin support for clickhouse-benchmark (it does not differ from the regular multi host/port run except for statistics report). [#26607](https://github.com/ClickHouse/ClickHouse/pull/26607) ([Azat Khuzhin](https://github.com/azat)).
-* Executable dictionaries (ExecutableDictionarySource, ExecutablePoolDictionarySource) enable creation with DDL query using clickhouse-local. Closes [#22355](https://github.com/ClickHouse/ClickHouse/issues/22355). [#26510](https://github.com/ClickHouse/ClickHouse/pull/26510) ([Maksim Kita](https://github.com/kitaisreal)).
-* - Set client query kind for mysql and postgresql handler. [#26498](https://github.com/ClickHouse/ClickHouse/pull/26498) ([anneji-dev](https://github.com/anneji-dev)).
+* `SET PROFILE` now applies constraints too if they're set for a passed profile. [#26730](https://github.com/ClickHouse/ClickHouse/pull/26730) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Improve handling of `KILL QUERY` requests. [#26675](https://github.com/ClickHouse/ClickHouse/pull/26675) ([Raúl Marín](https://github.com/Algunenano)).
+* `mapPopulatesSeries` function supports `Map` type. [#26663](https://github.com/ClickHouse/ClickHouse/pull/26663) ([Ildus Kurbangaliev](https://github.com/ildus)).
+* Fix excessive (x2) connect attempts with `skip_unavailable_shards`. [#26658](https://github.com/ClickHouse/ClickHouse/pull/26658) ([Azat Khuzhin](https://github.com/azat)).
+* Avoid hanging `clickhouse-benchmark` if connection fails (i.e. on EMFILE). [#26656](https://github.com/ClickHouse/ClickHouse/pull/26656) ([Azat Khuzhin](https://github.com/azat)).
+* Allow more threads to be used by the Kafka engine. [#26642](https://github.com/ClickHouse/ClickHouse/pull/26642) ([feihengye](https://github.com/feihengye)).
+* Add round-robin support for `clickhouse-benchmark` (it does not differ from the regular multi host/port run except for statistics report). [#26607](https://github.com/ClickHouse/ClickHouse/pull/26607) ([Azat Khuzhin](https://github.com/azat)).
+* Executable dictionaries (`executable`, `executable_pool`) enable creation with DDL query using `clickhouse-local`. Closes [#22355](https://github.com/ClickHouse/ClickHouse/issues/22355). [#26510](https://github.com/ClickHouse/ClickHouse/pull/26510) ([Maksim Kita](https://github.com/kitaisreal)).
+* Set client query kind for `mysql` and `postgresql` compatibility protocol handlers. [#26498](https://github.com/ClickHouse/ClickHouse/pull/26498) ([anneji-dev](https://github.com/anneji-dev)).
 * Apply `LIMIT` on the shards for queries like `SELECT * FROM dist ORDER BY key LIMIT 10` w/ `distributed_push_down_limit=1`. Avoid running `Distinct`/`LIMIT BY` steps for queries like `SELECT DISTINCT shading_key FROM dist ORDER BY key`. Now `distributed_push_down_limit` is respected by `optimize_distributed_group_by_sharding_key` optimization. [#26466](https://github.com/ClickHouse/ClickHouse/pull/26466) ([Azat Khuzhin](https://github.com/azat)).
-* After https://github.com/ClickHouse/ClickHouse/pull/26377. Encryption algorithm now should be specified explicitly if it's not default (`aes_128_ctr`):. [#26465](https://github.com/ClickHouse/ClickHouse/pull/26465) ([Vitaly Baranov](https://github.com/vitlibar)).
 * Updated protobuf to 3.17.3. Changelogs are available on https://github.com/protocolbuffers/protobuf/releases. [#26424](https://github.com/ClickHouse/ClickHouse/pull/26424) ([Ilya Yatsishin](https://github.com/qoega)).
 * Enable `use_hedged_requests` setting that allows to mitigate tail latencies on large clusters. [#26380](https://github.com/ClickHouse/ClickHouse/pull/26380) ([alexey-milovidov](https://github.com/alexey-milovidov)).
-* Added comments for the code written in https://github.com/ClickHouse/ClickHouse/pull/24206; the code has been improved in several places. [#26377](https://github.com/ClickHouse/ClickHouse/pull/26377) ([Vitaly Baranov](https://github.com/vitlibar)).
-* Fix behaviour with non-existing host in user allowed host list. [#26368](https://github.com/ClickHouse/ClickHouse/pull/26368) ([ianton-ru](https://github.com/ianton-ru)).
-* Add ability to set Distributed directory monitor settings via CREATE TABLE (i.e. `CREATE TABLE dist (key Int) Engine=Distributed(cluster, db, table) SETTINGS monitor_batch_inserts=1` and similar). [#26336](https://github.com/ClickHouse/ClickHouse/pull/26336) ([Azat Khuzhin](https://github.com/azat)).
+* Improve behaviour with non-existing host in user allowed host list. [#26368](https://github.com/ClickHouse/ClickHouse/pull/26368) ([ianton-ru](https://github.com/ianton-ru)).
+* Add ability to set `Distributed` directory monitor settings via CREATE TABLE (i.e. `CREATE TABLE dist (key Int) Engine=Distributed(cluster, db, table) SETTINGS monitor_batch_inserts=1` and similar). [#26336](https://github.com/ClickHouse/ClickHouse/pull/26336) ([Azat Khuzhin](https://github.com/azat)).
 * Save server address in history URLs in web UI if it differs from the origin of web UI. This closes [#26044](https://github.com/ClickHouse/ClickHouse/issues/26044). [#26322](https://github.com/ClickHouse/ClickHouse/pull/26322) ([alexey-milovidov](https://github.com/alexey-milovidov)).
-* Add events to profile calls to sleep / sleepEachRow. [#26320](https://github.com/ClickHouse/ClickHouse/pull/26320) ([Raúl Marín](https://github.com/Algunenano)).
+* Add events to profile calls to `sleep` / `sleepEachRow`. [#26320](https://github.com/ClickHouse/ClickHouse/pull/26320) ([Raúl Marín](https://github.com/Algunenano)).
 * Allow to reuse connections of shards among different clusters. It also avoids creating new connections when using `cluster` table function. [#26318](https://github.com/ClickHouse/ClickHouse/pull/26318) ([Amos Bird](https://github.com/amosbird)).
 * Control the execution period of clear old temporary directories by parameter with default value. [#26212](https://github.com/ClickHouse/ClickHouse/issues/26212). [#26313](https://github.com/ClickHouse/ClickHouse/pull/26313) ([fastio](https://github.com/fastio)).
 * Add a setting `function_range_max_elements_in_block` to tune the safety threshold for data volume generated by function `range`. This closes [#26303](https://github.com/ClickHouse/ClickHouse/issues/26303). [#26305](https://github.com/ClickHouse/ClickHouse/pull/26305) ([alexey-milovidov](https://github.com/alexey-milovidov)).
-* - When client connect to server, he receives information about all warnings that are already were collected by server. (It can be disabled by using option `--no-warnings`). [#26282](https://github.com/ClickHouse/ClickHouse/pull/26282) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
-* Make `toTimeZone` monotonicity when timeZone is a constant value to support partition puring when use sql like:. [#26261](https://github.com/ClickHouse/ClickHouse/pull/26261) ([huangzhaowei](https://github.com/SaintBacchus)).
-* Check hash function at table creation, not at sampling. Add settings in MergeTreeSettings, if someone create a table with incorrect sampling column but sampling never be used, disable this settings for starting the server without exception. [#26256](https://github.com/ClickHouse/ClickHouse/pull/26256) ([zhaoyu](https://github.com/zxc111)).
-* - Add `system.warnings` table to collect warnings about server configuration. [#26246](https://github.com/ClickHouse/ClickHouse/pull/26246) ([Filatenkov Artur](https://github.com/FArthur-cmd)).
+* Check hash function at table creation, not at sampling. Add settings for MergeTree, if someone create a table with incorrect sampling column but sampling never be used, disable this settings for starting the server without exception. [#26256](https://github.com/ClickHouse/ClickHouse/pull/26256) ([zhaoyu](https://github.com/zxc111)).
 * Added `output_format_avro_string_column_pattern` setting to put specified String columns to Avro as string instead of default bytes. Implements [#22414](https://github.com/ClickHouse/ClickHouse/issues/22414). [#26245](https://github.com/ClickHouse/ClickHouse/pull/26245) ([Ilya Golshtein](https://github.com/ilejn)).
 * Add information about column sizes in `system.columns` table for `Log` and `TinyLog` tables. This closes [#9001](https://github.com/ClickHouse/ClickHouse/issues/9001). [#26241](https://github.com/ClickHouse/ClickHouse/pull/26241) ([Nikolay Degterinsky](https://github.com/evillique)).
 * Don't throw exception when querying `system.detached_parts` table if there is custom disk configuration and `detached` directory does not exist on some disks. This closes [#26078](https://github.com/ClickHouse/ClickHouse/issues/26078). [#26236](https://github.com/ClickHouse/ClickHouse/pull/26236) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 * Check for non-deterministic functions in keys, including constant expressions like `now()`, `today()`. This closes [#25875](https://github.com/ClickHouse/ClickHouse/issues/25875). This closes [#11333](https://github.com/ClickHouse/ClickHouse/issues/11333). [#26235](https://github.com/ClickHouse/ClickHouse/pull/26235) ([alexey-milovidov](https://github.com/alexey-milovidov)).
-* convert timestamp and timestamptz data types to DateTime64 in postgres engine. [#26234](https://github.com/ClickHouse/ClickHouse/pull/26234) ([jasine](https://github.com/jasine)).
+* convert timestamp and timestamptz data types to `DateTime64` in PostgreSQL table engine. [#26234](https://github.com/ClickHouse/ClickHouse/pull/26234) ([jasine](https://github.com/jasine)).
 * Apply aggressive IN index analysis for projections so that better projection candidate can be selected. [#26218](https://github.com/ClickHouse/ClickHouse/pull/26218) ([Amos Bird](https://github.com/amosbird)).
 * Remove GLOBAL keyword for IN when scalar function is passed. In previous versions, if user specified `GLOBAL IN f(x)` exception was thrown. [#26217](https://github.com/ClickHouse/ClickHouse/pull/26217) ([Amos Bird](https://github.com/amosbird)).
 * Add error id (like `BAD_ARGUMENTS`) to exception messages. This closes [#25862](https://github.com/ClickHouse/ClickHouse/issues/25862). [#26172](https://github.com/ClickHouse/ClickHouse/pull/26172) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 * Fix incorrect output with --progress option for clickhouse-local. Progress bar will be cleared once it gets to 100% - same as it is done for clickhouse-client. Closes [#17484](https://github.com/ClickHouse/ClickHouse/issues/17484). [#26128](https://github.com/ClickHouse/ClickHouse/pull/26128) ([Kseniia Sumarokova](https://github.com/kssenii)).
-* add merge_selecting_sleep_ms setting. [#26120](https://github.com/ClickHouse/ClickHouse/pull/26120) ([lthaooo](https://github.com/lthaooo)).
+* Add `merge_selecting_sleep_ms` setting. [#26120](https://github.com/ClickHouse/ClickHouse/pull/26120) ([lthaooo](https://github.com/lthaooo)).
 * Remove complicated usage of Linux AIO with one block readahead and replace it with plain simple synchronous IO with O_DIRECT. In previous versions, the setting `min_bytes_to_use_direct_io` may not work correctly if `max_threads` is greater than one. Reading with direct IO (that is disabled by default for queries and enabled by default for large merges) will work in less efficient way. This closes [#25997](https://github.com/ClickHouse/ClickHouse/issues/25997). [#26003](https://github.com/ClickHouse/ClickHouse/pull/26003) ([alexey-milovidov](https://github.com/alexey-milovidov)).
-* If file descriptor in `File` table is regular file - allow to read multiple times from it. It allows `clickhouse-local` to read multiple times from stdin (with multiple SELECT queries or subqueries) if stdin is a regular file like `clickhouse-local --query "SELECT * FROM table UNION ALL SELECT * FROM table" ... < file`. This closes [#11124](https://github.com/ClickHouse/ClickHouse/issues/11124). Co-authored with @alexey-milovidov. [#25960](https://github.com/ClickHouse/ClickHouse/pull/25960) ([BoloniniD](https://github.com/BoloniniD)).
-* - Flush `Distributed` table on `REPLACE TABLE` query. Resolves [#24566](https://github.com/ClickHouse/ClickHouse/issues/24566) - Do not replace (or create) table on `[CREATE OR] REPLACE TABLE ... AS SELECT` query if insertion into new table fails. Resolves [#23175](https://github.com/ClickHouse/ClickHouse/issues/23175). [#25895](https://github.com/ClickHouse/ClickHouse/pull/25895) ([tavplubix](https://github.com/tavplubix)).
-* - Add `views` column to system.query_log containing the names of the (materialized or live) views executed by the query. - Adds a new log table (`system.query_views_log`) that contains information about each view executed during a query. - Modifies view execution: When an exception is thrown while executing a view, any view that has already startedwill continue running until it finishes. This used to be the behaviour under parallel_view_processing=true and now it's always the same behaviour. - Dependent views now report reading progress to the context. [#25714](https://github.com/ClickHouse/ClickHouse/pull/25714) ([Raúl Marín](https://github.com/Algunenano)).
+* Flush `Distributed` table on `REPLACE TABLE` query. Resolves [#24566](https://github.com/ClickHouse/ClickHouse/issues/24566) - Do not replace (or create) table on `[CREATE OR] REPLACE TABLE ... AS SELECT` query if insertion into new table fails. Resolves [#23175](https://github.com/ClickHouse/ClickHouse/issues/23175). [#25895](https://github.com/ClickHouse/ClickHouse/pull/25895) ([tavplubix](https://github.com/tavplubix)).
+* Add `views` column to system.query_log containing the names of the (materialized or live) views executed by the query. Adds a new log table (`system.query_views_log`) that contains information about each view executed during a query. Modifies view execution: When an exception is thrown while executing a view, any view that has already startedwill continue running until it finishes. This used to be the behaviour under parallel_view_processing=true and now it's always the same behaviour. - Dependent views now report reading progress to the context. [#25714](https://github.com/ClickHouse/ClickHouse/pull/25714) ([Raúl Marín](https://github.com/Algunenano)).
 * Do connection draining asynchonously upon finishing executing distributed queries. A new server setting is added `max_threads_for_connection_collector` which specifies the number of workers to recycle connections in background. If the pool is full, connection will be drained synchronously but a bit different than before: It's drained after we send EOS to client, query will succeed immediately after receiving enough data, and any exception will be logged instead of throwing to the client. Added setting `drain_timeout` (3 seconds by default). Connection draining will disconnect upon timeout. [#25674](https://github.com/ClickHouse/ClickHouse/pull/25674) ([Amos Bird](https://github.com/amosbird)).
 * Support for multiple includes in configuration. It is possible to include users configuration, remote servers configuration from multiple sources. Simply place `<include />` element with `from_zk`, `from_env` or `incl` attribute and it will be replaced with the substitution. [#24404](https://github.com/ClickHouse/ClickHouse/pull/24404) ([nvartolomei](https://github.com/nvartolomei)).
 * Fix multiple block insertion into distributed table with `insert_distributed_one_random_shard = 1`. This is a marginal feature. Mark as improvement. [#23140](https://github.com/ClickHouse/ClickHouse/pull/23140) ([Amos Bird](https://github.com/amosbird)).
 * Support `LowCardinality` and `FixedString` keys/values for `Map` type. [#21543](https://github.com/ClickHouse/ClickHouse/pull/21543) ([hexiaoting](https://github.com/hexiaoting)).
 * Enable reloading of local disk config. [#19526](https://github.com/ClickHouse/ClickHouse/pull/19526) ([taiyang-li](https://github.com/taiyang-li)).
-* Now we ran stateful and stateless tests in random timezones. Fixes [#12439](https://github.com/ClickHouse/ClickHouse/issues/12439). Reading String as DateTime and writing DateTime as String in Protobuf format now respect timezone. Reading UInt16 as DateTime in Arrow and Parquet formats now treat it as Date and then converts to DateTime with respect to DateTime's timezone, because Date is serialized in Arrow and Parquet as UInt16. GraphiteMergeTree now respect time zone for rounding of times. Fixes [#5098](https://github.com/ClickHouse/ClickHouse/issues/5098). Author: @alexey-milovidov. [#15408](https://github.com/ClickHouse/ClickHouse/pull/15408) ([alesapin](https://github.com/alesapin)).
-* Now KeyConditions can correctly skip nullable keys, including isNull and isNotNull. https://github.com/ClickHouse/ClickHouse/pull/12433 Cond. [#12455](https://github.com/ClickHouse/ClickHouse/pull/12455) ([Amos Bird](https://github.com/amosbird)).
+* Now KeyConditions can correctly skip nullable keys, including `isNull` and `isNotNull`. https://github.com/ClickHouse/ClickHouse/pull/12433. [#12455](https://github.com/ClickHouse/ClickHouse/pull/12455) ([Amos Bird](https://github.com/amosbird)).
 
 #### Bug Fix
 
 * Fix a couple of bugs that may cause replicas to diverge. [#27808](https://github.com/ClickHouse/ClickHouse/pull/27808) ([tavplubix](https://github.com/tavplubix)).
 * Fix a rare bug in `DROP PART` which can lead to the error `Unexpected merged part intersects drop range`. [#27807](https://github.com/ClickHouse/ClickHouse/pull/27807) ([alesapin](https://github.com/alesapin)).
 * Prevent crashes for some formats when NULL (tombstone) message was coming from Kafka. Closes [#19255](https://github.com/ClickHouse/ClickHouse/issues/19255). [#27794](https://github.com/ClickHouse/ClickHouse/pull/27794) ([filimonov](https://github.com/filimonov)).
-* Fix possible crash when asynchronous connection draining is enabled and hedged connection is disabled. [#27774](https://github.com/ClickHouse/ClickHouse/pull/27774) ([Amos Bird](https://github.com/amosbird)).
-* Remove duplicated source files in CMakeLists.txt in arrow-cmake. [#27736](https://github.com/ClickHouse/ClickHouse/pull/27736) ([taiyang-li](https://github.com/taiyang-li)).
-* After https://github.com/ClickHouse/ClickHouse/pull/25687. Add backquotes for the default database shown in CREATE USER. [#27702](https://github.com/ClickHouse/ClickHouse/pull/27702) ([Vitaly Baranov](https://github.com/vitlibar)).
-* After https://github.com/ClickHouse/ClickHouse/pull/26384. To execute `GRANT WITH REPLACE OPTION` now the current user should have `GRANT OPTION` for access rights it's going to grant AND for access rights it's going to revoke. [#27701](https://github.com/ClickHouse/ClickHouse/pull/27701) ([Vitaly Baranov](https://github.com/vitlibar)).
 * Fix column filtering with union distinct in subquery. Closes [#27578](https://github.com/ClickHouse/ClickHouse/issues/27578). [#27689](https://github.com/ClickHouse/ClickHouse/pull/27689) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Fix bad type cast when functions like `arrayHas` are applied to arrays of LowCardinality of Nullable of different non-numeric types like `DateTime` and `DateTime64`. In previous versions bad cast occurs. In new version it will lead to exception. This closes [#26330](https://github.com/ClickHouse/ClickHouse/issues/26330). [#27682](https://github.com/ClickHouse/ClickHouse/pull/27682) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 * Fix postgresql table function resulting in non-closing connections. Closes [#26088](https://github.com/ClickHouse/ClickHouse/issues/26088). [#27662](https://github.com/ClickHouse/ClickHouse/pull/27662) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Fixed another case of `Unexpected merged part ... intersecting drop range ...` error. [#27656](https://github.com/ClickHouse/ClickHouse/pull/27656) ([tavplubix](https://github.com/tavplubix)).
-* - Fix bug with aliased column in `Distributed` table. [#27652](https://github.com/ClickHouse/ClickHouse/pull/27652) ([Vladimir C](https://github.com/vdimir)).
+* Fix an error with aliased column in `Distributed` table. [#27652](https://github.com/ClickHouse/ClickHouse/pull/27652) ([Vladimir C](https://github.com/vdimir)).
 * After setting `max_memory_usage*` to non-zero value it was not possible to reset it back to 0 (unlimited). It's fixed. [#27638](https://github.com/ClickHouse/ClickHouse/pull/27638) ([tavplubix](https://github.com/tavplubix)).
 * Fixed underflow of the time value when constructing it from components. Closes [#27193](https://github.com/ClickHouse/ClickHouse/issues/27193). [#27605](https://github.com/ClickHouse/ClickHouse/pull/27605) ([Vasily Nemkov](https://github.com/Enmk)).
 * Fix crash during projection materialization when some parts contain missing columns. This fixes [#27512](https://github.com/ClickHouse/ClickHouse/issues/27512). [#27528](https://github.com/ClickHouse/ClickHouse/pull/27528) ([Amos Bird](https://github.com/amosbird)).
-* fix metric BackgroundMessageBrokerSchedulePoolTask, maybe mistyped。. [#27452](https://github.com/ClickHouse/ClickHouse/pull/27452) ([Ben](https://github.com/benbiti)).
+* fix metric `BackgroundMessageBrokerSchedulePoolTask`, maybe mistyped. [#27452](https://github.com/ClickHouse/ClickHouse/pull/27452) ([Ben](https://github.com/benbiti)).
 * Fix distributed queries with zero shards and aggregation. [#27427](https://github.com/ClickHouse/ClickHouse/pull/27427) ([Azat Khuzhin](https://github.com/azat)).
-* /proc/info contains metrics like. [#27361](https://github.com/ClickHouse/ClickHouse/pull/27361) ([Mike Kot](https://github.com/myrrc)).
-* Fix incorrect result for query with row-level security, prewhere and LowCardinality filter. Fixes [#27179](https://github.com/ClickHouse/ClickHouse/issues/27179). [#27329](https://github.com/ClickHouse/ClickHouse/pull/27329) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Compatibility when `/proc/meminfo` does not contain KB suffix. [#27361](https://github.com/ClickHouse/ClickHouse/pull/27361) ([Mike Kot](https://github.com/myrrc)).
+* Fix incorrect result for query with row-level security, PREWHERE and LowCardinality filter. Fixes [#27179](https://github.com/ClickHouse/ClickHouse/issues/27179). [#27329](https://github.com/ClickHouse/ClickHouse/pull/27329) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
 * Fixed incorrect validation of partition id for MergeTree tables that created with old syntax. [#27328](https://github.com/ClickHouse/ClickHouse/pull/27328) ([tavplubix](https://github.com/tavplubix)).
-* Fix Mysql protocol when using parallel formats (CSV / TSV). [#27326](https://github.com/ClickHouse/ClickHouse/pull/27326) ([Raúl Marín](https://github.com/Algunenano)).
+* Fix MySQL protocol when using parallel formats (CSV / TSV). [#27326](https://github.com/ClickHouse/ClickHouse/pull/27326) ([Raúl Marín](https://github.com/Algunenano)).
 * Fix `Cannot find column` error for queries with sampling. Was introduced in [#24574](https://github.com/ClickHouse/ClickHouse/issues/24574). Fixes [#26522](https://github.com/ClickHouse/ClickHouse/issues/26522). [#27301](https://github.com/ClickHouse/ClickHouse/pull/27301) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
-* Fix errors like `Expected ColumnLowCardinality, gotUInt8` or `Bad cast from type DB::ColumnVector<char8_t> to DB::ColumnLowCardinality` for some queries with `LowCardinality` in `PREWHERE`. Fixes [#23515](https://github.com/ClickHouse/ClickHouse/issues/23515). [#27298](https://github.com/ClickHouse/ClickHouse/pull/27298) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
-* Fix `distributed_group_by_no_merge=2`+`distributed_push_down_limit=1` or `optimize_distributed_group_by_sharding_key=1` with `LIMIT BY` and `LIMIT OFFSET`. [#27249](https://github.com/ClickHouse/ClickHouse/pull/27249) ([Azat Khuzhin](https://github.com/azat)).
+* Fix errors like `Expected ColumnLowCardinality, gotUInt8` or `Bad cast from type DB::ColumnVector<char8_t> to DB::ColumnLowCardinality` for some queries with `LowCardinality` in `PREWHERE`. And more importantly, fix the lack of whitespace in the error message. Fixes [#23515](https://github.com/ClickHouse/ClickHouse/issues/23515). [#27298](https://github.com/ClickHouse/ClickHouse/pull/27298) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
+* Fix `distributed_group_by_no_merge = 2` with `distributed_push_down_limit = 1` or `optimize_distributed_group_by_sharding_key = 1` with `LIMIT BY` and `LIMIT OFFSET`. [#27249](https://github.com/ClickHouse/ClickHouse/pull/27249) ([Azat Khuzhin](https://github.com/azat)). These are obscure combination of settings that no one is using.
 * Fix mutation stuck on invalid partitions in non-replicated MergeTree. [#27248](https://github.com/ClickHouse/ClickHouse/pull/27248) ([Azat Khuzhin](https://github.com/azat)).
 * In case of ambiguity, lambda functions prefer its arguments to other aliases or identifiers. [#27235](https://github.com/ClickHouse/ClickHouse/pull/27235) ([Raúl Marín](https://github.com/Algunenano)).
 * Fix column structure in merge join, close [#27091](https://github.com/ClickHouse/ClickHouse/issues/27091). [#27217](https://github.com/ClickHouse/ClickHouse/pull/27217) ([Vladimir C](https://github.com/vdimir)).
-* Fix on-disk format breakage for secondary indices over Nullable column (no stable release had been affected). [#27197](https://github.com/ClickHouse/ClickHouse/pull/27197) ([Azat Khuzhin](https://github.com/azat)).
 * In rare cases `system.detached_parts` table might contain incorrect information for some parts, it's fixed. Fixes [#27114](https://github.com/ClickHouse/ClickHouse/issues/27114). [#27183](https://github.com/ClickHouse/ClickHouse/pull/27183) ([tavplubix](https://github.com/tavplubix)).
-* - Fix uninitialized memory in functions `multiSearch*` with empty array, close [#27169](https://github.com/ClickHouse/ClickHouse/issues/27169). [#27181](https://github.com/ClickHouse/ClickHouse/pull/27181) ([Vladimir C](https://github.com/vdimir)).
-* Fix synchronization in GRPCServer This PR fixes [#27024](https://github.com/ClickHouse/ClickHouse/issues/27024). [#27064](https://github.com/ClickHouse/ClickHouse/pull/27064) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Fix uninitialized memory in functions `multiSearch*` with empty array, close [#27169](https://github.com/ClickHouse/ClickHouse/issues/27169). [#27181](https://github.com/ClickHouse/ClickHouse/pull/27181) ([Vladimir C](https://github.com/vdimir)).
+* Fix synchronization in GRPCServer. This PR fixes [#27024](https://github.com/ClickHouse/ClickHouse/issues/27024). [#27064](https://github.com/ClickHouse/ClickHouse/pull/27064) ([Vitaly Baranov](https://github.com/vitlibar)).
 * Fixed `cache`, `complex_key_cache`, `ssd_cache`, `complex_key_ssd_cache` configuration parsing. Options `allow_read_expired_keys`, `max_update_queue_size`, `update_queue_push_timeout_milliseconds`, `query_wait_timeout_milliseconds` were not parsed for dictionaries with non `cache` type. [#27032](https://github.com/ClickHouse/ClickHouse/pull/27032) ([Maksim Kita](https://github.com/kitaisreal)).
-* [RFC] Fix possible mutation stack due to race with DROP_RANGE. [#27002](https://github.com/ClickHouse/ClickHouse/pull/27002) ([Azat Khuzhin](https://github.com/azat)).
+* Fix possible mutation stack due to race with DROP_RANGE. [#27002](https://github.com/ClickHouse/ClickHouse/pull/27002) ([Azat Khuzhin](https://github.com/azat)).
 * Now partition ID in queries like `ALTER TABLE ... PARTITION ID xxx` validates for correctness. Fixes [#25718](https://github.com/ClickHouse/ClickHouse/issues/25718). [#26963](https://github.com/ClickHouse/ClickHouse/pull/26963) ([alesapin](https://github.com/alesapin)).
 * Fix "Unknown column name" error with multiple JOINs in some cases, close [#26899](https://github.com/ClickHouse/ClickHouse/issues/26899). [#26957](https://github.com/ClickHouse/ClickHouse/pull/26957) ([Vladimir C](https://github.com/vdimir)).
 * Fix reading of custom TLDs (stops processing with lower buffer or bigger file). [#26948](https://github.com/ClickHouse/ClickHouse/pull/26948) ([Azat Khuzhin](https://github.com/azat)).
 * Fix error `Missing columns: 'xxx'` when `DEFAULT` column references other non materialized column without `DEFAULT` expression. Fixes [#26591](https://github.com/ClickHouse/ClickHouse/issues/26591). [#26900](https://github.com/ClickHouse/ClickHouse/pull/26900) ([alesapin](https://github.com/alesapin)).
-* Fix library-bridge ids load. [#26834](https://github.com/ClickHouse/ClickHouse/pull/26834) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix loading of dictionary keys in `library-bridge` for `library` dictionary source. [#26834](https://github.com/ClickHouse/ClickHouse/pull/26834) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Aggregate function parameters might be lost when applying some combinators causing exceptions like `Conversion from AggregateFunction(topKArray, Array(String)) to AggregateFunction(topKArray(10), Array(String)) is not supported`. It's fixed. Fixes [#26196](https://github.com/ClickHouse/ClickHouse/issues/26196) and [#26433](https://github.com/ClickHouse/ClickHouse/issues/26433). [#26814](https://github.com/ClickHouse/ClickHouse/pull/26814) ([tavplubix](https://github.com/tavplubix)).
 * Add `event_time_microseconds` value for `REMOVE_PART` in `system.part_log`. In previous versions is was not set. [#26720](https://github.com/ClickHouse/ClickHouse/pull/26720) ([Azat Khuzhin](https://github.com/azat)).
 * Do not remove data on ReplicatedMergeTree table shutdown to avoid creating data to metadata inconsistency. [#26716](https://github.com/ClickHouse/ClickHouse/pull/26716) ([nvartolomei](https://github.com/nvartolomei)).
-* Sometimes SET ROLE could work incorrectly, this PR fixes that. [#26707](https://github.com/ClickHouse/ClickHouse/pull/26707) ([Vitaly Baranov](https://github.com/vitlibar)).
-* ParallelFormattingOutputFormat: Use mutex to handle the join to the collector_thread (https://github.com/ClickHouse/ClickHouse/issues/26694). [#26703](https://github.com/ClickHouse/ClickHouse/pull/26703) ([Raúl Marín](https://github.com/Algunenano)).
+* Sometimes `SET ROLE` could work incorrectly, this PR fixes that. [#26707](https://github.com/ClickHouse/ClickHouse/pull/26707) ([Vitaly Baranov](https://github.com/vitlibar)).
+* Some fixes for parallel formatting (https://github.com/ClickHouse/ClickHouse/issues/26694). [#26703](https://github.com/ClickHouse/ClickHouse/pull/26703) ([Raúl Marín](https://github.com/Algunenano)).
 * Fix potential nullptr dereference in window functions. This fixes [#25276](https://github.com/ClickHouse/ClickHouse/issues/25276). [#26668](https://github.com/ClickHouse/ClickHouse/pull/26668) ([Alexander Kuzmenkov](https://github.com/akuzm)).
-* Fix history file conversion if file is empty. [#26589](https://github.com/ClickHouse/ClickHouse/pull/26589) ([Azat Khuzhin](https://github.com/azat)).
-* Fix incorrect function names of groupBitmapAnd/Or/Xor. This fixes. [#26557](https://github.com/ClickHouse/ClickHouse/pull/26557) ([Amos Bird](https://github.com/amosbird)).
+* Fix clickhouse-client history file conversion (when upgrading from the format of 3 years old version of clickhouse-client) if file is empty. [#26589](https://github.com/ClickHouse/ClickHouse/pull/26589) ([Azat Khuzhin](https://github.com/azat)).
+* Fix incorrect function names of groupBitmapAnd/Or/Xor (can be displayed in some occasions). This fixes. [#26557](https://github.com/ClickHouse/ClickHouse/pull/26557) ([Amos Bird](https://github.com/amosbird)).
 * Update `chown` cmd check in clickhouse-server docker entrypoint. It fixes the bug that cluster pod restart failed (or timeout) on kubernetes. [#26545](https://github.com/ClickHouse/ClickHouse/pull/26545) ([Ky Li](https://github.com/Kylinrix)).
-* Fix crash in rabbitmq shutdown in case rabbitmq setup was not started. Closes [#26504](https://github.com/ClickHouse/ClickHouse/issues/26504). [#26529](https://github.com/ClickHouse/ClickHouse/pull/26529) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix crash in `RabbitMQ` shutdown in case `RabbitMQ` setup was not started. Closes [#26504](https://github.com/ClickHouse/ClickHouse/issues/26504). [#26529](https://github.com/ClickHouse/ClickHouse/pull/26529) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Fix issues with `CREATE DICTIONARY` query if dictionary name or database name was quoted. Closes [#26491](https://github.com/ClickHouse/ClickHouse/issues/26491). [#26508](https://github.com/ClickHouse/ClickHouse/pull/26508) ([Maksim Kita](https://github.com/kitaisreal)).
-* Fix broken name resolution after rewriting column aliases. This fixes [#26432](https://github.com/ClickHouse/ClickHouse/issues/26432). [#26475](https://github.com/ClickHouse/ClickHouse/pull/26475) ([Amos Bird](https://github.com/amosbird)).
+* Fix broken column name resolution after rewriting column aliases. This fixes [#26432](https://github.com/ClickHouse/ClickHouse/issues/26432). [#26475](https://github.com/ClickHouse/ClickHouse/pull/26475) ([Amos Bird](https://github.com/amosbird)).
 * Fix some fuzzed msan crash. Fixes [#22517](https://github.com/ClickHouse/ClickHouse/issues/22517). [#26428](https://github.com/ClickHouse/ClickHouse/pull/26428) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
-* Now, scalar subquery always returns `Nullable` result if it's type can be `Nullable`. It is needed because in case of empty subquery it's result should be `Null`. Previously, it was possible to get error about incompatible types (type deduction does not execute scalar subquery, and it could use not-nullable type). Scalar subquery with empty result which can't be converted to `Nullable` (like `Array` or `Tuple`) now throws error. Fixes [#25411](https://github.com/ClickHouse/ClickHouse/issues/25411). [#26423](https://github.com/ClickHouse/ClickHouse/pull/26423) ([Nikolai Kochetov](https://github.com/KochetovNicolai)).
 * Fix infinite non joined block stream in `partial_merge_join` close [#26325](https://github.com/ClickHouse/ClickHouse/issues/26325). [#26374](https://github.com/ClickHouse/ClickHouse/pull/26374) ([Vladimir C](https://github.com/vdimir)).
 * Fix possible crash when login as dropped user. This PR fixes [#26073](https://github.com/ClickHouse/ClickHouse/issues/26073). [#26363](https://github.com/ClickHouse/ClickHouse/pull/26363) ([Vitaly Baranov](https://github.com/vitlibar)).
 * Fix `optimize_distributed_group_by_sharding_key` for multiple columns (leads to incorrect result w/ `optimize_skip_unused_shards=1`/`allow_nondeterministic_optimize_skip_unused_shards=1` and multiple columns in sharding key expression). [#26353](https://github.com/ClickHouse/ClickHouse/pull/26353) ([Azat Khuzhin](https://github.com/azat)).
 * Fixed rare bug in lost replica recovery that may cause replicas to diverge. [#26321](https://github.com/ClickHouse/ClickHouse/pull/26321) ([tavplubix](https://github.com/tavplubix)).
-* Fix zstd decompression in case there are escape sequences at the end of internal buffer. Closes [#26013](https://github.com/ClickHouse/ClickHouse/issues/26013). [#26314](https://github.com/ClickHouse/ClickHouse/pull/26314) ([Kseniia Sumarokova](https://github.com/kssenii)).
+* Fix zstd decompression (for import/export in zstd framing format that is unrelated to tables data) in case there are escape sequences at the end of internal buffer. Closes [#26013](https://github.com/ClickHouse/ClickHouse/issues/26013). [#26314](https://github.com/ClickHouse/ClickHouse/pull/26314) ([Kseniia Sumarokova](https://github.com/kssenii)).
 * Fix logical error on join with totals, close [#26017](https://github.com/ClickHouse/ClickHouse/issues/26017). [#26250](https://github.com/ClickHouse/ClickHouse/pull/26250) ([Vladimir C](https://github.com/vdimir)).
 * Remove excessive newline in `thread_name` column in `system.stack_trace` table. This fixes [#24124](https://github.com/ClickHouse/ClickHouse/issues/24124). [#26210](https://github.com/ClickHouse/ClickHouse/pull/26210) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 * Fix potential crash if more than one `untuple` expression is used. [#26179](https://github.com/ClickHouse/ClickHouse/pull/26179) ([alexey-milovidov](https://github.com/alexey-milovidov)).
-* Don't throw exception in `toString` for nullable enum, close [#25806](https://github.com/ClickHouse/ClickHouse/issues/25806). [#26123](https://github.com/ClickHouse/ClickHouse/pull/26123) ([Vladimir C](https://github.com/vdimir)).
+* Don't throw exception in `toString` for Nullable Enum if Enum does not have a value for zero, close [#25806](https://github.com/ClickHouse/ClickHouse/issues/25806). [#26123](https://github.com/ClickHouse/ClickHouse/pull/26123) ([Vladimir C](https://github.com/vdimir)).
 * Fixed incorrect `sequence_id` in MySQL protocol packets that ClickHouse sends on exception during query execution. It might cause MySQL client to reset connection to ClickHouse server. Fixes [#21184](https://github.com/ClickHouse/ClickHouse/issues/21184). [#26051](https://github.com/ClickHouse/ClickHouse/pull/26051) ([tavplubix](https://github.com/tavplubix)).
-* Before this patch `cutToFirstSignificantSubdomainCustom()`/`cutToFirstSignificantSubdomainCustomWithWWW()`/`firstSignificantSubdomainCustom()` returns incorrect type for consts, and hence `optimize_skip_unused_shards` does not work:. [#26041](https://github.com/ClickHouse/ClickHouse/pull/26041) ([Azat Khuzhin](https://github.com/azat)).
+* Fix for the case that `cutToFirstSignificantSubdomainCustom()`/`cutToFirstSignificantSubdomainCustomWithWWW()`/`firstSignificantSubdomainCustom()` returns incorrect type for consts, and hence `optimize_skip_unused_shards` does not work:. [#26041](https://github.com/ClickHouse/ClickHouse/pull/26041) ([Azat Khuzhin](https://github.com/azat)).
 * Fix possible mismatched header when using normal projection with prewhere. This fixes [#26020](https://github.com/ClickHouse/ClickHouse/issues/26020). [#26038](https://github.com/ClickHouse/ClickHouse/pull/26038) ([Amos Bird](https://github.com/amosbird)).
 * Fix sharding_key from column w/o function for remote() (before `select * from remote('127.1', system.one, dummy)` leads to `Unknown column: dummy, there are only columns .` error). [#25824](https://github.com/ClickHouse/ClickHouse/pull/25824) ([Azat Khuzhin](https://github.com/azat)).
 * Fixed `Not found column ...` and `Missing column ...` errors when selecting from `MaterializeMySQL`. Fixes [#23708](https://github.com/ClickHouse/ClickHouse/issues/23708), [#24830](https://github.com/ClickHouse/ClickHouse/issues/24830), [#25794](https://github.com/ClickHouse/ClickHouse/issues/25794). [#25822](https://github.com/ClickHouse/ClickHouse/pull/25822) ([tavplubix](https://github.com/tavplubix)).
@@ -209,20 +201,12 @@
 
 #### Build/Testing/Packaging Improvement
 
-* Improve support for build with `clang-13`. [#27777](https://github.com/ClickHouse/ClickHouse/pull/27777) ([Sergei Semin](https://github.com/syominsergey)).
-* Add support for build with `clang-13`. This closes [#27705](https://github.com/ClickHouse/ClickHouse/issues/27705). [#27714](https://github.com/ClickHouse/ClickHouse/pull/27714) ([alexey-milovidov](https://github.com/alexey-milovidov)).
+* Now we ran stateful and stateless tests in random timezones. Fixes [#12439](https://github.com/ClickHouse/ClickHouse/issues/12439). Reading String as DateTime and writing DateTime as String in Protobuf format now respect timezone. Reading UInt16 as DateTime in Arrow and Parquet formats now treat it as Date and then converts to DateTime with respect to DateTime's timezone, because Date is serialized in Arrow and Parquet as UInt16. GraphiteMergeTree now respect time zone for rounding of times. Fixes [#5098](https://github.com/ClickHouse/ClickHouse/issues/5098). Author: @alexey-milovidov. [#15408](https://github.com/ClickHouse/ClickHouse/pull/15408) ([alesapin](https://github.com/alesapin)).
+* `clickhouse-test` supports SQL tests with [Jinja2](https://jinja.palletsprojects.com/en/3.0.x/templates/#synopsis) templates. [#26579](https://github.com/ClickHouse/ClickHouse/pull/26579) ([Vladimir C](https://github.com/vdimir)).
+* Add support for build with `clang-13`. This closes [#27705](https://github.com/ClickHouse/ClickHouse/issues/27705). [#27714](https://github.com/ClickHouse/ClickHouse/pull/27714) ([alexey-milovidov](https://github.com/alexey-milovidov)). [#27777](https://github.com/ClickHouse/ClickHouse/pull/27777) ([Sergei Semin](https://github.com/syominsergey))
 * Add CMake options to build with or without specific CPU instruction set. This is for [#17469](https://github.com/ClickHouse/ClickHouse/issues/17469) and [#27509](https://github.com/ClickHouse/ClickHouse/issues/27509). [#27508](https://github.com/ClickHouse/ClickHouse/pull/27508) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 * Fix linking of auxiliar programs when using dynamic libraries. [#26958](https://github.com/ClickHouse/ClickHouse/pull/26958) ([Raúl Marín](https://github.com/Algunenano)).
-* Tests: Fix CLICKHOUSE_CLIENT_SECURE with the default config. [#26901](https://github.com/ClickHouse/ClickHouse/pull/26901) ([Raúl Marín](https://github.com/Algunenano)).
-* Enabling RBAC TestFlows tests and crossing out new fails. [#26747](https://github.com/ClickHouse/ClickHouse/pull/26747) ([vzakaznikov](https://github.com/vzakaznikov)).
-* Fix /clickhouse/window functions/tests/non distributed/errors/error window function in join. [#26744](https://github.com/ClickHouse/ClickHouse/pull/26744) ([vzakaznikov](https://github.com/vzakaznikov)).
-* `clickhouse-test` supports SQL tests with [Jinja2](https://jinja.palletsprojects.com/en/3.0.x/templates/#synopsis) templates. [#26579](https://github.com/ClickHouse/ClickHouse/pull/26579) ([Vladimir C](https://github.com/vdimir)).
-* Update RocksDB to 2021-07-16 master. [#26411](https://github.com/ClickHouse/ClickHouse/pull/26411) ([alexey-milovidov](https://github.com/alexey-milovidov)).
-* Enabling Testflows tests for Kerberos support. [#21659](https://github.com/ClickHouse/ClickHouse/pull/21659) ([Andrey](https://github.com/zvonand)).
-
-#### Other
-
-* Rename `MaterializeMySQL` to `MaterializedMySQL`. [#26822](https://github.com/ClickHouse/ClickHouse/pull/26822) ([tavplubix](https://github.com/tavplubix)).
+* Update RocksDB to `2021-07-16` master. [#26411](https://github.com/ClickHouse/ClickHouse/pull/26411) ([alexey-milovidov](https://github.com/alexey-milovidov)).
 
 
 ### ClickHouse release v21.8, 2021-08-12

From 91293c7449aa8f15b2e01960fccd12d5c38cad10 Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Wed, 8 Sep 2021 23:30:48 +0800
Subject: [PATCH 169/177] Fix crash on exception with projection aggregate

---
 src/Interpreters/Aggregator.cpp               | 20 +++++-----
 src/Interpreters/Aggregator.h                 | 27 +++++++-------
 .../Transforms/AggregatingTransform.cpp       | 15 +++++---
 .../Transforms/AggregatingTransform.h         | 37 ++++++++++++++++---
 .../MergeTree/MergeTreeDataSelectExecutor.cpp |  8 +++-
 ...ference => 01707_join_use_nulls.reference} |  0
 ...use_nulls.sql => 01707_join_use_nulls.sql} |  0
 ...0_projection_with_mixed_pipeline.reference |  0
 .../01710_projection_with_mixed_pipeline.sql  |  9 +++++
 9 files changed, 81 insertions(+), 35 deletions(-)
 rename tests/queries/0_stateless/{01710_join_use_nulls.reference => 01707_join_use_nulls.reference} (100%)
 rename tests/queries/0_stateless/{01710_join_use_nulls.sql => 01707_join_use_nulls.sql} (100%)
 create mode 100644 tests/queries/0_stateless/01710_projection_with_mixed_pipeline.reference
 create mode 100644 tests/queries/0_stateless/01710_projection_with_mixed_pipeline.sql

diff --git a/src/Interpreters/Aggregator.cpp b/src/Interpreters/Aggregator.cpp
index c26eb10e697..a2896127d04 100644
--- a/src/Interpreters/Aggregator.cpp
+++ b/src/Interpreters/Aggregator.cpp
@@ -786,7 +786,7 @@ void NO_INLINE Aggregator::executeWithoutKeyImpl(
     AggregatedDataWithoutKey & res,
     size_t rows,
     AggregateFunctionInstruction * aggregate_instructions,
-    Arena * arena)
+    Arena * arena) const
 {
 #if USE_EMBEDDED_COMPILER
     if constexpr (use_compiled_functions)
@@ -865,7 +865,7 @@ void NO_INLINE Aggregator::executeOnIntervalWithoutKeyImpl(
 
 
 void Aggregator::prepareAggregateInstructions(Columns columns, AggregateColumns & aggregate_columns, Columns & materialized_columns,
-    AggregateFunctionInstructions & aggregate_functions_instructions, NestedColumnsHolder & nested_columns_holder)
+    AggregateFunctionInstructions & aggregate_functions_instructions, NestedColumnsHolder & nested_columns_holder) const
 {
     for (size_t i = 0; i < params.aggregates_size; ++i)
         aggregate_columns[i].resize(params.aggregates[i].arguments.size());
@@ -917,7 +917,7 @@ void Aggregator::prepareAggregateInstructions(Columns columns, AggregateColumns
 
 
 bool Aggregator::executeOnBlock(const Block & block, AggregatedDataVariants & result,
-                                ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, bool & no_more_keys)
+                                ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, bool & no_more_keys) const
 {
     UInt64 num_rows = block.rows();
     return executeOnBlock(block.getColumns(), num_rows, result, key_columns, aggregate_columns, no_more_keys);
@@ -925,7 +925,7 @@ bool Aggregator::executeOnBlock(const Block & block, AggregatedDataVariants & re
 
 
 bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedDataVariants & result,
-    ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, bool & no_more_keys)
+    ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns, bool & no_more_keys) const
 {
     /// `result` will destroy the states of aggregate functions in the destructor
     result.aggregator = this;
@@ -1058,7 +1058,7 @@ bool Aggregator::executeOnBlock(Columns columns, UInt64 num_rows, AggregatedData
 }
 
 
-void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants, const String & tmp_path)
+void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants, const String & tmp_path) const
 {
     Stopwatch watch;
     size_t rows = data_variants.size();
@@ -1130,7 +1130,7 @@ void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants, co
 }
 
 
-void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants)
+void Aggregator::writeToTemporaryFile(AggregatedDataVariants & data_variants) const
 {
     String tmp_path = params.tmp_volume->getDisk()->getPath();
     return writeToTemporaryFile(data_variants, tmp_path);
@@ -1192,7 +1192,7 @@ template <typename Method>
 void Aggregator::writeToTemporaryFileImpl(
     AggregatedDataVariants & data_variants,
     Method & method,
-    IBlockOutputStream & out)
+    IBlockOutputStream & out) const
 {
     size_t max_temporary_block_size_rows = 0;
     size_t max_temporary_block_size_bytes = 0;
@@ -2311,7 +2311,7 @@ void NO_INLINE Aggregator::mergeWithoutKeyStreamsImpl(
     block.clear();
 }
 
-bool Aggregator::mergeBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys)
+bool Aggregator::mergeOnBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys) const
 {
     /// `result` will destroy the states of aggregate functions in the destructor
     result.aggregator = this;
@@ -2661,7 +2661,7 @@ void NO_INLINE Aggregator::convertBlockToTwoLevelImpl(
 }
 
 
-std::vector<Block> Aggregator::convertBlockToTwoLevel(const Block & block)
+std::vector<Block> Aggregator::convertBlockToTwoLevel(const Block & block) const
 {
     if (!block)
         return {};
@@ -2753,7 +2753,7 @@ void Aggregator::destroyWithoutKey(AggregatedDataVariants & result) const
 }
 
 
-void Aggregator::destroyAllAggregateStates(AggregatedDataVariants & result)
+void Aggregator::destroyAllAggregateStates(AggregatedDataVariants & result) const
 {
     if (result.empty())
         return;
diff --git a/src/Interpreters/Aggregator.h b/src/Interpreters/Aggregator.h
index fde6ba219df..e72fe4baea3 100644
--- a/src/Interpreters/Aggregator.h
+++ b/src/Interpreters/Aggregator.h
@@ -506,7 +506,7 @@ struct AggregatedDataVariants : private boost::noncopyable
       * But this can hardly be done simply because it is planned to put variable-length strings into the same pool.
       * In this case, the pool will not be able to know with what offsets objects are stored.
       */
-    Aggregator * aggregator = nullptr;
+    const Aggregator * aggregator = nullptr;
 
     size_t keys_size{};  /// Number of keys. NOTE do we need this field?
     Sizes key_sizes;     /// Dimensions of keys, if keys of fixed length
@@ -975,11 +975,14 @@ public:
     /// Process one block. Return false if the processing should be aborted (with group_by_overflow_mode = 'break').
     bool executeOnBlock(const Block & block, AggregatedDataVariants & result,
         ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns,    /// Passed to not create them anew for each block
-        bool & no_more_keys);
+        bool & no_more_keys) const;
 
     bool executeOnBlock(Columns columns, UInt64 num_rows, AggregatedDataVariants & result,
         ColumnRawPtrs & key_columns, AggregateColumns & aggregate_columns,    /// Passed to not create them anew for each block
-        bool & no_more_keys);
+        bool & no_more_keys) const;
+
+    /// Used for aggregate projection.
+    bool mergeOnBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys) const;
 
     /** Convert the aggregation data structure into a block.
       * If overflow_row = true, then aggregates for rows that are not included in max_rows_to_group_by are put in the first block.
@@ -996,8 +999,6 @@ public:
     /// Merge partially aggregated blocks separated to buckets into one data structure.
     void mergeBlocks(BucketToBlocks bucket_to_blocks, AggregatedDataVariants & result, size_t max_threads);
 
-    bool mergeBlock(Block block, AggregatedDataVariants & result, bool & no_more_keys);
-
     /// Merge several partially aggregated blocks into one.
     /// Precondition: for all blocks block.info.is_overflows flag must be the same.
     /// (either all blocks are from overflow data or none blocks are).
@@ -1007,11 +1008,11 @@ public:
     /** Split block with partially-aggregated data to many blocks, as if two-level method of aggregation was used.
       * This is needed to simplify merging of that data with other results, that are already two-level.
       */
-    std::vector<Block> convertBlockToTwoLevel(const Block & block);
+    std::vector<Block> convertBlockToTwoLevel(const Block & block) const;
 
     /// For external aggregation.
-    void writeToTemporaryFile(AggregatedDataVariants & data_variants, const String & tmp_path);
-    void writeToTemporaryFile(AggregatedDataVariants & data_variants);
+    void writeToTemporaryFile(AggregatedDataVariants & data_variants, const String & tmp_path) const;
+    void writeToTemporaryFile(AggregatedDataVariants & data_variants) const;
 
     bool hasTemporaryFiles() const { return !temporary_files.empty(); }
 
@@ -1083,7 +1084,7 @@ private:
     Poco::Logger * log = &Poco::Logger::get("Aggregator");
 
     /// For external aggregation.
-    TemporaryFiles temporary_files;
+    mutable TemporaryFiles temporary_files;
 
 #if USE_EMBEDDED_COMPILER
     std::shared_ptr<CompiledAggregateFunctionsHolder> compiled_aggregate_functions_holder;
@@ -1106,7 +1107,7 @@ private:
     /** Call `destroy` methods for states of aggregate functions.
       * Used in the exception handler for aggregation, since RAII in this case is not applicable.
       */
-    void destroyAllAggregateStates(AggregatedDataVariants & result);
+    void destroyAllAggregateStates(AggregatedDataVariants & result) const;
 
 
     /// Process one data block, aggregate the data into a hash table.
@@ -1136,7 +1137,7 @@ private:
         AggregatedDataWithoutKey & res,
         size_t rows,
         AggregateFunctionInstruction * aggregate_instructions,
-        Arena * arena);
+        Arena * arena) const;
 
     static void executeOnIntervalWithoutKeyImpl(
         AggregatedDataWithoutKey & res,
@@ -1149,7 +1150,7 @@ private:
     void writeToTemporaryFileImpl(
         AggregatedDataVariants & data_variants,
         Method & method,
-        IBlockOutputStream & out);
+        IBlockOutputStream & out) const;
 
     /// Merge NULL key data from hash table `src` into `dst`.
     template <typename Method, typename Table>
@@ -1304,7 +1305,7 @@ private:
         AggregateColumns & aggregate_columns,
         Columns & materialized_columns,
         AggregateFunctionInstructions & instructions,
-        NestedColumnsHolder & nested_columns_holder);
+        NestedColumnsHolder & nested_columns_holder) const;
 
     void addSingleKeyToAggregateColumns(
         const AggregatedDataVariants & data_variants,
diff --git a/src/Processors/Transforms/AggregatingTransform.cpp b/src/Processors/Transforms/AggregatingTransform.cpp
index 7802bf6e3bf..a8a93e53663 100644
--- a/src/Processors/Transforms/AggregatingTransform.cpp
+++ b/src/Processors/Transforms/AggregatingTransform.cpp
@@ -395,9 +395,14 @@ AggregatingTransform::AggregatingTransform(Block header, AggregatingTransformPar
 }
 
 AggregatingTransform::AggregatingTransform(
-    Block header, AggregatingTransformParamsPtr params_, ManyAggregatedDataPtr many_data_,
-    size_t current_variant, size_t max_threads_, size_t temporary_data_merge_threads_)
-    : IProcessor({std::move(header)}, {params_->getHeader()}), params(std::move(params_))
+    Block header,
+    AggregatingTransformParamsPtr params_,
+    ManyAggregatedDataPtr many_data_,
+    size_t current_variant,
+    size_t max_threads_,
+    size_t temporary_data_merge_threads_)
+    : IProcessor({std::move(header)}, {params_->getHeader()})
+    , params(std::move(params_))
     , key_columns(params->params.keys_size)
     , aggregate_columns(params->params.aggregates_size)
     , many_data(std::move(many_data_))
@@ -525,7 +530,7 @@ void AggregatingTransform::consume(Chunk chunk)
     {
         auto block = getInputs().front().getHeader().cloneWithColumns(chunk.detachColumns());
         block = materializeBlock(block);
-        if (!params->aggregator.mergeBlock(block, variants, no_more_keys))
+        if (!params->aggregator.mergeOnBlock(block, variants, no_more_keys))
             is_consume_finished = true;
     }
     else
@@ -547,7 +552,7 @@ void AggregatingTransform::initGenerate()
     if (variants.empty() && params->params.keys_size == 0 && !params->params.empty_result_for_aggregation_by_empty_set)
     {
         if (params->only_merge)
-            params->aggregator.mergeBlock(getInputs().front().getHeader(), variants, no_more_keys);
+            params->aggregator.mergeOnBlock(getInputs().front().getHeader(), variants, no_more_keys);
         else
             params->aggregator.executeOnBlock(getInputs().front().getHeader(), variants, key_columns, aggregate_columns, no_more_keys);
     }
diff --git a/src/Processors/Transforms/AggregatingTransform.h b/src/Processors/Transforms/AggregatingTransform.h
index 9512a7a2811..1639bc4df4b 100644
--- a/src/Processors/Transforms/AggregatingTransform.h
+++ b/src/Processors/Transforms/AggregatingTransform.h
@@ -27,15 +27,38 @@ public:
 class IBlockInputStream;
 using BlockInputStreamPtr = std::shared_ptr<IBlockInputStream>;
 
+using AggregatorList = std::list<Aggregator>;
+using AggregatorListPtr = std::shared_ptr<AggregatorList>;
+
 struct AggregatingTransformParams
 {
     Aggregator::Params params;
-    Aggregator aggregator;
+
+    /// Each params holds a list of aggregators which are used in query. It's needed because we need
+    /// to use a pointer of aggregator to proper destroy complex aggregation states on exception
+    /// (See comments in AggregatedDataVariants). However, this pointer might not be valid because
+    /// we can have two different aggregators at the same time due to mixed pipeline of aggregate
+    /// projections, and one of them might gets destroyed before used.
+    AggregatorListPtr aggregator_list_ptr;
+    Aggregator & aggregator;
     bool final;
     bool only_merge = false;
 
     AggregatingTransformParams(const Aggregator::Params & params_, bool final_)
-        : params(params_), aggregator(params), final(final_) {}
+        : params(params_)
+        , aggregator_list_ptr(std::make_shared<AggregatorList>())
+        , aggregator(*aggregator_list_ptr->emplace(aggregator_list_ptr->end(), params))
+        , final(final_)
+    {
+    }
+
+    AggregatingTransformParams(const Aggregator::Params & params_, const AggregatorListPtr & aggregator_list_ptr_, bool final_)
+        : params(params_)
+        , aggregator_list_ptr(aggregator_list_ptr_)
+        , aggregator(*aggregator_list_ptr->emplace(aggregator_list_ptr->end(), params))
+        , final(final_)
+    {
+    }
 
     Block getHeader() const { return aggregator.getHeader(final); }
 
@@ -82,9 +105,13 @@ public:
     AggregatingTransform(Block header, AggregatingTransformParamsPtr params_);
 
     /// For Parallel aggregating.
-    AggregatingTransform(Block header, AggregatingTransformParamsPtr params_,
-                         ManyAggregatedDataPtr many_data, size_t current_variant,
-                         size_t max_threads, size_t temporary_data_merge_threads);
+    AggregatingTransform(
+        Block header,
+        AggregatingTransformParamsPtr params_,
+        ManyAggregatedDataPtr many_data,
+        size_t current_variant,
+        size_t max_threads,
+        size_t temporary_data_merge_threads);
     ~AggregatingTransform() override;
 
     String getName() const override { return "AggregatingTransform"; }
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 734a67da758..0433874ddaa 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -267,6 +267,8 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read(
         auto many_data = std::make_shared<ManyAggregatedData>(projection_pipe.numOutputPorts() + ordinary_pipe.numOutputPorts());
         size_t counter = 0;
 
+        AggregatorListPtr aggregator_list_ptr = std::make_shared<AggregatorList>();
+
         // TODO apply in_order_optimization here
         auto build_aggregate_pipe = [&](Pipe & pipe, bool projection)
         {
@@ -306,7 +308,8 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read(
                     settings.min_count_to_compile_aggregate_expression,
                     header_before_aggregation); // The source header is also an intermediate header
 
-                transform_params = std::make_shared<AggregatingTransformParams>(std::move(params), query_info.projection->aggregate_final);
+                transform_params = std::make_shared<AggregatingTransformParams>(
+                    std::move(params), aggregator_list_ptr, query_info.projection->aggregate_final);
 
                 /// This part is hacky.
                 /// We want AggregatingTransform to work with aggregate states instead of normal columns.
@@ -336,7 +339,8 @@ QueryPlanPtr MergeTreeDataSelectExecutor::read(
                     settings.compile_aggregate_expressions,
                     settings.min_count_to_compile_aggregate_expression);
 
-                transform_params = std::make_shared<AggregatingTransformParams>(std::move(params), query_info.projection->aggregate_final);
+                transform_params = std::make_shared<AggregatingTransformParams>(
+                    std::move(params), aggregator_list_ptr, query_info.projection->aggregate_final);
             }
 
             pipe.resize(pipe.numOutputPorts(), true, true);
diff --git a/tests/queries/0_stateless/01710_join_use_nulls.reference b/tests/queries/0_stateless/01707_join_use_nulls.reference
similarity index 100%
rename from tests/queries/0_stateless/01710_join_use_nulls.reference
rename to tests/queries/0_stateless/01707_join_use_nulls.reference
diff --git a/tests/queries/0_stateless/01710_join_use_nulls.sql b/tests/queries/0_stateless/01707_join_use_nulls.sql
similarity index 100%
rename from tests/queries/0_stateless/01710_join_use_nulls.sql
rename to tests/queries/0_stateless/01707_join_use_nulls.sql
diff --git a/tests/queries/0_stateless/01710_projection_with_mixed_pipeline.reference b/tests/queries/0_stateless/01710_projection_with_mixed_pipeline.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01710_projection_with_mixed_pipeline.sql b/tests/queries/0_stateless/01710_projection_with_mixed_pipeline.sql
new file mode 100644
index 00000000000..734aa659146
--- /dev/null
+++ b/tests/queries/0_stateless/01710_projection_with_mixed_pipeline.sql
@@ -0,0 +1,9 @@
+drop table if exists t;
+
+create table t (x UInt32) engine = MergeTree order by tuple() settings index_granularity = 8;
+insert into t select number from numbers(100);
+alter table t add projection p (select uniqHLL12(x));
+insert into t select number + 100 from numbers(100);
+select uniqHLL12(x) from t settings allow_experimental_projection_optimization = 1, max_bytes_to_read=400, max_block_size=8; -- { serverError 307; }
+
+drop table if exists t;

From 8bf0cc695d318f564eb13ab7c9049914c9002bf6 Mon Sep 17 00:00:00 2001
From: MagiaGroz <47081171+MagiaGroz@users.noreply.github.com>
Date: Thu, 9 Sep 2021 12:02:18 +0600
Subject: [PATCH 170/177] Update index.md

---
 docs/ru/sql-reference/statements/alter/index/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/ru/sql-reference/statements/alter/index/index.md b/docs/ru/sql-reference/statements/alter/index/index.md
index 1f6bbea5c4b..c3d1e75a77d 100644
--- a/docs/ru/sql-reference/statements/alter/index/index.md
+++ b/docs/ru/sql-reference/statements/alter/index/index.md
@@ -19,7 +19,7 @@ ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name
 Команда `ADD INDEX` добавляет описание индексов в метаданные, а `DROP INDEX` удаляет индекс из метаданных и стирает файлы индекса с диска, поэтому они легковесные и работают мгновенно.
 
 Если индекс появился в метаданных, то он начнет считаться в последующих слияниях и записях в таблицу, а не сразу после выполнения операции `ALTER`.
-`MATERIALIZE INDEX` - перестраивает индекс в указанной партиции. Реализовано как мутация.
+`MATERIALIZE INDEX` - перестраивает индекс в указанной партиции. Реализовано как мутация. В случае если нужно перестроить индекс над всеми данными то писать `IN PARTITION` не нужно.
 
 Запрос на изменение индексов реплицируется, сохраняя новые метаданные в ZooKeeper и применяя изменения на всех репликах.
 

From 2d49c77aca86b0981fdad7cab1899bbfa6de5c70 Mon Sep 17 00:00:00 2001
From: DuckSoft <realducksoft@gmail.com>
Date: Thu, 9 Sep 2021 14:19:44 +0800
Subject: [PATCH 171/177] Update formats.md

remove excess characters from doc
---
 docs/en/interfaces/formats.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/interfaces/formats.md b/docs/en/interfaces/formats.md
index 68a52dd702e..a533d9357e1 100644
--- a/docs/en/interfaces/formats.md
+++ b/docs/en/interfaces/formats.md
@@ -74,7 +74,7 @@ The `TabSeparated` format is convenient for processing data using custom program
 The `TabSeparated` format supports outputting total values (when using WITH TOTALS) and extreme values (when ‘extremes’ is set to 1). In these cases, the total values and extremes are output after the main data. The main result, total values, and extremes are separated from each other by an empty line. Example:
 
 ``` sql
-SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated``
+SELECT EventDate, count() AS c FROM test.hits GROUP BY EventDate WITH TOTALS ORDER BY EventDate FORMAT TabSeparated
 ```
 
 ``` text

From c0b5e0f35ee8a84e57143116e491eaae816ec02f Mon Sep 17 00:00:00 2001
From: MagiaGroz <47081171+MagiaGroz@users.noreply.github.com>
Date: Thu, 9 Sep 2021 13:45:06 +0600
Subject: [PATCH 172/177] update en-docs alter/index

---
 docs/en/sql-reference/statements/alter/index/index.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/statements/alter/index/index.md b/docs/en/sql-reference/statements/alter/index/index.md
index fd5657c3666..4e2943d37f3 100644
--- a/docs/en/sql-reference/statements/alter/index/index.md
+++ b/docs/en/sql-reference/statements/alter/index/index.md
@@ -12,7 +12,7 @@ The following operations are available:
 
 -   `ALTER TABLE [db].name DROP INDEX name` - Removes index description from tables metadata and deletes index files from disk.
 
--   `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - The query rebuilds the secondary index `name` in the partition `partition_name`. Implemented as a [mutation](../../../../sql-reference/statements/alter/index.md#mutations).
+-   `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - The query rebuilds the secondary index `name` in the partition `partition_name`. Implemented as a [mutation](../../../../sql-reference/statements/alter/index.md#mutations). To rebuild index over the whole data in the table you need to remove `IN PARTITION` from query.
 
 The first two commands are lightweight in a sense that they only change metadata or remove files.
 

From adc66b455245900876acbe0aee94222eba688142 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Thu, 9 Sep 2021 11:09:53 +0300
Subject: [PATCH 173/177] Update version_date.tsv after release 21.9.2.17

---
 utils/list-versions/version_date.tsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index c8885521437..fea66e5cb75 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,3 +1,4 @@
+v21.9.2.17-stable	2021-09-09
 v21.8.5.7-lts	2021-09-02
 v21.8.4.51-lts	2021-08-17
 v21.8.3.44-lts	2021-08-12

From ba18ba1d5c7f2480f432bccb01dab18e746f20bb Mon Sep 17 00:00:00 2001
From: Nikita Mikhaylov <mikhaylovnikitka@gmail.com>
Date: Thu, 9 Sep 2021 08:28:59 +0000
Subject: [PATCH 174/177] Increase stack size

---
 src/Common/FiberStack.h | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Common/FiberStack.h b/src/Common/FiberStack.h
index aec4befb515..db825722da5 100644
--- a/src/Common/FiberStack.h
+++ b/src/Common/FiberStack.h
@@ -31,8 +31,8 @@ public:
     /// probably it worth to try to increase stack size for coroutines.
     ///
     /// Current value is just enough for all tests in our CI. It's not selected in some special
-    /// way. We will have 36 pages with 4KB page size.
-    static constexpr size_t default_stack_size = 144 * 1024; /// 64KB was not enough for tests
+    /// way. We will have 40 pages with 4KB page size.
+    static constexpr size_t default_stack_size = 192 * 1024; /// 64KB was not enough for tests
 
     explicit FiberStack(size_t stack_size_ = default_stack_size) : stack_size(stack_size_)
     {

From 9d8509d37ed308fb6314e0dc85b8b9bac86c8927 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Thu, 9 Sep 2021 11:59:15 +0300
Subject: [PATCH 175/177] Auto version update to [21.10.1.8013] [54455]

---
 cmake/autogenerated_versions.txt | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt
index 03247b4b3ea..030bf0a9fe0 100644
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@@ -6,7 +6,7 @@ SET(VERSION_REVISION 54455)
 SET(VERSION_MAJOR 21)
 SET(VERSION_MINOR 10)
 SET(VERSION_PATCH 1)
-SET(VERSION_GITHASH 09df5018f95edcd0f759d4689ac5d029dd400c2a)
-SET(VERSION_DESCRIBE v21.10.1.1-testing)
-SET(VERSION_STRING 21.10.1.1)
+SET(VERSION_GITHASH 7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7)
+SET(VERSION_DESCRIBE v21.10.1.8013-prestable)
+SET(VERSION_STRING 21.10.1.8013)
 # end of autochange

From fc4f1194173c1c7282779c34b557a58a87637790 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Thu, 9 Sep 2021 12:03:29 +0300
Subject: [PATCH 176/177] Auto version update to [21.11.1.1] [54456]

---
 cmake/autogenerated_versions.txt              |  8 +--
 debian/changelog                              |  4 +-
 docker/client/Dockerfile                      |  2 +-
 docker/server/Dockerfile                      |  2 +-
 docker/test/Dockerfile                        |  2 +-
 .../StorageSystemContributors.generated.cpp   | 67 +++++++++++++++++++
 6 files changed, 76 insertions(+), 9 deletions(-)

diff --git a/cmake/autogenerated_versions.txt b/cmake/autogenerated_versions.txt
index 030bf0a9fe0..dd6ead7d97f 100644
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@@ -2,11 +2,11 @@
 
 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
-SET(VERSION_REVISION 54455)
+SET(VERSION_REVISION 54456)
 SET(VERSION_MAJOR 21)
-SET(VERSION_MINOR 10)
+SET(VERSION_MINOR 11)
 SET(VERSION_PATCH 1)
 SET(VERSION_GITHASH 7a4a0b0edef0ad6e0aa662cd3b90c3f4acf796e7)
-SET(VERSION_DESCRIBE v21.10.1.8013-prestable)
-SET(VERSION_STRING 21.10.1.8013)
+SET(VERSION_DESCRIBE v21.11.1.1-prestable)
+SET(VERSION_STRING 21.11.1.1)
 # end of autochange
diff --git a/debian/changelog b/debian/changelog
index f3e740d20cf..460424bdb36 100644
--- a/debian/changelog
+++ b/debian/changelog
@@ -1,5 +1,5 @@
-clickhouse (21.10.1.1) unstable; urgency=low
+clickhouse (21.11.1.1) unstable; urgency=low
 
   * Modified source code
 
- -- clickhouse-release <clickhouse-release@yandex-team.ru>  Sat, 17 Jul 2021 08:45:03 +0300
+ -- clickhouse-release <clickhouse-release@yandex-team.ru>  Thu, 09 Sep 2021 12:03:26 +0300
diff --git a/docker/client/Dockerfile b/docker/client/Dockerfile
index 1b398b43c32..962459ccead 100644
--- a/docker/client/Dockerfile
+++ b/docker/client/Dockerfile
@@ -1,7 +1,7 @@
 FROM ubuntu:18.04
 
 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.10.1.*
+ARG version=21.11.1.*
 
 RUN sed -i 's|http://archive|http://ru.archive|g' /etc/apt/sources.list
 
diff --git a/docker/server/Dockerfile b/docker/server/Dockerfile
index 6307e74c633..7309574b16c 100644
--- a/docker/server/Dockerfile
+++ b/docker/server/Dockerfile
@@ -1,7 +1,7 @@
 FROM ubuntu:20.04
 
 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.10.1.*
+ARG version=21.11.1.*
 ARG gosu_ver=1.10
 
 # set non-empty deb_location_url url to create a docker image
diff --git a/docker/test/Dockerfile b/docker/test/Dockerfile
index 62cfcf9e896..b3a23bf9662 100644
--- a/docker/test/Dockerfile
+++ b/docker/test/Dockerfile
@@ -1,7 +1,7 @@
 FROM ubuntu:18.04
 
 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.10.1.*
+ARG version=21.11.1.*
 
 RUN apt-get update && \
     apt-get install -y apt-transport-https dirmngr && \
diff --git a/src/Storages/System/StorageSystemContributors.generated.cpp b/src/Storages/System/StorageSystemContributors.generated.cpp
index bed8eadc19c..a097e08871d 100644
--- a/src/Storages/System/StorageSystemContributors.generated.cpp
+++ b/src/Storages/System/StorageSystemContributors.generated.cpp
@@ -15,6 +15,7 @@ const char * auto_contributors[] {
     "adevyatova",
     "ageraab",
     "Ahmed Dardery",
+    "Aimiyoo",
     "akazz",
     "Akazz",
     "akonyaev",
@@ -33,6 +34,7 @@ const char * auto_contributors[] {
     "alesapin",
     "Alex",
     "Alexander Avdonkin",
+    "Alexander Bezpiatov",
     "Alexander Burmak",
     "Alexander Ermolaev",
     "Alexander Gololobov",
@@ -43,6 +45,7 @@ const char * auto_contributors[] {
     "Alexander Krasheninnikov",
     "Alexander Kuranoff",
     "Alexander Kuzmenkov",
+    "Alexander Lodin",
     "Alexander Lukin",
     "Alexander Makarov",
     "Alexander Marshalov",
@@ -54,6 +57,7 @@ const char * auto_contributors[] {
     "Alexander Sapin",
     "Alexander Tokmakov",
     "Alexander Tretiakov",
+    "Alexandra",
     "Alexandra Latysheva",
     "Alexandre Snarskii",
     "Alexandr Kondratev",
@@ -63,6 +67,7 @@ const char * auto_contributors[] {
     "Alexei Averchenko",
     "Alexey",
     "Alexey Arno",
+    "Alexey Boykov",
     "Alexey Dushechkin",
     "Alexey Elymanov",
     "Alexey Ilyukhov",
@@ -95,12 +100,14 @@ const char * auto_contributors[] {
     "Anatoly Pugachev",
     "ana-uvarova",
     "AnaUvarova",
+    "Andr0901",
     "Andreas Hunkeler",
     "AndreevDm",
     "Andrei Bodrov",
     "Andrei Chulkov",
     "andrei-karpliuk",
     "Andrei Nekrashevich",
+    "ANDREI STAROVEROV",
     "Andrew Grigorev",
     "Andrew Onyshchuk",
     "andrewsg",
@@ -115,10 +122,12 @@ const char * auto_contributors[] {
     "Andrey Skobtsov",
     "Andrey Urusov",
     "Andrey Z",
+    "Andy Liang",
     "Andy Yang",
     "Anmol Arora",
     "Anna",
     "Anna Shakhova",
+    "anneji-dev",
     "annvsh",
     "anrodigina",
     "Anthony N. Simon",
@@ -140,6 +149,7 @@ const char * auto_contributors[] {
     "aprudaev",
     "Ariel Robaldo",
     "Arsen Hakobyan",
+    "Arslan G",
     "ArtCorp",
     "Artem Andreenko",
     "Artemeey",
@@ -155,6 +165,7 @@ const char * auto_contributors[] {
     "artpaul",
     "Artur",
     "Artur Beglaryan",
+    "Artur Filatenkov",
     "AsiaKorushkina",
     "asiana21",
     "atereh",
@@ -192,11 +203,14 @@ const char * auto_contributors[] {
     "Boris Granveaud",
     "Bowen Masco",
     "bo zeng",
+    "Braulio Valdivielso",
     "Brett Hoerner",
     "BSD_Conqueror",
     "bseng",
     "Bulat Gaifullin",
     "Carbyn",
+    "caspian",
+    "Caspian",
     "cekc",
     "centos7",
     "champtar",
@@ -263,6 +277,7 @@ const char * auto_contributors[] {
     "Dmitrii Raev",
     "dmitriiut",
     "Dmitriy",
+    "Dmitriy Lushnikov",
     "Dmitry",
     "Dmitry Belyavtsev",
     "Dmitry Bilunov",
@@ -281,6 +296,7 @@ const char * auto_contributors[] {
     "Dongdong Yang",
     "DoomzD",
     "Dr. Strange Looker",
+    "DuckSoft",
     "d.v.semenov",
     "eaxdev",
     "eejoin",
@@ -328,6 +344,7 @@ const char * auto_contributors[] {
     "favstovol",
     "FawnD2",
     "FeehanG",
+    "feihengye",
     "felixoid",
     "felixxdu",
     "feng lv",
@@ -335,6 +352,7 @@ const char * auto_contributors[] {
     "fessmage",
     "FgoDt",
     "fibersel",
+    "Filatenkov Artur",
     "filimonov",
     "filipe",
     "Filipe Caixeta",
@@ -361,6 +379,7 @@ const char * auto_contributors[] {
     "George",
     "George3d6",
     "George G",
+    "Georgy Ginzburg",
     "Gervasio Varela",
     "ggerogery",
     "giordyb",
@@ -386,15 +405,18 @@ const char * auto_contributors[] {
     "hchen9",
     "hcz",
     "heng zhao",
+    "hermano",
     "hexiaoting",
     "Hiroaki Nakamura",
     "hotid",
+    "huangzhaowei",
     "HuFuwang",
     "Hui Wang",
     "hustnn",
     "huzhichengdd",
     "ice1x",
     "idfer",
+    "igomac",
     "igor",
     "Igor",
     "Igor Hatarist",
@@ -404,6 +426,7 @@ const char * auto_contributors[] {
     "Igr",
     "Igr Mineev",
     "ikarishinjieva",
+    "Ikko Ashimine",
     "ikopylov",
     "Ildar Musin",
     "Ildus Kurbangaliev",
@@ -437,23 +460,29 @@ const char * auto_contributors[] {
     "Ivan Starkov",
     "ivanzhukov",
     "Ivan Zhukov",
+    "ivoleg",
     "Jack Song",
     "JackyWoo",
     "Jacob Hayes",
     "jakalletti",
+    "Jakub Kuklis",
     "JaosnHsieh",
+    "jasine",
     "Jason",
+    "Jason Keirstead",
     "javartisan",
     "javi",
     "javi santana",
     "Javi Santana",
     "Javi santana bot",
     "Jean Baptiste Favre",
+    "Jeffrey Dang",
     "jennyma",
     "jetgm",
     "Jiading Guo",
     "Jiang Tao",
     "jianmei zhang",
+    "jkuklis",
     "Jochen Schalanda",
     "John",
     "John Hummel",
@@ -467,15 +496,18 @@ const char * auto_contributors[] {
     "Keiji Yoshida",
     "Ken Chen",
     "Kevin Chiang",
+    "Kevin Michel",
     "kevin wan",
     "Kiran",
     "Kirill Danshin",
+    "Kirill Ershov",
     "kirillikoff",
     "Kirill Malev",
     "Kirill Shvakov",
     "kmeaw",
     "Koblikov Mihail",
     "KochetovNicolai",
+    "kolsys",
     "Konstantin Grabar",
     "Konstantin Lebedev",
     "Konstantin Malanchev",
@@ -492,6 +524,7 @@ const char * auto_contributors[] {
     "Kseniia Sumarokova",
     "kshvakov",
     "kssenii",
+    "Ky Li",
     "l",
     "l1tsolaiki",
     "lalex",
@@ -502,6 +535,7 @@ const char * auto_contributors[] {
     "Leopold Schabel",
     "leozhang",
     "Lev Borodin",
+    "levie",
     "levushkin aleksej",
     "levysh",
     "Lewinma",
@@ -509,6 +543,7 @@ const char * auto_contributors[] {
     "libenwang",
     "lichengxiang",
     "linceyou",
+    "listar",
     "litao91",
     "liu-bov",
     "Liu Cong",
@@ -521,6 +556,7 @@ const char * auto_contributors[] {
     "long2ice",
     "Lopatin Konstantin",
     "Loud_Scream",
+    "lthaooo",
     "ltybc-coder",
     "luc1ph3r",
     "Lucid Dreams",
@@ -529,12 +565,14 @@ const char * auto_contributors[] {
     "Lv Feng",
     "M0r64n",
     "madianjun",
+    "MagiaGroz",
     "maiha",
     "Maksim",
     "Maksim Fedotov",
     "Maksim Kita",
     "Maks Skorokhod",
     "malkfilipp",
+    "Malte",
     "manmitya",
     "maqroll",
     "Marat IDRISOV",
@@ -595,6 +633,7 @@ const char * auto_contributors[] {
     "Michal Lisowski",
     "michon470",
     "MicrochipQ",
+    "Miguel Fernández",
     "miha-g",
     "Mihail Fandyushin",
     "Mikahil Nacharov",
@@ -617,10 +656,12 @@ const char * auto_contributors[] {
     "MikuSugar",
     "Milad Arabi",
     "millb",
+    "Misko Lee",
     "mnkonkova",
     "Mohammad Hossein Sekhavat",
     "morty",
     "moscas",
+    "Mostafa Dahab",
     "MovElb",
     "Mr.General",
     "Murat Kabilov",
@@ -634,11 +675,14 @@ const char * auto_contributors[] {
     "nauta",
     "nautaa",
     "Neeke Gao",
+    "neng.liu",
     "Neng Liu",
+    "NengLiu",
     "never lee",
     "NeZeD [Mac Pro]",
     "nicelulu",
     "Nickita",
+    "Nickita Taranov",
     "Nickolay Yastrebov",
     "nickzhwang",
     "Nicolae Vartolomei",
@@ -684,6 +728,9 @@ const char * auto_contributors[] {
     "Olga Khvostikova",
     "olgarev",
     "Olga Revyakina",
+    "OmarBazaraa",
+    "Onehr7",
+    "OnePiece",
     "orantius",
     "Orivej Desh",
     "Oskar Wojciski",
@@ -701,6 +748,7 @@ const char * auto_contributors[] {
     "Pavel Yakunin",
     "Pavlo Bashynskiy",
     "Pawel Rog",
+    "pdv-ru",
     "Peng Jian",
     "Persiyanov Dmitriy Andreevich",
     "Pervakov Grigorii",
@@ -728,6 +776,7 @@ const char * auto_contributors[] {
     "Ramazan Polat",
     "Raúl Marín",
     "Ravengg",
+    "Realist007",
     "redclusive",
     "RedClusive",
     "RegulusZ",
@@ -749,12 +798,15 @@ const char * auto_contributors[] {
     "Roman Peshkurov",
     "Roman Tsisyk",
     "romanzhukov",
+    "Roman Zhukov",
     "root",
     "roverxu",
+    "ruct",
     "Ruslan",
     "Ruslan Savchenko",
     "Russ Frank",
     "Ruzal Ibragimov",
+    "ryzuo",
     "Sabyanin Maxim",
     "SaltTan",
     "Sami Kerola",
@@ -763,6 +815,7 @@ const char * auto_contributors[] {
     "satanson",
     "Saulius Valatka",
     "sdk2",
+    "Sean Haynes",
     "Sébastien Launay",
     "serebrserg",
     "Sergei Bocharov",
@@ -839,12 +892,15 @@ const char * auto_contributors[] {
     "TCeason",
     "Tema Novikov",
     "templarzq",
+    "terrylin",
     "The-Alchemist",
+    "Thomas Berdy",
     "Tiaonmmn",
     "tiger.yan",
     "tison",
     "TiunovNN",
     "Tobias Adamson",
+    "Tobias Lins",
     "Tom Bombadil",
     "topvisor",
     "Tsarkova Anastasia",
@@ -863,6 +919,7 @@ const char * auto_contributors[] {
     "VadimPE",
     "Vadim Plakhtinskiy",
     "Vadim Skipin",
+    "Vadim Volodin",
     "Val",
     "Valera Ryaboshapko",
     "Vasilyev Nikita",
@@ -877,12 +934,15 @@ const char * auto_contributors[] {
     "Veloman Yunkan",
     "Veniamin Gvozdikov",
     "Veselkov Konstantin",
+    "vgocoder",
     "vic",
     "vicdashkov",
+    "vicgao",
     "Victor",
     "Victor Tarnavsky",
     "Viktor Taranenko",
     "vinity",
+    "Vitalii S",
     "Vitaliy Fedorchenko",
     "Vitaliy Karnienko",
     "Vitaliy Kozlovskiy",
@@ -891,12 +951,14 @@ const char * auto_contributors[] {
     "Vitaly",
     "Vitaly Baranov",
     "Vitaly Samigullin",
+    "Vitaly Stoyan",
     "vitstn",
     "vivarum",
     "Vivien Maisonneuve",
     "Vlad Arkhipov",
     "Vladimir",
     "Vladimir Bunchuk",
+    "Vladimir C",
     "Vladimir Ch",
     "Vladimir Chebotarev",
     "vladimir golovchenko",
@@ -925,6 +987,7 @@ const char * auto_contributors[] {
     "wzl",
     "Xianda Ke",
     "Xiang Zhou",
+    "xiedeyantu",
     "xPoSx",
     "Yağızcan Değirmenci",
     "yang",
@@ -938,6 +1001,7 @@ const char * auto_contributors[] {
     "Yiğit Konur",
     "yiguolei",
     "Yingchun Lai",
+    "Yingfan Chen",
     "yingjinghan",
     "ylchou",
     "Y Lu",
@@ -966,6 +1030,8 @@ const char * auto_contributors[] {
     "Zhichang Yu",
     "Zhichun Wu",
     "Zhipeng",
+    "ZhiYong Wang",
+    "zhongyuankai",
     "zhukai",
     "Zijie Lu",
     "zlx19950903",
@@ -994,6 +1060,7 @@ const char * auto_contributors[] {
     "曲正鹏",
     "未来星___费",
     "极客青年",
+    "董海镔",
     "谢磊",
     "贾顺名(Jarvis)",
     "陈小玉",

From 341a6c51d61c8ef00fd1b709d84f851f56b3adc8 Mon Sep 17 00:00:00 2001
From: tavplubix <avtokmakov@yandex-team.ru>
Date: Thu, 9 Sep 2021 12:37:51 +0300
Subject: [PATCH 177/177] Merging #24866 (#28691)

* Add StorageSystemISTables.cpp/.h

* Another attempt

* Columns and Views

* Add information schema db and fix information schema 'tables' table

* fix build

* remove copy-paste, add views to system tables

* add test

* fix

* fix_tests

Co-authored-by: Damir Petrov <petrovdamir2235@gmail.com>
Co-authored-by: Damir Petrov <0442a403@verstehen.sas.yp-c.yandex.net>
Co-authored-by: mergify[bot] <37929162+mergify[bot]@users.noreply.github.com>
---
 programs/local/LocalServer.cpp                | 21 +++---
 programs/server/Server.cpp                    |  3 +
 src/Access/ContextAccess.cpp                  |  4 +-
 src/Core/Defines.h                            |  3 +
 src/Core/Settings.h                           |  2 +-
 src/Interpreters/DatabaseCatalog.h            |  2 +
 src/Interpreters/loadMetadata.cpp             | 30 +++++---
 src/Interpreters/loadMetadata.h               |  3 +-
 src/Storages/System/CMakeLists.txt            | 12 +++
 .../System/InformationSchema/columns.sql      | 74 +++++++++++++++++++
 .../System/InformationSchema/schemata.sql     | 26 +++++++
 .../System/InformationSchema/tables.sql       | 17 +++++
 .../System/InformationSchema/views.sql        | 36 +++++++++
 src/Storages/System/StorageSystemColumns.cpp  | 62 ++++++++++++++++
 .../System/StorageSystemReplicatedFetches.h   |  2 +-
 src/Storages/System/StorageSystemTables.cpp   | 52 +++++++++----
 .../System/attachInformationSchemaTables.cpp  | 57 ++++++++++++++
 .../System/attachInformationSchemaTables.h    | 11 +++
 src/Storages/System/attachSystemTablesImpl.h  |  1 +
 src/Storages/ya.make                          |  1 +
 .../test_mysql_protocol/golang.reference      |  1 +
 tests/integration/test_mysql_protocol/test.py |  8 +-
 tests/integration/test_quota/test.py          |  4 +-
 .../01161_all_system_tables.reference         |  0
 .../0_stateless/01161_all_system_tables.sh    | 27 +++++++
 .../01161_information_schema.reference        | 27 +++++++
 .../0_stateless/01161_information_schema.sql  | 19 +++++
 ...temporary_table_in_system_tables.reference |  6 +-
 28 files changed, 465 insertions(+), 46 deletions(-)
 create mode 100644 src/Storages/System/InformationSchema/columns.sql
 create mode 100644 src/Storages/System/InformationSchema/schemata.sql
 create mode 100644 src/Storages/System/InformationSchema/tables.sql
 create mode 100644 src/Storages/System/InformationSchema/views.sql
 create mode 100644 src/Storages/System/attachInformationSchemaTables.cpp
 create mode 100644 src/Storages/System/attachInformationSchemaTables.h
 create mode 100644 tests/queries/0_stateless/01161_all_system_tables.reference
 create mode 100755 tests/queries/0_stateless/01161_all_system_tables.sh
 create mode 100644 tests/queries/0_stateless/01161_information_schema.reference
 create mode 100644 tests/queries/0_stateless/01161_information_schema.sql

diff --git a/programs/local/LocalServer.cpp b/programs/local/LocalServer.cpp
index d9758f7b6d1..d2661874beb 100644
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@@ -8,6 +8,7 @@
 #include <Poco/NullChannel.h>
 #include <Databases/DatabaseMemory.h>
 #include <Storages/System/attachSystemTables.h>
+#include <Storages/System/attachInformationSchemaTables.h>
 #include <Interpreters/ProcessList.h>
 #include <Interpreters/executeQuery.h>
 #include <Interpreters/loadMetadata.h>
@@ -179,20 +180,18 @@ void LocalServer::tryInitPath()
 }
 
 
-static void attachSystemTables(ContextPtr context)
+static DatabasePtr createMemoryDatabaseIfNotExists(ContextPtr context, const String & database_name)
 {
-    DatabasePtr system_database = DatabaseCatalog::instance().tryGetDatabase(DatabaseCatalog::SYSTEM_DATABASE);
+    DatabasePtr system_database = DatabaseCatalog::instance().tryGetDatabase(database_name);
     if (!system_database)
     {
         /// TODO: add attachTableDelayed into DatabaseMemory to speedup loading
-        system_database = std::make_shared<DatabaseMemory>(DatabaseCatalog::SYSTEM_DATABASE, context);
-        DatabaseCatalog::instance().attachDatabase(DatabaseCatalog::SYSTEM_DATABASE, system_database);
+        system_database = std::make_shared<DatabaseMemory>(database_name, context);
+        DatabaseCatalog::instance().attachDatabase(database_name, system_database);
     }
-
-    attachSystemTablesLocal(*system_database);
+    return system_database;
 }
 
-
 int LocalServer::main(const std::vector<std::string> & /*args*/)
 try
 {
@@ -303,14 +302,18 @@ try
         fs::create_directories(fs::path(path) / "data/");
         fs::create_directories(fs::path(path) / "metadata/");
         loadMetadataSystem(global_context);
-        attachSystemTables(global_context);
+        attachSystemTablesLocal(*createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE));
+        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA));
+        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE));
         loadMetadata(global_context);
         DatabaseCatalog::instance().loadDatabases();
         LOG_DEBUG(log, "Loaded metadata.");
     }
     else if (!config().has("no-system-tables"))
     {
-        attachSystemTables(global_context);
+        attachSystemTablesLocal(*createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::SYSTEM_DATABASE));
+        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA));
+        attachInformationSchema(global_context, *createMemoryDatabaseIfNotExists(global_context, DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE));
     }
 
     processQueries();
diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 55df9963e8e..4a6d1e206e7 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -56,6 +56,7 @@
 #include <Access/AccessControlManager.h>
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Storages/System/attachSystemTables.h>
+#include <Storages/System/attachInformationSchemaTables.h>
 #include <AggregateFunctions/registerAggregateFunctions.h>
 #include <Functions/registerFunctions.h>
 #include <TableFunctions/registerTableFunctions.h>
@@ -1131,6 +1132,8 @@ if (ThreadFuzzer::instance().isEffective())
         global_context->setSystemZooKeeperLogAfterInitializationIfNeeded();
         /// After the system database is created, attach virtual system tables (in addition to query_log and part_log)
         attachSystemTablesServer(*database_catalog.getSystemDatabase(), has_zookeeper);
+        attachInformationSchema(global_context, *database_catalog.getDatabase(DatabaseCatalog::INFORMATION_SCHEMA));
+        attachInformationSchema(global_context, *database_catalog.getDatabase(DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE));
         /// Firstly remove partially dropped databases, to avoid race with MaterializedMySQLSyncThread,
         /// that may execute DROP before loadMarkedAsDroppedTables() in background,
         /// and so loadMarkedAsDroppedTables() will find it and try to add, and UUID will overlap.
diff --git a/src/Access/ContextAccess.cpp b/src/Access/ContextAccess.cpp
index 39b57a40e7a..5a615ad8fd6 100644
--- a/src/Access/ContextAccess.cpp
+++ b/src/Access/ContextAccess.cpp
@@ -119,8 +119,10 @@ namespace
         AccessRights res = access;
         res.modifyFlags(modifier);
 
-        /// Anyone has access to the "system" database.
+        /// Anyone has access to the "system" and "information_schema" database.
         res.grant(AccessType::SELECT, DatabaseCatalog::SYSTEM_DATABASE);
+        res.grant(AccessType::SELECT, DatabaseCatalog::INFORMATION_SCHEMA);
+        res.grant(AccessType::SELECT, DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE);
         return res;
     }
 
diff --git a/src/Core/Defines.h b/src/Core/Defines.h
index 8244a0fc815..ce5c9098399 100644
--- a/src/Core/Defines.h
+++ b/src/Core/Defines.h
@@ -128,5 +128,8 @@
 /// Default limit on recursion depth of recursive descend parser.
 #define DBMS_DEFAULT_MAX_PARSER_DEPTH 1000
 
+/// Default limit on query size.
+#define DBMS_DEFAULT_MAX_QUERY_SIZE 262144
+
 /// Max depth of hierarchical dictionary
 #define DBMS_HIERARCHICAL_DICTIONARY_MAX_DEPTH 1000
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 09dfd347423..f331ad2d7d3 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -48,7 +48,7 @@ class IColumn;
     M(MaxThreads, max_alter_threads, 0, "The maximum number of threads to execute the ALTER requests. By default, it is determined automatically.", 0) \
     M(UInt64, max_read_buffer_size, DBMS_DEFAULT_BUFFER_SIZE, "The maximum size of the buffer to read from the filesystem.", 0) \
     M(UInt64, max_distributed_connections, 1024, "The maximum number of connections for distributed processing of one query (should be greater than max_threads).", 0) \
-    M(UInt64, max_query_size, 262144, "Which part of the query can be read into RAM for parsing (the remaining data for INSERT, if any, is read later)", 0) \
+    M(UInt64, max_query_size, DBMS_DEFAULT_MAX_QUERY_SIZE, "Which part of the query can be read into RAM for parsing (the remaining data for INSERT, if any, is read later)", 0) \
     M(UInt64, interactive_delay, 100000, "The interval in microseconds to check if the request is cancelled, and to send progress info.", 0) \
     M(Seconds, connect_timeout, DBMS_DEFAULT_CONNECT_TIMEOUT_SEC, "Connection timeout if there are no replicas.", 0) \
     M(Milliseconds, connect_timeout_with_failover_ms, DBMS_DEFAULT_CONNECT_TIMEOUT_WITH_FAILOVER_MS, "Connection timeout for selecting first healthy replica.", 0) \
diff --git a/src/Interpreters/DatabaseCatalog.h b/src/Interpreters/DatabaseCatalog.h
index 071b80690df..f2063e4199f 100644
--- a/src/Interpreters/DatabaseCatalog.h
+++ b/src/Interpreters/DatabaseCatalog.h
@@ -123,6 +123,8 @@ class DatabaseCatalog : boost::noncopyable, WithMutableContext
 public:
     static constexpr const char * TEMPORARY_DATABASE = "_temporary_and_external_tables";
     static constexpr const char * SYSTEM_DATABASE = "system";
+    static constexpr const char * INFORMATION_SCHEMA = "information_schema";
+    static constexpr const char * INFORMATION_SCHEMA_UPPERCASE = "INFORMATION_SCHEMA";
 
     static DatabaseCatalog & init(ContextMutablePtr global_context_);
     static DatabaseCatalog & instance();
diff --git a/src/Interpreters/loadMetadata.cpp b/src/Interpreters/loadMetadata.cpp
index 458e17ac16b..230831a6674 100644
--- a/src/Interpreters/loadMetadata.cpp
+++ b/src/Interpreters/loadMetadata.cpp
@@ -47,6 +47,12 @@ static void executeCreateQuery(
     interpreter.execute();
 }
 
+static bool isSystemOrInformationSchema(const String & database_name)
+{
+    return database_name == DatabaseCatalog::SYSTEM_DATABASE ||
+           database_name == DatabaseCatalog::INFORMATION_SCHEMA ||
+           database_name == DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE;
+}
 
 static void loadDatabase(
     ContextMutablePtr context,
@@ -116,7 +122,7 @@ void loadMetadata(ContextMutablePtr context, const String & default_database_nam
             if (fs::path(current_file).extension() == ".sql")
             {
                 String db_name = fs::path(current_file).stem();
-                if (db_name != DatabaseCatalog::SYSTEM_DATABASE)
+                if (!isSystemOrInformationSchema(db_name))
                     databases.emplace(unescapeForFileName(db_name), fs::path(path) / db_name);
             }
 
@@ -142,7 +148,7 @@ void loadMetadata(ContextMutablePtr context, const String & default_database_nam
         if (current_file.at(0) == '.')
             continue;
 
-        if (current_file == DatabaseCatalog::SYSTEM_DATABASE)
+        if (isSystemOrInformationSchema(current_file))
             continue;
 
         databases.emplace(unescapeForFileName(current_file), it->path().string());
@@ -171,25 +177,31 @@ void loadMetadata(ContextMutablePtr context, const String & default_database_nam
     }
 }
 
-
-void loadMetadataSystem(ContextMutablePtr context)
+static void loadSystemDatabaseImpl(ContextMutablePtr context, const String & database_name, const String & default_engine)
 {
-    String path = context->getPath() + "metadata/" + DatabaseCatalog::SYSTEM_DATABASE;
+    String path = context->getPath() + "metadata/" + database_name;
     String metadata_file = path + ".sql";
     if (fs::exists(fs::path(path)) || fs::exists(fs::path(metadata_file)))
     {
         /// 'has_force_restore_data_flag' is true, to not fail on loading query_log table, if it is corrupted.
-        loadDatabase(context, DatabaseCatalog::SYSTEM_DATABASE, path, true);
+        loadDatabase(context, database_name, path, true);
     }
     else
     {
         /// Initialize system database manually
         String database_create_query = "CREATE DATABASE ";
-        database_create_query += DatabaseCatalog::SYSTEM_DATABASE;
-        database_create_query += " ENGINE=Atomic";
-        executeCreateQuery(database_create_query, context, DatabaseCatalog::SYSTEM_DATABASE, "<no file>", true);
+        database_create_query += database_name;
+        database_create_query += " ENGINE=";
+        database_create_query += default_engine;
+        executeCreateQuery(database_create_query, context, database_name, "<no file>", true);
     }
+}
 
+void loadMetadataSystem(ContextMutablePtr context)
+{
+    loadSystemDatabaseImpl(context, DatabaseCatalog::SYSTEM_DATABASE, "Atomic");
+    loadSystemDatabaseImpl(context, DatabaseCatalog::INFORMATION_SCHEMA, "Memory");
+    loadSystemDatabaseImpl(context, DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE, "Memory");
 }
 
 }
diff --git a/src/Interpreters/loadMetadata.h b/src/Interpreters/loadMetadata.h
index cf038a42855..529d2e43fc8 100644
--- a/src/Interpreters/loadMetadata.h
+++ b/src/Interpreters/loadMetadata.h
@@ -10,7 +10,8 @@ namespace DB
 /// You should first load system database, then attach system tables that you need into it, then load other databases.
 void loadMetadataSystem(ContextMutablePtr context);
 
-/// Load tables from databases and add them to context. Database 'system' is ignored. Use separate function to load system tables.
+/// Load tables from databases and add them to context. Database 'system' and 'information_schema' is ignored.
+/// Use separate function to load system tables.
 void loadMetadata(ContextMutablePtr context, const String & default_database_name = {});
 
 }
diff --git a/src/Storages/System/CMakeLists.txt b/src/Storages/System/CMakeLists.txt
index 7e350932038..2e909c03f5d 100644
--- a/src/Storages/System/CMakeLists.txt
+++ b/src/Storages/System/CMakeLists.txt
@@ -2,6 +2,8 @@
 # You can also regenerate it manually this way:
 # execute_process(COMMAND "${CMAKE_CURRENT_SOURCE_DIR}/StorageSystemContributors.sh")
 
+include(${ClickHouse_SOURCE_DIR}/cmake/embed_binary.cmake)
+
 set (CONFIG_BUILD "${CMAKE_CURRENT_BINARY_DIR}/StorageSystemBuildOptions.generated.cpp")
 get_property (BUILD_COMPILE_DEFINITIONS DIRECTORY ${ClickHouse_SOURCE_DIR} PROPERTY COMPILE_DEFINITIONS)
 get_property (BUILD_INCLUDE_DIRECTORIES DIRECTORY ${ClickHouse_SOURCE_DIR} PROPERTY INCLUDE_DIRECTORIES)
@@ -41,7 +43,16 @@ list (APPEND storages_system_sources ${GENERATED_TIMEZONES_SRC})
 # Overlength strings
 set_source_files_properties(${GENERATED_LICENSES_SRC} PROPERTIES COMPILE_FLAGS -w)
 
+clickhouse_embed_binaries(
+        TARGET information_schema_metadata
+        RESOURCE_DIR "${CMAKE_CURRENT_SOURCE_DIR}/InformationSchema/"
+        RESOURCES schemata.sql tables.sql views.sql columns.sql
+)
+
 add_library(clickhouse_storages_system ${storages_system_headers} ${storages_system_sources})
+
+add_dependencies(clickhouse_storages_system information_schema_metadata)
+
 target_link_libraries(clickhouse_storages_system PRIVATE
     dbms
     common
@@ -49,4 +60,5 @@ target_link_libraries(clickhouse_storages_system PRIVATE
     clickhouse_common_zookeeper
     clickhouse_parsers
     Poco::JSON
+    INTERFACE "-Wl,${WHOLE_ARCHIVE} $<TARGET_FILE:information_schema_metadata> -Wl,${NO_WHOLE_ARCHIVE}"
 )
diff --git a/src/Storages/System/InformationSchema/columns.sql b/src/Storages/System/InformationSchema/columns.sql
new file mode 100644
index 00000000000..1b71ea58f57
--- /dev/null
+++ b/src/Storages/System/InformationSchema/columns.sql
@@ -0,0 +1,74 @@
+ATTACH VIEW columns
+(
+    `table_catalog` String,
+    `table_schema` String,
+    `table_name` String,
+    `column_name` String,
+    `ordinal_position` UInt64,
+    `column_default` String,
+    `is_nullable` UInt8,
+    `data_type` String,
+    `character_maximum_length` Nullable(UInt64),
+    `character_octet_length` Nullable(UInt64),
+    `numeric_precision` Nullable(UInt64),
+    `numeric_precision_radix` Nullable(UInt64),
+    `numeric_scale` Nullable(UInt64),
+    `datetime_precision` Nullable(UInt64),
+    `character_set_catalog` Nullable(String),
+    `character_set_schema` Nullable(String),
+    `character_set_name` Nullable(String),
+    `collation_catalog` Nullable(String),
+    `collation_schema` Nullable(String),
+    `collation_name` Nullable(String),
+    `domain_catalog` Nullable(String),
+    `domain_schema` Nullable(String),
+    `domain_name` Nullable(String),
+    `TABLE_CATALOG` String ALIAS table_catalog,
+    `TABLE_SCHEMA` String ALIAS table_schema,
+    `TABLE_NAME` String ALIAS table_name,
+    `COLUMN_NAME` String ALIAS column_name,
+    `ORDINAL_POSITION` UInt64 ALIAS ordinal_position,
+    `COLUMN_DEFAULT` String ALIAS column_default,
+    `IS_NULLABLE` UInt8 ALIAS is_nullable,
+    `DATA_TYPE` String ALIAS data_type,
+    `CHARACTER_MAXIMUM_LENGTH` Nullable(UInt64) ALIAS character_maximum_length,
+    `CHARACTER_OCTET_LENGTH` Nullable(UInt64) ALIAS character_octet_length,
+    `NUMERIC_PRECISION` Nullable(UInt64) ALIAS numeric_precision,
+    `NUMERIC_PRECISION_RADIX` Nullable(UInt64) ALIAS numeric_precision_radix,
+    `NUMERIC_SCALE` Nullable(UInt64) ALIAS numeric_scale,
+    `DATETIME_PRECISION` Nullable(UInt64) ALIAS datetime_precision,
+    `CHARACTER_SET_CATALOG` Nullable(String) ALIAS character_set_catalog,
+    `CHARACTER_SET_SCHEMA` Nullable(String) ALIAS character_set_schema,
+    `CHARACTER_SET_NAME` Nullable(String) ALIAS character_set_name,
+    `COLLATION_CATALOG` Nullable(String) ALIAS collation_catalog,
+    `COLLATION_SCHEMA` Nullable(String) ALIAS collation_schema,
+    `COLLATION_NAME` Nullable(String) ALIAS collation_name,
+    `DOMAIN_CATALOG` Nullable(String) ALIAS domain_catalog,
+    `DOMAIN_SCHEMA` Nullable(String) ALIAS domain_schema,
+    `DOMAIN_NAME` Nullable(String) ALIAS domain_name
+) AS
+SELECT
+    database AS table_catalog,
+    database AS table_schema,
+    table AS table_name,
+    name AS column_name,
+    position AS ordinal_position,
+    default_expression AS column_default,
+    type LIKE 'Nullable(%)' AS is_nullable,
+    type AS data_type,
+    character_octet_length AS character_maximum_length,
+    character_octet_length,
+    numeric_precision,
+    numeric_precision_radix,
+    numeric_scale,
+    datetime_precision,
+    NULL AS character_set_catalog,
+    NULL AS character_set_schema,
+    NULL AS character_set_name,
+    NULL AS collation_catalog,
+    NULL AS collation_schema,
+    NULL AS collation_name,
+    NULL AS domain_catalog,
+    NULL AS domain_schema,
+    NULL AS domain_name
+FROM system.columns
diff --git a/src/Storages/System/InformationSchema/schemata.sql b/src/Storages/System/InformationSchema/schemata.sql
new file mode 100644
index 00000000000..9686fcbf4fa
--- /dev/null
+++ b/src/Storages/System/InformationSchema/schemata.sql
@@ -0,0 +1,26 @@
+ATTACH VIEW schemata
+(
+    `catalog_name` String,
+    `schema_name` String,
+    `schema_owner` String,
+    `default_character_set_catalog` Nullable(String),
+    `default_character_set_schema` Nullable(String),
+    `default_character_set_name` Nullable(String),
+    `sql_path` Nullable(String),
+    `CATALOG_NAME` String ALIAS catalog_name,
+    `SCHEMA_NAME` String ALIAS schema_name,
+    `SCHEMA_OWNER` String ALIAS schema_owner,
+    `DEFAULT_CHARACTER_SET_CATALOG` Nullable(String) ALIAS default_character_set_catalog,
+    `DEFAULT_CHARACTER_SET_SCHEMA` Nullable(String) ALIAS default_character_set_schema,
+    `DEFAULT_CHARACTER_SET_NAME` Nullable(String) ALIAS default_character_set_name,
+    `SQL_PATH` Nullable(String) ALIAS sql_path
+) AS
+SELECT
+    name AS catalog_name,
+    name AS schema_name,
+    'default' AS schema_owner,
+    NULL AS default_character_set_catalog,
+    NULL AS default_character_set_schema,
+    NULL AS default_character_set_name,
+    NULL AS sql_path
+FROM system.databases
diff --git a/src/Storages/System/InformationSchema/tables.sql b/src/Storages/System/InformationSchema/tables.sql
new file mode 100644
index 00000000000..8eea3713923
--- /dev/null
+++ b/src/Storages/System/InformationSchema/tables.sql
@@ -0,0 +1,17 @@
+ATTACH VIEW tables
+(
+    `table_catalog` String,
+    `table_schema` String,
+    `table_name` String,
+    `table_type` Enum8('BASE TABLE' = 1, 'VIEW' = 2, 'FOREIGN TABLE' = 3, 'LOCAL TEMPORARY' = 4, 'SYSTEM VIEW' = 5),
+    `TABLE_CATALOG` String ALIAS table_catalog,
+    `TABLE_SCHEMA` String ALIAS table_schema,
+    `TABLE_NAME` String ALIAS table_name,
+    `TABLE_TYPE` Enum8('BASE TABLE' = 1, 'VIEW' = 2, 'FOREIGN TABLE' = 3, 'LOCAL TEMPORARY' = 4, 'SYSTEM VIEW' = 5) ALIAS table_type
+) AS
+SELECT
+    database AS table_catalog,
+    database AS table_schema,
+    name AS table_name,
+    multiIf(is_temporary, 4, engine like '%View', 2, engine LIKE 'System%', 5, has_own_data = 0, 3, 1) AS table_type
+FROM system.tables
diff --git a/src/Storages/System/InformationSchema/views.sql b/src/Storages/System/InformationSchema/views.sql
new file mode 100644
index 00000000000..c5ecebfceac
--- /dev/null
+++ b/src/Storages/System/InformationSchema/views.sql
@@ -0,0 +1,36 @@
+ATTACH VIEW views
+(
+    `table_catalog` String,
+    `table_schema` String,
+    `table_name` String,
+    `view_definition` String,
+    `check_option` String,
+    `is_updatable` Enum8('NO' = 0, 'YES' = 1),
+    `is_insertable_into` Enum8('NO' = 0, 'YES' = 1),
+    `is_trigger_updatable` Enum8('NO' = 0, 'YES' = 1),
+    `is_trigger_deletable` Enum8('NO' = 0, 'YES' = 1),
+    `is_trigger_insertable_into` Enum8('NO' = 0, 'YES' = 1),
+    `TABLE_CATALOG` String ALIAS table_catalog,
+    `TABLE_SCHEMA` String ALIAS table_schema,
+    `TABLE_NAME` String ALIAS table_name,
+    `VIEW_DEFINITION` String ALIAS view_definition,
+    `CHECK_OPTION` String ALIAS check_option,
+    `IS_UPDATABLE` Enum8('NO' = 0, 'YES' = 1) ALIAS is_updatable,
+    `IS_INSERTABLE_INTO` Enum8('NO' = 0, 'YES' = 1) ALIAS is_insertable_into,
+    `IS_TRIGGER_UPDATABLE` Enum8('NO' = 0, 'YES' = 1) ALIAS is_trigger_updatable,
+    `IS_TRIGGER_DELETABLE` Enum8('NO' = 0, 'YES' = 1) ALIAS is_trigger_deletable,
+    `IS_TRIGGER_INSERTABLE_INTO` Enum8('NO' = 0, 'YES' = 1) ALIAS is_trigger_insertable_into
+) AS
+SELECT
+    database AS table_catalog,
+    database AS table_schema,
+    name AS table_name,
+    as_select AS view_definition,
+    'NONE' AS check_option,
+    0 AS is_updatable,
+    engine = 'MaterializedView' AS is_insertable_into,
+    0 AS is_trigger_updatable,
+    0 AS is_trigger_deletable,
+    0 AS is_trigger_insertable_into
+FROM system.tables
+WHERE engine LIKE '%View'
diff --git a/src/Storages/System/StorageSystemColumns.cpp b/src/Storages/System/StorageSystemColumns.cpp
index 0058b58f537..cefc5587014 100644
--- a/src/Storages/System/StorageSystemColumns.cpp
+++ b/src/Storages/System/StorageSystemColumns.cpp
@@ -5,6 +5,8 @@
 #include <Columns/ColumnString.h>
 #include <DataTypes/DataTypeString.h>
 #include <DataTypes/DataTypesNumber.h>
+#include <DataTypes/DataTypesDecimal.h>
+#include <DataTypes/DataTypeDateTime64.h>
 #include <Storages/VirtualColumnUtils.h>
 #include <Parsers/queryToString.h>
 #include <Parsers/ASTSelectQuery.h>
@@ -44,6 +46,12 @@ StorageSystemColumns::StorageSystemColumns(const StorageID & table_id_)
         { "is_in_primary_key",   std::make_shared<DataTypeUInt8>() },
         { "is_in_sampling_key",  std::make_shared<DataTypeUInt8>() },
         { "compression_codec",   std::make_shared<DataTypeString>() },
+        { "character_octet_length",     std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()) },
+        { "numeric_precision",          std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()) },
+        { "numeric_precision_radix",    std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()) },
+        { "numeric_scale",              std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()) },
+        { "datetime_precision",         std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>()) },
+
     }));
     setInMemoryMetadata(storage_metadata);
 }
@@ -218,6 +226,60 @@ protected:
                         res_columns[res_index++]->insertDefault();
                 }
 
+                /// character_octet_length makes sense for FixedString only
+                DataTypePtr not_nullable_type = removeNullable(column.type);
+                if (columns_mask[src_index++])
+                {
+                    if (isFixedString(not_nullable_type))
+                        res_columns[res_index++]->insert(not_nullable_type->getSizeOfValueInMemory());
+                    else
+                        res_columns[res_index++]->insertDefault();
+                }
+
+                /// numeric_precision
+                if (columns_mask[src_index++])
+                {
+                    if (isInteger(not_nullable_type))
+                        res_columns[res_index++]->insert(not_nullable_type->getSizeOfValueInMemory() * 8);  /// radix is 2
+                    else if (isDecimal(not_nullable_type))
+                        res_columns[res_index++]->insert(getDecimalPrecision(*not_nullable_type));  /// radix is 10
+                    else
+                        res_columns[res_index++]->insertDefault();
+                }
+
+                /// numeric_precision_radix
+                if (columns_mask[src_index++])
+                {
+                    if (isInteger(not_nullable_type))
+                        res_columns[res_index++]->insert(2);
+                    else if (isDecimal(not_nullable_type))
+                        res_columns[res_index++]->insert(10);
+                    else
+                        res_columns[res_index++]->insertDefault();
+                }
+
+                /// numeric_scale
+                if (columns_mask[src_index++])
+                {
+                    if (isInteger(not_nullable_type))
+                        res_columns[res_index++]->insert(0);
+                    else if (isDecimal(not_nullable_type))
+                        res_columns[res_index++]->insert(getDecimalScale(*not_nullable_type));
+                    else
+                        res_columns[res_index++]->insertDefault();
+                }
+
+                /// datetime_precision
+                if (columns_mask[src_index++])
+                {
+                    if (isDateTime64(not_nullable_type))
+                        res_columns[res_index++]->insert(assert_cast<const DataTypeDateTime64 &>(*not_nullable_type).getScale());
+                    else if (isDateOrDate32(not_nullable_type) || isDateTime(not_nullable_type) || isDateTime64(not_nullable_type))
+                        res_columns[res_index++]->insert(0);
+                    else
+                        res_columns[res_index++]->insertDefault();
+                }
+
                 ++rows_count;
             }
         }
diff --git a/src/Storages/System/StorageSystemReplicatedFetches.h b/src/Storages/System/StorageSystemReplicatedFetches.h
index b7336dbf437..1e17ac4941c 100644
--- a/src/Storages/System/StorageSystemReplicatedFetches.h
+++ b/src/Storages/System/StorageSystemReplicatedFetches.h
@@ -11,7 +11,7 @@ namespace DB
 class Context;
 
 /// system.replicated_fetches table. Takes data from context.getReplicatedFetchList()
-class StorageSystemReplicatedFetches final : public shared_ptr_helper<StorageSystemReplicatedFetches>, public IStorageSystemOneBlock<StorageSystemReplicatedFetches >
+class StorageSystemReplicatedFetches final : public shared_ptr_helper<StorageSystemReplicatedFetches>, public IStorageSystemOneBlock<StorageSystemReplicatedFetches>
 {
     friend struct shared_ptr_helper<StorageSystemReplicatedFetches>;
 public:
diff --git a/src/Storages/System/StorageSystemTables.cpp b/src/Storages/System/StorageSystemTables.cpp
index 1d38ca6ebbb..f1f7fa4fa08 100644
--- a/src/Storages/System/StorageSystemTables.cpp
+++ b/src/Storages/System/StorageSystemTables.cpp
@@ -47,6 +47,7 @@ StorageSystemTables::StorageSystemTables(const StorageID & table_id_)
         {"dependencies_table", std::make_shared<DataTypeArray>(std::make_shared<DataTypeString>())},
         {"create_table_query", std::make_shared<DataTypeString>()},
         {"engine_full", std::make_shared<DataTypeString>()},
+        {"as_select", std::make_shared<DataTypeString>()},
         {"partition_key", std::make_shared<DataTypeString>()},
         {"sorting_key", std::make_shared<DataTypeString>()},
         {"primary_key", std::make_shared<DataTypeString>()},
@@ -57,6 +58,7 @@ StorageSystemTables::StorageSystemTables(const StorageID & table_id_)
         {"lifetime_rows", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>())},
         {"lifetime_bytes", std::make_shared<DataTypeNullable>(std::make_shared<DataTypeUInt64>())},
         {"comment", std::make_shared<DataTypeString>()},
+        {"has_own_data", std::make_shared<DataTypeUInt8>()},
     }));
     setInMemoryMetadata(storage_metadata);
 }
@@ -209,6 +211,10 @@ protected:
                         if (columns_mask[src_index++])
                             res_columns[res_index++]->insert(table.second->getName());
 
+                        // as_select
+                        if (columns_mask[src_index++])
+                            res_columns[res_index++]->insertDefault();
+
                         // partition_key
                         if (columns_mask[src_index++])
                             res_columns[res_index++]->insertDefault();
@@ -248,6 +254,10 @@ protected:
                         // comment
                         if (columns_mask[src_index++])
                             res_columns[res_index++]->insertDefault();
+
+                        // has_own_data
+                        if (columns_mask[src_index++])
+                            res_columns[res_index++]->insertDefault();
                     }
                 }
 
@@ -355,15 +365,15 @@ protected:
                         res_columns[res_index++]->insert(dependencies_table_name_array);
                 }
 
-                if (columns_mask[src_index] || columns_mask[src_index + 1])
+                if (columns_mask[src_index] || columns_mask[src_index + 1] || columns_mask[src_index + 2])
                 {
                     ASTPtr ast = database->tryGetCreateTableQuery(table_name, context);
+                    auto * ast_create = ast ? ast->as<ASTCreateQuery>() : nullptr;
 
-                    if (ast && !context->getSettingsRef().show_table_uuid_in_table_create_query_if_not_nil)
+                    if (ast_create && !context->getSettingsRef().show_table_uuid_in_table_create_query_if_not_nil)
                     {
-                        auto & create = ast->as<ASTCreateQuery &>();
-                        create.uuid = UUIDHelpers::Nil;
-                        create.to_inner_uuid = UUIDHelpers::Nil;
+                        ast_create->uuid = UUIDHelpers::Nil;
+                        ast_create->to_inner_uuid = UUIDHelpers::Nil;
                     }
 
                     if (columns_mask[src_index++])
@@ -373,24 +383,28 @@ protected:
                     {
                         String engine_full;
 
-                        if (ast)
+                        if (ast_create && ast_create->storage)
                         {
-                            const auto & ast_create = ast->as<ASTCreateQuery &>();
-                            if (ast_create.storage)
-                            {
-                                engine_full = queryToString(*ast_create.storage);
+                            engine_full = queryToString(*ast_create->storage);
 
-                                static const char * const extra_head = " ENGINE = ";
-                                if (startsWith(engine_full, extra_head))
-                                    engine_full = engine_full.substr(strlen(extra_head));
-                            }
+                            static const char * const extra_head = " ENGINE = ";
+                            if (startsWith(engine_full, extra_head))
+                                engine_full = engine_full.substr(strlen(extra_head));
                         }
 
                         res_columns[res_index++]->insert(engine_full);
                     }
+
+                    if (columns_mask[src_index++])
+                    {
+                        String as_select;
+                        if (ast_create && ast_create->select)
+                            as_select = queryToString(*ast_create->select);
+                        res_columns[res_index++]->insert(as_select);
+                    }
                 }
                 else
-                    src_index += 2;
+                    src_index += 3;
 
                 StorageMetadataPtr metadata_snapshot;
                 if (table)
@@ -483,6 +497,14 @@ protected:
                     else
                         res_columns[res_index++]->insertDefault();
                 }
+
+                if (columns_mask[src_index++])
+                {
+                    if (table)
+                        res_columns[res_index++]->insert(table->storesDataOnDisk());
+                    else
+                        res_columns[res_index++]->insertDefault();
+                }
             }
         }
 
diff --git a/src/Storages/System/attachInformationSchemaTables.cpp b/src/Storages/System/attachInformationSchemaTables.cpp
new file mode 100644
index 00000000000..a94b15ba3b4
--- /dev/null
+++ b/src/Storages/System/attachInformationSchemaTables.cpp
@@ -0,0 +1,57 @@
+#include <Databases/DatabaseOnDisk.h>
+#include <Storages/System/attachInformationSchemaTables.h>
+#include <Storages/System/attachSystemTablesImpl.h>
+#include <Parsers/ParserCreateQuery.h>
+#include <Parsers/parseQuery.h>
+#include <common/getResource.h>
+
+namespace DB
+{
+
+/// View structures are taken from http://www.contrib.andrew.cmu.edu/~shadow/sql/sql1992.txt
+
+static void createInformationSchemaView(ContextMutablePtr context, IDatabase & database, const String & view_name)
+{
+    try
+    {
+        assert(database.getDatabaseName() == DatabaseCatalog::INFORMATION_SCHEMA ||
+               database.getDatabaseName() == DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE);
+        if (database.getEngineName() != "Memory")
+            return;
+        bool is_uppercase = database.getDatabaseName() == DatabaseCatalog::INFORMATION_SCHEMA_UPPERCASE;
+
+        String metadata_resource_name = view_name + ".sql";
+        auto attach_query = getResource(metadata_resource_name);
+        if (attach_query.empty())
+            return;
+
+        ParserCreateQuery parser;
+        ASTPtr ast = parseQuery(parser, attach_query.data(), attach_query.data() + attach_query.size(),
+                                "Attach query from embedded resource " + metadata_resource_name,
+                                DBMS_DEFAULT_MAX_QUERY_SIZE, DBMS_DEFAULT_MAX_PARSER_DEPTH);
+
+        auto & ast_create = ast->as<ASTCreateQuery &>();
+        assert(view_name == ast_create.table);
+        if (is_uppercase)
+            ast_create.table = Poco::toUpper(view_name);
+
+        StoragePtr view = createTableFromAST(ast_create, database.getDatabaseName(),
+                                             database.getTableDataPath(ast_create), context, true).second;
+
+        database.createTable(context, ast_create.table, view, ast);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(__PRETTY_FUNCTION__);
+    }
+}
+
+void attachInformationSchema(ContextMutablePtr context, IDatabase & information_schema_database)
+{
+    createInformationSchemaView(context, information_schema_database, "schemata");
+    createInformationSchemaView(context, information_schema_database, "tables");
+    createInformationSchemaView(context, information_schema_database, "views");
+    createInformationSchemaView(context, information_schema_database, "columns");
+}
+
+}
diff --git a/src/Storages/System/attachInformationSchemaTables.h b/src/Storages/System/attachInformationSchemaTables.h
new file mode 100644
index 00000000000..9bb5623128a
--- /dev/null
+++ b/src/Storages/System/attachInformationSchemaTables.h
@@ -0,0 +1,11 @@
+#pragma once
+#include <Interpreters/Context_fwd.h>
+
+namespace DB
+{
+
+class IDatabase;
+
+void attachInformationSchema(ContextMutablePtr context, IDatabase & information_schema_database);
+
+}
diff --git a/src/Storages/System/attachSystemTablesImpl.h b/src/Storages/System/attachSystemTablesImpl.h
index 78e944f7d27..1fdf677699a 100644
--- a/src/Storages/System/attachSystemTablesImpl.h
+++ b/src/Storages/System/attachSystemTablesImpl.h
@@ -9,6 +9,7 @@ namespace DB
 template<typename StorageT, typename... StorageArgs>
 void attach(IDatabase & system_database, const String & table_name, StorageArgs && ... args)
 {
+    assert(system_database.getDatabaseName() == DatabaseCatalog::SYSTEM_DATABASE);
     if (system_database.getUUID() == UUIDHelpers::Nil)
     {
         /// Attach to Ordinary database
diff --git a/src/Storages/ya.make b/src/Storages/ya.make
index 4ea5cb14db4..c85a0dbd902 100644
--- a/src/Storages/ya.make
+++ b/src/Storages/ya.make
@@ -218,6 +218,7 @@ SRCS(
     System/StorageSystemWarnings.cpp
     System/StorageSystemZeros.cpp
     System/StorageSystemZooKeeper.cpp
+    System/attachInformationSchemaTables.cpp
     System/attachSystemTables.cpp
     TTLDescription.cpp
     VirtualColumnUtils.cpp
diff --git a/tests/integration/test_mysql_protocol/golang.reference b/tests/integration/test_mysql_protocol/golang.reference
index 082149b4644..db16a5a6925 100644
--- a/tests/integration/test_mysql_protocol/golang.reference
+++ b/tests/integration/test_mysql_protocol/golang.reference
@@ -13,6 +13,7 @@ name CHAR
 a TINYINT
 Result:
 tables 1
+tables 1
 Columns:
 a
 b
diff --git a/tests/integration/test_mysql_protocol/test.py b/tests/integration/test_mysql_protocol/test.py
index 070aa9967fc..0b3f6ea95af 100644
--- a/tests/integration/test_mysql_protocol/test.py
+++ b/tests/integration/test_mysql_protocol/test.py
@@ -407,24 +407,24 @@ def test_php_client(started_cluster, php_container):
     code, (stdout, stderr) = php_container.exec_run(
         'php -f test.php {host} {port} default 123'.format(host=started_cluster.get_instance_ip('node'), port=server_port), demux=True)
     assert code == 0
-    assert stdout.decode() == 'tables\n'
+    assert stdout.decode() == 'tables\ntables\n'
 
     code, (stdout, stderr) = php_container.exec_run(
         'php -f test_ssl.php {host} {port} default 123'.format(host=started_cluster.get_instance_ip('node'), port=server_port), demux=True)
     assert code == 0
-    assert stdout.decode() == 'tables\n'
+    assert stdout.decode() == 'tables\ntables\n'
 
     code, (stdout, stderr) = php_container.exec_run(
         'php -f test.php {host} {port} user_with_double_sha1 abacaba'.format(host=started_cluster.get_instance_ip('node'), port=server_port),
         demux=True)
     assert code == 0
-    assert stdout.decode() == 'tables\n'
+    assert stdout.decode() == 'tables\ntables\n'
 
     code, (stdout, stderr) = php_container.exec_run(
         'php -f test_ssl.php {host} {port} user_with_double_sha1 abacaba'.format(host=started_cluster.get_instance_ip('node'), port=server_port),
         demux=True)
     assert code == 0
-    assert stdout.decode() == 'tables\n'
+    assert stdout.decode() == 'tables\ntables\n'
 
 
 def test_mysqljs_client(started_cluster, nodejs_container):
diff --git a/tests/integration/test_quota/test.py b/tests/integration/test_quota/test.py
index 5f43d581178..4149987996b 100644
--- a/tests/integration/test_quota/test.py
+++ b/tests/integration/test_quota/test.py
@@ -399,9 +399,9 @@ def test_consumption_of_show_tables():
         instance.query("SHOW QUOTA"))
 
 def test_consumption_of_show_databases():
-    assert instance.query("SHOW DATABASES") == "default\nsystem\n"
+    assert instance.query("SHOW DATABASES") == "INFORMATION_SCHEMA\ndefault\ninformation_schema\nsystem\n"
     assert re.match(
-        "myQuota\\tdefault\\t.*\\t31556952\\t1\\t1000\\t1\\t500\\t0\\t500\\t0\\t\\\\N\\t2\\t\\\\N.*",
+        "myQuota\\tdefault\\t.*\\t31556952\\t1\\t1000\\t1\\t500\\t0\\t500\\t0\\t\\\\N\\t4\\t\\\\N.*",
         instance.query("SHOW QUOTA"))
 
 def test_consumption_of_show_clusters():
diff --git a/tests/queries/0_stateless/01161_all_system_tables.reference b/tests/queries/0_stateless/01161_all_system_tables.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01161_all_system_tables.sh b/tests/queries/0_stateless/01161_all_system_tables.sh
new file mode 100755
index 00000000000..9b19cc97d16
--- /dev/null
+++ b/tests/queries/0_stateless/01161_all_system_tables.sh
@@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+THREADS=8
+RAND=$(($RANDOM))
+LIMIT=10000
+
+function run_selects()
+{
+    thread_num=$1
+    readarray -t tables_arr < <(${CLICKHOUSE_CLIENT} -q "SELECT database || '.' || name FROM system.tables
+    WHERE database in ('system', 'information_schema', 'INFORMATION_SCHEMA') and name!='zookeeper'
+    AND sipHash64(name || toString($RAND)) % $THREADS = $thread_num")
+
+    for t in "${tables_arr[@]}"
+    do
+        ${CLICKHOUSE_CLIENT} -q "SELECT * FROM $t LIMIT $LIMIT FORMAT Null" # Suppress style check: database=$CLICKHOUSE_DATABASEs
+    done
+}
+
+for ((i=0; i<THREADS; i++)) do
+    run_selects "$i" &
+done
+wait
diff --git a/tests/queries/0_stateless/01161_information_schema.reference b/tests/queries/0_stateless/01161_information_schema.reference
new file mode 100644
index 00000000000..38ca608ee39
--- /dev/null
+++ b/tests/queries/0_stateless/01161_information_schema.reference
@@ -0,0 +1,27 @@
+columns
+schemata
+tables
+views
+COLUMNS
+SCHEMATA
+TABLES
+VIEWS
+INFORMATION_SCHEMA	INFORMATION_SCHEMA	default	\N	\N	\N	\N
+information_schema	information_schema	default	\N	\N	\N	\N
+default	default	mv	VIEW
+default	default	t	FOREIGN TABLE
+default	default	v	VIEW
+		tmp	LOCAL TEMPORARY
+default	default	mv	SELECT * FROM system.one	NONE	NO	YES	NO	NO	NO
+default	default	v	SELECT n, f FROM default.t	NONE	NO	NO	NO	NO	NO
+default	default	mv	dummy	1		0	UInt8	\N	\N	8	2	0	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+default	default	t	n	1		0	UInt64	\N	\N	64	2	0	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+default	default	t	f	2		0	Float32	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+default	default	t	s	3		0	String	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+default	default	t	fs	4		0	FixedString(42)	42	42	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+default	default	t	d	5		0	Decimal(9, 6)	\N	\N	9	10	6	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+default	default	v	n	1		1	Nullable(Int32)	\N	\N	32	2	0	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+default	default	v	f	2		0	Float64	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N	\N
+		tmp	d	1		0	Date	\N	\N	\N	\N	\N	0	\N	\N	\N	\N	\N	\N	\N	\N	\N
+		tmp	dt	2		0	DateTime	\N	\N	\N	\N	\N	0	\N	\N	\N	\N	\N	\N	\N	\N	\N
+		tmp	dtms	3		0	DateTime64(3)	\N	\N	\N	\N	\N	3	\N	\N	\N	\N	\N	\N	\N	\N	\N
diff --git a/tests/queries/0_stateless/01161_information_schema.sql b/tests/queries/0_stateless/01161_information_schema.sql
new file mode 100644
index 00000000000..b6b10efb001
--- /dev/null
+++ b/tests/queries/0_stateless/01161_information_schema.sql
@@ -0,0 +1,19 @@
+show tables from information_schema;
+SHOW TABLES FROM INFORMATION_SCHEMA;
+
+create table t (n UInt64, f Float32, s String, fs FixedString(42), d Decimal(9, 6)) engine=Memory;
+create view v (n Nullable(Int32), f Float64) as select n, f from t;
+create materialized view mv engine=Null as select * from system.one;
+create temporary table tmp (d Date, dt DateTime, dtms DateTime64(3));
+
+-- FIXME #28687
+select * from information_schema.schemata where schema_name ilike 'information_schema';
+-- SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE (TABLE_SCHEMA=currentDatabase() OR TABLE_SCHEMA='') AND TABLE_NAME NOT LIKE '%inner%';
+SELECT * FROM INFORMATION_SCHEMA.TABLES WHERE (table_schema=currentDatabase() OR table_schema='') AND table_name NOT LIKE '%inner%';
+select * from information_schema.views where table_schema=currentDatabase();
+-- SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE (TABLE_SCHEMA=currentDatabase() OR TABLE_SCHEMA='') AND TABLE_NAME NOT LIKE '%inner%';
+SELECT * FROM INFORMATION_SCHEMA.COLUMNS WHERE (table_schema=currentDatabase() OR table_schema='') AND table_name NOT LIKE '%inner%';
+
+drop table t;
+drop view v;
+drop view mv;
diff --git a/tests/queries/0_stateless/01602_temporary_table_in_system_tables.reference b/tests/queries/0_stateless/01602_temporary_table_in_system_tables.reference
index 5b321d6c87a..4ab84e129fe 100644
--- a/tests/queries/0_stateless/01602_temporary_table_in_system_tables.reference
+++ b/tests/queries/0_stateless/01602_temporary_table_in_system_tables.reference
@@ -1,8 +1,8 @@
 	test_01602a	CREATE TEMPORARY TABLE test_01602a (`x` UInt32) ENGINE = Memory	Memory	Memory	1
 	test_01602b	CREATE TEMPORARY TABLE test_01602b (`y` Float64, `z` String) ENGINE = Memory	Memory	Memory	1
-	test_01602a	x	UInt32	1			0	0	0		0	0	0	0	
-	test_01602b	y	Float64	1			0	0	0		0	0	0	0	
-	test_01602b	z	String	2			0	0	0		0	0	0	0	
+	test_01602a	x	UInt32	1			0	0	0		0	0	0	0		\N	32	2	0	\N
+	test_01602b	y	Float64	1			0	0	0		0	0	0	0		\N	\N	\N	\N	\N
+	test_01602b	z	String	2			0	0	0		0	0	0	0		\N	\N	\N	\N	\N
 CREATE TEMPORARY TABLE test_01602a\n(\n    `x` UInt32\n)\nENGINE = Memory
 CREATE TEMPORARY TABLE test_01602b\n(\n    `y` Float64,\n    `z` String\n)\nENGINE = Memory
 0