diff --git a/CMakeLists.txt b/CMakeLists.txt
index dafa5f03429..3c846cdd51e 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -401,17 +401,6 @@ else ()
     option(WERROR "Enable -Werror compiler option" ON)
 endif ()
 
-if (WERROR)
-    # Don't pollute CMAKE_CXX_FLAGS with -Werror as it will break some CMake checks.
-    # Instead, adopt modern cmake usage requirement.
-    target_compile_options(global-libs INTERFACE "-Werror")
-endif ()
-
-# Make this extra-checks for correct library dependencies.
-if (OS_LINUX AND NOT SANITIZE)
-    target_link_options(global-libs INTERFACE "-Wl,--no-undefined")
-endif ()
-
 # Increase stack size on Musl. We need big stack for our recursive-descend parser.
 if (USE_MUSL)
     set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} -Wl,-z,stack-size=2097152")
@@ -419,6 +408,7 @@ endif ()
 
 include(cmake/dbms_glob_sources.cmake)
 
+add_library(global-group INTERFACE)
 if (OS_LINUX OR OS_ANDROID)
     include(cmake/linux/default_libs.cmake)
 elseif (OS_DARWIN)
@@ -426,6 +416,18 @@ elseif (OS_DARWIN)
 elseif (OS_FREEBSD)
     include(cmake/freebsd/default_libs.cmake)
 endif ()
+link_libraries(global-group)
+
+if (WERROR)
+    # Don't pollute CMAKE_CXX_FLAGS with -Werror as it will break some CMake checks.
+    # Instead, adopt modern cmake usage requirement.
+    target_compile_options(global-group INTERFACE "-Werror")
+endif ()
+
+# Make this extra-checks for correct library dependencies.
+if (OS_LINUX AND NOT SANITIZE)
+    target_link_options(global-group INTERFACE "-Wl,--no-undefined")
+endif ()
 
 ######################################
 ### Add targets below this comment ###
diff --git a/base/glibc-compatibility/musl/getauxval.c b/base/glibc-compatibility/musl/getauxval.c
index b534678d76f..dad7aa938d7 100644
--- a/base/glibc-compatibility/musl/getauxval.c
+++ b/base/glibc-compatibility/musl/getauxval.c
@@ -1,80 +1,59 @@
-#include "atomic.h"
 #include <sys/auxv.h>
-#include <fcntl.h> // open
-#include <sys/stat.h> // O_RDONLY
-#include <unistd.h> // read, close
-#include <stdlib.h> // ssize_t
-#include <stdio.h> // perror, fprintf
-#include <link.h> // ElfW
+#include "atomic.h"
+#include <unistd.h> // __environ
 #include <errno.h>
 
-#define ARRAY_SIZE(a) sizeof((a))/sizeof((a[0]))
-
-// We don't have libc struct available here.
-// Compute aux vector manually (from /proc/self/auxv).
-//
-// Right now there is only 51 AT_* constants,
-// so 64 should be enough until this implementation will be replaced with musl.
-static unsigned long __auxv[64];
+// We don't have libc struct available here. Compute aux vector manually.
+static unsigned long * __auxv = NULL;
 static unsigned long __auxv_secure = 0;
 
+static size_t __find_auxv(unsigned long type)
+{
+    size_t i;
+    for (i = 0; __auxv[i]; i += 2)
+    {
+        if (__auxv[i] == type)
+            return i + 1;
+    }
+    return (size_t) -1;
+}
+
 unsigned long __getauxval(unsigned long type)
 {
     if (type == AT_SECURE)
         return __auxv_secure;
 
-    if (type >= ARRAY_SIZE(__auxv))
+    if (__auxv)
     {
-        errno = ENOENT;
-        return 0;
+        size_t index = __find_auxv(type);
+        if (index != ((size_t) -1))
+            return __auxv[index];
     }
 
-    return __auxv[type];
+    errno = ENOENT;
+    return 0;
 }
 
 static void * volatile getauxval_func;
 
-ssize_t __retry_read(int fd, void *buf, size_t count)
-{
-    for (;;)
-    {
-        ssize_t ret = read(fd, buf, count);
-        if (ret == -1)
-        {
-            if (errno == EINTR)
-                continue;
-            perror("Cannot read /proc/self/auxv");
-            abort();
-        }
-        return ret;
-    }
-}
 static unsigned long  __auxv_init(unsigned long type)
 {
-    // od -t dL /proc/self/auxv
-    int fd = open("/proc/self/auxv", O_RDONLY);
-    if (fd == -1) {
-        perror("Cannot read /proc/self/auxv (likely kernel is too old or procfs is not mounted)");
-        abort();
-    }
-
-    ElfW(auxv_t) aux;
-
-    /// NOTE: sizeof(aux) is very small (less then PAGE_SIZE), so partial read should not be possible.
-    _Static_assert(sizeof(aux) < 4096, "Unexpected sizeof(aux)");
-    while (__retry_read(fd, &aux, sizeof(aux)) == sizeof(aux))
+    if (!__environ)
     {
-        if (aux.a_type >= ARRAY_SIZE(__auxv))
-        {
-            fprintf(stderr, "AT_* is out of range: %li (maximum allowed is %zu)\n", aux.a_type, ARRAY_SIZE(__auxv));
-            abort();
-        }
-        __auxv[aux.a_type] = aux.a_un.a_val;
+        // __environ is not initialized yet so we can't initialize __auxv right now.
+        // That's normally occurred only when getauxval() is called from some sanitizer's internal code.
+        errno = ENOENT;
+        return 0;
     }
-    close(fd);
 
-    // AT_SECURE
-    __auxv_secure = __getauxval(AT_SECURE);
+    // Initialize __auxv and __auxv_secure.
+    size_t i;
+    for (i = 0; __environ[i]; i++);
+    __auxv = (unsigned long *) (__environ + i + 1);
+
+    size_t secure_idx = __find_auxv(AT_SECURE);
+    if (secure_idx != ((size_t) -1))
+        __auxv_secure = __auxv[secure_idx];
 
     // Now we've initialized __auxv, next time getauxval() will only call __get_auxval().
     a_cas_p(&getauxval_func, (void *)__auxv_init, (void *)__getauxval);
diff --git a/cmake/darwin/default_libs.cmake b/cmake/darwin/default_libs.cmake
index a6ee800d59b..ca4beaea8b6 100644
--- a/cmake/darwin/default_libs.cmake
+++ b/cmake/darwin/default_libs.cmake
@@ -24,14 +24,10 @@ find_package(Threads REQUIRED)
 
 include (cmake/find/cxx.cmake)
 
-add_library(global-group INTERFACE)
-
 target_link_libraries(global-group INTERFACE
     $<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
 )
 
-link_libraries(global-group)
-
 # FIXME: remove when all contribs will get custom cmake lists
 install(
     TARGETS global-group global-libs
diff --git a/cmake/freebsd/default_libs.cmake b/cmake/freebsd/default_libs.cmake
index a5847c95387..f7a333df6e6 100644
--- a/cmake/freebsd/default_libs.cmake
+++ b/cmake/freebsd/default_libs.cmake
@@ -25,14 +25,10 @@ find_package(Threads REQUIRED)
 include (cmake/find/unwind.cmake)
 include (cmake/find/cxx.cmake)
 
-add_library(global-group INTERFACE)
-
 target_link_libraries(global-group INTERFACE
     $<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
 )
 
-link_libraries(global-group)
-
 # FIXME: remove when all contribs will get custom cmake lists
 install(
     TARGETS global-group global-libs
diff --git a/cmake/linux/default_libs.cmake b/cmake/linux/default_libs.cmake
index 426ae482ea3..98951822015 100644
--- a/cmake/linux/default_libs.cmake
+++ b/cmake/linux/default_libs.cmake
@@ -45,15 +45,12 @@ endif ()
 include (cmake/find/unwind.cmake)
 include (cmake/find/cxx.cmake)
 
-add_library(global-group INTERFACE)
 target_link_libraries(global-group INTERFACE
     -Wl,--start-group
     $<TARGET_PROPERTY:global-libs,INTERFACE_LINK_LIBRARIES>
     -Wl,--end-group
 )
 
-link_libraries(global-group)
-
 # FIXME: remove when all contribs will get custom cmake lists
 install(
     TARGETS global-group global-libs
diff --git a/docs/en/interfaces/http.md b/docs/en/interfaces/http.md
index f8f6f26d208..d72fb4d6f17 100644
--- a/docs/en/interfaces/http.md
+++ b/docs/en/interfaces/http.md
@@ -23,11 +23,13 @@ Web UI can be accessed here: `http://localhost:8123/play`.
 ![Web UI](../images/play.png)
 
 
-In health-check scripts use `GET /ping` request. This handler always returns “Ok.” (with a line feed at the end). Available from version 18.12.13.
+In health-check scripts use `GET /ping` request. This handler always returns “Ok.” (with a line feed at the end). Available from version 18.12.13. See also `/replicas_status` to check replica's delay.
 
 ``` bash
 $ curl 'http://localhost:8123/ping'
 Ok.
+$ curl 'http://localhost:8123/replicas_status'
+Ok.
 ```
 
 Send the request as a URL ‘query’ parameter, or as a POST. Or send the beginning of the query in the ‘query’ parameter, and the rest in the POST (we’ll explain later why this is necessary). The size of the URL is limited to 16 KB, so keep this in mind when sending large queries.
diff --git a/docs/en/sql-reference/statements/use.md b/docs/en/sql-reference/statements/use.md
index 41cba58bb9d..841c23d333d 100644
--- a/docs/en/sql-reference/statements/use.md
+++ b/docs/en/sql-reference/statements/use.md
@@ -3,14 +3,14 @@ toc_priority: 53
 toc_title: USE
 ---
 
-# USE 语句 {#use}
+# USE Statement {#use}
 
 ``` sql
 USE db
 ```
 
-用于设置会话的当前数据库。
+Lets you set the current database for the session.
 
-如果查询语句中没有在表名前面以加点的方式指明数据库名， 则用当前数据库进行搜索。
+The current database is used for searching for tables if the database is not explicitly defined in the query with a dot before the table name.
 
-使用 HTTP 协议时无法进行此查询，因为没有会话的概念。
+This query can’t be made when using the HTTP protocol, since there is no concept of a session.
diff --git a/docs/ko/images/column-oriented.gif b/docs/ko/images/column-oriented.gif
new file mode 100644
index 00000000000..d5ac7c82848
Binary files /dev/null and b/docs/ko/images/column-oriented.gif differ
diff --git a/docs/ko/images/logo.svg b/docs/ko/images/logo.svg
new file mode 100644
index 00000000000..b5ab923ff65
--- /dev/null
+++ b/docs/ko/images/logo.svg
@@ -0,0 +1 @@
+<svg xmlns="http://www.w3.org/2000/svg" width="54" height="48" markdown="1" viewBox="0 0 9 8"><style>.o{fill:#fc0}.r{fill:red}</style><path d="M0,7 h1 v1 h-1 z" class="r"/><path d="M0,0 h1 v7 h-1 z" class="o"/><path d="M2,0 h1 v8 h-1 z" class="o"/><path d="M4,0 h1 v8 h-1 z" class="o"/><path d="M6,0 h1 v8 h-1 z" class="o"/><path d="M8,3.25 h1 v1.5 h-1 z" class="o"/></svg>
\ No newline at end of file
diff --git a/docs/ko/images/play.png b/docs/ko/images/play.png
new file mode 100644
index 00000000000..b75aebe4089
Binary files /dev/null and b/docs/ko/images/play.png differ
diff --git a/docs/ko/images/row-oriented.gif b/docs/ko/images/row-oriented.gif
new file mode 100644
index 00000000000..41395b5693e
Binary files /dev/null and b/docs/ko/images/row-oriented.gif differ
diff --git a/docs/ko/index.md b/docs/ko/index.md
new file mode 100644
index 00000000000..f2a6396c069
--- /dev/null
+++ b/docs/ko/index.md
@@ -0,0 +1,94 @@
+---
+toc_priority: 0
+toc_title: 목차
+---
+
+# ClickHouse란? {#what-is-clickhouse}
+
+ClickHouse® 는 query의 온라인 분석 처리(OLAP)를 위한 열 지향(column-oriented) 데이터베이스 관리 시스템(DBMS)입니다. 
+
+"보통의" 행 지향(row-oriented) DMBS에서는 데이터가 다음과 같은 순서로 저장됩니다.
+
+| row | WatchID     | JavaEnable | Title              | GoodEvent | EventTime           |
+|-----|-------------|------------|--------------------|-----------|---------------------|
+| #0 | 89354350662 | 1          | Investor Relations | 1         | 2016-05-18 05:19:20 |
+| #1 | 90329509958 | 0          | Contact us         | 1         | 2016-05-18 08:10:20 |
+| #2 | 89953706054 | 1          | Mission            | 1         | 2016-05-18 07:38:00 |
+| #N | …           | …          | …                  | …         | …                   |
+
+즉, 행과 관련된 모든 값들은 물리적으로 나란히 저장됩니다.
+
+행 지향(row-oriented) DMBS의 예시로는 MySQL, Postgres, 그리고 MS SQL 서버 등이 있습니다.
+
+열 지향 (column-oriented) DBMS에서는 데이터가 아래와 같은 방식으로 저장됩니다:
+
+| Row:        | #0                 | #1                 | #2                 | #N |
+|-------------|---------------------|---------------------|---------------------|-----|
+| WatchID:    | 89354350662         | 90329509958         | 89953706054         | …   |
+| JavaEnable: | 1                   | 0                   | 1                   | …   |
+| Title:      | Investor Relations  | Contact us          | Mission             | …   |
+| GoodEvent:  | 1                   | 1                   | 1                   | …   |
+| EventTime:  | 2016-05-18 05:19:20 | 2016-05-18 08:10:20 | 2016-05-18 07:38:00 | …   |
+
+이 예에서는 데이터가 정렬된 순서만을 보여줍니다. 다른 열의 값들은 서로 분리되어 저장되고, 같은 열의 정보들은 함께 저장됩니다.
+
+열 지향(column-oriented) DBMS 의 종류는 Vertica, Paraccel (Actian Matrix and Amazon Redshift), Sybase IQ, Exasol, Infobright, InfiniDB, MonetDB (VectorWise and Actian Vector), LucidDB, SAP HANA, Google Dremel, Google PowerDrill, Druid, 그리고 kdb+ 등이 있습니다.
+
+데이터를 저장하기 위한 서로 다른 순서는 다른 시나리오에 더 적합합니다. 데이터 접근 시나리오는 쿼리가 수행되는 빈도, 비율 및 비율을 나타내거나, 각 쿼리 유형(행, 열 및 바이트)에 대해 읽은 데이터의 양 데이터 읽기와 업데이트 사이의 관계, 데이터의 작업 크기 및 로컬에서 사용되는 방법 트랜잭션이 사용되는지 여부, 트랜잭션이 얼마나 격리되어 있는지, 데이터 복제 및 논리적 무결성에 대한 요구 사항, 각 쿼리 유형에 대한 대기 시간 및 처리량 요구 사항 등이 있습니다.
+
+시스템의 부하가 높을수록 사용 시나리오의 요구 사항에 맞게 시스템 설정을 사용자 지정하는 것이 더 중요하며 이 사용자 지정은 더욱 세분화됩니다. 상당히 다른 시나리오에 똑같이 적합한 시스템은 없습니다. 만약 높은 부하에서 시스템이 넓은 시나리오 집합에 대해 적응한다면 시스템은 모든 시나리오를 모두 제대로 처리하지 못하거나 가능한 시나리오 중 하나 또는 몇 개에 대해서만 잘 작동할 것입니다.
+
+## OLAP 시나리오의 중요 속성들 {#key-properties-of-olap-scenario}
+
+-   요청(request)의 대부분은 읽기 접근에 관한 것입니다.
+-   데이터는 단일 행이 아니라 상당히 큰 일괄 처리(\> 1000개 행)로 업데이트됩니다. 또는 전혀 업데이트되지 않습니다.
+-   데이터는 DB에 추가되지만 수정되지는 않습니다.
+-   읽기의 경우 DB에서 상당히 많은 수의 행이 추출되지만 열은 일부만 추출됩니다.
+-   테이블은 "넓습니다". 이는 열의 수가 많다는 것을 의미합니다.
+-   쿼리는 상대적으로 드뭅니다(일반적으로 서버당 수백 또는 초당 쿼리 미만).
+-   간단한 쿼리의 경우 약 50ms의 대기 시간이 허용됩니다.
+-   열 값은 숫자와 짧은 문자열(예: URL당 60바이트)과 같이 상당히 작습니다
+-   단일 쿼리를 처리할 때 높은 처리량이 필요합니다(서버당 초당 최대 수십억 행).
+-   트랜잭션이 필요하지 않습니다.
+-   데이터 일관성에 대한 요구 사항이 낮습니다.
+-   쿼리당 하나의 큰 테이블이 존재하고 하나를 제외한 모든 테이블은 작습니다.
+-   쿼리 결과가 원본 데이터보다 훨씬 작습니다. 즉, 데이터가 필터링되거나 집계되므로 결과가 단일 서버의 RAM에 꼭 들어맞습니다.
+
+OLAP 시나리오가 다른 일반적인 시나리오(OLTP 또는 키-값 액세스와 같은)와 매우 다르다는 것을 쉽게 알 수 있습니다. 따라서 적절한 성능을 얻으려면 분석 쿼리를 처리하기 위해 OLTP 또는 키-값 DB를 사용하는 것은 의미가 없습니다. 예를 들어 분석에 MongoDB나 Redis를 사용하려고 하면 OLAP 데이터베이스에 비해 성능이 매우 저하됩니다.
+
+## 왜 열 지향 데이터베이스가 OLAP 시나리오에 적합한가{#why-column-oriented-databases-work-better-in-the-olap-scenario}
+
+열 지향(column-oriented) 데이터베이스는 OLAP 시나리오에 더 적합합니다. 대부분의 쿼리를 처리하는 데 있어서 행 지향(row-oriented) 데이터베이스보다 100배 이상 빠릅니다. 그 이유는 아래에 자세히 설명되어 있지만 사실은 시각적으로 더 쉽게 설명할 수 있습니다.
+
+**행 지향 DBMS**
+
+![Row-oriented](images/row-oriented.gif#)
+
+**열 지향 DBMS**
+
+![Column-oriented](images/column-oriented.gif#)
+
+차이가 보이시나요?
+
+### 입출력 {#inputoutput}
+
+1.  분석 쿼리의 경우 적은 수의 테이블 열만 읽어야 합니다. 열 지향 데이터베이스에서는 필요한 데이터만 읽을 수 있습니다. 예를 들어 100개 중 5개의 열이 필요한 경우 I/O가 20배 감소할 것으로 예상할 수 있습니다. 
+2.  데이터는 패킷으로 읽히므로 압축하기가 더 쉽습니다. 열의 데이터도 압축하기 쉽습니다. 이것은 I/O의 볼륨을 더욱 감소시킵니다.
+3.  감소된 I/O로 인해 시스템 캐시에 더 많은 데이터가 들어갑니다.
+
+예를 들어, "각 광고 플랫폼에 대한 레코드 수 계산" 쿼리는 압축되지 않은 1바이트를 차지하는 하나의 "광고 플랫폼 ID" 열을 읽어야 합니다. 트래픽의 대부분이 광고 플랫폼에서 발생하지 않은 경우 이 열의 최소 10배 압축을 기대할 수 있습니다. 빠른 압축 알고리즘을 사용하면 초당 최소 몇 기가바이트의 압축되지 않은 데이터의 속도로 데이터 압축 해제가 가능합니다. 즉, 이 쿼리는 단일 서버에서 초당 약 수십억 행의 속도로 처리될 수 있습니다. 이 속도는 정말 실제로 달성됩니다.
+
+### CPU {#cpu}
+
+쿼리를 수행하려면 많은 행을 처리해야 하므로 별도의 행이 아닌 전체 벡터에 대한 모든 연산을 디스패치하거나 쿼리 엔진을 구현하여 디스패치 비용이 거의 들지 않습니다. 반쯤 괜찮은 디스크 하위 시스템에서 이렇게 하지 않으면 쿼리 인터프리터가 불가피하게 CPU를 정지시킵니다. 데이터를 열에 저장하고 가능한 경우 열별로 처리하는 것이 좋습니다.
+
+이를 수행하기위한 두가지 방법이 있습니다.
+
+1.  벡터 엔진. 모든 연산은 별도의 값 대신 벡터에 대해 작성됩니다. 즉, 작업을 자주 호출할 필요가 없으며 파견 비용도 무시할 수 있습니다. 작업 코드에는 최적화된 내부 주기가 포함되어 있습니다.
+2.  코드 생성. 쿼리에 대해 생성된 코드에는 모든 간접 호출이 있습니다.
+
+이것은 단순한 쿼리를 실행할 때 의미가 없기 때문에 "일반" 데이터베이스에서는 수행되지 않습니다. 그러나 예외가 있습니다. 예를 들어 MemSQL은 코드 생성을 사용하여 SQL 쿼리를 처리할 때 대기 시간을 줄입니다. (비교되게, 분석 DBMS는 대기 시간이 아닌 처리량 최적화가 필요합니다.)
+
+CPU 효율성을 위해 쿼리 언어는 선언적(SQL 또는 MDX)이거나 최소한 벡터(J, K)여야 합니다. 쿼리는 최적화를 허용하는 암시적 루프만 포함해야 합니다.
+
+{## [원문](https://clickhouse.com/docs/en/) ##}
diff --git a/docs/zh/faq/general/dbms-naming.md b/docs/zh/faq/general/dbms-naming.md
deleted file mode 120000
index 0df856af0ca..00000000000
--- a/docs/zh/faq/general/dbms-naming.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../en/faq/general/dbms-naming.md
\ No newline at end of file
diff --git a/docs/zh/faq/general/dbms-naming.md b/docs/zh/faq/general/dbms-naming.md
new file mode 100644
index 00000000000..8d4353f9322
--- /dev/null
+++ b/docs/zh/faq/general/dbms-naming.md
@@ -0,0 +1,17 @@
+---
+title: "\u201CClickHouse\u201D 有什么含义?"
+toc_hidden: true
+toc_priority: 10
+---
+
+# “ClickHouse” 有什么含义? {#what-does-clickhouse-mean}
+
+它是“**点击**流”和“数据**仓库**”的组合。它来自于Yandex最初的用例。在Metrica网站上，ClickHouse本应该保存人们在互联网上的所有点击记录，现在它仍然在做这项工作。你可以在[ClickHouse history](../../introduction/history.md)页面上阅读更多关于这个用例的信息。
+
+这个由两部分组成的意思有两个结果:
+
+- 唯一正确的写“Click**H** house”的方式是用大写H。
+- 如果需要缩写，请使用“**CH**”。由于一些历史原因，缩写CK在中国也很流行，主要是因为中文中最早的一个关于ClickHouse的演讲使用了这种形式。
+
+!!! info “有趣的事实”
+    多年后ClickHouse闻名于世, 这种命名方法：结合各有深意的两个词被赞扬为最好的数据库命名方式, 卡内基梅隆大学数据库副教授[Andy Pavlo做的研究](https://www.cs.cmu.edu/~pavlo/blog/2020/03/on-naming-a-database-management-system.html) 。ClickHouse与Postgres共同获得“史上最佳数据库名”奖。
diff --git a/docs/zh/operations/requirements.md b/docs/zh/operations/requirements.md
index c3013f738a2..964d7aa34f4 100644
--- a/docs/zh/operations/requirements.md
+++ b/docs/zh/operations/requirements.md
@@ -1,59 +1,59 @@
 ---
 toc_priority: 44
-toc_title: "要求"
+toc_title: "必备条件"
 ---
 
-# 要求 {#requirements}
+# 必备条件 {#requirements}
 
 ## CPU {#cpu}
 
-对于从预构建的deb包进行安装，请使用具有x86_64架构并支持SSE4.2指令的CPU。 要使用不支持SSE4.2或具有AArch64或PowerPC64LE体系结构的处理器运行ClickHouse，您应该从源代码构建ClickHouse。
+如果您使用预编译的DEB/RPM包安装ClickHouse，请使用支持SSE4.2指令集的x86_64架构的CPU。如果需要在不支持SSE4.2指令集的CPU上，或者在AArch64（ARM）和PowerPC64LE（IBM Power）架构上运行ClickHouse，您应该从源码编译ClickHouse。
 
-ClickHouse实现并行数据处理并使用所有可用的硬件资源。 在选择处理器时，考虑到ClickHouse在具有大量内核但时钟速率较低的配置中的工作效率要高于具有较少内核和较高时钟速率的配置。 例如，具有2600MHz的16核心优于具有3600MHz的8核心。
+ClickHouse实现了并行数据处理，处理时会使用所有的可用资源。在选择处理器时，请注意：ClickHouse在具有大量计算核、时钟频率稍低的平台上比计算核少、时钟频率高的平台上效率更高。例如，ClickHouse在16核 2.6GHz的CPU上运行速度高于8核 3.6GHz的CPU。
 
-建议使用 **睿频加速** 和 **超线程** 技术。 它显着提高了典型工作负载的性能。
+建议使用 **睿频加速** 和 **超线程** 技术。 它显着提高了正常工作负载的性能。
 
 ## RAM {#ram}
 
-我们建议使用至少4GB的RAM来执行重要的查询。 ClickHouse服务器可以使用少得多的RAM运行，但它需要处理查询的内存。
+我们建议使用至少4GB的内存来执行重要的查询。 ClickHouse服务器可以使用很少的内存运行，但它需要一定量的内存用于处理查询。
 
-RAM所需的体积取决于:
+ClickHouse所需内存取决于:
 
--   查询的复杂性。
--   查询中处理的数据量。
+- 查询的复杂程度。
+- 查询处理的数据量。
 
-要计算所需的RAM体积，您应该估计临时数据的大小 [GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause), [DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct), [JOIN](../sql-reference/statements/select/join.md#select-join) 和您使用的其他操作。
+要计算所需的内存大小，您应该考虑用于[GROUP BY](../sql-reference/statements/select/group-by.md#select-group-by-clause)、[DISTINCT](../sql-reference/statements/select/distinct.md#select-distinct)、[JOIN](../sql-reference/statements/select/join.md#select-join) 和其他操作所需的临时数据量。
 
-ClickHouse可以使用外部存储器来存储临时数据。看 [在外部存储器中分组](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory) 有关详细信息。
+ClickHouse可以使用外部存储器来存储临时数据。详情请见[在外部存储器中分组](../sql-reference/statements/select/group-by.md#select-group-by-in-external-memory)。
 
 ## 交换文件 {#swap-file}
 
-禁用生产环境的交换文件。
+请在生产环境禁用交换文件。
 
 ## 存储子系统 {#storage-subsystem}
 
 您需要有2GB的可用磁盘空间来安装ClickHouse。
 
-数据所需的存储量应单独计算。 评估应包括:
+数据所需的存储空间应单独计算。预估存储容量时请考虑:
 
--   估计数据量。
+- 数据量
 
-    您可以采取数据的样本并从中获取行的平均大小。 然后将该值乘以计划存储的行数。
+    您可以对数据进行采样并计算每行的平均占用空间。然后将该值乘以计划存储的行数。
 
--   数据压缩系数。
+- 数据压缩比
 
-    要估计数据压缩系数，请将数据的样本加载到ClickHouse中，并将数据的实际大小与存储的表的大小进行比较。 例如，点击流数据通常被压缩6-10倍。
+    要计算数据压缩比，请将样本数据写入ClickHouse，并将原始数据大小与ClickHouse实际存储的数据进行比较。例如，用户点击行为的原始数据压缩比通常为6-10。
 
-要计算要存储的最终数据量，请将压缩系数应用于估计的数据量。 如果计划将数据存储在多个副本中，则将估计的量乘以副本数。
+请将原始数据的大小除以压缩比来获得实际所需存储的大小。如果您打算将数据存放于几个副本中，请将存储容量乘上副本数。
 
 ## 网络 {#network}
 
-如果可能的话，使用10G或更高级别的网络。
+如果可能的话，请使用10G或更高级别的网络。
 
-网络带宽对于处理具有大量中间结果数据的分布式查询至关重要。 此外，网络速度会影响复制过程。
+网络带宽对于处理具有大量中间结果数据的分布式查询至关重要。此外，网络速度会影响复制过程。
 
 ## 软件 {#software}
 
-ClickHouse主要是为Linux系列操作系统开发的。 推荐的Linux发行版是Ubuntu。 `tzdata` 软件包应安装在系统中。
+ClickHouse主要是为Linux系列操作系统开发的。推荐的Linux发行版是Ubuntu。您需要检查`tzdata`（对于Ubuntu）软件包是否在安装ClickHouse之前已经安装。
 
-ClickHouse也可以在其他操作系统系列中工作。 查看详细信息 [开始](../getting-started/index.md) 文档的部分。
+ClickHouse也可以在其他操作系统系列中工作。详情请查看[开始](../getting-started/index.md)。
diff --git a/docs/zh/sql-reference/statements/alter/index.md b/docs/zh/sql-reference/statements/alter/index.md
index 2f60dbb262e..f7d983cab4e 100644
--- a/docs/zh/sql-reference/statements/alter/index.md
+++ b/docs/zh/sql-reference/statements/alter/index.md
@@ -1,23 +1,74 @@
 ---
-toc_hidden_folder: true
-toc_priority: 42
-toc_title: INDEX
+toc_priority: 35
+toc_title: ALTER
 ---
 
-# 操作数据跳过索引 {#manipulations-with-data-skipping-indices}
+## ALTER {#query_language_queries_alter}
 
-可以使用以下操作：
+大多数 `ALTER TABLE` 查询修改表设置或数据:
 
--   `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - 向表元数据添加索引描述。
+-   [COLUMN](../../../sql-reference/statements/alter/column.md)
+-   [PARTITION](../../../sql-reference/statements/alter/partition.md)
+-   [DELETE](../../../sql-reference/statements/alter/delete.md)
+-   [UPDATE](../../../sql-reference/statements/alter/update.md)
+-   [ORDER BY](../../../sql-reference/statements/alter/order-by.md)
+-   [INDEX](../../../sql-reference/statements/alter/index/index.md)
+-   [CONSTRAINT](../../../sql-reference/statements/alter/constraint.md)
+-   [TTL](../../../sql-reference/statements/alter/ttl.md)
 
--   `ALTER TABLE [db].name DROP INDEX name` - 从表元数据中删除索引描述并从磁盘中删除索引文件。
+!!! note "备注"
+    大多数 `ALTER TABLE` 查询只支持[\*MergeTree](../../../engines/table-engines/mergetree-family/index.md)表，以及[Merge](../../../engines/table-engines/special/merge.md)和[Distributed](../../../engines/table-engines/special/distributed.md)。
 
--   `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - 查询在分区`partition_name`中重建二级索引`name`。 操作为[mutation](../../../sql-reference/statements/alter/index.md#mutations).
+这些 `ALTER` 语句操作视图:
 
-前两个命令是轻量级的，它们只更改元数据或删除文件。
+-   [ALTER TABLE ... MODIFY QUERY](../../../sql-reference/statements/alter/view.md) — 修改一个 [Materialized view](../create/view.md#materialized) 结构.
+-   [ALTER LIVE VIEW](../../../sql-reference/statements/alter/view.md#alter-live-view) — 刷新一个 [Live view](../create/view.md#live-view).
 
-Also, they are replicated, syncing indices metadata via ZooKeeper.
-此外，它们会被复制，会通过ZooKeeper同步索引元数据。
+这些 `ALTER` 语句修改与基于角色的访问控制相关的实体:
 
-!!! note "注意"
-索引操作仅支持具有以下特征的表 [`*MergeTree`](../../../engines/table-engines/mergetree-family/mergetree.md)引擎 (包括[replicated](../../../engines/table-engines/mergetree-family/replication.md)).
+-   [USER](../../../sql-reference/statements/alter/user.md)
+-   [ROLE](../../../sql-reference/statements/alter/role.md)
+-   [QUOTA](../../../sql-reference/statements/alter/quota.md)
+-   [ROW POLICY](../../../sql-reference/statements/alter/row-policy.md)
+-   [SETTINGS PROFILE](../../../sql-reference/statements/alter/settings-profile.md)
+
+[ALTER TABLE ... MODIFY COMMENT](../../../sql-reference/statements/alter/comment.md) 语句添加、修改或删除表中的注释，无论之前是否设置过。
+
+## Mutations 突变 {#mutations}
+
+用来操作表数据的ALTER查询是通过一种叫做“突变”的机制来实现的，最明显的是[ALTER TABLE … DELETE](../../../sql-reference/statements/alter/delete.md)和[ALTER TABLE … UPDATE](../../../sql-reference/statements/alter/update.md)。它们是异步的后台进程，类似于[MergeTree](../../../engines/table-engines/mergetree-family/index.md)表的合并，产生新的“突变”版本的部件。
+
+
+
+对于 `*MergeTree` 表，通过重写整个数据部分来执行突变。没有原子性——一旦突变的部件准备好，部件就会被替换，并且在突变期间开始执行的 `SELECT` 查询将看到来自已经突变的部件的数据，以及来自尚未突变的部件的数据。
+
+
+
+突变完全按照它们的产生顺序排列，并按此顺序应用于每个部分。突变还与“INSERT INTO”查询进行部分排序:在提交突变之前插入表中的数据将被突变，而在此之后插入的数据将不会被突变。注意，突变不会以任何方式阻止插入。
+
+
+
+突变查询在添加突变条目后立即返回(对于复制表到ZooKeeper，对于非复制表到文件系统)。突变本身使用系统配置文件设置异步执行。要跟踪突变的进程，可以使用[`system.mutations`](../../../operations/system-tables/mutations.md#system_tables-mutations) 表。成功提交的变异将继续执行，即使ClickHouse服务器重新启动。没有办法回滚突变一旦提交，但如果突变卡住了，它可以取消与[`KILL MUTATION`](../../../sql-reference/statements/misc.md#kill-mutation) 查询。
+
+
+
+完成突变的条目不会立即删除(保留条目的数量由 `finished_mutations_to_keep` 存储引擎参数决定)。删除旧的突变条目。
+
+## ALTER 查询的同步性 {#synchronicity-of-alter-queries}
+
+
+对于非复制表，所有的 `ALTER` 查询都是同步执行的。对于复制表，查询只是向“ZooKeeper”添加相应动作的指令，动作本身会尽快执行。但是，查询可以等待所有副本上的这些操作完成。
+
+对于所有的“ALTER”查询，您可以使用[replication_alter_partitions_sync](../../../operations/settings/settings.md#replication-alter-partitions-sync)设置等待。
+
+通过[replication_wait_for_inactive_replica_timeout](../../../operations/settings/settings.md#replication-wait-for-inactive-replica-timeout]设置，可以指定不活动的副本执行所有 `ALTER` 查询的等待时间(以秒为单位)。
+
+
+
+!!! info "备注"
+
+    对于所有的 `ALTER` 查询，如果  `replication_alter_partitions_sync = 2`  和一些副本的不激活时间超过时间(在 `replication_wait_for_inactive_replica_timeout` 设置中指定)，那么将抛出一个异常 `UNFINISHED`。
+
+
+
+对于 `ALTER TABLE ... UPDATE|DELETE` 查询由 [mutations_sync](../../../operations/settings/settings.md#mutations_sync) 设置定义的同步度。
diff --git a/docs/zh/sql-reference/statements/alter/index/index.md b/docs/zh/sql-reference/statements/alter/index/index.md
deleted file mode 120000
index b754fa71b83..00000000000
--- a/docs/zh/sql-reference/statements/alter/index/index.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../../../en/sql-reference/statements/alter/index/index.md
\ No newline at end of file
diff --git a/docs/zh/sql-reference/statements/alter/index/index.md b/docs/zh/sql-reference/statements/alter/index/index.md
new file mode 100644
index 00000000000..16f48e55b2f
--- /dev/null
+++ b/docs/zh/sql-reference/statements/alter/index/index.md
@@ -0,0 +1,23 @@
+---
+toc_hidden_folder: true
+toc_priority: 42
+toc_title: INDEX
+---
+
+# 操作数据跳过索引 {#manipulations-with-data-skipping-indices}
+
+可以使用以下操作：
+
+-   `ALTER TABLE [db].name ADD INDEX name expression TYPE type GRANULARITY value [FIRST|AFTER name]` - 向表元数据添加索引描述。
+
+-   `ALTER TABLE [db].name DROP INDEX name` - 从表元数据中删除索引描述并从磁盘中删除索引文件。
+
+-   `ALTER TABLE [db.]table MATERIALIZE INDEX name IN PARTITION partition_name` - 查询在分区`partition_name`中重建二级索引`name`。 操作为[mutation](../../../../sql-reference/statements/alter/index.md#mutations).
+
+前两个命令是轻量级的，它们只更改元数据或删除文件。
+
+Also, they are replicated, syncing indices metadata via ZooKeeper.
+此外，它们会被复制，会通过ZooKeeper同步索引元数据。
+
+!!! note "注意"
+    索引操作仅支持具有以下特征的表 [`*MergeTree`](../../../../engines/table-engines/mergetree-family/mergetree.md)引擎 (包括[replicated](../../../../engines/table-engines/mergetree-family/replication.md)).
diff --git a/docs/zh/sql-reference/statements/exists.md b/docs/zh/sql-reference/statements/exists.md
deleted file mode 120000
index d69e8224fe6..00000000000
--- a/docs/zh/sql-reference/statements/exists.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../en/sql-reference/statements/exists.md
\ No newline at end of file
diff --git a/docs/zh/sql-reference/statements/exists.md b/docs/zh/sql-reference/statements/exists.md
new file mode 100644
index 00000000000..69b26fea918
--- /dev/null
+++ b/docs/zh/sql-reference/statements/exists.md
@@ -0,0 +1,12 @@
+---
+toc_priority: 45
+toc_title: EXISTS
+---
+
+# EXISTS 语句 {#exists-statement}
+
+``` sql
+EXISTS [TEMPORARY] [TABLE|DICTIONARY] [db.]name [INTO OUTFILE filename] [FORMAT format]
+```
+
+返回一个单独的 `UInt8`类型的列，如果表或数据库不存在，则包含一个值 `0`，如果表在指定的数据库中存在，则包含一个值 `1`。
\ No newline at end of file
diff --git a/docs/zh/sql-reference/statements/set.md b/docs/zh/sql-reference/statements/set.md
deleted file mode 120000
index 02e106afc9f..00000000000
--- a/docs/zh/sql-reference/statements/set.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../en/sql-reference/statements/set.md
\ No newline at end of file
diff --git a/docs/zh/sql-reference/statements/set.md b/docs/zh/sql-reference/statements/set.md
new file mode 100644
index 00000000000..a9888a7080e
--- /dev/null
+++ b/docs/zh/sql-reference/statements/set.md
@@ -0,0 +1,23 @@
+---
+toc_priority: 50
+toc_title: SET
+---
+
+# SET 语句 {#query-set}
+
+``` sql
+SET param = value
+```
+
+给当前会话的 `param` [配置项](../../operations/settings/index.md)赋值。你不能用这样的方式修改[服务器相关设置](../../operations/server-configuration-parameters/index.md)。
+
+
+您还可以在单个查询中设置指定设置配置文件中的所有值。
+
+
+
+``` sql
+SET profile = 'profile-name-from-the-settings-file'
+```
+
+更多详情, 详见 [配置项](../../operations/settings/settings.md).
diff --git a/docs/zh/sql-reference/statements/use.md b/docs/zh/sql-reference/statements/use.md
deleted file mode 120000
index 7bdbf049326..00000000000
--- a/docs/zh/sql-reference/statements/use.md
+++ /dev/null
@@ -1 +0,0 @@
-../../../en/sql-reference/statements/use.md
\ No newline at end of file
diff --git a/docs/zh/sql-reference/statements/use.md b/docs/zh/sql-reference/statements/use.md
new file mode 100644
index 00000000000..41cba58bb9d
--- /dev/null
+++ b/docs/zh/sql-reference/statements/use.md
@@ -0,0 +1,16 @@
+---
+toc_priority: 53
+toc_title: USE
+---
+
+# USE 语句 {#use}
+
+``` sql
+USE db
+```
+
+用于设置会话的当前数据库。
+
+如果查询语句中没有在表名前面以加点的方式指明数据库名， 则用当前数据库进行搜索。
+
+使用 HTTP 协议时无法进行此查询，因为没有会话的概念。
diff --git a/src/AggregateFunctions/AggregateFunctionIf.cpp b/src/AggregateFunctions/AggregateFunctionIf.cpp
index 5ba54ff8505..ce71e76de43 100644
--- a/src/AggregateFunctions/AggregateFunctionIf.cpp
+++ b/src/AggregateFunctions/AggregateFunctionIf.cpp
@@ -72,7 +72,7 @@ private:
     using Base = AggregateFunctionNullBase<result_is_nullable, serialize_flag,
         AggregateFunctionIfNullUnary<result_is_nullable, serialize_flag>>;
 
-    inline bool singleFilter(const IColumn ** columns, size_t row_num, size_t num_arguments) const
+    inline bool singleFilter(const IColumn ** columns, size_t row_num) const
     {
         const IColumn * filter_column = columns[num_arguments - 1];
 
@@ -112,7 +112,7 @@ public:
     {
         const ColumnNullable * column = assert_cast<const ColumnNullable *>(columns[0]);
         const IColumn * nested_column = &column->getNestedColumn();
-        if (!column->isNullAt(row_num) && singleFilter(columns, row_num, num_arguments))
+        if (!column->isNullAt(row_num) && singleFilter(columns, row_num))
         {
             this->setFlag(place);
             this->nested_function->add(this->nestedPlace(place), &nested_column, row_num, arena);
diff --git a/src/AggregateFunctions/AggregateFunctionSumMap.h b/src/AggregateFunctions/AggregateFunctionSumMap.h
index 7e661a92c5b..295258cd8cf 100644
--- a/src/AggregateFunctions/AggregateFunctionSumMap.h
+++ b/src/AggregateFunctions/AggregateFunctionSumMap.h
@@ -226,7 +226,7 @@ public:
                 {
                     // FIXME why is storing NearestFieldType not enough, and we
                     // have to check for decimals again here?
-                    UInt32 scale = static_cast<const ColumnDecimal<T> &>(key_column).getData().getScale();
+                    UInt32 scale = static_cast<const ColumnDecimal<T> &>(key_column).getScale();
                     it = merged_maps.find(DecimalField<T>(key, scale));
                 }
                 else
@@ -251,7 +251,7 @@ public:
 
                     if constexpr (is_decimal<T>)
                     {
-                        UInt32 scale = static_cast<const ColumnDecimal<T> &>(key_column).getData().getScale();
+                        UInt32 scale = static_cast<const ColumnDecimal<T> &>(key_column).getScale();
                         merged_maps.emplace(DecimalField<T>(key, scale), std::move(new_values));
                     }
                     else
diff --git a/src/Columns/ColumnArray.cpp b/src/Columns/ColumnArray.cpp
index 929c0153a0a..3abc60f33bc 100644
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@@ -1,5 +1,3 @@
-#include <string.h> // memcpy
-
 #include <Columns/ColumnArray.h>
 #include <Columns/ColumnsNumber.h>
 #include <Columns/ColumnString.h>
@@ -9,12 +7,7 @@
 #include <Columns/ColumnsCommon.h>
 #include <Columns/ColumnCompressed.h>
 #include <Columns/MaskOperations.h>
-
-#include <base/unaligned.h>
-#include <base/sort.h>
-
 #include <Processors/Transforms/ColumnGathererTransform.h>
-
 #include <Common/Exception.h>
 #include <Common/Arena.h>
 #include <Common/SipHash.h>
@@ -22,6 +15,8 @@
 #include <Common/assert_cast.h>
 #include <Common/WeakHash.h>
 #include <Common/HashTable/Hash.h>
+#include <base/unaligned.h>
+#include <cstring> // memcpy
 
 
 namespace DB
@@ -127,18 +122,8 @@ size_t ColumnArray::size() const
 
 Field ColumnArray::operator[](size_t n) const
 {
-    size_t offset = offsetAt(n);
-    size_t size = sizeAt(n);
-
-    if (size > max_array_size_as_field)
-        throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array of size {} is too large to be manipulated as single field, maximum size {}",
-            size, max_array_size_as_field);
-
-    Array res(size);
-
-    for (size_t i = 0; i < size; ++i)
-        res[i] = getData()[offset + i];
-
+    Field res;
+    get(n, res);
     return res;
 }
 
@@ -152,11 +137,12 @@ void ColumnArray::get(size_t n, Field & res) const
         throw Exception(ErrorCodes::TOO_LARGE_ARRAY_SIZE, "Array of size {} is too large to be manipulated as single field, maximum size {}",
             size, max_array_size_as_field);
 
-    res = Array(size);
+    res = Array();
     Array & res_arr = DB::get<Array &>(res);
+    res_arr.reserve(size);
 
     for (size_t i = 0; i < size; ++i)
-        getData().get(offset + i, res_arr[i]);
+        res_arr.push_back(getData()[offset + i]);
 }
 
 
diff --git a/src/Columns/ColumnDecimal.cpp b/src/Columns/ColumnDecimal.cpp
index 99085f0f976..4941585f8dd 100644
--- a/src/Columns/ColumnDecimal.cpp
+++ b/src/Columns/ColumnDecimal.cpp
@@ -32,12 +32,6 @@ namespace ErrorCodes
     extern const int LOGICAL_ERROR;
 }
 
-template class DecimalPaddedPODArray<Decimal32>;
-template class DecimalPaddedPODArray<Decimal64>;
-template class DecimalPaddedPODArray<Decimal128>;
-template class DecimalPaddedPODArray<Decimal256>;
-template class DecimalPaddedPODArray<DateTime64>;
-
 template <is_decimal T>
 int ColumnDecimal<T>::compareAt(size_t n, size_t m, const IColumn & rhs_, int) const
 {
@@ -131,19 +125,6 @@ void ColumnDecimal<T>::updateHashFast(SipHash & hash) const
 template <is_decimal T>
 void ColumnDecimal<T>::getPermutation(bool reverse, size_t limit, int , IColumn::Permutation & res) const
 {
-#if 1 /// TODO: perf test
-    if (data.size() <= std::numeric_limits<UInt32>::max())
-    {
-        PaddedPODArray<UInt32> tmp_res;
-        permutation(reverse, limit, tmp_res);
-
-        res.resize(tmp_res.size());
-        for (size_t i = 0; i < tmp_res.size(); ++i)
-            res[i] = tmp_res[i];
-        return;
-    }
-#endif
-
     permutation(reverse, limit, res);
 }
 
diff --git a/src/Columns/ColumnDecimal.h b/src/Columns/ColumnDecimal.h
index b55083cd671..1a4b06b46e4 100644
--- a/src/Columns/ColumnDecimal.h
+++ b/src/Columns/ColumnDecimal.h
@@ -1,66 +1,21 @@
 #pragma once
 
+#include <cmath>
+
+#include <base/sort.h>
+#include <base/TypeName.h>
+#include <Core/Field.h>
+#include <Core/DecimalFunctions.h>
+#include <Core/TypeId.h>
+#include <Common/typeid_cast.h>
 #include <Columns/ColumnVectorHelper.h>
 #include <Columns/IColumn.h>
 #include <Columns/IColumnImpl.h>
-#include <Core/Field.h>
-#include <Core/DecimalFunctions.h>
-#include <Common/typeid_cast.h>
-#include <base/sort.h>
-#include <Core/TypeId.h>
-#include <base/TypeName.h>
-
-#include <cmath>
 
 
 namespace DB
 {
 
-/// PaddedPODArray extended by Decimal scale
-template <typename T>
-class DecimalPaddedPODArray : public PaddedPODArray<T>
-{
-public:
-    using Base = PaddedPODArray<T>;
-    using Base::operator[];
-
-    DecimalPaddedPODArray(size_t size, UInt32 scale_)
-    :   Base(size),
-        scale(scale_)
-    {}
-
-    DecimalPaddedPODArray(const DecimalPaddedPODArray & other)
-    :   Base(other.begin(), other.end()),
-        scale(other.scale)
-    {}
-
-    DecimalPaddedPODArray(DecimalPaddedPODArray && other)
-    {
-        this->swap(other);
-        std::swap(scale, other.scale);
-    }
-
-    DecimalPaddedPODArray & operator=(DecimalPaddedPODArray && other)
-    {
-        this->swap(other);
-        std::swap(scale, other.scale);
-        return *this;
-    }
-
-    UInt32 getScale() const { return scale; }
-
-private:
-    UInt32 scale;
-};
-
-/// Prevent implicit template instantiation of DecimalPaddedPODArray for common decimal types
-
-extern template class DecimalPaddedPODArray<Decimal32>;
-extern template class DecimalPaddedPODArray<Decimal64>;
-extern template class DecimalPaddedPODArray<Decimal128>;
-extern template class DecimalPaddedPODArray<Decimal256>;
-extern template class DecimalPaddedPODArray<DateTime64>;
-
 /// A ColumnVector for Decimals
 template <is_decimal T>
 class ColumnDecimal final : public COWHelper<ColumnVectorHelper, ColumnDecimal<T>>
@@ -72,16 +27,16 @@ private:
 public:
     using ValueType = T;
     using NativeT = typename T::NativeType;
-    using Container = DecimalPaddedPODArray<T>;
+    using Container = PaddedPODArray<T>;
 
 private:
     ColumnDecimal(const size_t n, UInt32 scale_)
-    :   data(n, scale_),
+    :   data(n),
         scale(scale_)
     {}
 
     ColumnDecimal(const ColumnDecimal & src)
-    :   data(src.data),
+    :   data(src.data.begin(), src.data.end()),
         scale(src.scale)
     {}
 
@@ -195,7 +150,7 @@ public:
     const T & getElement(size_t n) const { return data[n]; }
     T & getElement(size_t n) { return data[n]; }
 
-    UInt32 getScale() const {return scale;}
+    UInt32 getScale() const { return scale; }
 
 protected:
     Container data;
@@ -206,8 +161,8 @@ protected:
     {
         size_t s = data.size();
         res.resize(s);
-        for (U i = 0; i < s; ++i)
-            res[i] = i;
+        for (size_t i = 0; i < s; ++i)
+            res[i] = static_cast<U>(i);
 
         auto sort_end = res.end();
         if (limit && limit < s)
diff --git a/src/Columns/ColumnMap.cpp b/src/Columns/ColumnMap.cpp
index e595525d9e8..ef5d96da0f7 100644
--- a/src/Columns/ColumnMap.cpp
+++ b/src/Columns/ColumnMap.cpp
@@ -4,8 +4,6 @@
 #include <Processors/Transforms/ColumnGathererTransform.h>
 #include <IO/WriteBufferFromString.h>
 #include <IO/Operators.h>
-#include <base/map.h>
-#include <base/range.h>
 #include <Common/typeid_cast.h>
 #include <Common/assert_cast.h>
 #include <Common/WeakHash.h>
@@ -64,8 +62,9 @@ MutableColumnPtr ColumnMap::cloneResized(size_t new_size) const
 
 Field ColumnMap::operator[](size_t n) const
 {
-    auto array = DB::get<Array>((*nested)[n]);
-    return Map(std::make_move_iterator(array.begin()), std::make_move_iterator(array.end()));
+    Field res;
+    get(n, res);
+    return res;
 }
 
 void ColumnMap::get(size_t n, Field & res) const
@@ -74,11 +73,12 @@ void ColumnMap::get(size_t n, Field & res) const
     size_t offset = offsets[n - 1];
     size_t size = offsets[n] - offsets[n - 1];
 
-    res = Map(size);
+    res = Map();
     auto & map = DB::get<Map &>(res);
+    map.reserve(size);
 
     for (size_t i = 0; i < size; ++i)
-        getNestedData().get(offset + i, map[i]);
+        map.push_back(getNestedData()[offset + i]);
 }
 
 bool ColumnMap::isDefaultAt(size_t n) const
diff --git a/src/Columns/ColumnTuple.cpp b/src/Columns/ColumnTuple.cpp
index d667b264d55..0310eca7adc 100644
--- a/src/Columns/ColumnTuple.cpp
+++ b/src/Columns/ColumnTuple.cpp
@@ -9,9 +9,6 @@
 #include <Common/WeakHash.h>
 #include <Common/assert_cast.h>
 #include <Common/typeid_cast.h>
-#include <base/sort.h>
-#include <base/map.h>
-#include <base/range.h>
 #include <DataTypes/Serializations/SerializationInfoTuple.h>
 
 
@@ -101,17 +98,21 @@ MutableColumnPtr ColumnTuple::cloneResized(size_t new_size) const
 
 Field ColumnTuple::operator[](size_t n) const
 {
-    return collections::map<Tuple>(columns, [n] (const auto & column) { return (*column)[n]; });
+    Field res;
+    get(n, res);
+    return res;
 }
 
 void ColumnTuple::get(size_t n, Field & res) const
 {
     const size_t tuple_size = columns.size();
-    Tuple tuple(tuple_size);
-    for (const auto i : collections::range(0, tuple_size))
-        columns[i]->get(n, tuple[i]);
 
-    res = tuple;
+    res = Tuple();
+    Tuple & res_tuple = DB::get<Tuple &>(res);
+    res_tuple.reserve(tuple_size);
+
+    for (size_t i = 0; i < tuple_size; ++i)
+        res_tuple.push_back((*columns[i])[n]);
 }
 
 bool ColumnTuple::isDefaultAt(size_t n) const
@@ -483,7 +484,7 @@ void ColumnTuple::getExtremes(Field & min, Field & max) const
     Tuple min_tuple(tuple_size);
     Tuple max_tuple(tuple_size);
 
-    for (const auto i : collections::range(0, tuple_size))
+    for (size_t i = 0; i < tuple_size; ++i)
         columns[i]->getExtremes(min_tuple[i], max_tuple[i]);
 
     min = min_tuple;
@@ -504,7 +505,7 @@ bool ColumnTuple::structureEquals(const IColumn & rhs) const
         if (tuple_size != rhs_tuple->columns.size())
             return false;
 
-        for (const auto i : collections::range(0, tuple_size))
+        for (size_t i = 0; i < tuple_size; ++i)
             if (!columns[i]->structureEquals(*rhs_tuple->columns[i]))
                 return false;
 
diff --git a/src/Common/PoolBase.h b/src/Common/PoolBase.h
index 85d4e84abca..a82a6efc4c1 100644
--- a/src/Common/PoolBase.h
+++ b/src/Common/PoolBase.h
@@ -41,6 +41,7 @@ private:
 
         ObjectPtr object;
         bool in_use = false;
+        std::atomic<bool> is_expired = false;
         PoolBase & pool;
     };
 
@@ -87,6 +88,14 @@ public:
         Object & operator*() &              { return *data->data.object; }
         const Object & operator*() const &  { return *data->data.object; }
 
+        /**
+         * Expire an object to make it reallocated later.
+         */
+        void expire()
+        {
+            data->data.is_expired = true;
+        }
+
         bool isNull() const { return data == nullptr; }
 
         PoolBase * getPool() const
@@ -112,9 +121,22 @@ public:
         while (true)
         {
             for (auto & item : items)
+            {
                 if (!item->in_use)
-                    return Entry(*item);
-
+                {
+                    if (likely(!item->is_expired))
+                    {
+                        return Entry(*item);
+                    }
+                    else
+                    {
+                        expireObject(item->object);
+                        item->object = allocObject();
+                        item->is_expired = false;
+                        return Entry(*item);
+                    }
+                }
+            }
             if (items.size() < max_items)
             {
                 ObjectPtr object = allocObject();
@@ -139,6 +161,12 @@ public:
             items.emplace_back(std::make_shared<PooledObject>(allocObject(), *this));
     }
 
+    inline size_t size()
+    {
+        std::unique_lock lock(mutex);
+        return items.size();
+    }
+
 private:
     /** The maximum size of the pool. */
     unsigned max_items;
@@ -162,4 +190,5 @@ protected:
 
     /** Creates a new object to put into the pool. */
     virtual ObjectPtr allocObject() = 0;
+    virtual void expireObject(ObjectPtr) {}
 };
diff --git a/src/Common/tests/gtest_poolbase.cpp b/src/Common/tests/gtest_poolbase.cpp
new file mode 100644
index 00000000000..20c3281c964
--- /dev/null
+++ b/src/Common/tests/gtest_poolbase.cpp
@@ -0,0 +1,52 @@
+#include <memory>
+#include <gtest/gtest.h>
+#include <Common/PoolBase.h>
+#include <Poco/Logger.h>
+using namespace DB;
+
+class PoolObject
+{
+public:
+    int x = 0;
+};
+
+class MyPoolBase : public PoolBase<PoolObject>
+{
+public:
+    using Object = PoolBase<PoolObject>::Object;
+    using ObjectPtr = std::shared_ptr<Object>;
+    using Ptr = PoolBase<PoolObject>::Ptr;
+
+    int last_destroy_value = 0;
+    MyPoolBase() : PoolBase<PoolObject>(100, &Poco::Logger::get("MyPoolBase")) { }
+
+protected:
+    ObjectPtr allocObject() override { return std::make_shared<Object>(); }
+
+    void expireObject(ObjectPtr obj) override
+    {
+        LOG_TRACE(log, "expire object");
+        ASSERT_TRUE(obj->x == 100);
+        last_destroy_value = obj->x;
+    }
+};
+
+TEST(PoolBase, testDestroy1)
+{
+    MyPoolBase pool;
+    {
+        auto obj_entry = pool.get(-1);
+        ASSERT_TRUE(!obj_entry.isNull());
+        obj_entry->x = 100;
+        obj_entry.expire();
+    }
+    ASSERT_EQ(1, pool.size());
+
+    {
+        auto obj_entry = pool.get(-1);
+        ASSERT_TRUE(!obj_entry.isNull());
+        ASSERT_EQ(obj_entry->x, 0);
+        ASSERT_EQ(1, pool.size());
+    }
+    ASSERT_EQ(100, pool.last_destroy_value);
+}
diff --git a/src/Core/BackgroundSchedulePool.cpp b/src/Core/BackgroundSchedulePool.cpp
index 9a42f752db2..18c43d8c45f 100644
--- a/src/Core/BackgroundSchedulePool.cpp
+++ b/src/Core/BackgroundSchedulePool.cpp
@@ -5,7 +5,6 @@
 #include <Common/CurrentThread.h>
 #include <base/logger_useful.h>
 #include <chrono>
-#include <base/scope_guard.h>
 
 
 namespace DB
@@ -246,7 +245,6 @@ void BackgroundSchedulePool::threadFunction()
     setThreadName(thread_name.c_str());
 
     attachToThreadGroup();
-    SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); });
 
     while (!shutdown)
     {
@@ -273,7 +271,6 @@ void BackgroundSchedulePool::delayExecutionThreadFunction()
     setThreadName((thread_name + "/D").c_str());
 
     attachToThreadGroup();
-    SCOPE_EXIT({ CurrentThread::detachQueryIfNotDetached(); });
 
     while (!shutdown)
     {
diff --git a/src/DataTypes/Serializations/SerializationArray.cpp b/src/DataTypes/Serializations/SerializationArray.cpp
index e3b535a2a11..30ee5e98b74 100644
--- a/src/DataTypes/Serializations/SerializationArray.cpp
+++ b/src/DataTypes/Serializations/SerializationArray.cpp
@@ -37,10 +37,11 @@ void SerializationArray::deserializeBinary(Field & field, ReadBuffer & istr) con
 {
     size_t size;
     readVarUInt(size, istr);
-    field = Array(size);
+    field = Array();
     Array & arr = get<Array &>(field);
+    arr.reserve(size);
     for (size_t i = 0; i < size; ++i)
-        nested->deserializeBinary(arr[i], istr);
+        nested->deserializeBinary(arr.emplace_back(), istr);
 }
 
 
diff --git a/src/DataTypes/Serializations/SerializationMap.cpp b/src/DataTypes/Serializations/SerializationMap.cpp
index 3f17061a744..24d06d8f3b2 100644
--- a/src/DataTypes/Serializations/SerializationMap.cpp
+++ b/src/DataTypes/Serializations/SerializationMap.cpp
@@ -53,13 +53,15 @@ void SerializationMap::deserializeBinary(Field & field, ReadBuffer & istr) const
 {
     size_t size;
     readVarUInt(size, istr);
-    field = Map(size);
-    for (auto & elem : field.get<Map &>())
+    field = Map();
+    Map & map = field.get<Map &>();
+    map.reserve(size);
+    for (size_t i = 0; i < size; ++i)
     {
         Tuple tuple(2);
         key->deserializeBinary(tuple[0], istr);
         value->deserializeBinary(tuple[1], istr);
-        elem = std::move(tuple);
+        map.push_back(std::move(tuple));
     }
 }
 
diff --git a/src/DataTypes/Serializations/SerializationTuple.cpp b/src/DataTypes/Serializations/SerializationTuple.cpp
index cd5a6b65a3c..8dc15fc9841 100644
--- a/src/DataTypes/Serializations/SerializationTuple.cpp
+++ b/src/DataTypes/Serializations/SerializationTuple.cpp
@@ -1,4 +1,3 @@
-#include <base/range.h>
 #include <DataTypes/Serializations/SerializationTuple.h>
 #include <DataTypes/Serializations/SerializationInfoTuple.h>
 #include <DataTypes/DataTypeTuple.h>
@@ -44,11 +43,11 @@ void SerializationTuple::deserializeBinary(Field & field, ReadBuffer & istr) con
 {
     const size_t size = elems.size();
 
-    Tuple tuple(size);
-    for (const auto i : collections::range(0, size))
-        elems[i]->deserializeBinary(tuple[i], istr);
-
-    field = tuple;
+    field = Tuple();
+    Tuple & tuple = get<Tuple &>(field);
+    tuple.reserve(size);
+    for (size_t i = 0; i < size; ++i)
+        elems[i]->deserializeBinary(tuple.emplace_back(), istr);
 }
 
 void SerializationTuple::serializeBinary(const IColumn & column, size_t row_num, WriteBuffer & ostr) const
@@ -73,7 +72,7 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl)
 
         // Check that all columns now have the same size.
         size_t new_size = column.size();
-        for (auto i : collections::range(1, num_elems))
+        for (size_t i = 1; i < num_elems; ++i)
         {
             const auto & element_column = extractElementColumn(column, i);
             if (element_column.size() != new_size)
@@ -87,7 +86,7 @@ static void addElementSafe(size_t num_elems, IColumn & column, F && impl)
     }
     catch (...)
     {
-        for (const auto & i : collections::range(0, num_elems))
+        for (size_t i = 0; i < num_elems; ++i)
         {
             auto & element_column = extractElementColumn(column, i);
             if (element_column.size() > old_size)
@@ -102,7 +101,7 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr)
 {
     addElementSafe(elems.size(), column, [&]
     {
-        for (const auto & i : collections::range(0, elems.size()))
+        for (size_t i = 0; i < elems.size(); ++i)
             elems[i]->deserializeBinary(extractElementColumn(column, i), istr);
     });
 }
@@ -110,7 +109,7 @@ void SerializationTuple::deserializeBinary(IColumn & column, ReadBuffer & istr)
 void SerializationTuple::serializeText(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
 {
     writeChar('(', ostr);
-    for (const auto i : collections::range(0, elems.size()))
+    for (size_t i = 0; i < elems.size(); ++i)
     {
         if (i != 0)
             writeChar(',', ostr);
@@ -126,7 +125,7 @@ void SerializationTuple::deserializeText(IColumn & column, ReadBuffer & istr, co
 
     addElementSafe(elems.size(), column, [&]
     {
-        for (const auto i : collections::range(0, size))
+        for (size_t i = 0; i < size; ++i)
         {
             skipWhitespaceIfAny(istr);
             if (i != 0)
@@ -158,7 +157,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu
         && have_explicit_names)
     {
         writeChar('{', ostr);
-        for (const auto i : collections::range(0, elems.size()))
+        for (size_t i = 0; i < elems.size(); ++i)
         {
             if (i != 0)
             {
@@ -173,7 +172,7 @@ void SerializationTuple::serializeTextJSON(const IColumn & column, size_t row_nu
     else
     {
         writeChar('[', ostr);
-        for (const auto i : collections::range(0, elems.size()))
+        for (size_t i = 0; i < elems.size(); ++i)
         {
             if (i != 0)
                 writeChar(',', ostr);
@@ -195,7 +194,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
         addElementSafe(elems.size(), column, [&]
         {
             // Require all elements but in arbitrary order.
-            for (auto i : collections::range(0, elems.size()))
+            for (size_t i = 0; i < elems.size(); ++i)
             {
                 if (i > 0)
                 {
@@ -226,7 +225,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
 
         addElementSafe(elems.size(), column, [&]
         {
-            for (const auto i : collections::range(0, size))
+            for (size_t i = 0; i < size; ++i)
             {
                 skipWhitespaceIfAny(istr);
                 if (i != 0)
@@ -246,7 +245,7 @@ void SerializationTuple::deserializeTextJSON(IColumn & column, ReadBuffer & istr
 void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
 {
     writeCString("<tuple>", ostr);
-    for (const auto i : collections::range(0, elems.size()))
+    for (size_t i = 0; i < elems.size(); ++i)
     {
         writeCString("<elem>", ostr);
         elems[i]->serializeTextXML(extractElementColumn(column, i), row_num, ostr, settings);
@@ -257,7 +256,7 @@ void SerializationTuple::serializeTextXML(const IColumn & column, size_t row_num
 
 void SerializationTuple::serializeTextCSV(const IColumn & column, size_t row_num, WriteBuffer & ostr, const FormatSettings & settings) const
 {
-    for (const auto i : collections::range(0, elems.size()))
+    for (size_t i = 0; i < elems.size(); ++i)
     {
         if (i != 0)
             writeChar(settings.csv.tuple_delimiter, ostr);
@@ -270,7 +269,7 @@ void SerializationTuple::deserializeTextCSV(IColumn & column, ReadBuffer & istr,
     addElementSafe(elems.size(), column, [&]
     {
         const size_t size = elems.size();
-        for (const auto i : collections::range(0, size))
+        for (size_t i = 0; i < size; ++i)
         {
             if (i != 0)
             {
@@ -362,7 +361,7 @@ void SerializationTuple::serializeBinaryBulkWithMultipleStreams(
 {
     auto * tuple_state = checkAndGetState<SerializeBinaryBulkStateTuple>(state);
 
-    for (const auto i : collections::range(0, elems.size()))
+    for (size_t i = 0; i < elems.size(); ++i)
     {
         const auto & element_col = extractElementColumn(column, i);
         elems[i]->serializeBinaryBulkWithMultipleStreams(element_col, offset, limit, settings, tuple_state->states[i]);
@@ -382,7 +381,7 @@ void SerializationTuple::deserializeBinaryBulkWithMultipleStreams(
     auto & column_tuple = assert_cast<ColumnTuple &>(*mutable_column);
 
     settings.avg_value_size_hint = 0;
-    for (const auto i : collections::range(0, elems.size()))
+    for (size_t i = 0; i < elems.size(); ++i)
         elems[i]->deserializeBinaryBulkWithMultipleStreams(column_tuple.getColumnPtr(i), limit, settings, tuple_state->states[i], cache);
 }
 
diff --git a/src/Functions/FunctionMathUnary.h b/src/Functions/FunctionMathUnary.h
index e0b9355e0a6..fa10c004e87 100644
--- a/src/Functions/FunctionMathUnary.h
+++ b/src/Functions/FunctionMathUnary.h
@@ -125,7 +125,7 @@ private:
     {
         const auto & src_data = col->getData();
         const size_t size = src_data.size();
-        UInt32 scale = src_data.getScale();
+        UInt32 scale = col->getScale();
 
         auto dst = ColumnVector<ReturnType>::create();
         auto & dst_data = dst->getData();
diff --git a/src/Functions/FunctionsConversion.h b/src/Functions/FunctionsConversion.h
index d65d0604547..909803d7cd7 100644
--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@@ -152,9 +152,11 @@ struct ConvertImpl
         if (const ColVecFrom * col_from = checkAndGetColumn<ColVecFrom>(named_from.column.get()))
         {
             typename ColVecTo::MutablePtr col_to = nullptr;
+
             if constexpr (IsDataTypeDecimal<ToDataType>)
             {
                 UInt32 scale;
+
                 if constexpr (std::is_same_v<Additions, AccurateConvertStrategyAdditions>
                     || std::is_same_v<Additions, AccurateOrNullConvertStrategyAdditions>)
                 {
@@ -208,11 +210,11 @@ struct ConvertImpl
                             bool convert_result = false;
 
                             if constexpr (IsDataTypeDecimal<FromDataType> && IsDataTypeDecimal<ToDataType>)
-                                convert_result = tryConvertDecimals<FromDataType, ToDataType>(vec_from[i], vec_from.getScale(), vec_to.getScale(), result);
+                                convert_result = tryConvertDecimals<FromDataType, ToDataType>(vec_from[i], col_from->getScale(), col_to->getScale(), result);
                             else if constexpr (IsDataTypeDecimal<FromDataType> && IsDataTypeNumber<ToDataType>)
-                                convert_result = tryConvertFromDecimal<FromDataType, ToDataType>(vec_from[i], vec_from.getScale(), result);
+                                convert_result = tryConvertFromDecimal<FromDataType, ToDataType>(vec_from[i], col_from->getScale(), result);
                             else if constexpr (IsDataTypeNumber<FromDataType> && IsDataTypeDecimal<ToDataType>)
-                                convert_result = tryConvertToDecimal<FromDataType, ToDataType>(vec_from[i], vec_to.getScale(), result);
+                                convert_result = tryConvertToDecimal<FromDataType, ToDataType>(vec_from[i], col_to->getScale(), result);
 
                             if (convert_result)
                                 vec_to[i] = result;
@@ -225,11 +227,11 @@ struct ConvertImpl
                         else
                         {
                             if constexpr (IsDataTypeDecimal<FromDataType> && IsDataTypeDecimal<ToDataType>)
-                                vec_to[i] = convertDecimals<FromDataType, ToDataType>(vec_from[i], vec_from.getScale(), vec_to.getScale());
+                                vec_to[i] = convertDecimals<FromDataType, ToDataType>(vec_from[i], col_from->getScale(), col_to->getScale());
                             else if constexpr (IsDataTypeDecimal<FromDataType> && IsDataTypeNumber<ToDataType>)
-                                vec_to[i] = convertFromDecimal<FromDataType, ToDataType>(vec_from[i], vec_from.getScale());
+                                vec_to[i] = convertFromDecimal<FromDataType, ToDataType>(vec_from[i], col_from->getScale());
                             else if constexpr (IsDataTypeNumber<FromDataType> && IsDataTypeDecimal<ToDataType>)
-                                vec_to[i] = convertToDecimal<FromDataType, ToDataType>(vec_from[i], vec_to.getScale());
+                                vec_to[i] = convertToDecimal<FromDataType, ToDataType>(vec_from[i], col_to->getScale());
                             else
                                 throw Exception("Unsupported data type in conversion function", ErrorCodes::CANNOT_CONVERT_TYPE);
                         }
@@ -820,7 +822,7 @@ struct ConvertImpl<FromDataType, std::enable_if_t<!std::is_same_v<FromDataType,
             else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime>)
                 data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss") + 1));
             else if constexpr (std::is_same_v<FromDataType, DataTypeDateTime64>)
-                data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + vec_from.getScale() + 1));
+                data_to.resize(size * (strlen("YYYY-MM-DD hh:mm:ss.") + col_from->getScale() + 1));
             else
                 data_to.resize(size * 3);   /// Arbitrary
 
@@ -1169,7 +1171,7 @@ struct ConvertThroughParsing
                     if constexpr (to_datetime64)
                     {
                         DateTime64 res = 0;
-                        parseDateTime64BestEffort(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone);
+                        parseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone);
                         vec_to[i] = res;
                     }
                     else
@@ -1184,7 +1186,7 @@ struct ConvertThroughParsing
                     if constexpr (to_datetime64)
                     {
                         DateTime64 res = 0;
-                        parseDateTime64BestEffortUS(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone);
+                        parseDateTime64BestEffortUS(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone);
                         vec_to[i] = res;
                     }
                     else
@@ -1199,12 +1201,12 @@ struct ConvertThroughParsing
                     if constexpr (to_datetime64)
                     {
                         DateTime64 value = 0;
-                        readDateTime64Text(value, vec_to.getScale(), read_buffer, *local_time_zone);
+                        readDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone);
                         vec_to[i] = value;
                     }
                     else if constexpr (IsDataTypeDecimal<ToDataType>)
                         SerializationDecimal<typename ToDataType::FieldType>::readText(
-                            vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale());
+                            vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale());
                     else
                     {
                         parseImpl<ToDataType>(vec_to[i], read_buffer, local_time_zone);
@@ -1223,7 +1225,7 @@ struct ConvertThroughParsing
                     if constexpr (to_datetime64)
                     {
                         DateTime64 res = 0;
-                        parsed = tryParseDateTime64BestEffort(res, vec_to.getScale(), read_buffer, *local_time_zone, *utc_time_zone);
+                        parsed = tryParseDateTime64BestEffort(res, col_to->getScale(), read_buffer, *local_time_zone, *utc_time_zone);
                         vec_to[i] = res;
                     }
                     else
@@ -1244,12 +1246,12 @@ struct ConvertThroughParsing
                     if constexpr (to_datetime64)
                     {
                         DateTime64 value = 0;
-                        parsed = tryReadDateTime64Text(value, vec_to.getScale(), read_buffer, *local_time_zone);
+                        parsed = tryReadDateTime64Text(value, col_to->getScale(), read_buffer, *local_time_zone);
                         vec_to[i] = value;
                     }
                     else if constexpr (IsDataTypeDecimal<ToDataType>)
                         parsed = SerializationDecimal<typename ToDataType::FieldType>::tryReadText(
-                            vec_to[i], read_buffer, ToDataType::maxPrecision(), vec_to.getScale());
+                            vec_to[i], read_buffer, ToDataType::maxPrecision(), col_to->getScale());
                     else
                         parsed = tryParseImpl<ToDataType>(vec_to[i], read_buffer, local_time_zone);
                 }
diff --git a/src/Functions/FunctionsRound.h b/src/Functions/FunctionsRound.h
index edba82a5b4e..8c30885d5dd 100644
--- a/src/Functions/FunctionsRound.h
+++ b/src/Functions/FunctionsRound.h
@@ -422,9 +422,9 @@ private:
     using Container = typename ColumnDecimal<T>::Container;
 
 public:
-    static NO_INLINE void apply(const Container & in, Container & out, Scale scale_arg)
+    static NO_INLINE void apply(const Container & in, UInt32 in_scale, Container & out, Scale scale_arg)
     {
-        scale_arg = in.getScale() - scale_arg;
+        scale_arg = in_scale - scale_arg;
         if (scale_arg > 0)
         {
             size_t scale = intExp10(scale_arg);
@@ -498,11 +498,11 @@ public:
         const auto * const col = checkAndGetColumn<ColumnDecimal<T>>(col_general);
         const typename ColumnDecimal<T>::Container & vec_src = col->getData();
 
-        auto col_res = ColumnDecimal<T>::create(vec_src.size(), vec_src.getScale());
+        auto col_res = ColumnDecimal<T>::create(vec_src.size(), col->getScale());
         auto & vec_res = col_res->getData();
 
         if (!vec_res.empty())
-            DecimalRoundingImpl<T, rounding_mode, tie_breaking_mode>::apply(col->getData(), vec_res, scale_arg);
+            DecimalRoundingImpl<T, rounding_mode, tie_breaking_mode>::apply(col->getData(), col->getScale(), vec_res, scale_arg);
 
         return col_res;
     }
diff --git a/src/Functions/array/arrayAggregation.cpp b/src/Functions/array/arrayAggregation.cpp
index da2304e1bb6..ee08c4f7f37 100644
--- a/src/Functions/array/arrayAggregation.cpp
+++ b/src/Functions/array/arrayAggregation.cpp
@@ -157,11 +157,11 @@ struct ArrayAggregateImpl
                 return false;
 
             const AggregationType x = column_const->template getValue<Element>(); // NOLINT
-            const auto & data = checkAndGetColumn<ColVecType>(&column_const->getDataColumn())->getData();
+            const ColVecType * column_typed = checkAndGetColumn<ColVecType>(&column_const->getDataColumn());
 
             typename ColVecResultType::MutablePtr res_column;
             if constexpr (is_decimal<Element>)
-                res_column = ColVecResultType::create(offsets.size(), data.getScale());
+                res_column = ColVecResultType::create(offsets.size(), column_typed->getScale());
             else
                 res_column = ColVecResultType::create(offsets.size());
 
@@ -185,7 +185,7 @@ struct ArrayAggregateImpl
                 {
                     if constexpr (is_decimal<Element>)
                     {
-                        res[i] = DecimalUtils::convertTo<ResultType>(x, data.getScale());
+                        res[i] = DecimalUtils::convertTo<ResultType>(x, column_typed->getScale());
                     }
                     else
                     {
@@ -210,11 +210,11 @@ struct ArrayAggregateImpl
                                 throw Exception(ErrorCodes::DECIMAL_OVERFLOW, "Decimal math overflow");
                         }
 
-                        auto result_scale = data.getScale() * array_size;
+                        auto result_scale = column_typed->getScale() * array_size;
                         if (unlikely(result_scale > DecimalUtils::max_precision<AggregationType>))
                             throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale {} is out of bounds", result_scale);
 
-                        res[i] = DecimalUtils::convertTo<ResultType>(product, data.getScale() * array_size);
+                        res[i] = DecimalUtils::convertTo<ResultType>(product, result_scale);
                     }
                     else
                     {
@@ -236,7 +236,7 @@ struct ArrayAggregateImpl
 
         typename ColVecResultType::MutablePtr res_column;
         if constexpr (is_decimal<Element>)
-            res_column = ColVecResultType::create(offsets.size(), data.getScale());
+            res_column = ColVecResultType::create(offsets.size(), column->getScale());
         else
             res_column = ColVecResultType::create(offsets.size());
 
@@ -309,7 +309,7 @@ struct ArrayAggregateImpl
                 if constexpr (is_decimal<Element>)
                 {
                     aggregate_value = aggregate_value / AggregationType(count);
-                    res[i] = DecimalUtils::convertTo<ResultType>(aggregate_value, data.getScale());
+                    res[i] = DecimalUtils::convertTo<ResultType>(aggregate_value, column->getScale());
                 }
                 else
                 {
@@ -318,7 +318,7 @@ struct ArrayAggregateImpl
             }
             else if constexpr (aggregate_operation == AggregateOperation::product && is_decimal<Element>)
             {
-                auto result_scale = data.getScale() * count;
+                auto result_scale = column->getScale() * count;
 
                 if (unlikely(result_scale > DecimalUtils::max_precision<AggregationType>))
                     throw Exception(ErrorCodes::ARGUMENT_OUT_OF_BOUND, "Scale {} is out of bounds", result_scale);
diff --git a/src/Functions/array/arrayCompact.cpp b/src/Functions/array/arrayCompact.cpp
index 7914b9a154e..c2908e37e12 100644
--- a/src/Functions/array/arrayCompact.cpp
+++ b/src/Functions/array/arrayCompact.cpp
@@ -40,7 +40,7 @@ struct ArrayCompactImpl
 
         typename ColVecType::MutablePtr res_values_column;
         if constexpr (is_decimal<T>)
-            res_values_column = ColVecType::create(src_values.size(), src_values.getScale());
+            res_values_column = ColVecType::create(src_values.size(), src_values_column->getScale());
         else
             res_values_column = ColVecType::create(src_values.size());
 
diff --git a/src/Functions/array/arrayCumSum.cpp b/src/Functions/array/arrayCumSum.cpp
index da8ef3d7852..467d9ad3951 100644
--- a/src/Functions/array/arrayCumSum.cpp
+++ b/src/Functions/array/arrayCumSum.cpp
@@ -101,9 +101,8 @@ struct ArrayCumSumImpl
             typename ColVecResult::MutablePtr res_nested;
             if constexpr (is_decimal<Element>)
             {
-                const typename ColVecType::Container & data =
-                    checkAndGetColumn<ColVecType>(&column_const->getDataColumn())->getData();
-                res_nested = ColVecResult::create(0, data.getScale());
+                const ColVecType * column_typed = checkAndGetColumn<ColVecType>(&column_const->getDataColumn());
+                res_nested = ColVecResult::create(0, column_typed->getScale());
             }
             else
                 res_nested = ColVecResult::create();
@@ -120,7 +119,7 @@ struct ArrayCumSumImpl
 
         typename ColVecResult::MutablePtr res_nested;
         if constexpr (is_decimal<Element>)
-            res_nested = ColVecResult::create(0, data.getScale());
+            res_nested = ColVecResult::create(0, column->getScale());
         else
             res_nested = ColVecResult::create();
 
diff --git a/src/Functions/array/arrayCumSumNonNegative.cpp b/src/Functions/array/arrayCumSumNonNegative.cpp
index c40df27c1cc..476bbd08163 100644
--- a/src/Functions/array/arrayCumSumNonNegative.cpp
+++ b/src/Functions/array/arrayCumSumNonNegative.cpp
@@ -83,7 +83,7 @@ struct ArrayCumSumNonNegativeImpl
 
         typename ColVecResult::MutablePtr res_nested;
         if constexpr (is_decimal<Element>)
-            res_nested = ColVecResult::create(0, data.getScale());
+            res_nested = ColVecResult::create(0, column->getScale());
         else
             res_nested = ColVecResult::create();
 
diff --git a/src/Functions/array/arrayDifference.cpp b/src/Functions/array/arrayDifference.cpp
index 97243f2cf74..c5fdf27100b 100644
--- a/src/Functions/array/arrayDifference.cpp
+++ b/src/Functions/array/arrayDifference.cpp
@@ -105,7 +105,7 @@ struct ArrayDifferenceImpl
 
         typename ColVecResult::MutablePtr res_nested;
         if constexpr (is_decimal<Element>)
-            res_nested = ColVecResult::create(0, data.getScale());
+            res_nested = ColVecResult::create(0, column->getScale());
         else
             res_nested = ColVecResult::create();
 
diff --git a/src/Functions/dateName.cpp b/src/Functions/dateName.cpp
index c89a7f80dfd..eef9bc3955b 100644
--- a/src/Functions/dateName.cpp
+++ b/src/Functions/dateName.cpp
@@ -148,7 +148,7 @@ public:
         UInt32 scale [[maybe_unused]] = 0;
         if constexpr (std::is_same_v<DataType, DataTypeDateTime64>)
         {
-            scale = times_data.getScale();
+            scale = times->getScale();
         }
 
         auto result_column = ColumnString::create();
diff --git a/src/Functions/formatDateTime.cpp b/src/Functions/formatDateTime.cpp
index 9f303b86ad3..e2ec90f4e61 100644
--- a/src/Functions/formatDateTime.cpp
+++ b/src/Functions/formatDateTime.cpp
@@ -440,7 +440,7 @@ public:
         UInt32 scale [[maybe_unused]] = 0;
         if constexpr (std::is_same_v<DataType, DataTypeDateTime64>)
         {
-            scale = vec.getScale();
+            scale = times->getScale();
         }
 
         auto col_res = ColumnString::create();
diff --git a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
index 2117eec0063..a81d4204565 100644
--- a/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
+++ b/src/Interpreters/ExecuteScalarSubqueriesVisitor.cpp
@@ -68,7 +68,7 @@ void ExecuteScalarSubqueriesMatcher::visit(ASTPtr & ast, Data & data)
 static bool worthConvertingToLiteral(const Block & scalar)
 {
     const auto * scalar_type_name = scalar.safeGetByPosition(0).type->getFamilyName();
-    std::set<String> useless_literal_types = {"Array", "Tuple", "AggregateFunction", "Function", "Set", "LowCardinality"};
+    static const std::set<std::string_view> useless_literal_types = {"Array", "Tuple", "AggregateFunction", "Function", "Set", "LowCardinality"};
     return !useless_literal_types.count(scalar_type_name);
 }
 
diff --git a/src/Interpreters/ExternalLoader.cpp b/src/Interpreters/ExternalLoader.cpp
index b2cd9495feb..aab3a9e7437 100644
--- a/src/Interpreters/ExternalLoader.cpp
+++ b/src/Interpreters/ExternalLoader.cpp
@@ -966,14 +966,14 @@ private:
     /// Does the loading, possibly in the separate thread.
     void doLoading(const String & name, size_t loading_id, bool forced_to_reload, size_t min_id_to_finish_loading_dependencies_, bool async, ThreadGroupStatusPtr thread_group = {})
     {
-        if (thread_group)
-            CurrentThread::attachTo(thread_group);
-
         SCOPE_EXIT_SAFE(
             if (thread_group)
                 CurrentThread::detachQueryIfNotDetached();
         );
 
+        if (thread_group)
+            CurrentThread::attachTo(thread_group);
+
         LOG_TRACE(log, "Start loading object '{}'", name);
         try
         {
diff --git a/src/Interpreters/sortBlock.cpp b/src/Interpreters/sortBlock.cpp
index edf911fa61c..c8a2d0903f2 100644
--- a/src/Interpreters/sortBlock.cpp
+++ b/src/Interpreters/sortBlock.cpp
@@ -1,13 +1,10 @@
 #include <Interpreters/sortBlock.h>
 
-#include <Columns/ColumnString.h>
 #include <Columns/ColumnConst.h>
 #include <Columns/ColumnNullable.h>
-#include <Columns/ColumnLowCardinality.h>
-#include <Common/typeid_cast.h>
+#include <Columns/ColumnTuple.h>
 #include <Functions/FunctionHelpers.h>
 
-#include <pdqsort.h>
 
 namespace DB
 {
@@ -17,66 +14,34 @@ namespace ErrorCodes
     extern const int BAD_COLLATION;
 }
 
-static bool isCollationRequired(const SortColumnDescription & description)
+/// Column with description for sort
+struct ColumnWithSortDescription
+{
+    const IColumn * column = nullptr;
+    SortColumnDescription description;
+
+    /// It means, that this column is ColumnConst
+    bool column_const = false;
+};
+
+using ColumnsWithSortDescriptions = std::vector<ColumnWithSortDescription>;
+
+namespace
+{
+
+inline bool isCollationRequired(const SortColumnDescription & description)
 {
     return description.collator != nullptr;
 }
 
-
-ColumnsWithSortDescriptions getColumnsWithSortDescription(const Block & block, const SortDescription & description)
-{
-    size_t size = description.size();
-    ColumnsWithSortDescriptions res;
-    res.reserve(size);
-
-    for (size_t i = 0; i < size; ++i)
-    {
-        const IColumn * column = !description[i].column_name.empty()
-            ? block.getByName(description[i].column_name).column.get()
-            : block.safeGetByPosition(description[i].column_number).column.get();
-
-        res.emplace_back(ColumnWithSortDescription{column, description[i], isColumnConst(*column)});
-    }
-
-    return res;
-}
-
-
-struct PartialSortingLess
+template <bool check_collation>
+struct PartialSortingLessImpl
 {
     const ColumnsWithSortDescriptions & columns;
 
-    explicit PartialSortingLess(const ColumnsWithSortDescriptions & columns_) : columns(columns_) {}
+    explicit PartialSortingLessImpl(const ColumnsWithSortDescriptions & columns_) : columns(columns_) { }
 
-    bool operator() (size_t a, size_t b) const
-    {
-        for (const auto & elem : columns)
-        {
-            int res;
-            if (elem.column_const)
-                res = 0;
-            else
-                res = elem.description.direction * elem.column->compareAt(a, b, *elem.column, elem.description.nulls_direction);
-            if (res < 0)
-                return true;
-            else if (res > 0)
-                return false;
-        }
-        return false;
-    }
-};
-
-
-struct PartialSortingLessWithCollation
-{
-    const ColumnsWithSortDescriptions & columns;
-
-    explicit PartialSortingLessWithCollation(const ColumnsWithSortDescriptions & columns_)
-        : columns(columns_)
-    {
-    }
-
-    bool operator() (size_t a, size_t b) const
+    inline bool operator()(size_t a, size_t b) const
     {
         for (const auto & elem : columns)
         {
@@ -85,13 +50,25 @@ struct PartialSortingLessWithCollation
             if (elem.column_const)
             {
                 res = 0;
+                continue;
             }
-            else if (isCollationRequired(elem.description))
+
+            if constexpr (check_collation)
             {
-                res = elem.column->compareAtWithCollation(a, b, *elem.column, elem.description.nulls_direction, *elem.description.collator);
+                if (isCollationRequired(elem.description))
+                {
+                    res = elem.column->compareAtWithCollation(a, b, *elem.column, elem.description.nulls_direction, *elem.description.collator);
+                }
+                else
+                {
+                    res = elem.column->compareAt(a, b, *elem.column, elem.description.nulls_direction);
+                }
             }
             else
+            {
                 res = elem.column->compareAt(a, b, *elem.column, elem.description.nulls_direction);
+            }
+
             res *= elem.description.direction;
             if (res < 0)
                 return true;
@@ -102,124 +79,148 @@ struct PartialSortingLessWithCollation
     }
 };
 
+using PartialSortingLess = PartialSortingLessImpl<false>;
+using PartialSortingLessWithCollation = PartialSortingLessImpl<true>;
+
+}
+
+void convertTupleColumnIntoSortDescriptions(
+    const ColumnTuple * tuple, const SortColumnDescription & description, ColumnsWithSortDescriptions & result)
+{
+    for (const auto & column : tuple->getColumns())
+    {
+        if (const auto * subtuple = typeid_cast<const ColumnTuple *>(column.get()))
+        {
+            convertTupleColumnIntoSortDescriptions(subtuple, description, result);
+        }
+        else
+        {
+            result.emplace_back(ColumnWithSortDescription{column.get(), description, isColumnConst(*column)});
+
+            if (isCollationRequired(description) && !result.back().column->isCollationSupported())
+                result.back().description.collator = nullptr;
+        }
+    }
+}
+
+ColumnsWithSortDescriptions getColumnsWithSortDescription(const Block & block, const SortDescription & description)
+{
+    size_t size = description.size();
+
+    ColumnsWithSortDescriptions result;
+    result.reserve(size);
+
+    for (size_t i = 0; i < size; ++i)
+    {
+        const auto & sort_column_description = description[i];
+
+        const IColumn * column = !sort_column_description.column_name.empty()
+            ? block.getByName(sort_column_description.column_name).column.get()
+            : block.safeGetByPosition(sort_column_description.column_number).column.get();
+
+        if (isCollationRequired(sort_column_description))
+        {
+            if (!column->isCollationSupported())
+                throw Exception(
+                    "Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, "
+                    "containing them.",
+                    ErrorCodes::BAD_COLLATION);
+        }
+
+        if (const auto * tuple = typeid_cast<const ColumnTuple *>(column))
+            convertTupleColumnIntoSortDescriptions(tuple, sort_column_description, result);
+        else
+            result.emplace_back(ColumnWithSortDescription{column, sort_column_description, isColumnConst(*column)});
+    }
+
+    return result;
+}
+
 void sortBlock(Block & block, const SortDescription & description, UInt64 limit)
 {
     if (!block)
         return;
 
-    /// If only one column to sort by
-    if (description.size() == 1)
+    ColumnsWithSortDescriptions columns_with_sort_descriptions = getColumnsWithSortDescription(block, description);
+
+    bool all_const = true;
+    for (const auto & column : columns_with_sort_descriptions)
     {
-        IColumn::Permutation perm;
-        bool reverse = description[0].direction == -1;
-
-        const IColumn * column = !description[0].column_name.empty()
-            ? block.getByName(description[0].column_name).column.get()
-            : block.safeGetByPosition(description[0].column_number).column.get();
-
-        bool is_column_const = false;
-        if (isCollationRequired(description[0]))
+        if (!column.column_const)
         {
-            if (!column->isCollationSupported())
-                throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing them.", ErrorCodes::BAD_COLLATION);
+            all_const = false;
+            break;
+        }
+    }
+    if (all_const)
+        return;
 
-            if (isColumnConst(*column))
-                is_column_const = true;
-            else
-                column->getPermutationWithCollation(*description[0].collator, reverse, limit, description[0].nulls_direction, perm);
-        }
-        else if (!isColumnConst(*column))
-        {
-            int nan_direction_hint = description[0].nulls_direction;
-            column->getPermutation(reverse, limit, nan_direction_hint, perm);
-        }
+    IColumn::Permutation permutation;
+
+    /// If only one column to sort by
+    if (columns_with_sort_descriptions.size() == 1)
+    {
+        auto & column_with_sort_description = columns_with_sort_descriptions[0];
+
+        bool reverse = column_with_sort_description.description.direction == -1;
+        int nan_direction_hint = column_with_sort_description.description.nulls_direction;
+        const auto & column = column_with_sort_description.column;
+
+        if (isCollationRequired(column_with_sort_description.description))
+            column->getPermutationWithCollation(
+                *column_with_sort_description.description.collator, reverse, limit, nan_direction_hint, permutation);
         else
-            /// we don't need to do anything with const column
-            is_column_const = true;
-
-        size_t columns = block.columns();
-        for (size_t i = 0; i < columns; ++i)
-        {
-            if (!is_column_const)
-                block.getByPosition(i).column = block.getByPosition(i).column->permute(perm, limit);
-        }
+            column->getPermutation(reverse, limit, nan_direction_hint, permutation);
     }
     else
     {
         size_t size = block.rows();
-        IColumn::Permutation perm(size);
+        permutation.resize(size);
         for (size_t i = 0; i < size; ++i)
-            perm[i] = i;
+            permutation[i] = i;
 
         if (limit >= size)
             limit = 0;
 
-        bool need_collation = false;
-        ColumnsWithSortDescriptions columns_with_sort_desc = getColumnsWithSortDescription(block, description);
+        EqualRanges ranges;
+        ranges.emplace_back(0, permutation.size());
 
-        for (size_t i = 0, num_sort_columns = description.size(); i < num_sort_columns; ++i)
+        for (const auto & column_with_sort_description : columns_with_sort_descriptions)
         {
-            const IColumn * column = columns_with_sort_desc[i].column;
-            if (isCollationRequired(description[i]))
-            {
-                if (!column->isCollationSupported())
-                    throw Exception("Collations could be specified only for String, LowCardinality(String), Nullable(String) or for Array or Tuple, containing them.", ErrorCodes::BAD_COLLATION);
+            while (!ranges.empty() && limit && limit <= ranges.back().first)
+                ranges.pop_back();
 
-                need_collation = true;
+            if (ranges.empty())
+                break;
+
+            if (column_with_sort_description.column_const)
+                continue;
+
+            bool is_collation_required = isCollationRequired(column_with_sort_description.description);
+            bool reverse = column_with_sort_description.description.direction < 0;
+            int nan_direction_hint = column_with_sort_description.description.nulls_direction;
+            const auto & column = column_with_sort_description.column;
+
+            if (is_collation_required)
+            {
+                column->updatePermutationWithCollation(
+                    *column_with_sort_description.description.collator, reverse, limit, nan_direction_hint, permutation, ranges);
+            }
+            else
+            {
+                column->updatePermutation(reverse, limit, nan_direction_hint, permutation, ranges);
             }
         }
+    }
 
-        if (need_collation)
-        {
-            EqualRanges ranges;
-            ranges.emplace_back(0, perm.size());
-            for (const auto & column : columns_with_sort_desc)
-            {
-                while (!ranges.empty() && limit && limit <= ranges.back().first)
-                    ranges.pop_back();
-
-                if (ranges.empty())
-                    break;
-
-                if (column.column_const)
-                    continue;
-
-                if (isCollationRequired(column.description))
-                {
-                    column.column->updatePermutationWithCollation(
-                        *column.description.collator, column.description.direction < 0, limit, column.description.nulls_direction, perm, ranges);
-                }
-                else
-                {
-                    column.column->updatePermutation(
-                        column.description.direction < 0, limit, column.description.nulls_direction, perm, ranges);
-                }
-            }
-        }
-        else
-        {
-            EqualRanges ranges;
-            ranges.emplace_back(0, perm.size());
-            for (const auto & column : columns_with_sort_desc)
-            {
-                while (!ranges.empty() && limit && limit <= ranges.back().first)
-                    ranges.pop_back();
-
-                if (ranges.empty())
-                    break;
-
-                column.column->updatePermutation(
-                    column.description.direction < 0, limit, column.description.nulls_direction, perm, ranges);
-            }
-        }
-
-        size_t columns = block.columns();
-        for (size_t i = 0; i < columns; ++i)
-            block.getByPosition(i).column = block.getByPosition(i).column->permute(perm, limit);
+    size_t columns = block.columns();
+    for (size_t i = 0; i < columns; ++i)
+    {
+        auto & column_to_sort = block.getByPosition(i).column;
+        column_to_sort = column_to_sort->permute(permutation, limit);
     }
 }
 
-
 void stableGetPermutation(const Block & block, const SortDescription & description, IColumn::Permutation & out_permutation)
 {
     if (!block)
@@ -235,7 +236,6 @@ void stableGetPermutation(const Block & block, const SortDescription & descripti
     std::stable_sort(out_permutation.begin(), out_permutation.end(), PartialSortingLess(columns_with_sort_desc));
 }
 
-
 bool isAlreadySorted(const Block & block, const SortDescription & description)
 {
     if (!block)
@@ -276,12 +276,15 @@ void stableSortBlock(Block & block, const SortDescription & description)
     if (!block)
         return;
 
-    IColumn::Permutation perm;
-    stableGetPermutation(block, description, perm);
+    IColumn::Permutation permutation;
+    stableGetPermutation(block, description, permutation);
 
     size_t columns = block.columns();
     for (size_t i = 0; i < columns; ++i)
-        block.safeGetByPosition(i).column = block.safeGetByPosition(i).column->permute(perm, 0);
+    {
+        auto & column_to_sort = block.safeGetByPosition(i).column;
+        column_to_sort = column_to_sort->permute(permutation, 0);
+    }
 }
 
 }
diff --git a/src/Interpreters/sortBlock.h b/src/Interpreters/sortBlock.h
index faf9384901b..31ae78e90b0 100644
--- a/src/Interpreters/sortBlock.h
+++ b/src/Interpreters/sortBlock.h
@@ -10,7 +10,6 @@ namespace DB
 /// Sort one block by `description`. If limit != 0, then the partial sort of the first `limit` rows is produced.
 void sortBlock(Block & block, const SortDescription & description, UInt64 limit = 0);
 
-
 /** Used only in StorageMergeTree to sort the data with INSERT.
   * Sorting is stable. This is important for keeping the order of rows in the CollapsingMergeTree engine
   *  - because based on the order of rows it is determined whether to delete or leave groups of rows when collapsing.
@@ -23,24 +22,9 @@ void stableSortBlock(Block & block, const SortDescription & description);
   */
 void stableGetPermutation(const Block & block, const SortDescription & description, IColumn::Permutation & out_permutation);
 
-
 /** Quickly check whether the block is already sorted. If the block is not sorted - returns false as fast as possible.
   * Collations are not supported.
   */
 bool isAlreadySorted(const Block & block, const SortDescription & description);
 
-/// Column with description for sort
-struct ColumnWithSortDescription
-{
-    const IColumn * column = nullptr;
-    SortColumnDescription description;
-
-    /// It means, that this column is ColumnConst
-    bool column_const = false;
-};
-
-using ColumnsWithSortDescriptions = std::vector<ColumnWithSortDescription>;
-
-ColumnsWithSortDescriptions getColumnsWithSortDescription(const Block & block, const SortDescription & description);
-
 }
diff --git a/src/Parsers/ParserDictionary.cpp b/src/Parsers/ParserDictionary.cpp
index 399dda08911..ef914e2264a 100644
--- a/src/Parsers/ParserDictionary.cpp
+++ b/src/Parsers/ParserDictionary.cpp
@@ -188,8 +188,19 @@ bool ParserDictionary::parseImpl(Pos & pos, ASTPtr & node, Expected & expected)
     ASTPtr ast_settings;
 
     /// Primary is required to be the first in dictionary definition
-    if (primary_key_keyword.ignore(pos) && !expression_list_p.parse(pos, primary_key, expected))
-        return false;
+    if (primary_key_keyword.ignore(pos))
+    {
+        bool was_open = false;
+
+        if (open.ignore(pos, expected))
+            was_open = true;
+
+        if (!expression_list_p.parse(pos, primary_key, expected))
+            return false;
+
+        if (was_open && !close.ignore(pos, expected))
+            return false;
+    }
 
     /// Loop is used to avoid strict order of dictionary properties
     while (true)
diff --git a/src/Processors/Executors/CompletedPipelineExecutor.cpp b/src/Processors/Executors/CompletedPipelineExecutor.cpp
index 45b02cba298..8ec1916f4ce 100644
--- a/src/Processors/Executors/CompletedPipelineExecutor.cpp
+++ b/src/Processors/Executors/CompletedPipelineExecutor.cpp
@@ -4,7 +4,6 @@
 #include <Poco/Event.h>
 #include <Common/setThreadName.h>
 #include <Common/ThreadPool.h>
-#include <base/scope_guard_safe.h>
 #include <iostream>
 
 namespace DB
@@ -40,11 +39,6 @@ static void threadFunction(CompletedPipelineExecutor::Data & data, ThreadGroupSt
         if (thread_group)
             CurrentThread::attachTo(thread_group);
 
-        SCOPE_EXIT_SAFE(
-            if (thread_group)
-                CurrentThread::detachQueryIfNotDetached();
-        );
-
         data.executor->execute(num_threads);
     }
     catch (...)
diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp
index e722f8718f7..80aacf14fe6 100644
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@@ -301,11 +301,6 @@ void PipelineExecutor::executeImpl(size_t num_threads)
                 if (thread_group)
                     CurrentThread::attachTo(thread_group);
 
-                SCOPE_EXIT_SAFE(
-                    if (thread_group)
-                        CurrentThread::detachQueryIfNotDetached();
-                );
-
                 try
                 {
                     executeSingleThread(thread_num);
diff --git a/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp b/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp
index 0ba07df95a6..198d5ce5d8d 100644
--- a/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp
+++ b/src/Processors/Executors/PullingAsyncPipelineExecutor.cpp
@@ -4,9 +4,7 @@
 #include <Processors/Transforms/AggregatingTransform.h>
 #include <Processors/Sources/NullSource.h>
 #include <QueryPipeline/QueryPipeline.h>
-
 #include <Common/setThreadName.h>
-#include <base/scope_guard_safe.h>
 
 namespace DB
 {
@@ -77,11 +75,6 @@ static void threadFunction(PullingAsyncPipelineExecutor::Data & data, ThreadGrou
         if (thread_group)
             CurrentThread::attachTo(thread_group);
 
-        SCOPE_EXIT_SAFE(
-            if (thread_group)
-                CurrentThread::detachQueryIfNotDetached();
-        );
-
         data.executor->execute(num_threads);
     }
     catch (...)
diff --git a/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp b/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
index 68898bdc2c2..6c2e62b77dc 100644
--- a/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
+++ b/src/Processors/Executors/PushingAsyncPipelineExecutor.cpp
@@ -2,11 +2,8 @@
 #include <Processors/Executors/PipelineExecutor.h>
 #include <Processors/ISource.h>
 #include <QueryPipeline/QueryPipeline.h>
-#include <iostream>
-
 #include <Common/ThreadPool.h>
 #include <Common/setThreadName.h>
-#include <base/scope_guard_safe.h>
 #include <Poco/Event.h>
 
 namespace DB
@@ -107,11 +104,6 @@ static void threadFunction(PushingAsyncPipelineExecutor::Data & data, ThreadGrou
         if (thread_group)
             CurrentThread::attachTo(thread_group);
 
-        SCOPE_EXIT_SAFE(
-            if (thread_group)
-                CurrentThread::detachQueryIfNotDetached();
-        );
-
         data.executor->execute(num_threads);
     }
     catch (...)
diff --git a/src/Processors/Formats/Impl/NativeFormat.cpp b/src/Processors/Formats/Impl/NativeFormat.cpp
index 19e2ede6b65..bd95cfd6376 100644
--- a/src/Processors/Formats/Impl/NativeFormat.cpp
+++ b/src/Processors/Formats/Impl/NativeFormat.cpp
@@ -15,21 +15,22 @@ namespace DB
 class NativeInputFormat final : public IInputFormat
 {
 public:
-    NativeInputFormat(ReadBuffer & buf, const Block & header)
-        : IInputFormat(header, buf)
-        , reader(buf, header, 0) {}
+    NativeInputFormat(ReadBuffer & buf, const Block & header_)
+        : IInputFormat(header_, buf)
+        , reader(std::make_unique<NativeReader>(buf, header_, 0))
+        , header(header_) {}
 
     String getName() const override { return "Native"; }
 
     void resetParser() override
     {
         IInputFormat::resetParser();
-        reader.resetParser();
+        reader->resetParser();
     }
 
     Chunk generate() override
     {
-        auto block = reader.read();
+        auto block = reader->read();
         if (!block)
             return {};
 
@@ -40,8 +41,15 @@ public:
         return Chunk(block.getColumns(), num_rows);
     }
 
+    void setReadBuffer(ReadBuffer & in_) override
+    {
+        reader = std::make_unique<NativeReader>(in_, header, 0);
+        IInputFormat::setReadBuffer(in_);
+    }
+
 private:
-    NativeReader reader;
+    std::unique_ptr<NativeReader> reader;
+    Block header;
 };
 
 class NativeOutputFormat final : public IOutputFormat
diff --git a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
index 213226c9d68..bfdb9de7d26 100644
--- a/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
+++ b/src/Processors/Formats/Impl/ParallelParsingInputFormat.cpp
@@ -2,17 +2,12 @@
 #include <IO/ReadHelpers.h>
 #include <Common/CurrentThread.h>
 #include <Common/setThreadName.h>
-#include <base/scope_guard_safe.h>
 
 namespace DB
 {
 
 void ParallelParsingInputFormat::segmentatorThreadFunction(ThreadGroupStatusPtr thread_group)
 {
-    SCOPE_EXIT_SAFE(
-        if (thread_group)
-            CurrentThread::detachQueryIfNotDetached();
-    );
     if (thread_group)
         CurrentThread::attachTo(thread_group);
 
@@ -59,12 +54,8 @@ void ParallelParsingInputFormat::segmentatorThreadFunction(ThreadGroupStatusPtr
 
 void ParallelParsingInputFormat::parserThreadFunction(ThreadGroupStatusPtr thread_group, size_t current_ticket_number)
 {
-    SCOPE_EXIT_SAFE(
-        if (thread_group)
-            CurrentThread::detachQueryIfNotDetached();
-    );
     if (thread_group)
-        CurrentThread::attachTo(thread_group);
+        CurrentThread::attachToIfDetached(thread_group);
 
     const auto parser_unit_number = current_ticket_number % processing_units.size();
     auto & unit = processing_units[parser_unit_number];
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index dc2f8b14b6c..9a124ad5143 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -67,7 +67,6 @@
 #include <boost/algorithm/string/replace.hpp>
 
 #include <base/insertAtEnd.h>
-#include <base/scope_guard_safe.h>
 
 #include <algorithm>
 #include <iomanip>
@@ -1590,12 +1589,8 @@ void MergeTreeData::clearPartsFromFilesystem(const DataPartsVector & parts_to_re
         {
             pool.scheduleOrThrowOnError([&, thread_group = CurrentThread::getGroup()]
             {
-                SCOPE_EXIT_SAFE(
-                    if (thread_group)
-                        CurrentThread::detachQueryIfNotDetached();
-                );
                 if (thread_group)
-                    CurrentThread::attachTo(thread_group);
+                    CurrentThread::attachToIfDetached(thread_group);
 
                 LOG_DEBUG(log, "Removing part from filesystem {}", part->name);
                 part->remove();
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index cdedd37e14a..a277cda9e50 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -1,5 +1,4 @@
 #include <boost/rational.hpp>   /// For calculations related to sampling coefficients.
-#include <base/scope_guard_safe.h>
 #include <optional>
 #include <unordered_set>
 
@@ -988,9 +987,8 @@ RangesInDataParts MergeTreeDataSelectExecutor::filterPartsByPrimaryKeyAndSkipInd
             for (size_t part_index = 0; part_index < parts.size(); ++part_index)
                 pool.scheduleOrThrowOnError([&, part_index, thread_group = CurrentThread::getGroup()]
                 {
-                    SCOPE_EXIT_SAFE(if (thread_group) CurrentThread::detachQueryIfNotDetached(););
                     if (thread_group)
-                        CurrentThread::attachTo(thread_group);
+                        CurrentThread::attachToIfDetached(thread_group);
 
                     process_part(part_index);
                 });
diff --git a/tests/performance/order_by_tuple.xml b/tests/performance/order_by_tuple.xml
new file mode 100644
index 00000000000..72fb1812bbc
--- /dev/null
+++ b/tests/performance/order_by_tuple.xml
@@ -0,0 +1,8 @@
+<test>
+    <tags>
+        <tag>sorting</tag>
+        <tag>comparison</tag>
+    </tags>
+
+    <query>select * from numbers(300000000) order by (1 - number , number + 1 , number) limit 10;</query>
+</test>
diff --git a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh
index c5aaa794ac9..f5ab71d8d34 100755
--- a/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh
+++ b/tests/queries/0_stateless/01810_max_part_removal_threads_long.sh
@@ -16,22 +16,54 @@ $CLICKHOUSE_CLIENT -nm -q "create database ordinary_$CLICKHOUSE_DATABASE engine=
 $CLICKHOUSE_CLIENT -nm -q """
     use ordinary_$CLICKHOUSE_DATABASE;
     drop table if exists data_01810;
-    create table data_01810 (key Int) Engine=MergeTree() order by key partition by key settings max_part_removal_threads=10, concurrent_part_removal_threshold=49;
-    insert into data_01810 select * from numbers(50);
+
+    create table data_01810 (key Int)
+    Engine=MergeTree()
+    order by key
+    partition by key%100
+    settings max_part_removal_threads=10, concurrent_part_removal_threshold=99, min_bytes_for_wide_part=0;
+
+    insert into data_01810 select * from numbers(100);
     drop table data_01810 settings log_queries=1;
     system flush logs;
-    select throwIf(length(thread_ids)<50) from system.query_log where event_date >= yesterday() and current_database = currentDatabase() and query = 'drop table data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null;
+
+    -- sometimes the same thread can be used to remove part, due to ThreadPool,
+    -- hence we cannot compare strictly.
+    select throwIf(not(length(thread_ids) between 6 and 11))
+    from system.query_log
+    where
+        event_date >= yesterday() and
+        current_database = currentDatabase() and
+        query = 'drop table data_01810 settings log_queries=1;' and
+        type = 'QueryFinish'
+    format Null;
 """
 
 # ReplicatedMergeTree
 $CLICKHOUSE_CLIENT -nm -q """
     use ordinary_$CLICKHOUSE_DATABASE;
     drop table if exists rep_data_01810;
-    create table rep_data_01810 (key Int) Engine=ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/rep_data_01810', '1') order by key partition by key settings max_part_removal_threads=10, concurrent_part_removal_threshold=49;
-    insert into rep_data_01810 select * from numbers(50);
+
+    create table rep_data_01810 (key Int)
+    Engine=ReplicatedMergeTree('/clickhouse/tables/$CLICKHOUSE_TEST_ZOOKEEPER_PREFIX/rep_data_01810', '1')
+    order by key
+    partition by key%100
+    settings max_part_removal_threads=10, concurrent_part_removal_threshold=99, min_bytes_for_wide_part=0;
+
+    insert into rep_data_01810 select * from numbers(100);
     drop table rep_data_01810 settings log_queries=1;
     system flush logs;
-    select throwIf(length(thread_ids)<50) from system.query_log where event_date >= yesterday() and current_database = currentDatabase() and query = 'drop table rep_data_01810 settings log_queries=1;' and type = 'QueryFinish' format Null;
+
+    -- sometimes the same thread can be used to remove part, due to ThreadPool,
+    -- hence we cannot compare strictly.
+    select throwIf(not(length(thread_ids) between 6 and 11))
+    from system.query_log
+    where
+        event_date >= yesterday() and
+        current_database = currentDatabase() and
+        query = 'drop table rep_data_01810 settings log_queries=1;' and
+        type = 'QueryFinish'
+    format Null;
 """
 
 $CLICKHOUSE_CLIENT -nm -q "drop database ordinary_$CLICKHOUSE_DATABASE"
diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.python b/tests/queries/0_stateless/02187_async_inserts_all_formats.python
new file mode 100644
index 00000000000..0a909451259
--- /dev/null
+++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.python
@@ -0,0 +1,50 @@
+#!/usr/bin/env python3
+import os
+import sys
+
+CURDIR = os.path.dirname(os.path.realpath(__file__))
+sys.path.insert(0, os.path.join(CURDIR, 'helpers'))
+
+CLICKHOUSE_URL = os.environ.get('CLICKHOUSE_URL')
+CLICKHOUSE_TMP = os.environ.get('CLICKHOUSE_TMP')
+
+from pure_http_client import ClickHouseClient
+
+client = ClickHouseClient()
+
+def run_test(data_format, gen_data_template, settings):
+    print(data_format)
+    client.query("TRUNCATE TABLE t_async_insert")
+
+    expected = client.query(gen_data_template.format("TSV")).strip()
+    data = client.query(gen_data_template.format(data_format), settings=settings,binary_result=True)
+
+    insert_query = "INSERT INTO t_async_insert FORMAT {}".format(data_format)
+    client.query_with_data(insert_query, data, settings=settings)
+
+    result = client.query("SELECT * FROM t_async_insert FORMAT TSV").strip()
+    if result != expected:
+        print("Failed for format {}.\nExpected:\n{}\nGot:\n{}\n".format(data_format, expected, result))
+        exit(1)
+
+formats = client.query("SELECT name FROM system.formats WHERE is_input AND is_output \
+    AND name NOT IN ('CapnProto', 'RawBLOB', 'Template', 'ProtobufSingle', 'LineAsString', 'Protobuf') ORDER BY name").strip().split('\n')
+
+# Generic formats
+client.query("DROP TABLE IF EXISTS t_async_insert")
+client.query("CREATE TABLE t_async_insert (id UInt64, s String, arr Array(UInt64)) ENGINE = Memory")
+gen_data_query = "SELECT number AS id, toString(number) AS s, range(number) AS arr FROM numbers(10) FORMAT {}"
+
+for data_format in formats:
+    run_test(data_format, gen_data_query, settings={"async_insert": 1, "wait_for_async_insert": 1})
+
+# LineAsString
+client.query("DROP TABLE IF EXISTS t_async_insert")
+client.query("CREATE TABLE t_async_insert (s String) ENGINE = Memory")
+gen_data_query = "SELECT toString(number) AS s FROM numbers(10) FORMAT {}"
+
+run_test('LineAsString', gen_data_query, settings={"async_insert": 1, "wait_for_async_insert": 1})
+
+# TODO: add CapnProto and Protobuf
+
+print("OK")
diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.reference b/tests/queries/0_stateless/02187_async_inserts_all_formats.reference
new file mode 100644
index 00000000000..b4a5b6c3a42
--- /dev/null
+++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.reference
@@ -0,0 +1,40 @@
+Arrow
+ArrowStream
+Avro
+CSV
+CSVWithNames
+CSVWithNamesAndTypes
+CustomSeparated
+CustomSeparatedWithNames
+CustomSeparatedWithNamesAndTypes
+JSONCompactEachRow
+JSONCompactEachRowWithNames
+JSONCompactEachRowWithNamesAndTypes
+JSONCompactStringsEachRow
+JSONCompactStringsEachRowWithNames
+JSONCompactStringsEachRowWithNamesAndTypes
+JSONEachRow
+JSONStringsEachRow
+MsgPack
+Native
+ORC
+Parquet
+RowBinary
+RowBinaryWithNames
+RowBinaryWithNamesAndTypes
+TSKV
+TSV
+TSVRaw
+TSVRawWithNames
+TSVRawWithNamesAndTypes
+TSVWithNames
+TSVWithNamesAndTypes
+TabSeparated
+TabSeparatedRaw
+TabSeparatedRawWithNames
+TabSeparatedRawWithNamesAndTypes
+TabSeparatedWithNames
+TabSeparatedWithNamesAndTypes
+Values
+LineAsString
+OK
diff --git a/tests/queries/0_stateless/02187_async_inserts_all_formats.sh b/tests/queries/0_stateless/02187_async_inserts_all_formats.sh
new file mode 100755
index 00000000000..4b0b8d84c58
--- /dev/null
+++ b/tests/queries/0_stateless/02187_async_inserts_all_formats.sh
@@ -0,0 +1,9 @@
+#!/usr/bin/env bash
+# Tags: no-fasttest, long
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+# shellcheck source=../shell_config.sh
+. "$CURDIR"/../shell_config.sh
+
+# We should have correct env vars from shell_config.sh to run this test
+python3 "$CURDIR"/02187_async_inserts_all_formats.python
diff --git a/tests/queries/0_stateless/02188_parser_dictionary_primary_key.reference b/tests/queries/0_stateless/02188_parser_dictionary_primary_key.reference
new file mode 100644
index 00000000000..0e4e614d264
--- /dev/null
+++ b/tests/queries/0_stateless/02188_parser_dictionary_primary_key.reference
@@ -0,0 +1,8 @@
+Dictionary output
+0	Value
+Dictionary output
+0	Value
+Dictionary output
+0	Value
+Dictionary output
+0	Value
diff --git a/tests/queries/0_stateless/02188_parser_dictionary_primary_key.sql b/tests/queries/0_stateless/02188_parser_dictionary_primary_key.sql
new file mode 100644
index 00000000000..a939c30b57b
--- /dev/null
+++ b/tests/queries/0_stateless/02188_parser_dictionary_primary_key.sql
@@ -0,0 +1,65 @@
+DROP TABLE IF EXISTS 02188_test_dictionary_source;
+CREATE TABLE 02188_test_dictionary_source
+(
+    id UInt64,
+    value String
+)
+ENGINE=TinyLog;
+
+INSERT INTO 02188_test_dictionary_source VALUES (0, 'Value');
+
+DROP DICTIONARY IF EXISTS 02188_test_dictionary_simple_primary_key;
+CREATE DICTIONARY 02188_test_dictionary_simple_primary_key
+(
+    id UInt64,
+    value String
+)
+PRIMARY KEY id
+SOURCE(CLICKHOUSE(TABLE '02188_test_dictionary_source'))
+LAYOUT(DIRECT());
+
+SELECT 'Dictionary output';
+SELECT * FROM 02188_test_dictionary_simple_primary_key;
+DROP DICTIONARY 02188_test_dictionary_simple_primary_key;
+
+CREATE DICTIONARY 02188_test_dictionary_simple_primary_key
+(
+    id UInt64,
+    value String
+)
+PRIMARY KEY (id)
+SOURCE(CLICKHOUSE(TABLE '02188_test_dictionary_source'))
+LAYOUT(DIRECT());
+
+SELECT 'Dictionary output';
+SELECT * FROM 02188_test_dictionary_simple_primary_key;
+DROP DICTIONARY 02188_test_dictionary_simple_primary_key;
+
+DROP DICTIONARY IF EXISTS 02188_test_dictionary_complex_primary_key;
+CREATE DICTIONARY 02188_test_dictionary_complex_primary_key
+(
+    id UInt64,
+    value String
+)
+PRIMARY KEY id, value
+SOURCE(CLICKHOUSE(TABLE '02188_test_dictionary_source'))
+LAYOUT(COMPLEX_KEY_DIRECT());
+
+SELECT 'Dictionary output';
+SELECT * FROM 02188_test_dictionary_complex_primary_key;
+DROP DICTIONARY 02188_test_dictionary_complex_primary_key;
+
+CREATE DICTIONARY 02188_test_dictionary_complex_primary_key
+(
+    id UInt64,
+    value String
+)
+PRIMARY KEY (id, value)
+SOURCE(CLICKHOUSE(TABLE '02188_test_dictionary_source'))
+LAYOUT(COMPLEX_KEY_DIRECT());
+
+SELECT 'Dictionary output';
+SELECT * FROM 02188_test_dictionary_complex_primary_key;
+DROP DICTIONARY 02188_test_dictionary_complex_primary_key;
+
+DROP TABLE 02188_test_dictionary_source;
diff --git a/tests/queries/0_stateless/helpers/pure_http_client.py b/tests/queries/0_stateless/helpers/pure_http_client.py
index 9f79c4ac529..3335f141bb5 100644
--- a/tests/queries/0_stateless/helpers/pure_http_client.py
+++ b/tests/queries/0_stateless/helpers/pure_http_client.py
@@ -14,22 +14,23 @@ class ClickHouseClient:
     def __init__(self, host = CLICKHOUSE_SERVER_URL_STR):
         self.host = host
 
-    def query(self, query, connection_timeout = 1500):
+    def query(self, query, connection_timeout=1500, settings=dict(), binary_result=False):
         NUMBER_OF_TRIES = 30
         DELAY = 10
 
+        params = {
+            'timeout_before_checking_execution_speed': 120,
+            'max_execution_time': 6000,
+            'database': CLICKHOUSE_DATABASE,
+        }
+
+        # Add extra settings to params
+        params = {**params, **settings}
+
         for i in range(NUMBER_OF_TRIES):
-            r = requests.post(
-                self.host,
-                params = {
-                    'timeout_before_checking_execution_speed': 120,
-                    'max_execution_time': 6000,
-                    'database': CLICKHOUSE_DATABASE
-                },
-                timeout = connection_timeout,
-                data = query)
+            r = requests.post(self.host, params=params, timeout=connection_timeout, data=query)
             if r.status_code == 200:
-                return r.text
+                return r.content if binary_result else r.text
             else:
                 print('ATTENTION: try #%d failed' % i)
                 if i != (NUMBER_OF_TRIES-1):
@@ -44,9 +45,22 @@ class ClickHouseClient:
         df = pd.read_csv(io.StringIO(data), sep = '\t')
         return df
 
-    def query_with_data(self, query, content):
-        content = content.encode('utf-8')
-        r = requests.post(self.host, data=content)
+    def query_with_data(self, query, data, connection_timeout=1500, settings=dict()):
+        params = {
+            'query': query,
+            'timeout_before_checking_execution_speed': 120,
+            'max_execution_time': 6000,
+            'database': CLICKHOUSE_DATABASE,
+        }
+
+        headers = {
+            "Content-Type": "application/binary"
+        }
+
+        # Add extra settings to params
+        params = {**params, **settings}
+
+        r = requests.post(self.host, params=params, timeout=connection_timeout, data=data, headers=headers)
         result = r.text
         if r.status_code == 200:
             return result
diff --git a/utils/c++expr b/utils/c++expr
index ec24b97ebc9..c498e780d05 100755
--- a/utils/c++expr
+++ b/utils/c++expr
@@ -3,7 +3,7 @@ set -e
 
 usage() {
     cat <<EOF >&2
-USAGE: c++expr [-c CXX | -C | -I] [-i INCLUDE] [-b STEPS] [-t TESTS] [-o FILE] [-O CXX_OPTS...] [-g 'GLOBAL CODE'] 'MAIN CODE'
+USAGE: c++expr [-c CXX | -C | -I] [-i INCLUDE] [-l LIB] [-b STEPS] [-t TESTS] [-o FILE] [-O CXX_OPTS...] [-g 'GLOBAL CODE'] 'MAIN CODE'
 OPTIONS:
     -c CXX          use specified c++ compiler
     -C              use cmake
@@ -15,6 +15,19 @@ OPTIONS:
     -t TESTS_NUM    make program to benchmark specified code snippet and run TESTS_NUM tests
     -o FILE         do not run, just save binary executable file
     -O CXX_OPTS     forward option compiler (e.g. -O "-O3 -std=c++20")
+EXAMPLES:
+  $ c++expr -g 'int fib(int n) { return n < 2 ? n : fib(n-2) + fib(n-1); }' 'OUT(fib(10)) OUT(fib(20)) OUT(fib(30))'
+    fib(10) -> 55
+    fib(20) -> 6765
+    fib(30) -> 832040
+  $ c++expr -I -i Interpreters/Context.h 'OUT(sizeof(DB::Context))'
+    sizeof(DB::Context) -> 7776
+  $ c++expr -I -i Common/Stopwatch.h -b 10000 'Stopwatch sw;'
+    Steps per test: 10000
+    Test #0: 0.0178 us      5.61798e+07 sps
+    ...
+    Test #4: 0.0179 us      5.58659e+07 sps
+    Average: 0.0179 us      5.58659e+07 sps
 EOF
     exit 1
 }
@@ -37,7 +50,7 @@ CMD_PARAMS=
 # Parse command line
 #
 
-if [ "$1" == "--help" ]; then usage; fi
+if [ "$1" == "--help" ] || [ -z "$1" ]; then usage; fi
 while getopts "vc:CIi:l:b:t:o:O:g:" OPT; do
     case "$OPT" in
     v)      set -x; ;;
diff --git a/utils/clickhouse-diagnostics/clickhouse-diagnostics b/utils/clickhouse-diagnostics/clickhouse-diagnostics
index ffddee0bdc4..83c0af9cd11 100644
--- a/utils/clickhouse-diagnostics/clickhouse-diagnostics
+++ b/utils/clickhouse-diagnostics/clickhouse-diagnostics
@@ -953,7 +953,7 @@ def parse_version(version):
     """
     Parse version string.
     """
-    return [int(x) for x in version.strip().split('.')]
+    return [int(x) for x in version.strip().split('.') if x.isnumeric()]
 
 
 if __name__ == '__main__':