Merge branch 'master' into issue-16775

2024-11-24 08:32:02 +00:00 · 2021-05-24 05:56:57 +03:00 · 2021-05-24 05:56:57 +03:00 · 6f70feed2f
commit 6f70feed2f
parent ff7b5d34bb f80e6535b0
101 changed files with 2622 additions and 334 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -36,7 +36,7 @@ option(FAIL_ON_UNSUPPORTED_OPTIONS_COMBINATION
 if(FAIL_ON_UNSUPPORTED_OPTIONS_COMBINATION)
    set(RECONFIGURE_MESSAGE_LEVEL FATAL_ERROR)
 else()
-    set(RECONFIGURE_MESSAGE_LEVEL STATUS)
+    set(RECONFIGURE_MESSAGE_LEVEL WARNING)
 endif()

 enable_language(C CXX ASM)
@ -504,7 +504,6 @@ include (cmake/find/libuv.cmake) # for amqpcpp and cassandra
 include (cmake/find/amqpcpp.cmake)
 include (cmake/find/capnp.cmake)
 include (cmake/find/llvm.cmake)
-include (cmake/find/termcap.cmake) # for external static llvm
 include (cmake/find/h3.cmake)
 include (cmake/find/libxml2.cmake)
 include (cmake/find/brotli.cmake)
--- a/cmake/find/llvm.cmake
+++ b/cmake/find/llvm.cmake
@ -1,98 +1,31 @@
-if (APPLE OR SPLIT_SHARED_LIBRARIES OR NOT ARCH_AMD64)
+if (APPLE OR SPLIT_SHARED_LIBRARIES OR NOT ARCH_AMD64 OR SANITIZE STREQUAL "undefined")
    set (ENABLE_EMBEDDED_COMPILER OFF CACHE INTERNAL "")
 endif()

 option (ENABLE_EMBEDDED_COMPILER "Enable support for 'compile_expressions' option for query execution" ON)
-# Broken in macos. TODO: update clang, re-test, enable on Apple
-if (ENABLE_EMBEDDED_COMPILER AND NOT SPLIT_SHARED_LIBRARIES AND ARCH_AMD64 AND NOT (SANITIZE STREQUAL "undefined"))
-    option (USE_INTERNAL_LLVM_LIBRARY "Use bundled or system LLVM library." ${NOT_UNBUNDLED})
-endif()

 if (NOT ENABLE_EMBEDDED_COMPILER)
-    if(USE_INTERNAL_LLVM_LIBRARY)
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Cannot use internal LLVM library with ENABLE_EMBEDDED_COMPILER=OFF")
-    endif()
+    set (USE_EMBEDDED_COMPILER 0)
    return()
 endif()

 if (NOT EXISTS "${ClickHouse_SOURCE_DIR}/contrib/llvm/llvm/CMakeLists.txt")
-    if (USE_INTERNAL_LLVM_LIBRARY)
-        message (WARNING "submodule contrib/llvm is missing. to fix try run: \n git submodule update --init --recursive")
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't fidd internal LLVM library")
-    endif()
-    set (MISSING_INTERNAL_LLVM_LIBRARY 1)
+    message (${RECONFIGURE_MESSAGE_LEVEL} "submodule /contrib/llvm is missing. to fix try run: \n git submodule update --init --recursive")
 endif ()

-if (NOT USE_INTERNAL_LLVM_LIBRARY)
-    set (LLVM_PATHS "/usr/local/lib/llvm" "/usr/lib/llvm")
+set (USE_EMBEDDED_COMPILER 1)

-    foreach(llvm_v 11.1 11)
-        if (NOT LLVM_FOUND)
-            find_package (LLVM ${llvm_v} CONFIG PATHS ${LLVM_PATHS})
-        endif ()
-    endforeach ()
+set (LLVM_FOUND 1)
+set (LLVM_VERSION "12.0.0bundled")
+set (LLVM_INCLUDE_DIRS
+    "${ClickHouse_SOURCE_DIR}/contrib/llvm/llvm/include"
+    "${ClickHouse_BINARY_DIR}/contrib/llvm/llvm/include"
+)
+set (LLVM_LIBRARY_DIRS "${ClickHouse_BINARY_DIR}/contrib/llvm/llvm")

-    if (LLVM_FOUND)
-        # Remove dynamically-linked zlib and libedit from LLVM's dependencies:
-        set_target_properties(LLVMSupport PROPERTIES INTERFACE_LINK_LIBRARIES "-lpthread;LLVMDemangle;${ZLIB_LIBRARIES}")
-        set_target_properties(LLVMLineEditor PROPERTIES INTERFACE_LINK_LIBRARIES "LLVMSupport")
-
-        option(LLVM_HAS_RTTI "Enable if LLVM was build with RTTI enabled" ON)
-        set (USE_EMBEDDED_COMPILER 1)
-    else()
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Can't find system LLVM")
-        set (USE_EMBEDDED_COMPILER 0)
-    endif()
-
-    if (LLVM_FOUND AND OS_LINUX AND USE_LIBCXX AND NOT FORCE_LLVM_WITH_LIBCXX)
-        message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is not set but the LLVM library from OS packages "
-                "in Linux is incompatible with libc++ ABI. LLVM Will be disabled. Force: -DFORCE_LLVM_WITH_LIBCXX=ON")
-        message (${RECONFIGURE_MESSAGE_LEVEL} "Unsupported LLVM configuration, cannot enable LLVM")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    endif ()
-endif()
-
-if(NOT LLVM_FOUND AND NOT MISSING_INTERNAL_LLVM_LIBRARY)
-    if (CMAKE_CURRENT_SOURCE_DIR STREQUAL CMAKE_CURRENT_BINARY_DIR)
-        message(WARNING "Option ENABLE_EMBEDDED_COMPILER is set but internal LLVM library cannot build if build directory is the same as source directory.")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    elseif (SPLIT_SHARED_LIBRARIES)
-        # llvm-tablegen cannot find shared libraries that we build. Probably can be easily fixed.
-        message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is not compatible with SPLIT_SHARED_LIBRARIES. Build of LLVM will be disabled.")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    elseif (NOT ARCH_AMD64)
-        # It's not supported yet, but you can help.
-        message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY is only available for x86_64. Build of LLVM will be disabled.")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    elseif (SANITIZE STREQUAL "undefined")
-        # llvm-tblgen, that is used during LLVM build, doesn't work with UBSan.
-        message(WARNING "Option USE_INTERNAL_LLVM_LIBRARY does not work with UBSan, because 'llvm-tblgen' tool from LLVM has undefined behaviour. Build of LLVM will be disabled.")
-        set (LLVM_FOUND 0)
-        set (USE_EMBEDDED_COMPILER 0)
-    else ()
-        set (USE_INTERNAL_LLVM_LIBRARY ON)
-        set (LLVM_FOUND 1)
-        set (USE_EMBEDDED_COMPILER 1)
-        set (LLVM_VERSION "9.0.0bundled")
-        set (LLVM_INCLUDE_DIRS
-            "${ClickHouse_SOURCE_DIR}/contrib/llvm/llvm/include"
-            "${ClickHouse_BINARY_DIR}/contrib/llvm/llvm/include"
-        )
-        set (LLVM_LIBRARY_DIRS "${ClickHouse_BINARY_DIR}/contrib/llvm/llvm")
-    endif()
-endif()
-
-if (LLVM_FOUND)
-    message(STATUS "LLVM include Directory: ${LLVM_INCLUDE_DIRS}")
-    message(STATUS "LLVM library Directory: ${LLVM_LIBRARY_DIRS}")
-    message(STATUS "LLVM C++ compiler flags: ${LLVM_CXXFLAGS}")
-else()
-    message (${RECONFIGURE_MESSAGE_LEVEL} "Can't enable LLVM")
-endif()
+message(STATUS "LLVM include Directory: ${LLVM_INCLUDE_DIRS}")
+message(STATUS "LLVM library Directory: ${LLVM_LIBRARY_DIRS}")
+message(STATUS "LLVM C++ compiler flags: ${LLVM_CXXFLAGS}")

 # This list was generated by listing all LLVM libraries, compiling the binary and removing all libraries while it still compiles.
 set (REQUIRED_LLVM_LIBRARIES
--- a/cmake/find/termcap.cmake
+++ b/cmake/find/termcap.cmake
@ -1,17 +0,0 @@
-if (ENABLE_EMBEDDED_COMPILER AND NOT USE_INTERNAL_LLVM_LIBRARY AND USE_STATIC_LIBRARIES)
-    find_library (TERMCAP_LIBRARY tinfo)
-    if (NOT TERMCAP_LIBRARY)
-        find_library (TERMCAP_LIBRARY ncurses)
-    endif()
-    if (NOT TERMCAP_LIBRARY)
-        find_library (TERMCAP_LIBRARY termcap)
-    endif()
-
-    if (NOT TERMCAP_LIBRARY)
-        message (FATAL_ERROR "Statically Linking external LLVM requires termcap")
-    endif()
-
-    target_link_libraries(LLVMSupport INTERFACE ${TERMCAP_LIBRARY})
-
-    message (STATUS "Using termcap: ${TERMCAP_LIBRARY}")
-endif()
--- a/contrib/CMakeLists.txt
+++ b/contrib/CMakeLists.txt
@ -220,11 +220,12 @@ elseif(GTEST_SRC_DIR)
    target_compile_definitions(gtest INTERFACE GTEST_HAS_POSIX_RE=0)
 endif()

-if (USE_EMBEDDED_COMPILER AND USE_INTERNAL_LLVM_LIBRARY)
+if (USE_EMBEDDED_COMPILER)
    # ld: unknown option: --color-diagnostics
    if (APPLE)
        set (LINKER_SUPPORTS_COLOR_DIAGNOSTICS 0 CACHE INTERNAL "")
    endif ()
+
    set (LLVM_ENABLE_EH 1 CACHE INTERNAL "")
    set (LLVM_ENABLE_RTTI 1 CACHE INTERNAL "")
    set (LLVM_ENABLE_PIC 0 CACHE INTERNAL "")
@ -239,8 +240,6 @@ if (USE_EMBEDDED_COMPILER AND USE_INTERNAL_LLVM_LIBRARY)

    set (CMAKE_CXX_STANDARD ${CMAKE_CXX_STANDARD_bak})
    unset (CMAKE_CXX_STANDARD_bak)
-
-    target_include_directories(LLVMSupport SYSTEM BEFORE PRIVATE ${ZLIB_INCLUDE_DIR})
 endif ()

 if (USE_INTERNAL_LIBGSASL_LIBRARY)
--- a/contrib/llvm
+++ b/contrib/llvm
@ -1 +1 @@
-Subproject commit cfaf365cf96918999d09d976ec736b4518cf5d02
+Subproject commit a7198805de67374eb3fb4c6b89797fa2d1cd7e50
--- a/docs/en/operations/configuration-files.md
+++ b/docs/en/operations/configuration-files.md
@ -5,9 +5,9 @@ toc_title: Configuration Files

 # Configuration Files {#configuration_files}

-ClickHouse supports multi-file configuration management. The main server configuration file is `/etc/clickhouse-server/config.xml`. Other files must be in the `/etc/clickhouse-server/config.d` directory.
+ClickHouse supports multi-file configuration management. The main server configuration file is `/etc/clickhouse-server/config.xml` or `/etc/clickhouse-server/config.yaml`. Other files must be in the `/etc/clickhouse-server/config.d` directory. Note, that any configuration file can be written either in XML or YAML, but mixing formats in one file is not supported. For example, you can have main configs as `config.xml` and `users.xml` and write additional files in `config.d` and `users.d` directories in `.yaml`.

-All the configuration files should be in XML format. Also, they should have the same root element, usually `<yandex>`.
+All the configuration files should be in XML or YAML formats. All XML files should have the same root element, usually `<yandex>`. As for YAML, `yandex:` should not be present, the parser will insert it automatically.

 ## Override {#override}

@ -32,7 +32,7 @@ Users configuration can be splitted into separate files similar to `config.xml`
 Directory name is defined as `users_config` setting without `.xml` postfix concatenated with `.d`.
 Directory `users.d` is used by default, as `users_config` defaults to `users.xml`.

-## Example {#example}
+## XML example {#example}

 For example, you can have separate config file for each user like this:

@ -55,6 +55,70 @@ $ cat /etc/clickhouse-server/users.d/alice.xml
 </yandex>
 ```

+## YAML examples {#example}
+
+Here you can see default config written in YAML: [config-example.yaml](https://github.com/ClickHouse/ClickHouse/blob/master/programs/server/config-example.yaml).
+
+There are some differences between YAML and XML formats in terms of ClickHouse configurations. Here are some tips for writing a configuration in YAML format.
+
+You should use a Scalar node to write a key-value pair:
+``` yaml
+key: value
+```
+
+To create a node, containing other nodes you should use a Map:
+``` yaml
+map_key:
+  key1: val1
+  key2: val2
+  key3: val3
+```
+
+To create a list of values or nodes assigned to one tag you should use a Sequence:
+``` yaml
+seq_key:
+  - val1
+  - val2
+  - key1: val3
+  - map:
+      key2: val4
+      key3: val5
+```
+
+If you want to write an attribute for a Sequence or Map node, you should use a @ prefix before the attribute key. Note, that @ is reserved by YAML standard, so you should also to wrap it into double quotes:
+
+``` yaml
+map:
+  "@attr1": value1
+  "@attr2": value2
+  key: 123
+```
+
+From that Map we will get these XML nodes:
+
+``` xml
+<map attr1="value1" attr2="value2">
+    <key>123</key>
+</map>
+```
+
+You can also set attributes for Sequence:
+
+``` yaml
+seq:
+  - "@attr1": value1
+  - "@attr2": value2
+  - 123
+  - abc
+```
+
+So, we can get YAML config equal to this XML one:
+
+``` xml
+<seq attr1="value1" attr2="value2">123</seq>
+<seq attr1="value1" attr2="value2">abc</seq>
+```
+
 ## Implementation Details {#implementation-details}

 For each config file, the server also generates `file-preprocessed.xml` files when starting. These files contain all the completed substitutions and overrides, and they are intended for informational use. If ZooKeeper substitutions were used in the config files but ZooKeeper is not available on the server start, the server loads the configuration from the preprocessed file.
--- a/docs/en/sql-reference/statements/select/prewhere.md
+++ b/docs/en/sql-reference/statements/select/prewhere.md
@ -16,6 +16,9 @@ A query may simultaneously specify `PREWHERE` and `WHERE`. In this case, `PREWHE

 If the `optimize_move_to_prewhere` setting is set to 0, heuristics to automatically move parts of expressions from `WHERE` to `PREWHERE` are disabled.

+!!! note "Attention"
+     The `PREWHERE` section is executed before` FINAL`, so the results of `FROM FINAL` queries may be skewed when using` PREWHERE` with fields not in the `ORDER BY` section of a table. 
+     
 ## Limitations {#limitations}

 `PREWHERE` is only supported by tables from the `*MergeTree` family.
--- a/docs/ru/sql-reference/dictionaries/index.md
+++ b/docs/ru/sql-reference/dictionaries/index.md
@ -12,6 +12,5 @@ ClickHouse поддерживает специальные функции для

 ClickHouse поддерживает:

-   [Встроенные словари](internal-dicts.md#internal_dicts) со специфическим [набором функций](../../sql-reference/dictionaries/external-dictionaries/index.md).
-   [Подключаемые (внешние) словари](external-dictionaries/external-dicts.md#dicts-external-dicts) с [набором функций](../../sql-reference/dictionaries/external-dictionaries/index.md).
-
+-   [Встроенные словари](internal-dicts.md#internal_dicts) со специфическим [набором функций](../../sql-reference/functions/ext-dict-functions.md).
+-   [Подключаемые (внешние) словари](external-dictionaries/external-dicts.md#dicts-external-dicts) с [набором функций](../../sql-reference/functions/ext-dict-functions.md).
--- a/docs/ru/sql-reference/statements/select/prewhere.md
+++ b/docs/ru/sql-reference/statements/select/prewhere.md
@ -16,6 +16,9 @@ Prewhere — это оптимизация для более эффективн

 Если значение параметра `optimize_move_to_prewhere` равно 0, эвристика по автоматическому перемещнию части выражений из `WHERE` к `PREWHERE` отключается.

+!!! note "Внимание"
+    Секция `PREWHERE` выполняется до `FINAL`, поэтому результаты запросов `FROM FINAL` могут исказится при использовании `PREWHERE` с полями не входящями в `ORDER BY` таблицы.
+
 ## Ограничения {#limitations}

 `PREWHERE` поддерживается только табличными движками из семейства `*MergeTree`.
--- a/docs/zh/sql-reference/dictionaries/index.md
+++ b/docs/zh/sql-reference/dictionaries/index.md
@ -8,15 +8,15 @@ toc_title: "\u5BFC\u8A00"

 # 字典 {#dictionaries}

-字典是一个映射 (`key -> attributes`）这是方便各种类型的参考清单。
+字典是一个映射 (`键 -> 属性`）, 是方便各种类型的参考清单。

-ClickHouse支持使用可用于查询的字典的特殊功能。 这是更容易和更有效地使用字典与功能比 `JOIN` 与参考表。
+ClickHouse支持一些特殊函数配合字典在查询中使用。 将字典与函数结合使用比将 `JOIN` 操作与引用表结合使用更简单、更有效。

 [NULL](../../sql-reference/syntax.md#null-literal) 值不能存储在字典中。

 ClickHouse支持:

-   [内置字典](internal-dicts.md#internal_dicts) 具有特定的 [功能集](../../sql-reference/functions/ym-dict-functions.md).
-   [插件（外部）字典](external-dictionaries/external-dicts.md#dicts-external-dicts) 用一个 [功能集](../../sql-reference/functions/ext-dict-functions.md).
+-   [内置字典](internal-dicts.md#internal_dicts) ,这些字典具有特定的 [函数集](../../sql-reference/functions/ym-dict-functions.md).
+-   [插件（外部）字典](external-dictionaries/external-dicts.md#dicts-external-dicts) ,这些字典拥有一个 [函数集](../../sql-reference/functions/ext-dict-functions.md).

 [原始文章](https://clickhouse.tech/docs/en/query_language/dicts/) <!--hide-->
--- a/programs/CMakeLists.txt
+++ b/programs/CMakeLists.txt
@ -49,6 +49,12 @@ option (ENABLE_CLICKHOUSE_GIT_IMPORT "A tool to analyze Git repositories"


 option (ENABLE_CLICKHOUSE_KEEPER "ClickHouse alternative to ZooKeeper" ${ENABLE_CLICKHOUSE_ALL})
+if (NOT USE_NURAFT)
+    # RECONFIGURE_MESSAGE_LEVEL should not be used here,
+    # since USE_NURAFT is set to OFF for FreeBSD and Darwin.
+    message (STATUS "clickhouse-keeper will not be built (lack of NuRaft)")
+    set(ENABLE_CLICKHOUSE_KEEPER OFF)
+endif()

 if (CLICKHOUSE_SPLIT_BINARY)
    option(ENABLE_CLICKHOUSE_INSTALL "Install ClickHouse without .deb/.rpm/.tgz packages (having the binary only)" OFF)
@ -259,7 +265,10 @@ add_subdirectory (obfuscator)
 add_subdirectory (install)
 add_subdirectory (git-import)
 add_subdirectory (bash-completion)
-add_subdirectory (keeper)
+
+if (ENABLE_CLICKHOUSE_KEEPER)
+    add_subdirectory (keeper)
+endif()

 if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
    add_subdirectory (odbc-bridge)
@ -278,7 +287,18 @@ if (CLICKHOUSE_ONE_SHARED)
 endif()

 if (CLICKHOUSE_SPLIT_BINARY)
-    set (CLICKHOUSE_ALL_TARGETS clickhouse-server clickhouse-client clickhouse-local clickhouse-benchmark clickhouse-extract-from-config clickhouse-compressor clickhouse-format clickhouse-obfuscator clickhouse-git-import clickhouse-copier clickhouse-keeper)
+    set (CLICKHOUSE_ALL_TARGETS
+        clickhouse-server
+        clickhouse-client
+        clickhouse-local
+        clickhouse-benchmark
+        clickhouse-extract-from-config
+        clickhouse-compressor
+        clickhouse-format
+        clickhouse-obfuscator
+        clickhouse-git-import
+        clickhouse-copier
+    )

    if (ENABLE_CLICKHOUSE_ODBC_BRIDGE)
        list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-odbc-bridge)
@ -288,6 +308,10 @@ if (CLICKHOUSE_SPLIT_BINARY)
        list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-library-bridge)
    endif ()

+    if (ENABLE_CLICKHOUSE_KEEPER)
+        list (APPEND CLICKHOUSE_ALL_TARGETS clickhouse-keeper)
+    endif ()
+
    set_target_properties(${CLICKHOUSE_ALL_TARGETS} PROPERTIES RUNTIME_OUTPUT_DIRECTORY ..)

    add_custom_target (clickhouse-bundle ALL DEPENDS ${CLICKHOUSE_ALL_TARGETS})
--- a/programs/keeper/Keeper.cpp
+++ b/programs/keeper/Keeper.cpp
@ -30,9 +30,7 @@
 #    include <Poco/Net/SecureServerSocket.h>
 #endif

-#if USE_NURAFT
-#   include <Server/KeeperTCPHandlerFactory.h>
-#endif
+#include <Server/KeeperTCPHandlerFactory.h>

 #if defined(OS_LINUX)
 #    include <unistd.h>
@ -357,7 +355,6 @@ int Keeper::main(const std::vector<std::string> & /*args*/)

    auto servers = std::make_shared<std::vector<ProtocolServerAdapter>>();

-#if USE_NURAFT
    /// Initialize test keeper RAFT. Do nothing if no nu_keeper_server in config.
    global_context->initializeKeeperStorageDispatcher();
    for (const auto & listen_host : listen_hosts)
@ -398,9 +395,6 @@ int Keeper::main(const std::vector<std::string> & /*args*/)
 #endif
        });
    }
-#else
-    throw Exception(ErrorCodes::SUPPORT_IS_DISABLED, "ClickHouse keeper built without NuRaft library. Cannot use coordination.");
-#endif

    for (auto & server : *servers)
        server.start();
--- a/programs/server/config-example.yaml
+++ b/programs/server/config-example.yaml
@ -0,0 +1,945 @@
+# NOTE: User and query level settings are set up in "users.xml" file.
+# If you have accidentally specified user-level settings here, server won't start.
+# You can either move the settings to the right place inside "users.xml" file
+# or add skip_check_for_incorrect_settings: 1 here.
+logger:
+    # Possible levels [1]:
+    # - none (turns off logging)
+    # - fatal
+    # - critical
+    # - error
+    # - warning
+    # - notice
+    # - information
+    # - debug
+    # - trace
+    # [1]: https://github.com/pocoproject/poco/blob/poco-1.9.4-release/Foundation/include/Poco/Logger.h#L105-L114
+    level: trace
+    log: /var/log/clickhouse-server/clickhouse-server.log
+    errorlog: /var/log/clickhouse-server/clickhouse-server.err.log
+    # Rotation policy
+    # See https://github.com/pocoproject/poco/blob/poco-1.9.4-release/Foundation/include/Poco/FileChannel.h#L54-L85
+    size: 1000M
+    count: 10
+    # console: 1
+    # Default behavior is autodetection (log to console if not daemon mode and is tty)
+
+    # Per level overrides (legacy):
+    # For example to suppress logging of the ConfigReloader you can use:
+    # NOTE: levels.logger is reserved, see below.
+    # levels:
+    #     ConfigReloader: none
+
+    # Per level overrides:
+    # For example to suppress logging of the RBAC for default user you can use:
+    # (But please note that the logger name maybe changed from version to version, even after minor upgrade)
+    # levels:
+    #     - logger:
+    #         name: 'ContextAccess (default)'
+    #         level: none
+    #     - logger:
+    #         name: 'DatabaseOrdinary (test)'
+    #         level: none
+
+# It is the name that will be shown in the clickhouse-client.
+# By default, anything with "production" will be highlighted in red in query prompt.
+# display_name: production
+
+# Port for HTTP API. See also 'https_port' for secure connections.
+# This interface is also used by ODBC and JDBC drivers (DataGrip, Dbeaver, ...)
+# and by most of web interfaces (embedded UI, Grafana, Redash, ...).
+http_port: 8123
+
+# Port for interaction by native protocol with:
+# - clickhouse-client and other native ClickHouse tools (clickhouse-benchmark, clickhouse-copier);
+# - clickhouse-server with other clickhouse-servers for distributed query processing;
+# - ClickHouse drivers and applications supporting native protocol
+# (this protocol is also informally called as "the TCP protocol");
+# See also 'tcp_port_secure' for secure connections.
+tcp_port: 9000
+
+# Compatibility with MySQL protocol.
+# ClickHouse will pretend to be MySQL for applications connecting to this port.
+mysql_port: 9004
+
+# Compatibility with PostgreSQL protocol.
+# ClickHouse will pretend to be PostgreSQL for applications connecting to this port.
+postgresql_port: 9005
+
+# HTTP API with TLS (HTTPS).
+# You have to configure certificate to enable this interface.
+# See the openSSL section below.
+# https_port: 8443
+
+# Native interface with TLS.
+# You have to configure certificate to enable this interface.
+# See the openSSL section below.
+# tcp_port_secure: 9440
+
+# Native interface wrapped with PROXYv1 protocol
+# PROXYv1 header sent for every connection.
+# ClickHouse will extract information about proxy-forwarded client address from the header.
+# tcp_with_proxy_port: 9011
+
+# Port for communication between replicas. Used for data exchange.
+# It provides low-level data access between servers.
+# This port should not be accessible from untrusted networks.
+# See also 'interserver_http_credentials'.
+# Data transferred over connections to this port should not go through untrusted networks.
+# See also 'interserver_https_port'.
+interserver_http_port: 9009
+
+# Port for communication between replicas with TLS.
+# You have to configure certificate to enable this interface.
+# See the openSSL section below.
+# See also 'interserver_http_credentials'.
+# interserver_https_port: 9010
+
+# Hostname that is used by other replicas to request this server.
+# If not specified, than it is determined analogous to 'hostname -f' command.
+# This setting could be used to switch replication to another network interface
+# (the server may be connected to multiple networks via multiple addresses)
+# interserver_http_host: example.yandex.ru
+
+# You can specify credentials for authenthication between replicas.
+# This is required when interserver_https_port is accessible from untrusted networks,
+# and also recommended to avoid SSRF attacks from possibly compromised services in your network.
+# interserver_http_credentials:
+#     user: interserver
+#     password: ''
+
+# Listen specified address.
+# Use :: (wildcard IPv6 address), if you want to accept connections both with IPv4 and IPv6 from everywhere.
+# Notes:
+# If you open connections from wildcard address, make sure that at least one of the following measures applied:
+# - server is protected by firewall and not accessible from untrusted networks;
+# - all users are restricted to subset of network addresses (see users.xml);
+# - all users have strong passwords, only secure (TLS) interfaces are accessible, or connections are only made via TLS interfaces.
+# - users without password have readonly access.
+# See also: https://www.shodan.io/search?query=clickhouse
+# listen_host: '::'
+
+# Same for hosts without support for IPv6:
+# listen_host: 0.0.0.0
+
+# Default values - try listen localhost on IPv4 and IPv6.
+# listen_host: '::1'
+# listen_host: 127.0.0.1
+
+# Don't exit if IPv6 or IPv4 networks are unavailable while trying to listen.
+# listen_try: 0
+
+# Allow multiple servers to listen on the same address:port. This is not recommended.
+# listen_reuse_port: 0
+
+# listen_backlog: 64
+max_connections: 4096
+
+# For 'Connection: keep-alive' in HTTP 1.1
+keep_alive_timeout: 3
+
+# gRPC protocol (see src/Server/grpc_protos/clickhouse_grpc.proto for the API)
+# grpc_port: 9100
+grpc:
+    enable_ssl: false
+
+    # The following two files are used only if enable_ssl=1
+    ssl_cert_file: /path/to/ssl_cert_file
+    ssl_key_file: /path/to/ssl_key_file
+
+    # Whether server will request client for a certificate
+    ssl_require_client_auth: false
+
+    # The following file is used only if ssl_require_client_auth=1
+    ssl_ca_cert_file: /path/to/ssl_ca_cert_file
+
+    # Default compression algorithm (applied if client doesn't specify another algorithm).
+    # Supported algorithms: none, deflate, gzip, stream_gzip
+    compression: deflate
+
+    # Default compression level (applied if client doesn't specify another level).
+    # Supported levels: none, low, medium, high
+    compression_level: medium
+
+    # Send/receive message size limits in bytes. -1 means unlimited
+    max_send_message_size: -1
+    max_receive_message_size: -1
+
+    # Enable if you want very detailed logs
+    verbose_logs: false
+
+# Used with https_port and tcp_port_secure. Full ssl options list: https://github.com/ClickHouse-Extras/poco/blob/master/NetSSL_OpenSSL/include/Poco/Net/SSLManager.h#L71
+openSSL:
+    server:
+        # Used for https server AND secure tcp port
+        # openssl req -subj "/CN=localhost" -new -newkey rsa:2048 -days 365 -nodes -x509 -keyout /etc/clickhouse-server/server.key -out /etc/clickhouse-server/server.crt
+        certificateFile: /etc/clickhouse-server/server.crt
+        privateKeyFile: /etc/clickhouse-server/server.key
+
+        # dhparams are optional. You can delete the dhParamsFile: element.
+        # To generate dhparams, use the following command:
+        # openssl dhparam -out /etc/clickhouse-server/dhparam.pem 4096
+        # Only file format with BEGIN DH PARAMETERS is supported.
+        dhParamsFile: /etc/clickhouse-server/dhparam.pem
+        verificationMode: none
+        loadDefaultCAFile: true
+        cacheSessions: true
+        disableProtocols: 'sslv2,sslv3'
+        preferServerCiphers: true
+    client:
+        # Used for connecting to https dictionary source and secured Zookeeper communication
+        loadDefaultCAFile: true
+        cacheSessions: true
+        disableProtocols: 'sslv2,sslv3'
+        preferServerCiphers: true
+
+        # Use for self-signed: verificationMode: none
+        invalidCertificateHandler:
+            # Use for self-signed: name: AcceptCertificateHandler
+            name: RejectCertificateHandler
+
+# Default root page on http[s] server. For example load UI from https://tabix.io/ when opening http://localhost:8123
+# http_server_default_response: |-
+#     <html ng-app="SMI2"><head><base href="http://ui.tabix.io/"></head><body><div ui-view="" class="content-ui"></div><script src="http://loader.tabix.io/master.js"></script></body></html>
+
+# Maximum number of concurrent queries.
+max_concurrent_queries: 100
+
+# Maximum memory usage (resident set size) for server process.
+# Zero value or unset means default. Default is "max_server_memory_usage_to_ram_ratio" of available physical RAM.
+# If the value is larger than "max_server_memory_usage_to_ram_ratio" of available physical RAM, it will be cut down.
+
+# The constraint is checked on query execution time.
+# If a query tries to allocate memory and the current memory usage plus allocation is greater
+# than specified threshold, exception will be thrown.
+
+# It is not practical to set this constraint to small values like just a few gigabytes,
+# because memory allocator will keep this amount of memory in caches and the server will deny service of queries.
+max_server_memory_usage: 0
+
+# Maximum number of threads in the Global thread pool.
+# This will default to a maximum of 10000 threads if not specified.
+# This setting will be useful in scenarios where there are a large number
+# of distributed queries that are running concurrently but are idling most
+# of the time, in which case a higher number of threads might be required.
+max_thread_pool_size: 10000
+
+# On memory constrained environments you may have to set this to value larger than 1.
+max_server_memory_usage_to_ram_ratio: 0.9
+
+# Simple server-wide memory profiler. Collect a stack trace at every peak allocation step (in bytes).
+# Data will be stored in system.trace_log table with query_id = empty string.
+# Zero means disabled.
+total_memory_profiler_step: 4194304
+
+# Collect random allocations and deallocations and write them into system.trace_log with 'MemorySample' trace_type.
+# The probability is for every alloc/free regardless to the size of the allocation.
+# Note that sampling happens only when the amount of untracked memory exceeds the untracked memory limit,
+# which is 4 MiB by default but can be lowered if 'total_memory_profiler_step' is lowered.
+# You may want to set 'total_memory_profiler_step' to 1 for extra fine grained sampling.
+total_memory_tracker_sample_probability: 0
+
+# Set limit on number of open files (default: maximum). This setting makes sense on Mac OS X because getrlimit() fails to retrieve
+# correct maximum value.
+# max_open_files: 262144
+
+# Size of cache of uncompressed blocks of data, used in tables of MergeTree family.
+# In bytes. Cache is single for server. Memory is allocated only on demand.
+# Cache is used when 'use_uncompressed_cache' user setting turned on (off by default).
+# Uncompressed cache is advantageous only for very short queries and in rare cases.
+
+# Note: uncompressed cache can be pointless for lz4, because memory bandwidth
+# is slower than multi-core decompression on some server configurations.
+# Enabling it can sometimes paradoxically make queries slower.
+uncompressed_cache_size: 8589934592
+
+# Approximate size of mark cache, used in tables of MergeTree family.
+# In bytes. Cache is single for server. Memory is allocated only on demand.
+# You should not lower this value.
+mark_cache_size: 5368709120
+
+# If you enable the `min_bytes_to_use_mmap_io` setting,
+# the data in MergeTree tables can be read with mmap to avoid copying from kernel to userspace.
+# It makes sense only for large files and helps only if data reside in page cache.
+# To avoid frequent open/mmap/munmap/close calls (which are very expensive due to consequent page faults)
+# and to reuse mappings from several threads and queries,
+# the cache of mapped files is maintained. Its size is the number of mapped regions (usually equal to the number of mapped files).
+# The amount of data in mapped files can be monitored
+# in system.metrics, system.metric_log by the MMappedFiles, MMappedFileBytes metrics
+# and in system.asynchronous_metrics, system.asynchronous_metrics_log by the MMapCacheCells metric,
+# and also in system.events, system.processes, system.query_log, system.query_thread_log by the
+# CreatedReadBufferMMap, CreatedReadBufferMMapFailed, MMappedFileCacheHits, MMappedFileCacheMisses events.
+# Note that the amount of data in mapped files does not consume memory directly and is not accounted
+# in query or server memory usage - because this memory can be discarded similar to OS page cache.
+# The cache is dropped (the files are closed) automatically on removal of old parts in MergeTree,
+# also it can be dropped manually by the SYSTEM DROP MMAP CACHE query.
+mmap_cache_size: 1000
+
+# Cache size for compiled expressions.
+compiled_expression_cache_size: 1073741824
+
+# Path to data directory, with trailing slash.
+path: /var/lib/clickhouse/
+
+# Path to temporary data for processing hard queries.
+tmp_path: /var/lib/clickhouse/tmp/
+
+# Policy from the <storage_configuration> for the temporary files.
+# If not set <tmp_path> is used, otherwise <tmp_path> is ignored.
+
+# Notes:
+# - move_factor              is ignored
+# - keep_free_space_bytes    is ignored
+# - max_data_part_size_bytes is ignored
+# - you must have exactly one volume in that policy
+# tmp_policy: tmp
+
+# Directory with user provided files that are accessible by 'file' table function.
+user_files_path: /var/lib/clickhouse/user_files/
+
+# LDAP server definitions.
+ldap_servers: ''
+
+# List LDAP servers with their connection parameters here to later 1) use them as authenticators for dedicated local users,
+# who have 'ldap' authentication mechanism specified instead of 'password', or to 2) use them as remote user directories.
+# Parameters:
+# host - LDAP server hostname or IP, this parameter is mandatory and cannot be empty.
+# port - LDAP server port, default is 636 if enable_tls is set to true, 389 otherwise.
+# bind_dn - template used to construct the DN to bind to.
+# The resulting DN will be constructed by replacing all '{user_name}' substrings of the template with the actual
+# user name during each authentication attempt.
+# user_dn_detection - section with LDAP search parameters for detecting the actual user DN of the bound user.
+# This is mainly used in search filters for further role mapping when the server is Active Directory. The
+# resulting user DN will be used when replacing '{user_dn}' substrings wherever they are allowed. By default,
+# user DN is set equal to bind DN, but once search is performed, it will be updated with to the actual detected
+# user DN value.
+# base_dn - template used to construct the base DN for the LDAP search.
+# The resulting DN will be constructed by replacing all '{user_name}' and '{bind_dn}' substrings
+# of the template with the actual user name and bind DN during the LDAP search.
+# scope - scope of the LDAP search.
+# Accepted values are: 'base', 'one_level', 'children', 'subtree' (the default).
+# search_filter - template used to construct the search filter for the LDAP search.
+# The resulting filter will be constructed by replacing all '{user_name}', '{bind_dn}', and '{base_dn}'
+# substrings of the template with the actual user name, bind DN, and base DN during the LDAP search.
+# Note, that the special characters must be escaped properly in XML.
+# verification_cooldown - a period of time, in seconds, after a successful bind attempt, during which a user will be assumed
+# to be successfully authenticated for all consecutive requests without contacting the LDAP server.
+# Specify 0 (the default) to disable caching and force contacting the LDAP server for each authentication request.
+# enable_tls - flag to trigger use of secure connection to the LDAP server.
+# Specify 'no' for plain text (ldap://) protocol (not recommended).
+# Specify 'yes' for LDAP over SSL/TLS (ldaps://) protocol (recommended, the default).
+# Specify 'starttls' for legacy StartTLS protocol (plain text (ldap://) protocol, upgraded to TLS).
+# tls_minimum_protocol_version - the minimum protocol version of SSL/TLS.
+# Accepted values are: 'ssl2', 'ssl3', 'tls1.0', 'tls1.1', 'tls1.2' (the default).
+# tls_require_cert - SSL/TLS peer certificate verification behavior.
+# Accepted values are: 'never', 'allow', 'try', 'demand' (the default).
+# tls_cert_file - path to certificate file.
+# tls_key_file - path to certificate key file.
+# tls_ca_cert_file - path to CA certificate file.
+# tls_ca_cert_dir - path to the directory containing CA certificates.
+# tls_cipher_suite - allowed cipher suite (in OpenSSL notation).
+# Example:
+# my_ldap_server:
+#     host: localhost
+#     port: 636
+#     bind_dn: 'uid={user_name},ou=users,dc=example,dc=com'
+#     verification_cooldown: 300
+#     enable_tls: yes
+#     tls_minimum_protocol_version: tls1.2
+#     tls_require_cert: demand
+#     tls_cert_file: /path/to/tls_cert_file
+#     tls_key_file: /path/to/tls_key_file
+#     tls_ca_cert_file: /path/to/tls_ca_cert_file
+#     tls_ca_cert_dir: /path/to/tls_ca_cert_dir
+#     tls_cipher_suite: ECDHE-ECDSA-AES256-GCM-SHA384:ECDHE-RSA-AES256-GCM-SHA384:AES256-GCM-SHA384
+
+# Example (typical Active Directory with configured user DN detection for further role mapping):
+# my_ad_server:
+#     host: localhost
+#     port: 389
+#     bind_dn: 'EXAMPLE\{user_name}'
+#     user_dn_detection:
+#         base_dn: CN=Users,DC=example,DC=com
+#         search_filter: '(&amp;(objectClass=user)(sAMAccountName={user_name}))'
+#     enable_tls: no
+
+# To enable Kerberos authentication support for HTTP requests (GSS-SPNEGO), for those users who are explicitly configured
+# to authenticate via Kerberos, define a single 'kerberos' section here.
+# Parameters:
+# principal - canonical service principal name, that will be acquired and used when accepting security contexts.
+# This parameter is optional, if omitted, the default principal will be used.
+# This parameter cannot be specified together with 'realm' parameter.
+# realm - a realm, that will be used to restrict authentication to only those requests whose initiator's realm matches it.
+# This parameter is optional, if omitted, no additional filtering by realm will be applied.
+# This parameter cannot be specified together with 'principal' parameter.
+# Example:
+# kerberos: ''
+
+# Example:
+# kerberos:
+#     principal: HTTP/clickhouse.example.com@EXAMPLE.COM
+
+# Example:
+# kerberos:
+#     realm: EXAMPLE.COM
+
+# Sources to read users, roles, access rights, profiles of settings, quotas.
+user_directories:
+    users_xml:
+        # Path to configuration file with predefined users.
+        path: users.yaml
+    local_directory:
+        # Path to folder where users created by SQL commands are stored.
+        path: /var/lib/clickhouse/access/
+    # To add an LDAP server as a remote user directory of users that are not defined locally, define a single 'ldap' section
+    # with the following parameters:
+    # server - one of LDAP server names defined in 'ldap_servers' config section above.
+    # This parameter is mandatory and cannot be empty.
+    # roles - section with a list of locally defined roles that will be assigned to each user retrieved from the LDAP server.
+    # If no roles are specified here or assigned during role mapping (below), user will not be able to perform any
+    # actions after authentication.
+    # role_mapping - section with LDAP search parameters and mapping rules.
+    # When a user authenticates, while still bound to LDAP, an LDAP search is performed using search_filter and the
+    # name of the logged in user. For each entry found during that search, the value of the specified attribute is
+    # extracted. For each attribute value that has the specified prefix, the prefix is removed, and the rest of the
+    # value becomes the name of a local role defined in ClickHouse, which is expected to be created beforehand by
+    # CREATE ROLE command.
+    # There can be multiple 'role_mapping' sections defined inside the same 'ldap' section. All of them will be
+    # applied.
+    # base_dn - template used to construct the base DN for the LDAP search.
+    # The resulting DN will be constructed by replacing all '{user_name}', '{bind_dn}', and '{user_dn}'
+    # substrings of the template with the actual user name, bind DN, and user DN during each LDAP search.
+    # scope - scope of the LDAP search.
+    # Accepted values are: 'base', 'one_level', 'children', 'subtree' (the default).
+    # search_filter - template used to construct the search filter for the LDAP search.
+    # The resulting filter will be constructed by replacing all '{user_name}', '{bind_dn}', '{user_dn}', and
+    # '{base_dn}' substrings of the template with the actual user name, bind DN, user DN, and base DN during
+    # each LDAP search.
+    # Note, that the special characters must be escaped properly in XML.
+    # attribute - attribute name whose values will be returned by the LDAP search. 'cn', by default.
+    # prefix - prefix, that will be expected to be in front of each string in the original list of strings returned by
+    # the LDAP search. Prefix will be removed from the original strings and resulting strings will be treated
+    # as local role names. Empty, by default.
+    # Example:
+    # ldap:
+    #     server: my_ldap_server
+    #     roles:
+    #         my_local_role1: ''
+    #         my_local_role2: ''
+    #     role_mapping:
+    #         base_dn: 'ou=groups,dc=example,dc=com'
+    #         scope: subtree
+    #         search_filter: '(&amp;(objectClass=groupOfNames)(member={bind_dn}))'
+    #         attribute: cn
+    #         prefix: clickhouse_
+    # Example (typical Active Directory with role mapping that relies on the detected user DN):
+    # ldap:
+    #     server: my_ad_server
+    #     role_mapping:
+    #         base_dn: 'CN=Users,DC=example,DC=com'
+    #         attribute: CN
+    #         scope: subtree
+    #         search_filter: '(&amp;(objectClass=group)(member={user_dn}))'
+    #         prefix: clickhouse_
+
+# Default profile of settings.
+default_profile: default
+
+# Comma-separated list of prefixes for user-defined settings.
+# custom_settings_prefixes: ''
+# System profile of settings. This settings are used by internal processes (Distributed DDL worker and so on).
+# system_profile: default
+
+# Buffer profile of settings.
+# This settings are used by Buffer storage to flush data to the underlying table.
+# Default: used from system_profile directive.
+# buffer_profile: default
+
+# Default database.
+default_database: default
+
+# Server time zone could be set here.
+
+# Time zone is used when converting between String and DateTime types,
+# when printing DateTime in text formats and parsing DateTime from text,
+# it is used in date and time related functions, if specific time zone was not passed as an argument.
+
+# Time zone is specified as identifier from IANA time zone database, like UTC or Africa/Abidjan.
+# If not specified, system time zone at server startup is used.
+
+# Please note, that server could display time zone alias instead of specified name.
+# Example: W-SU is an alias for Europe/Moscow and Zulu is an alias for UTC.
+# timezone: Europe/Moscow
+
+# You can specify umask here (see "man umask"). Server will apply it on startup.
+# Number is always parsed as octal. Default umask is 027 (other users cannot read logs, data files, etc; group can only read).
+# umask: 022
+
+# Perform mlockall after startup to lower first queries latency
+# and to prevent clickhouse executable from being paged out under high IO load.
+# Enabling this option is recommended but will lead to increased startup time for up to a few seconds.
+mlock_executable: true
+
+# Reallocate memory for machine code ("text") using huge pages. Highly experimental.
+remap_executable: false
+
+# Uncomment below in order to use JDBC table engine and function.
+# To install and run JDBC bridge in background:
+# * [Debian/Ubuntu]
+# export MVN_URL=https://repo1.maven.org/maven2/ru/yandex/clickhouse/clickhouse-jdbc-bridge
+# export PKG_VER=$(curl -sL $MVN_URL/maven-metadata.xml | grep '<release>' | sed -e 's|.*>\(.*\)<.*|\1|')
+# wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge_$PKG_VER-1_all.deb
+# apt install --no-install-recommends -f ./clickhouse-jdbc-bridge_$PKG_VER-1_all.deb
+# clickhouse-jdbc-bridge &
+# * [CentOS/RHEL]
+# export MVN_URL=https://repo1.maven.org/maven2/ru/yandex/clickhouse/clickhouse-jdbc-bridge
+# export PKG_VER=$(curl -sL $MVN_URL/maven-metadata.xml | grep '<release>' | sed -e 's|.*>\(.*\)<.*|\1|')
+# wget https://github.com/ClickHouse/clickhouse-jdbc-bridge/releases/download/v$PKG_VER/clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm
+# yum localinstall -y clickhouse-jdbc-bridge-$PKG_VER-1.noarch.rpm
+# clickhouse-jdbc-bridge &
+# Please refer to https://github.com/ClickHouse/clickhouse-jdbc-bridge#usage for more information.
+
+# jdbc_bridge:
+#     host: 127.0.0.1
+#     port: 9019
+
+# Configuration of clusters that could be used in Distributed tables.
+# https://clickhouse.tech/docs/en/operations/table_engines/distributed/
+remote_servers:
+    # Test only shard config for testing distributed storage
+    test_shard_localhost:
+        # Inter-server per-cluster secret for Distributed queries
+        # default: no secret (no authentication will be performed)
+
+        # If set, then Distributed queries will be validated on shards, so at least:
+        # - such cluster should exist on the shard,
+        # - such cluster should have the same secret.
+
+        # And also (and which is more important), the initial_user will
+        # be used as current user for the query.
+
+        # Right now the protocol is pretty simple and it only takes into account:
+        # - cluster name
+        # - query
+
+        # Also it will be nice if the following will be implemented:
+        # - source hostname (see interserver_http_host), but then it will depends from DNS,
+        # it can use IP address instead, but then the you need to get correct on the initiator node.
+        # - target hostname / ip address (same notes as for source hostname)
+        # - time-based security tokens
+        # secret: ''
+        shard:
+            # Optional. Whether to write data to just one of the replicas. Default: false (write data to all replicas).
+            # internal_replication: false
+            # Optional. Shard weight when writing data. Default: 1.
+            # weight: 1
+            replica:
+                host: localhost
+                port: 9000
+                # Optional. Priority of the replica for load_balancing. Default: 1 (less value has more priority).
+                # priority: 1
+    test_cluster_two_shards_localhost:
+        shard:
+            - replica:
+                  host: localhost
+                  port: 9000
+            - replica:
+                  host: localhost
+                  port: 9000
+    test_cluster_two_shards:
+        shard:
+            - replica:
+                  host: 127.0.0.1
+                  port: 9000
+            - replica:
+                  host: 127.0.0.2
+                  port: 9000
+    test_cluster_two_shards_internal_replication:
+        shard:
+            - internal_replication: true
+              replica:
+                  host: 127.0.0.1
+                  port: 9000
+            - internal_replication: true
+              replica:
+                  host: 127.0.0.2
+                  port: 9000
+    test_shard_localhost_secure:
+        shard:
+            replica:
+                host: localhost
+                port: 9440
+                secure: 1
+    test_unavailable_shard:
+        shard:
+            - replica:
+                  host: localhost
+                  port: 9000
+            - replica:
+                  host: localhost
+                  port: 1
+
+# The list of hosts allowed to use in URL-related storage engines and table functions.
+# If this section is not present in configuration, all hosts are allowed.
+# remote_url_allow_hosts:
+
+# Host should be specified exactly as in URL. The name is checked before DNS resolution.
+# Example: "yandex.ru", "yandex.ru." and "www.yandex.ru" are different hosts.
+# If port is explicitly specified in URL, the host:port is checked as a whole.
+# If host specified here without port, any port with this host allowed.
+# "yandex.ru" -> "yandex.ru:443", "yandex.ru:80" etc. is allowed, but "yandex.ru:80" -> only "yandex.ru:80" is allowed.
+# If the host is specified as IP address, it is checked as specified in URL. Example: "[2a02:6b8:a::a]".
+# If there are redirects and support for redirects is enabled, every redirect (the Location field) is checked.
+
+# Regular expression can be specified. RE2 engine is used for regexps.
+# Regexps are not aligned: don't forget to add ^ and $. Also don't forget to escape dot (.) metacharacter
+# (forgetting to do so is a common source of error).
+
+# If element has 'incl' attribute, then for it's value will be used corresponding substitution from another file.
+# By default, path to file with substitutions is /etc/metrika.xml. It could be changed in config in 'include_from' element.
+# Values for substitutions are specified in /yandex/name_of_substitution elements in that file.
+
+# ZooKeeper is used to store metadata about replicas, when using Replicated tables.
+# Optional. If you don't use replicated tables, you could omit that.
+# See https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/replication/
+
+# zookeeper:
+#     - node:
+#         host: example1
+#         port: 2181
+#     - node:
+#         host: example2
+#         port: 2181
+#     - node:
+#         host: example3
+#         port: 2181
+
+# Substitutions for parameters of replicated tables.
+# Optional. If you don't use replicated tables, you could omit that.
+# See https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/replication/#creating-replicated-tables
+# macros:
+#     shard: 01
+#     replica: example01-01-1
+
+# Reloading interval for embedded dictionaries, in seconds. Default: 3600.
+builtin_dictionaries_reload_interval: 3600
+
+# Maximum session timeout, in seconds. Default: 3600.
+max_session_timeout: 3600
+
+# Default session timeout, in seconds. Default: 60.
+default_session_timeout: 60
+
+# Sending data to Graphite for monitoring. Several sections can be defined.
+# interval - send every X second
+# root_path - prefix for keys
+# hostname_in_path - append hostname to root_path (default = true)
+# metrics - send data from table system.metrics
+# events - send data from table system.events
+# asynchronous_metrics - send data from table system.asynchronous_metrics
+
+# graphite:
+#     host: localhost
+#     port: 42000
+#     timeout: 0.1
+#     interval: 60
+#     root_path: one_min
+#     hostname_in_path: true
+
+#     metrics: true
+#     events: true
+#     events_cumulative: false
+#     asynchronous_metrics: true
+
+# graphite:
+#     host: localhost
+#     port: 42000
+#     timeout: 0.1
+#     interval: 1
+#     root_path: one_sec
+
+#     metrics: true
+#     events: true
+#     events_cumulative: false
+#     asynchronous_metrics: false
+
+# Serve endpoint for Prometheus monitoring.
+# endpoint - mertics path (relative to root, statring with "/")
+# port - port to setup server. If not defined or 0 than http_port used
+# metrics - send data from table system.metrics
+# events - send data from table system.events
+# asynchronous_metrics - send data from table system.asynchronous_metrics
+# status_info - send data from different component from CH, ex: Dictionaries status
+
+# prometheus:
+#     endpoint: /metrics
+#     port: 9363
+
+#     metrics: true
+#     events: true
+#     asynchronous_metrics: true
+#     status_info: true
+
+# Query log. Used only for queries with setting log_queries = 1.
+query_log:
+    # What table to insert data. If table is not exist, it will be created.
+    # When query log structure is changed after system update,
+    # then old table will be renamed and new table will be created automatically.
+    database: system
+    table: query_log
+
+    # PARTITION BY expr: https://clickhouse.yandex/docs/en/table_engines/mergetree-family/custom_partitioning_key/
+    # Example:
+    # event_date
+    # toMonday(event_date)
+    # toYYYYMM(event_date)
+    # toStartOfHour(event_time)
+    partition_by: toYYYYMM(event_date)
+
+    # Table TTL specification: https://clickhouse.tech/docs/en/engines/table-engines/mergetree-family/mergetree/#mergetree-table-ttl
+    # Example:
+    # event_date + INTERVAL 1 WEEK
+    # event_date + INTERVAL 7 DAY DELETE
+    # event_date + INTERVAL 2 WEEK TO DISK 'bbb'
+
+    # ttl: 'event_date + INTERVAL 30 DAY DELETE'
+
+    # Instead of partition_by, you can provide full engine expression (starting with ENGINE = ) with parameters,
+    # Example: engine: 'ENGINE = MergeTree PARTITION BY toYYYYMM(event_date) ORDER BY (event_date, event_time) SETTINGS index_granularity = 1024'
+
+    # Interval of flushing data.
+    flush_interval_milliseconds: 7500
+
+# Trace log. Stores stack traces collected by query profilers.
+# See query_profiler_real_time_period_ns and query_profiler_cpu_time_period_ns settings.
+trace_log:
+    database: system
+    table: trace_log
+    partition_by: toYYYYMM(event_date)
+    flush_interval_milliseconds: 7500
+
+# Query thread log. Has information about all threads participated in query execution.
+# Used only for queries with setting log_query_threads = 1.
+query_thread_log:
+    database: system
+    table: query_thread_log
+    partition_by: toYYYYMM(event_date)
+    flush_interval_milliseconds: 7500
+
+# Uncomment if use part log.
+# Part log contains information about all actions with parts in MergeTree tables (creation, deletion, merges, downloads).
+# part_log:
+#     database: system
+#     table: part_log
+#     flush_interval_milliseconds: 7500
+
+# Uncomment to write text log into table.
+# Text log contains all information from usual server log but stores it in structured and efficient way.
+# The level of the messages that goes to the table can be limited (<level>), if not specified all messages will go to the table.
+# text_log:
+#     database: system
+#     table: text_log
+#     flush_interval_milliseconds: 7500
+#     level: ''
+
+# Metric log contains rows with current values of ProfileEvents, CurrentMetrics collected with "collect_interval_milliseconds" interval.
+metric_log:
+    database: system
+    table: metric_log
+    flush_interval_milliseconds: 7500
+    collect_interval_milliseconds: 1000
+
+# Asynchronous metric log contains values of metrics from
+# system.asynchronous_metrics.
+asynchronous_metric_log:
+    database: system
+    table: asynchronous_metric_log
+
+    # Asynchronous metrics are updated once a minute, so there is
+    # no need to flush more often.
+    flush_interval_milliseconds: 60000
+
+# OpenTelemetry log contains OpenTelemetry trace spans.
+opentelemetry_span_log:
+
+    # The default table creation code is insufficient, this <engine> spec
+    # is a workaround. There is no 'event_time' for this log, but two times,
+    # start and finish. It is sorted by finish time, to avoid inserting
+    # data too far away in the past (probably we can sometimes insert a span
+    # that is seconds earlier than the last span in the table, due to a race
+    # between several spans inserted in parallel). This gives the spans a
+    # global order that we can use to e.g. retry insertion into some external
+    # system.
+    engine: |-
+        engine MergeTree
+             partition by toYYYYMM(finish_date)
+             order by (finish_date, finish_time_us, trace_id)
+    database: system
+    table: opentelemetry_span_log
+    flush_interval_milliseconds: 7500
+
+# Crash log. Stores stack traces for fatal errors.
+# This table is normally empty.
+crash_log:
+    database: system
+    table: crash_log
+    partition_by: ''
+    flush_interval_milliseconds: 1000
+
+# Parameters for embedded dictionaries, used in Yandex.Metrica.
+# See https://clickhouse.yandex/docs/en/dicts/internal_dicts/
+
+# Path to file with region hierarchy.
+# path_to_regions_hierarchy_file: /opt/geo/regions_hierarchy.txt
+
+# Path to directory with files containing names of regions
+# path_to_regions_names_files: /opt/geo/
+
+
+# top_level_domains_path: /var/lib/clickhouse/top_level_domains/
+# Custom TLD lists.
+# Format: name: /path/to/file
+
+# Changes will not be applied w/o server restart.
+# Path to the list is under top_level_domains_path (see above).
+top_level_domains_lists: ''
+
+# public_suffix_list: /path/to/public_suffix_list.dat
+
+# Configuration of external dictionaries. See:
+# https://clickhouse.tech/docs/en/sql-reference/dictionaries/external-dictionaries/external-dicts
+dictionaries_config: '*_dictionary.xml'
+
+# Uncomment if you want data to be compressed 30-100% better.
+# Don't do that if you just started using ClickHouse.
+
+# compression:
+#     # Set of variants. Checked in order. Last matching case wins. If nothing matches, lz4 will be used.
+#     case:
+#         Conditions. All must be satisfied. Some conditions may be omitted.
+#         # min_part_size: 10000000000    # Min part size in bytes.
+#         # min_part_size_ratio: 0.01     # Min size of part relative to whole table size.
+#         # What compression method to use.
+#         method: zstd
+
+# Allow to execute distributed DDL queries (CREATE, DROP, ALTER, RENAME) on cluster.
+# Works only if ZooKeeper is enabled. Comment it if such functionality isn't required.
+distributed_ddl:
+    # Path in ZooKeeper to queue with DDL queries
+    path: /clickhouse/task_queue/ddl
+
+    # Settings from this profile will be used to execute DDL queries
+    # profile: default
+
+    # Controls how much ON CLUSTER queries can be run simultaneously.
+    # pool_size: 1
+
+    # Cleanup settings (active tasks will not be removed)
+
+    # Controls task TTL (default 1 week)
+    # task_max_lifetime: 604800
+
+    # Controls how often cleanup should be performed (in seconds)
+    # cleanup_delay_period: 60
+
+    # Controls how many tasks could be in the queue
+    # max_tasks_in_queue: 1000
+
+# Settings to fine tune MergeTree tables. See documentation in source code, in MergeTreeSettings.h
+# merge_tree:
+#     max_suspicious_broken_parts: 5
+
+# Protection from accidental DROP.
+# If size of a MergeTree table is greater than max_table_size_to_drop (in bytes) than table could not be dropped with any DROP query.
+# If you want do delete one table and don't want to change clickhouse-server config, you could create special file <clickhouse-path>/flags/force_drop_table and make DROP once.
+# By default max_table_size_to_drop is 50GB; max_table_size_to_drop=0 allows to DROP any tables.
+# The same for max_partition_size_to_drop.
+# Uncomment to disable protection.
+
+# max_table_size_to_drop: 0
+# max_partition_size_to_drop: 0
+
+# Example of parameters for GraphiteMergeTree table engine
+graphite_rollup_example:
+    pattern:
+        regexp: click_cost
+        function: any
+        retention:
+            - age: 0
+              precision: 3600
+            - age: 86400
+              precision: 60
+    default:
+        function: max
+        retention:
+            - age: 0
+              precision: 60
+            - age: 3600
+              precision: 300
+            - age: 86400
+              precision: 3600
+
+# Directory in <clickhouse-path> containing schema files for various input formats.
+# The directory will be created if it doesn't exist.
+format_schema_path: /var/lib/clickhouse/format_schemas/
+
+# Default query masking rules, matching lines would be replaced with something else in the logs
+# (both text logs and system.query_log).
+# name - name for the rule (optional)
+# regexp - RE2 compatible regular expression (mandatory)
+# replace - substitution string for sensitive data (optional, by default - six asterisks)
+query_masking_rules:
+    rule:
+        name: hide encrypt/decrypt arguments
+        regexp: '((?:aes_)?(?:encrypt|decrypt)(?:_mysql)?)\s*\(\s*(?:''(?:\\''|.)+''|.*?)\s*\)'
+        # or more secure, but also more invasive:
+        # (aes_\w+)\s*\(.*\)
+        replace: \1(???)
+
+# Uncomment to use custom http handlers.
+# rules are checked from top to bottom, first match runs the handler
+# url - to match request URL, you can use 'regex:' prefix to use regex match(optional)
+# methods - to match request method, you can use commas to separate multiple method matches(optional)
+# headers - to match request headers, match each child element(child element name is header name), you can use 'regex:' prefix to use regex match(optional)
+# handler is request handler
+# type - supported types: static, dynamic_query_handler, predefined_query_handler
+# query - use with predefined_query_handler type, executes query when the handler is called
+# query_param_name - use with dynamic_query_handler type, extracts and executes the value corresponding to the <query_param_name> value in HTTP request params
+# status - use with static type, response status code
+# content_type - use with static type, response content-type
+# response_content - use with static type, Response content sent to client, when using the prefix 'file://' or 'config://', find the content from the file or configuration send to client.
+
+# http_handlers:
+#     - rule:
+#         url: /
+#         methods: POST,GET
+#         headers:
+#           pragma: no-cache
+#         handler:
+#           type: dynamic_query_handler
+#           query_param_name: query
+#     - rule:
+#         url: /predefined_query
+#         methods: POST,GET
+#         handler:
+#           type: predefined_query_handler
+#           query: 'SELECT * FROM system.settings'
+#     - rule:
+#         handler:
+#           type: static
+#           status: 200
+#           content_type: 'text/plain; charset=UTF-8'
+#           response_content: config://http_server_default_response
+
+send_crash_reports:
+    # Changing <enabled> to true allows sending crash reports to
+    # the ClickHouse core developers team via Sentry https://sentry.io
+    # Doing so at least in pre-production environments is highly appreciated
+    enabled: false
+    # Change <anonymize> to true if you don't feel comfortable attaching the server hostname to the crash report
+    anonymize: false
+    # Default endpoint should be changed to different Sentry DSN only if you have
+    # some in-house engineers or hired consultants who're going to debug ClickHouse issues for you
+    endpoint: 'https://6f33034cfe684dd7a3ab9875e57b1c8d@o388870.ingest.sentry.io/5226277'
+    # Uncomment to disable ClickHouse internal DNS caching.
+    # disable_internal_dns_cache: 1
--- a/programs/server/config-example.yaml.disabled
+++ b/programs/server/config-example.yaml.disabled
@ -1,86 +0,0 @@
-# We can use 3 main node types in YAML: Scalar, Map and Sequence.
-
-
-
-# A Scalar is a simple key-value pair:
-
-scalar: 123
-
-# Here we have a key "scalar" and value "123"
-# If we rewrite this in XML, we will get <scalar>123</scalar>
-
-# We can also represent an empty value with '':
-
-key: ''
-
-
-
-# A Map is a node, which contains other nodes:
-
-map:
-  key1: value1
-  key2: value2
-  small_map:
-    key3: value3
-    
-# This map can be converted into:
-# <map>
-#     <key1>value1</key1>
-#     <key2>value2</key2>
-#     <small_map>
-#         <key3>value3</key3>
-#     </small_map>
-# </map>
-
-
-
-# A Sequence is a node, which contains also other nodes.
-# The main difference from Map is that Sequence can also contain simple values.
-
-sequence:
-  - val1
-  - val2
-  - key: 123
-  - map:
-      mkey1: foo
-      mkey2: bar
-      
-# We can represent it in XML this way:
-# <sequence>val1</sequence>
-# <sequence>val2</sequence>
-# <sequence>
-#     <key>123</key>
-# </sequence>
-# <sequence>
-#     <map>
-#         <mkey1>foo</mkey1>
-#         <mkey2>bar</mkey2>
-#     </map>
-# </sequence>
-
-
-
-# YAML does not have direct support for structures like XML attributes.
-# We represent them as nodes with @ prefix in key. Note, that @ is reserved by YAML standard,
-# so you will need to write double quotes around the key. Both Map and Sequence can have
-# attributes as children nodes
-
-map:
-  "@attr1": value1
-  "@attr2": value2
-  key: 123
-  
-# This gives us:
-# <map attr1="value1" attr2="value2">
-#     <key>123</key>
-# </map>
-
-sequence:
-  - "@attr1": value1
-  - "@attr2": value2
-  - 123
-  - abc
-  
-# And this gives us:
-# <map attr1="value1" attr2="value2">123</map>
-# <map attr1="value1" attr2="value2">abc</map>
--- a/src/Coordination/KeeperStorageDispatcher.cpp
+++ b/src/Coordination/KeeperStorageDispatcher.cpp
@ -226,6 +226,10 @@ bool KeeperStorageDispatcher::putRequest(const Coordination::ZooKeeperRequestPtr
    request_info.session_id = session_id;

    std::lock_guard lock(push_request_mutex);
+
+    if (shutdown_called)
+        return false;
+
    /// Put close requests without timeouts
    if (request->getOpNum() == Coordination::OpNum::Close)
        requests_queue->push(std::move(request_info));
@ -316,6 +320,8 @@ void KeeperStorageDispatcher::shutdown()
                break;
            }
        }
+
+        std::lock_guard lock(session_to_response_callback_mutex);
        session_to_response_callback.clear();
    }
    catch (...)
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -443,7 +443,12 @@ class IColumn;
    \
    M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
    M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \
-    M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \
+    \
+    /** Experimental feature for moving data between shards. */ \
+    \
+    M(Bool, allow_experimental_query_deduplication, false, "Experimental data deduplication for SELECT queries based on part UUIDs", 0) \
+    M(Bool, experimental_query_deduplication_send_all_part_uuids, false, "If false only part UUIDs for currently moving parts are sent. If true all read part UUIDs are sent (useful only for testing).", 0) \
+    \
    M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \
    M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \
    M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \
--- a/src/DataTypes/Native.h
+++ b/src/DataTypes/Native.h
@ -9,7 +9,6 @@

 #    include <DataTypes/IDataType.h>
 #    include <DataTypes/DataTypeNullable.h>
-#    include <DataTypes/DataTypeFixedString.h>
 #    include <Columns/ColumnConst.h>
 #    include <Columns/ColumnNullable.h>
 #    pragma GCC diagnostic push
@ -41,7 +40,8 @@ static inline llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const IDa
    {
        const auto & data_type_nullable = static_cast<const DataTypeNullable&>(type);
        auto * wrapped = toNativeType(builder, *data_type_nullable.getNestedType());
-        return wrapped ? llvm::StructType::get(wrapped, /* is null = */ builder.getInt1Ty()) : nullptr;
+        auto * is_null_type = builder.getInt1Ty();
+        return wrapped ? llvm::StructType::get(wrapped, is_null_type) : nullptr;
    }

    /// LLVM doesn't have unsigned types, it has unsigned instructions.
@ -57,11 +57,6 @@ static inline llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const IDa
        return builder.getFloatTy();
    else if (data_type.isFloat64())
        return builder.getDoubleTy();
-    else if (data_type.isFixedString())
-    {
-        const auto & data_type_fixed_string = static_cast<const DataTypeFixedString &>(type);
-        return llvm::VectorType::get(builder.getInt8Ty(), data_type_fixed_string.getN());
-    }

    return nullptr;
 }
@ -76,7 +71,7 @@ static inline bool canBeNativeType(const IDataType & type)
        return canBeNativeType(*data_type_nullable.getNestedType());
    }

-    return data_type.isNativeInt() || data_type.isNativeUInt() || data_type.isFloat() || data_type.isFixedString() || data_type.isDate();
+    return data_type.isNativeInt() || data_type.isNativeUInt() || data_type.isFloat() || data_type.isDate();
 }

 static inline llvm::Type * toNativeType(llvm::IRBuilderBase & builder, const DataTypePtr & type)
--- a/src/Interpreters/JIT/compileFunction.cpp
+++ b/src/Interpreters/JIT/compileFunction.cpp
@ -87,7 +87,7 @@ static void compileFunction(llvm::Module & module, const IFunctionBase & functio
    for (size_t i = 0; i <= arg_types.size(); ++i)
    {
        const auto & type = i == arg_types.size() ? function.getResultType() : arg_types[i];
-        auto * data = b.CreateLoad(b.CreateConstInBoundsGEP1_32(data_type, columns_arg, i));
+        auto * data = b.CreateLoad(data_type, b.CreateConstInBoundsGEP1_32(data_type, columns_arg, i));
        columns[i].data_init = b.CreatePointerCast(b.CreateExtractValue(data, {0}), toNativeType(b, removeNullable(type))->getPointerTo());
        columns[i].null_init = type->isNullable() ? b.CreateExtractValue(data, {1}) : nullptr;
    }
@ -122,14 +122,14 @@ static void compileFunction(llvm::Module & module, const IFunctionBase & functio
        auto & column = columns[i];
        auto type = arg_types[i];

-        auto * value = b.CreateLoad(column.data);
-        if (!column.null)
+        auto * value = b.CreateLoad(toNativeType(b, removeNullable(type)), column.data);
+        if (!type->isNullable())
        {
            arguments.emplace_back(value);
            continue;
        }

-        auto * is_null = b.CreateICmpNE(b.CreateLoad(column.null), b.getInt8(0));
+        auto * is_null = b.CreateICmpNE(b.CreateLoad(b.getInt8Ty(), column.null), b.getInt8(0));
        auto * nullable_unitilized = llvm::Constant::getNullValue(toNativeType(b, type));
        auto * nullable_value = b.CreateInsertValue(b.CreateInsertValue(nullable_unitilized, value, {0}), is_null, {1});
        arguments.emplace_back(nullable_value);
--- a/src/Interpreters/examples/jit_example.cpp
+++ b/src/Interpreters/examples/jit_example.cpp
@ -38,7 +38,7 @@ int main(int argc, char **argv)

    //     b.CreateCall(func_declaration);

-    //     auto * load_argument = b.CreateLoad(argument);
+    //     auto * load_argument = b.CreateLoad(value_type, argument);
    //     auto * value = b.CreateAdd(load_argument, load_argument);
    //     b.CreateRet(value);
    // });
--- a/src/Parsers/ParserAlterQuery.cpp
+++ b/src/Parsers/ParserAlterQuery.cpp
@ -84,6 +84,7 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
    ParserKeyword s_to_disk("TO DISK");
    ParserKeyword s_to_volume("TO VOLUME");
    ParserKeyword s_to_table("TO TABLE");
+    ParserKeyword s_to_shard("TO SHARD");

    ParserKeyword s_delete("DELETE");
    ParserKeyword s_update("UPDATE");
@ -366,6 +367,10 @@ bool ParserAlterCommand::parseImpl(Pos & pos, ASTPtr & node, Expected & expected
                    return false;
                command->move_destination_type = DataDestinationType::TABLE;
            }
+            else if (s_to_shard.ignore(pos))
+            {
+                command->move_destination_type = DataDestinationType::SHARD;
+            }
            else
                return false;

--- a/src/Server/KeeperTCPHandler.cpp
+++ b/src/Server/KeeperTCPHandler.cpp
@ -194,7 +194,7 @@ KeeperTCPHandler::KeeperTCPHandler(IServer & server_, const Poco::Net::StreamSoc
    , server(server_)
    , log(&Poco::Logger::get("NuKeeperTCPHandler"))
    , global_context(Context::createCopy(server.context()))
-    , nu_keeper_storage_dispatcher(global_context->getKeeperStorageDispatcher())
+    , keeper_dispatcher(global_context->getKeeperStorageDispatcher())
    , operation_timeout(0, global_context->getConfigRef().getUInt("test_keeper_server.operation_timeout_ms", Coordination::DEFAULT_OPERATION_TIMEOUT_MS) * 1000)
    , session_timeout(0, global_context->getConfigRef().getUInt("test_keeper_server.session_timeout_ms", Coordination::DEFAULT_SESSION_TIMEOUT_MS) * 1000)
    , poll_wrapper(std::make_unique<SocketInterruptablePollWrapper>(socket_))
@ -286,12 +286,12 @@ void KeeperTCPHandler::runImpl()
        return;
    }

-    if (nu_keeper_storage_dispatcher->hasLeader())
+    if (keeper_dispatcher->hasLeader())
    {
        try
        {
            LOG_INFO(log, "Requesting session ID for the new client");
-            session_id = nu_keeper_storage_dispatcher->getSessionID(session_timeout.totalMilliseconds());
+            session_id = keeper_dispatcher->getSessionID(session_timeout.totalMilliseconds());
            LOG_INFO(log, "Received session ID {}", session_id);
        }
        catch (const Exception & e)
@ -318,7 +318,7 @@ void KeeperTCPHandler::runImpl()
        UInt8 single_byte = 1;
        [[maybe_unused]] int result = write(response_fd, &single_byte, sizeof(single_byte));
    };
-    nu_keeper_storage_dispatcher->registerSession(session_id, response_callback);
+    keeper_dispatcher->registerSession(session_id, response_callback);

    session_stopwatch.start();
    bool close_received = false;
@ -368,7 +368,7 @@ void KeeperTCPHandler::runImpl()
                if (response->error == Coordination::Error::ZSESSIONEXPIRED)
                {
                    LOG_DEBUG(log, "Session #{} expired because server shutting down or quorum is not alive", session_id);
-                    nu_keeper_storage_dispatcher->finishSession(session_id);
+                    keeper_dispatcher->finishSession(session_id);
                    return;
                }

@ -381,7 +381,7 @@ void KeeperTCPHandler::runImpl()
            if (session_stopwatch.elapsedMicroseconds() > static_cast<UInt64>(session_timeout.totalMicroseconds()))
            {
                LOG_DEBUG(log, "Session #{} expired", session_id);
-                nu_keeper_storage_dispatcher->finishSession(session_id);
+                keeper_dispatcher->finishSession(session_id);
                break;
            }
        }
@ -389,7 +389,7 @@ void KeeperTCPHandler::runImpl()
    catch (const Exception & ex)
    {
        LOG_INFO(log, "Got exception processing session #{}: {}", session_id, getExceptionMessage(ex, true));
-        nu_keeper_storage_dispatcher->finishSession(session_id);
+        keeper_dispatcher->finishSession(session_id);
    }
 }

@ -407,7 +407,7 @@ std::pair<Coordination::OpNum, Coordination::XID> KeeperTCPHandler::receiveReque
    request->xid = xid;
    request->readImpl(*in);

-    if (!nu_keeper_storage_dispatcher->putRequest(request, session_id))
+    if (!keeper_dispatcher->putRequest(request, session_id))
        throw Exception(ErrorCodes::TIMEOUT_EXCEEDED, "Session {} already disconnected", session_id);
    return std::make_pair(opnum, xid);
 }
--- a/src/Server/KeeperTCPHandler.h
+++ b/src/Server/KeeperTCPHandler.h
@ -38,7 +38,7 @@ private:
    IServer & server;
    Poco::Logger * log;
    ContextPtr global_context;
-    std::shared_ptr<KeeperStorageDispatcher> nu_keeper_storage_dispatcher;
+    std::shared_ptr<KeeperStorageDispatcher> keeper_dispatcher;
    Poco::Timespan operation_timeout;
    Poco::Timespan session_timeout;
    int64_t session_id{-1};
--- a/src/Storages/DataDestinationType.h
+++ b/src/Storages/DataDestinationType.h
@ -10,6 +10,7 @@ enum class DataDestinationType
    VOLUME,
    TABLE,
    DELETE,
+    SHARD,
 };

 }
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@ -152,6 +152,7 @@ MergeTreeData::MergeTreeData(
    , log_name(table_id_.getNameForLogs())
    , log(&Poco::Logger::get(log_name))
    , storage_settings(std::move(storage_settings_))
+    , pinned_part_uuids(std::make_shared<PinnedPartUUIDs>())
    , data_parts_by_info(data_parts_indexes.get<TagByInfo>())
    , data_parts_by_state_and_info(data_parts_indexes.get<TagByStateAndInfo>())
    , parts_mover(this)
@ -2997,6 +2998,11 @@ void MergeTreeData::movePartitionToVolume(const ASTPtr & partition, const String
        throw Exception("Cannot move parts because moves are manually disabled", ErrorCodes::ABORTED);
 }

+void MergeTreeData::movePartitionToShard(const ASTPtr & /*partition*/, bool /*move_part*/, const String & /*to*/, ContextPtr /*query_context*/)
+{
+    throw Exception(ErrorCodes::NOT_IMPLEMENTED, "MOVE PARTITION TO SHARD is not supported by storage {}", getName());
+}
+
 void MergeTreeData::fetchPartition(
    const ASTPtr & /*partition*/,
    const StorageMetadataPtr & /*metadata_snapshot*/,
@ -3046,11 +3052,23 @@ Pipe MergeTreeData::alterPartition(
                        break;

                    case PartitionCommand::MoveDestinationType::TABLE:
+                    {
                        checkPartitionCanBeDropped(command.partition);
                        String dest_database = query_context->resolveDatabase(command.to_database);
                        auto dest_storage = DatabaseCatalog::instance().getTable({dest_database, command.to_table}, query_context);
                        movePartitionToTable(dest_storage, command.partition, query_context);
-                        break;
+                    }
+                    break;
+
+                    case PartitionCommand::MoveDestinationType::SHARD:
+                    {
+                        if (!getSettings()->part_moves_between_shards_enable)
+                            throw Exception(ErrorCodes::SUPPORT_IS_DISABLED,
+                                            "Moving parts between shards is experimental and work in progress"
+                                            ", see part_moves_between_shards_enable setting");
+                        movePartitionToShard(command.partition, command.part, command.move_destination_name, query_context);
+                    }
+                    break;
                }
            }
            break;
@ -4581,6 +4599,12 @@ catch (...)
    tryLogCurrentException(log, __PRETTY_FUNCTION__);
 }

+StorageMergeTree::PinnedPartUUIDsPtr MergeTreeData::getPinnedPartUUIDs() const
+{
+    std::lock_guard lock(pinned_part_uuids_mutex);
+    return pinned_part_uuids;
+}
+
 MergeTreeData::CurrentlyMovingPartsTagger::CurrentlyMovingPartsTagger(MergeTreeMovingParts && moving_parts_, MergeTreeData & data_)
    : parts_to_move(std::move(moving_parts_)), data(data_)
 {
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@ -20,6 +20,7 @@
 #include <Storages/IndicesDescription.h>
 #include <Storages/MergeTree/MergeTreePartsMover.h>
 #include <Storages/MergeTree/MergeTreeWriteAheadLog.h>
+#include <Storages/MergeTree/PinnedPartUUIDs.h>
 #include <Interpreters/PartLog.h>
 #include <Disks/StoragePolicy.h>
 #include <Interpreters/Aggregator.h>
@ -128,6 +129,8 @@ public:
    using DataPartStates = std::initializer_list<DataPartState>;
    using DataPartStateVector = std::vector<DataPartState>;

+    using PinnedPartUUIDsPtr = std::shared_ptr<const PinnedPartUUIDs>;
+
    constexpr static auto FORMAT_VERSION_FILE_NAME = "format_version.txt";
    constexpr static auto DETACHED_DIR_NAME = "detached";

@ -801,6 +804,8 @@ public:
    /// Mutex for currently_moving_parts
    mutable std::mutex moving_parts_mutex;

+    PinnedPartUUIDsPtr getPinnedPartUUIDs() const;
+
    /// Return main processing background job, like merge/mutate/fetch and so on
    virtual std::optional<JobAndPool> getDataProcessingJob() = 0;
    /// Return job to move parts between disks/volumes and so on.
@ -855,6 +860,10 @@ protected:
    /// Use get and set to receive readonly versions.
    MultiVersion<MergeTreeSettings> storage_settings;

+    /// Used to determine which UUIDs to send to root query executor for deduplication.
+    mutable std::shared_mutex pinned_part_uuids_mutex;
+    PinnedPartUUIDsPtr pinned_part_uuids;
+
    /// Work with data parts

    struct TagByInfo{};
@ -989,7 +998,6 @@ protected:
    virtual void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr context) = 0;
    virtual void movePartitionToTable(const StoragePtr & dest_table, const ASTPtr & partition, ContextPtr context) = 0;

-    /// Makes sense only for replicated tables
    virtual void fetchPartition(
        const ASTPtr & partition,
        const StorageMetadataPtr & metadata_snapshot,
@ -997,6 +1005,8 @@ protected:
        bool fetch_part,
        ContextPtr query_context);

+    virtual void movePartitionToShard(const ASTPtr & partition, bool move_part, const String & to, ContextPtr query_context);
+
    void writePartLog(
        PartLogElement::Type type,
        const ExecutionStatus & execution_status,
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@ -528,6 +528,7 @@ QueryPlanPtr MergeTreeDataSelectExecutor::readFromParts(
            selectPartsToReadWithUUIDFilter(
                parts,
                part_values,
+                data.getPinnedPartUUIDs(),
                minmax_idx_condition,
                minmax_columns_types,
                partition_pruner,
@ -2246,6 +2247,7 @@ void MergeTreeDataSelectExecutor::selectPartsToRead(
 void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
    MergeTreeData::DataPartsVector & parts,
    const std::unordered_set<String> & part_values,
+    MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids,
    const std::optional<KeyCondition> & minmax_idx_condition,
    const DataTypes & minmax_columns_types,
    std::optional<PartitionPruner> & partition_pruner,
@ -2253,6 +2255,8 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
    ContextPtr query_context,
    PartFilterCounters & counters) const
 {
+    const Settings & settings = query_context->getSettings();
+
    /// process_parts prepare parts that have to be read for the query,
    /// returns false if duplicated parts' UUID have been met
    auto select_parts = [&] (MergeTreeData::DataPartsVector & selected_parts) -> bool
@ -2309,9 +2313,12 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
            /// populate UUIDs and exclude ignored parts if enabled
            if (part->uuid != UUIDHelpers::Nil)
            {
-                auto result = temp_part_uuids.insert(part->uuid);
-                if (!result.second)
-                    throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR);
+                if (settings.experimental_query_deduplication_send_all_part_uuids || pinned_part_uuids->contains(part->uuid))
+                {
+                    auto result = temp_part_uuids.insert(part->uuid);
+                    if (!result.second)
+                        throw Exception("Found a part with the same UUID on the same replica.", ErrorCodes::LOGICAL_ERROR);
+                }
            }

            selected_parts.push_back(part_or_projection);
@ -2335,7 +2342,8 @@ void MergeTreeDataSelectExecutor::selectPartsToReadWithUUIDFilter(
    /// Process parts that have to be read for a query.
    auto needs_retry = !select_parts(parts);

-    /// If any duplicated part UUIDs met during the first step, try to ignore them in second pass
+    /// If any duplicated part UUIDs met during the first step, try to ignore them in second pass.
+    /// This may happen when `prefer_localhost_replica` is set and "distributed" stage runs in the same process with "remote" stage.
    if (needs_retry)
    {
        LOG_DEBUG(log, "Found duplicate uuids locally, will retry part selection without them");
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.h
@ -166,6 +166,7 @@ private:
    void selectPartsToReadWithUUIDFilter(
        MergeTreeData::DataPartsVector & parts,
        const std::unordered_set<String> & part_values,
+        MergeTreeData::PinnedPartUUIDsPtr pinned_part_uuids,
        const std::optional<KeyCondition> & minmax_idx_condition,
        const DataTypes & minmax_columns_types,
        std::optional<PartitionPruner> & partition_pruner,
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@ -128,6 +128,10 @@ struct Settings;
    M(UInt64, min_marks_to_honor_max_concurrent_queries, 0, "Minimal number of marks to honor the MergeTree-level's max_concurrent_queries (0 - disabled). Queries will still be limited by other max_concurrent_queries settings.", 0) \
    M(UInt64, min_bytes_to_rebalance_partition_over_jbod, 0, "Minimal amount of bytes to enable part rebalance over JBOD array (0 - disabled).", 0) \
    \
+    /** Experimental/work in progress feature. Unsafe for production. */ \
+    M(UInt64, part_moves_between_shards_enable, 0, "Experimental/Incomplete feature to move parts between shards. Does not take into account sharding expressions.", 0) \
+    M(UInt64, part_moves_between_shards_delay_seconds, 30, "Time to wait before/after moving parts between shards.", 0) \
+    \
    /** Obsolete settings. Kept for backward compatibility only. */ \
    M(UInt64, min_relative_delay_to_yield_leadership, 120, "Obsolete setting, does nothing.", 0) \
    M(UInt64, check_delay_period, 60, "Obsolete setting, does nothing.", 0) \
--- a/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp
+++ b/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.cpp
@ -0,0 +1,459 @@
+#include <Storages/MergeTree/PartMovesBetweenShardsOrchestrator.h>
+#include <Storages/MergeTree/PinnedPartUUIDs.h>
+#include <Storages/StorageReplicatedMergeTree.h>
+#include <boost/range/adaptor/map.hpp>
+#include <Poco/JSON/JSON.h>
+#include <Poco/JSON/Object.h>
+#include <Poco/JSON/Parser.h>
+
+namespace DB
+{
+PartMovesBetweenShardsOrchestrator::PartMovesBetweenShardsOrchestrator(StorageReplicatedMergeTree & storage_)
+    : storage(storage_)
+    , zookeeper_path(storage.zookeeper_path)
+    , logger_name(storage.getStorageID().getFullTableName() + " (PartMovesBetweenShardsOrchestrator)")
+    , log(&Poco::Logger::get(logger_name))
+    , entries_znode_path(zookeeper_path + "/part_moves_shard")
+{
+    /// Schedule pool is not designed for long-running tasks. TODO replace with a separate thread?
+    task = storage.getContext()->getSchedulePool().createTask(logger_name, [this]{ run(); });
+}
+
+void PartMovesBetweenShardsOrchestrator::run()
+{
+    if (!storage.getSettings()->part_moves_between_shards_enable)
+        return;
+
+    if (need_stop)
+        return;
+
+    auto sleep_ms = 10;
+
+    try
+    {
+        fetchStateFromZK();
+
+        if (step())
+            fetchStateFromZK();
+        else
+            sleep_ms = 3 * 1000;
+    }
+    catch (...)
+    {
+        tryLogCurrentException(log, __PRETTY_FUNCTION__);
+    }
+
+
+    task->scheduleAfter(sleep_ms);
+}
+
+void PartMovesBetweenShardsOrchestrator::shutdown()
+{
+    need_stop = true;
+    task->deactivate();
+    LOG_TRACE(log, "PartMovesBetweenShardsOrchestrator thread finished");
+}
+
+void PartMovesBetweenShardsOrchestrator::fetchStateFromZK()
+{
+    std::lock_guard lock(state_mutex);
+
+    entries.clear();
+
+    auto zk = storage.getZooKeeper();
+
+    Strings task_names = zk->getChildren(entries_znode_path);
+    for (auto const & task_name : task_names)
+    {
+        PartMovesBetweenShardsOrchestrator::Entry e;
+        Coordination::Stat stat;
+
+        e.znode_path = entries_znode_path + "/" + task_name;
+
+        auto entry_str = zk->get(e.znode_path, &stat);
+        e.fromString(entry_str);
+
+        e.version = stat.version;
+        e.znode_name = task_name;
+
+        entries[task_name] = std::move(e);
+    }
+}
+
+bool PartMovesBetweenShardsOrchestrator::step()
+{
+    if (!storage.is_leader)
+        return false;
+
+    auto zk = storage.getZooKeeper();
+
+    std::optional<Entry> entry_to_process;
+
+    /// Try find an entry to process and copy it.
+    {
+        std::lock_guard lock(state_mutex);
+
+        for (auto const & entry : entries | boost::adaptors::map_values)
+        {
+            if (entry.state.value == EntryState::DONE || entry.state.value == EntryState::CANCELLED)
+                continue;
+
+            entry_to_process.emplace(entry);
+            break;
+        }
+    }
+
+    if (!entry_to_process.has_value())
+        return false;
+
+    /// Since some state transitions are long running (waiting on replicas acknowledgement we create this lock to avoid
+    /// other replicas trying to do the same work. All state transitions should be idempotent so is is safe to lose the
+    /// lock and have another replica retry.
+    ///
+    /// Note: This blocks all other entries from being executed. Technical debt.
+    zkutil::EphemeralNodeHolder::Ptr entry_node_holder;
+
+    try
+    {
+        entry_node_holder = zkutil::EphemeralNodeHolder::create(entry_to_process->znode_path + "/lock_holder", *zk, storage.replica_name);
+    }
+    catch (const Coordination::Exception & e)
+    {
+        if (e.code == Coordination::Error::ZNODEEXISTS)
+        {
+            LOG_DEBUG(log, "Task {} is being processed by another replica", entry_to_process->znode_name);
+            return false;
+        }
+
+        throw;
+    }
+
+    try
+    {
+        /// Use the same ZooKeeper connection. If we'd lost the lock then connection
+        /// will become expired and all consequent operations will fail.
+        stepEntry(entry_to_process.value(), zk);
+    }
+    catch (...)
+    {
+        tryLogCurrentException(log, __PRETTY_FUNCTION__);
+
+        Entry entry_copy = entry_to_process.value();
+        entry_copy.last_exception_msg = getCurrentExceptionMessage(false);
+        entry_copy.update_time = std::time(nullptr);
+        zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+
+        return false;
+    }
+
+    return true;
+}
+
+void PartMovesBetweenShardsOrchestrator::stepEntry(const Entry & entry, zkutil::ZooKeeperPtr zk)
+{
+    switch (entry.state.value)
+    {
+        case EntryState::DONE:
+            break;
+
+        case EntryState::CANCELLED:
+            break;
+
+        case EntryState::TODO:
+        {
+            /// State transition.
+            Entry entry_copy = entry;
+            entry_copy.state = EntryState::SYNC_SOURCE;
+            entry_copy.update_time = std::time(nullptr);
+            zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+        }
+        break;
+
+        case EntryState::SYNC_SOURCE:
+        {
+            {
+                /// Log entry.
+                Coordination::Requests ops;
+                ops.emplace_back(zkutil::makeCheckRequest(entry.znode_path, entry.version));
+
+                ReplicatedMergeTreeLogEntryData log_entry;
+                log_entry.type = ReplicatedMergeTreeLogEntryData::SYNC_PINNED_PART_UUIDS;
+                log_entry.create_time = std::time(nullptr);
+                log_entry.source_replica = storage.replica_name;
+                ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/log", "", -1));
+                ops.emplace_back(zkutil::makeCreateRequest(
+                    zookeeper_path + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
+
+                Coordination::Responses responses;
+                Coordination::Error rc = zk->tryMulti(ops, responses);
+                zkutil::KeeperMultiException::check(rc, ops, responses);
+
+                String log_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
+                log_entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1);
+
+                /// This wait in background schedule pool is useless. It'd be
+                /// better to have some notification which will call `step`
+                /// function when all replicated will finish. TODO.
+                storage.waitForAllReplicasToProcessLogEntry(log_entry, true);
+            }
+
+            {
+                /// State transition.
+                Entry entry_copy = entry;
+                entry_copy.state = EntryState::SYNC_DESTINATION;
+                entry_copy.update_time = std::time(nullptr);
+                zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+            }
+        }
+        break;
+
+        case EntryState::SYNC_DESTINATION:
+        {
+            {
+                /// Log entry.
+                Coordination::Requests ops;
+                ops.emplace_back(zkutil::makeCheckRequest(entry.znode_path, entry.version));
+
+                ReplicatedMergeTreeLogEntryData log_entry;
+                log_entry.type = ReplicatedMergeTreeLogEntryData::SYNC_PINNED_PART_UUIDS;
+                log_entry.create_time = std::time(nullptr);
+                log_entry.source_replica = storage.replica_name;
+                log_entry.source_shard = zookeeper_path;
+
+                ops.emplace_back(zkutil::makeSetRequest(entry.to_shard + "/log", "", -1));
+                ops.emplace_back(zkutil::makeCreateRequest(
+                    entry.to_shard + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
+
+                Coordination::Responses responses;
+                Coordination::Error rc = zk->tryMulti(ops, responses);
+                zkutil::KeeperMultiException::check(rc, ops, responses);
+
+                String log_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
+                log_entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1);
+
+                storage.waitForAllTableReplicasToProcessLogEntry(entry.to_shard, log_entry, true);
+            }
+
+            {
+                /// State transition.
+                Entry entry_copy = entry;
+                entry_copy.state = EntryState::DESTINATION_FETCH;
+                entry_copy.update_time = std::time(nullptr);
+                zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+            }
+        }
+        break;
+
+        case EntryState::DESTINATION_FETCH:
+        {
+            /// Make sure table structure doesn't change when there are part movements in progress.
+            {
+                Coordination::Requests ops;
+                ops.emplace_back(zkutil::makeCheckRequest(entry.znode_path, entry.version));
+
+                /// Log entry.
+                ReplicatedMergeTreeLogEntryData log_entry;
+                log_entry.type = ReplicatedMergeTreeLogEntryData::CLONE_PART_FROM_SHARD;
+                log_entry.create_time = std::time(nullptr);
+                log_entry.new_part_name = entry.part_name;
+                log_entry.source_replica = storage.replica_name;
+                log_entry.source_shard = zookeeper_path;
+                ops.emplace_back(zkutil::makeSetRequest(entry.to_shard + "/log", "", -1));
+                ops.emplace_back(zkutil::makeCreateRequest(
+                    entry.to_shard + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
+
+                Coordination::Responses responses;
+                Coordination::Error rc = zk->tryMulti(ops, responses);
+                zkutil::KeeperMultiException::check(rc, ops, responses);
+
+                String log_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
+                log_entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1);
+
+                storage.waitForAllTableReplicasToProcessLogEntry(entry.to_shard, log_entry, true);
+            }
+
+            {
+                /// State transition.
+                Entry entry_copy = entry;
+                entry_copy.state = EntryState::DESTINATION_ATTACH;
+                entry_copy.update_time = std::time(nullptr);
+                zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+            }
+        }
+        break;
+
+        case EntryState::DESTINATION_ATTACH:
+        {
+            /// There is a chance that attach on destination will fail and this task will be left in the queue forever.
+            {
+                Coordination::Requests ops;
+                ops.emplace_back(zkutil::makeCheckRequest(entry.znode_path, entry.version));
+
+                auto part = storage.getActiveContainingPart(entry.part_name);
+                /// Allocating block number in other replicas zookeeper path
+                /// TODO Maybe we can do better.
+                auto block_number_lock = storage.allocateBlockNumber(part->info.partition_id, zk, "", entry.to_shard);
+                auto block_number = block_number_lock->getNumber();
+
+                auto part_info = part->info;
+                part_info.min_block = block_number;
+                part_info.max_block = block_number;
+                part_info.level = 0;
+                part_info.mutation = 0;
+
+                /// Attach log entry (all replicas already fetched part)
+                ReplicatedMergeTreeLogEntryData log_entry;
+                log_entry.type = ReplicatedMergeTreeLogEntryData::ATTACH_PART;
+                log_entry.part_checksum = part->checksums.getTotalChecksumHex();
+                log_entry.create_time = std::time(nullptr);
+                log_entry.new_part_name = part_info.getPartName();
+                ops.emplace_back(zkutil::makeSetRequest(entry.to_shard + "/log", "", -1));
+                ops.emplace_back(zkutil::makeCreateRequest(
+                    entry.to_shard + "/log/log-", log_entry.toString(), zkutil::CreateMode::PersistentSequential));
+
+                Coordination::Responses responses;
+                Coordination::Error rc = zk->tryMulti(ops, responses);
+                zkutil::KeeperMultiException::check(rc, ops, responses);
+
+                String log_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
+                log_entry.znode_name = log_znode_path.substr(log_znode_path.find_last_of('/') + 1);
+
+                storage.waitForAllTableReplicasToProcessLogEntry(entry.to_shard, log_entry, true);
+            }
+
+            {
+                /// State transition.
+                Entry entry_copy = entry;
+                entry_copy.state = EntryState::SOURCE_DROP_PRE_DELAY;
+                entry_copy.update_time = std::time(nullptr);
+                zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+            }
+        }
+        break;
+
+        case EntryState::SOURCE_DROP_PRE_DELAY:
+        {
+            std::this_thread::sleep_for(std::chrono::seconds(storage.getSettings()->part_moves_between_shards_delay_seconds));
+
+            /// State transition.
+            Entry entry_copy = entry;
+            entry_copy.state = EntryState::SOURCE_DROP;
+            entry_copy.update_time = std::time(nullptr);
+            zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+        }
+        break;
+
+        case EntryState::SOURCE_DROP:
+        {
+            {
+                ReplicatedMergeTreeLogEntry log_entry;
+                if (storage.dropPart(zk, entry.part_name, log_entry,false, false))
+                    storage.waitForAllReplicasToProcessLogEntry(log_entry, true);
+            }
+
+            {
+                /// State transition.
+                Entry entry_copy = entry;
+                entry_copy.state = EntryState::SOURCE_DROP_POST_DELAY;
+                entry_copy.update_time = std::time(nullptr);
+                zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+            }
+        }
+        break;
+
+        case EntryState::SOURCE_DROP_POST_DELAY:
+        {
+            std::this_thread::sleep_for(std::chrono::seconds(storage.getSettings()->part_moves_between_shards_delay_seconds));
+
+            /// State transition.
+            Entry entry_copy = entry;
+            entry_copy.state = EntryState::REMOVE_UUID_PIN;
+            entry_copy.update_time = std::time(nullptr);
+            zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+        }
+        break;
+
+        case EntryState::REMOVE_UUID_PIN:
+        {
+            {
+                PinnedPartUUIDs src_pins;
+                PinnedPartUUIDs dst_pins;
+
+                {
+                    String s = zk->get(zookeeper_path + "/pinned_part_uuids", &src_pins.stat);
+                    src_pins.fromString(s);
+                }
+
+                {
+                    String s = zk->get(entry.to_shard + "/pinned_part_uuids", &dst_pins.stat);
+                    dst_pins.fromString(s);
+                }
+
+                src_pins.part_uuids.erase(entry.part_uuid);
+                dst_pins.part_uuids.erase(entry.part_uuid);
+
+                Coordination::Requests ops;
+                ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/pinned_part_uuids", src_pins.toString(), src_pins.stat.version));
+                ops.emplace_back(zkutil::makeSetRequest(entry.to_shard + "/pinned_part_uuids", dst_pins.toString(), dst_pins.stat.version));
+
+                zk->multi(ops);
+            }
+
+            /// State transition.
+            Entry entry_copy = entry;
+            entry_copy.state = EntryState::DONE;
+            entry_copy.update_time = std::time(nullptr);
+            zk->set(entry_copy.znode_path, entry_copy.toString(), entry_copy.version);
+        }
+        break;
+    }
+}
+
+std::vector<PartMovesBetweenShardsOrchestrator::Entry> PartMovesBetweenShardsOrchestrator::getEntries() const
+{
+    std::lock_guard lock(state_mutex);
+
+    std::vector<Entry> res;
+
+    for (const auto & e : entries)
+        res.push_back(e.second);
+
+    return res;
+}
+
+String PartMovesBetweenShardsOrchestrator::Entry::toString() const
+{
+    Poco::JSON::Object json;
+
+    json.set(JSON_KEY_CREATE_TIME, DB::toString(create_time));
+    json.set(JSON_KEY_UPDATE_TIME, DB::toString(update_time));
+    json.set(JSON_KEY_TASK_UUID, DB::toString(task_uuid));
+    json.set(JSON_KEY_PART_NAME, part_name);
+    json.set(JSON_KEY_PART_UUID, DB::toString(part_uuid));
+    json.set(JSON_KEY_TO_SHARD, to_shard);
+    json.set(JSON_KEY_STATE, state.toString());
+    json.set(JSON_KEY_LAST_EX_MSG, last_exception_msg);
+
+    std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    oss.exceptions(std::ios::failbit);
+    json.stringify(oss);
+
+    return oss.str();
+}
+
+void PartMovesBetweenShardsOrchestrator::Entry::fromString(const String & buf)
+{
+    Poco::JSON::Parser parser;
+    auto json = parser.parse(buf).extract<Poco::JSON::Object::Ptr>();
+
+    create_time = parseFromString<time_t>(json->getValue<std::string>(JSON_KEY_CREATE_TIME));
+    update_time = parseFromString<time_t>(json->getValue<std::string>(JSON_KEY_UPDATE_TIME));
+    task_uuid = parseFromString<UUID>(json->getValue<std::string>(JSON_KEY_TASK_UUID));
+    part_name = json->getValue<std::string>(JSON_KEY_PART_NAME);
+    part_uuid = parseFromString<UUID>(json->getValue<std::string>(JSON_KEY_PART_UUID));
+    to_shard = json->getValue<std::string>(JSON_KEY_TO_SHARD);
+    state.value = EntryState::fromString(json->getValue<std::string>(JSON_KEY_STATE));
+    last_exception_msg = json->getValue<std::string>(JSON_KEY_LAST_EX_MSG);
+}
+
+}
--- a/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.h
+++ b/src/Storages/MergeTree/PartMovesBetweenShardsOrchestrator.h
@ -0,0 +1,162 @@
+#pragma once
+
+#include <vector>
+#include <common/logger_useful.h>
+#include <common/types.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Core/UUID.h>
+#include <Core/BackgroundSchedulePool.h>
+#include <IO/WriteHelpers.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+}
+
+class StorageReplicatedMergeTree;
+
+/// Cross shard part movement workflow orchestration.
+class PartMovesBetweenShardsOrchestrator
+{
+public:
+    struct EntryState
+    {
+        enum Value
+        {
+            TODO,
+            SYNC_SOURCE,
+            SYNC_DESTINATION,
+            DESTINATION_FETCH,
+            DESTINATION_ATTACH,
+            SOURCE_DROP_PRE_DELAY,
+            SOURCE_DROP,
+            SOURCE_DROP_POST_DELAY,
+            REMOVE_UUID_PIN,
+            DONE,
+            CANCELLED,
+        };
+
+        EntryState(): value(TODO) {}
+        EntryState(Value value_): value(value_) {}
+
+        Value value;
+
+        String toString() const
+        {
+            switch (value)
+            {
+                case TODO: return "TODO";
+                case SYNC_SOURCE: return "SYNC_SOURCE";
+                case SYNC_DESTINATION: return "SYNC_DESTINATION";
+                case DESTINATION_FETCH: return "DESTINATION_FETCH";
+                case DESTINATION_ATTACH: return "DESTINATION_ATTACH";
+                case SOURCE_DROP_PRE_DELAY: return "SOURCE_DROP_PRE_DELAY";
+                case SOURCE_DROP: return "SOURCE_DROP";
+                case SOURCE_DROP_POST_DELAY: return "SOURCE_DROP_POST_DELAY";
+                case REMOVE_UUID_PIN: return "REMOVE_UUID_PIN";
+                case DONE: return "DONE";
+                case CANCELLED: return "CANCELLED";
+            }
+
+            throw Exception("Unknown EntryState: " + DB::toString<int>(value), ErrorCodes::LOGICAL_ERROR);
+        }
+
+        static EntryState::Value fromString(String in)
+        {
+            if (in == "TODO") return TODO;
+            else if (in == "SYNC_SOURCE") return SYNC_SOURCE;
+            else if (in == "SYNC_DESTINATION") return SYNC_DESTINATION;
+            else if (in == "DESTINATION_FETCH") return DESTINATION_FETCH;
+            else if (in == "DESTINATION_ATTACH") return DESTINATION_ATTACH;
+            else if (in == "SOURCE_DROP_PRE_DELAY") return SOURCE_DROP_PRE_DELAY;
+            else if (in == "SOURCE_DROP") return SOURCE_DROP;
+            else if (in == "SOURCE_DROP_POST_DELAY") return SOURCE_DROP_POST_DELAY;
+            else if (in == "REMOVE_UUID_PIN") return REMOVE_UUID_PIN;
+            else if (in == "DONE") return DONE;
+            else if (in == "CANCELLED") return CANCELLED;
+            else throw Exception("Unknown state: " + in, ErrorCodes::LOGICAL_ERROR);
+        }
+    };
+
+    struct Entry
+    {
+        friend class PartMovesBetweenShardsOrchestrator;
+
+        time_t create_time = 0;
+        time_t update_time = 0;
+
+        /// Globally unique identifier used for attaching parts on destination.
+        /// Using `part_uuid` results in part names being reused when moving parts back and forth.
+        UUID task_uuid;
+
+        String part_name;
+        UUID part_uuid;
+        String to_shard;
+
+        EntryState state;
+
+        String last_exception_msg;
+
+        String znode_name;
+
+    private:
+        /// Transient value for CAS.
+        uint32_t version = 0;
+
+        String znode_path;
+
+    public:
+        String toString() const;
+        void fromString(const String & buf);
+    };
+
+private:
+    static constexpr auto JSON_KEY_CREATE_TIME = "create_time";
+    static constexpr auto JSON_KEY_UPDATE_TIME = "update_time";
+    static constexpr auto JSON_KEY_TASK_UUID = "task_uuid";
+    static constexpr auto JSON_KEY_PART_NAME = "part_name";
+    static constexpr auto JSON_KEY_PART_UUID = "part_uuid";
+    static constexpr auto JSON_KEY_TO_SHARD = "to_shard";
+    static constexpr auto JSON_KEY_STATE = "state";
+    static constexpr auto JSON_KEY_LAST_EX_MSG = "last_exception";
+
+public:
+    PartMovesBetweenShardsOrchestrator(StorageReplicatedMergeTree & storage_);
+
+    void start() { task->activateAndSchedule(); }
+    void wakeup() { task->schedule(); }
+    void shutdown();
+
+    void fetchStateFromZK();
+
+    /// We could have one thread per Entry and worry about concurrency issues.
+    /// Or we could have a single thread trying to run one step at a time.
+    bool step();
+
+    std::vector<Entry> getEntries() const;
+
+private:
+    void run();
+    void stepEntry(const Entry & entry, zkutil::ZooKeeperPtr zk);
+
+private:
+    StorageReplicatedMergeTree & storage;
+
+    String zookeeper_path;
+    String logger_name;
+    Poco::Logger * log = nullptr;
+    std::atomic<bool> need_stop{false};
+
+    BackgroundSchedulePool::TaskHolder task;
+
+    mutable std::mutex state_mutex;
+    std::map<String, Entry> entries;
+
+public:
+    String entries_znode_path;
+};
+
+}
--- a/src/Storages/MergeTree/PinnedPartUUIDs.cpp
+++ b/src/Storages/MergeTree/PinnedPartUUIDs.cpp
@ -0,0 +1,36 @@
+#include "PinnedPartUUIDs.h"
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Poco/JSON/JSON.h>
+#include <Poco/JSON/Object.h>
+#include <Poco/JSON/Parser.h>
+
+namespace DB
+{
+
+String PinnedPartUUIDs::toString() const
+{
+    std::vector<UUID> vec(part_uuids.begin(), part_uuids.end());
+
+    Poco::JSON::Object json;
+    json.set(JSON_KEY_UUIDS, DB::toString(vec));
+
+    std::ostringstream oss; // STYLE_CHECK_ALLOW_STD_STRING_STREAM
+    oss.exceptions(std::ios::failbit);
+    json.stringify(oss);
+
+    return oss.str();
+}
+
+void PinnedPartUUIDs::fromString(const String & buf)
+{
+    Poco::JSON::Parser parser;
+    auto json = parser.parse(buf).extract<Poco::JSON::Object::Ptr>();
+
+    std::vector<UUID> vec = parseFromString<std::vector<UUID>>(json->getValue<std::string>(PinnedPartUUIDs::JSON_KEY_UUIDS));
+
+    part_uuids.clear();
+    std::copy(vec.begin(), vec.end(), std::inserter(part_uuids, part_uuids.begin()));
+}
+
+}
--- a/src/Storages/MergeTree/PinnedPartUUIDs.h
+++ b/src/Storages/MergeTree/PinnedPartUUIDs.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include <Common/ZooKeeper/IKeeper.h>
+#include <Core/UUID.h>
+#include <set>
+
+namespace DB
+{
+
+struct PinnedPartUUIDs
+{
+    std::set<UUID> part_uuids;
+    Coordination::Stat stat{};
+
+    bool contains(const UUID & part_uuid) const
+    {
+        return part_uuids.contains(part_uuid);
+    }
+
+    String toString() const;
+    void fromString(const String & buf);
+
+private:
+    static constexpr auto JSON_KEY_UUIDS = "part_uuids";
+};
+
+}
--- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.cpp
@ -54,6 +54,12 @@ void ReplicatedMergeTreeLogEntryData::writeText(WriteBuffer & out) const
            out << "get\n" << new_part_name;
            break;

+        case CLONE_PART_FROM_SHARD:
+            out << "clone_part_from_shard\n"
+                << new_part_name << "\n"
+                << "source_shard: " << source_shard;
+            break;
+
        case ATTACH_PART:
            out << "attach\n" << new_part_name << "\n"
                << "part_checksum: " << part_checksum;
@ -142,6 +148,10 @@ void ReplicatedMergeTreeLogEntryData::writeText(WriteBuffer & out) const
            out << metadata_str;
            break;

+        case SYNC_PINNED_PART_UUIDS:
+            out << "sync_pinned_part_uuids\n";
+            break;
+
        default:
            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unknown log entry type: {}", static_cast<int>(type));
    }
@ -306,6 +316,16 @@ void ReplicatedMergeTreeLogEntryData::readText(ReadBuffer & in)
        metadata_str.resize(metadata_size);
        in.readStrict(&metadata_str[0], metadata_size);
    }
+    else if (type_str == "sync_pinned_part_uuids")
+    {
+        type = SYNC_PINNED_PART_UUIDS;
+    }
+    else if (type_str == "clone_part_from_shard")
+    {
+        type = CLONE_PART_FROM_SHARD;
+        in >> new_part_name;
+        in >> "\nsource_shard: " >> source_shard;
+    }

    if (!trailing_newline_found)
        in >> "\n";
@ -419,6 +439,14 @@ Strings ReplicatedMergeTreeLogEntryData::getVirtualPartNames(MergeTreeDataFormat
        return res;
    }

+    /// Doesn't produce any part.
+    if (type == SYNC_PINNED_PART_UUIDS)
+        return {};
+
+    /// Doesn't produce any part by itself.
+    if (type == CLONE_PART_FROM_SHARD)
+        return {};
+
    return {new_part_name};
 }

--- a/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeLogEntry.h
@ -43,6 +43,8 @@ struct ReplicatedMergeTreeLogEntryData
        REPLACE_RANGE,  /// Drop certain range of partitions and replace them by new ones
        MUTATE_PART,    /// Apply one or several mutations to the part.
        ALTER_METADATA, /// Apply alter modification according to global /metadata and /columns paths
+        SYNC_PINNED_PART_UUIDS, /// Synchronization point for ensuring that all replicas have up to date in-memory state.
+        CLONE_PART_FROM_SHARD,  /// Clone part from another shard.
    };

    static String typeToString(Type type)
@ -58,6 +60,8 @@ struct ReplicatedMergeTreeLogEntryData
            case ReplicatedMergeTreeLogEntryData::REPLACE_RANGE:    return "REPLACE_RANGE";
            case ReplicatedMergeTreeLogEntryData::MUTATE_PART:      return "MUTATE_PART";
            case ReplicatedMergeTreeLogEntryData::ALTER_METADATA:   return "ALTER_METADATA";
+            case ReplicatedMergeTreeLogEntryData::SYNC_PINNED_PART_UUIDS: return "SYNC_PINNED_PART_UUIDS";
+            case ReplicatedMergeTreeLogEntryData::CLONE_PART_FROM_SHARD:  return "CLONE_PART_FROM_SHARD";
            default:
                throw Exception("Unknown log entry type: " + DB::toString<int>(type), ErrorCodes::LOGICAL_ERROR);
        }
@ -76,6 +80,7 @@ struct ReplicatedMergeTreeLogEntryData

    Type type = EMPTY;
    String source_replica; /// Empty string means that this entry was added to the queue immediately, and not copied from the log.
+    String source_shard;

    String part_checksum; /// Part checksum for ATTACH_PART, empty otherwise.

--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@ -1804,6 +1804,17 @@ ReplicatedMergeTreeMergePredicate::ReplicatedMergeTreeMergePredicate(

    merges_version = queue_.pullLogsToQueue(zookeeper);

+    {
+        /// We avoid returning here a version to be used in a lightweight transaction.
+        ///
+        /// When pinned parts set is changed a log entry is added to the queue in the same transaction.
+        /// The log entry serves as a synchronization point, and it also increments `merges_version`.
+        ///
+        /// If pinned parts are fetched after logs are pulled then we can safely say that it contains all locks up to `merges_version`.
+        String s = zookeeper->get(queue.zookeeper_path + "/pinned_part_uuids");
+        pinned_part_uuids.fromString(s);
+    }
+
    Coordination::GetResponse quorum_status_response = quorum_status_future.get();
    if (quorum_status_response.error == Coordination::Error::ZOK)
    {
@ -1872,6 +1883,13 @@ bool ReplicatedMergeTreeMergePredicate::canMergeTwoParts(

    for (const MergeTreeData::DataPartPtr & part : {left, right})
    {
+        if (pinned_part_uuids.part_uuids.contains(part->uuid))
+        {
+            if (out_reason)
+                *out_reason = "Part " + part->name + " has uuid " + toString(part->uuid) + " which is currently pinned";
+            return false;
+        }
+
        if (part->name == inprogress_quorum_part)
        {
            if (out_reason)
@ -1967,6 +1985,13 @@ bool ReplicatedMergeTreeMergePredicate::canMergeSinglePart(
    const MergeTreeData::DataPartPtr & part,
    String * out_reason) const
 {
+    if (pinned_part_uuids.part_uuids.contains(part->uuid))
+    {
+        if (out_reason)
+            *out_reason = "Part " + part->name + " has uuid " + toString(part->uuid) + " which is currently pinned";
+        return false;
+    }
+
    if (part->name == inprogress_quorum_part)
    {
        if (out_reason)
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.h
@ -8,6 +8,7 @@
 #include <Storages/MergeTree/ActiveDataPartSet.h>
 #include <Storages/MergeTree/MergeTreeData.h>
 #include <Storages/MergeTree/MergeTreeMutationStatus.h>
+#include <Storages/MergeTree/PinnedPartUUIDs.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeQuorumAddedParts.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeAltersSequence.h>

@ -486,6 +487,9 @@ private:
    /// (loaded at some later time than prev_virtual_parts).
    std::unordered_map<String, std::set<Int64>> committing_blocks;

+    /// List of UUIDs for parts that have their identity "pinned".
+    PinnedPartUUIDs pinned_part_uuids;
+
    /// Quorum state taken at some later time than prev_virtual_parts.
    String inprogress_quorum_part;

--- a/src/Storages/PartitionCommands.cpp
+++ b/src/Storages/PartitionCommands.cpp
@ -18,7 +18,6 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-
 std::optional<PartitionCommand> PartitionCommand::parse(const ASTAlterCommand * command_ast)
 {
    if (command_ast->type == ASTAlterCommand::DROP_PARTITION)
@ -65,8 +64,11 @@ std::optional<PartitionCommand> PartitionCommand::parse(const ASTAlterCommand *
                res.to_database = command_ast->to_database;
                res.to_table = command_ast->to_table;
                break;
-            default:
+            case DataDestinationType::SHARD:
+                res.move_destination_type = PartitionCommand::MoveDestinationType::SHARD;
                break;
+            case DataDestinationType::DELETE:
+                throw Exception("ALTER with this destination type is not handled. This is a bug.", ErrorCodes::LOGICAL_ERROR);
        }
        if (res.move_destination_type != PartitionCommand::MoveDestinationType::TABLE)
            res.move_destination_name = command_ast->move_destination_name;
--- a/src/Storages/PartitionCommands.h
+++ b/src/Storages/PartitionCommands.h
@ -64,6 +64,7 @@ struct PartitionCommand
        DISK,
        VOLUME,
        TABLE,
+        SHARD,
    };

    std::optional<MoveDestinationType> move_destination_type;
--- a/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
+++ b/src/Storages/RabbitMQ/StorageRabbitMQ.cpp
@ -43,6 +43,7 @@ static const auto RETRIES_MAX = 20;
 static const uint32_t QUEUE_SIZE = 100000;
 static const auto MAX_FAILED_READ_ATTEMPTS = 10;
 static const auto RESCHEDULE_MS = 500;
+static const auto BACKOFF_TRESHOLD = 32000;
 static const auto MAX_THREAD_WORK_DURATION_MS = 60000;

 namespace ErrorCodes
@ -100,6 +101,7 @@ StorageRabbitMQ::StorageRabbitMQ(
        , semaphore(0, num_consumers)
        , unique_strbase(getRandomName())
        , queue_size(std::max(QUEUE_SIZE, static_cast<uint32_t>(getMaxBlockSize())))
+        , milliseconds_to_wait(RESCHEDULE_MS)
 {
    event_handler = std::make_shared<RabbitMQHandler>(loop.getLoop(), log);
    restoreConnection(false);
@ -852,7 +854,17 @@ void StorageRabbitMQ::streamingToViewsFunc()
                    LOG_DEBUG(log, "Started streaming to {} attached views", dependencies_count);

                    if (streamToViews())
+                    {
+                        /// Reschedule with backoff.
+                        if (milliseconds_to_wait < BACKOFF_TRESHOLD)
+                            milliseconds_to_wait *= 2;
+                        event_handler->updateLoopState(Loop::STOP);
                        break;
+                    }
+                    else
+                    {
+                        milliseconds_to_wait = RESCHEDULE_MS;
+                    }

                    auto end_time = std::chrono::steady_clock::now();
                    auto duration = std::chrono::duration_cast<std::chrono::milliseconds>(end_time - start_time);
@ -871,9 +883,8 @@ void StorageRabbitMQ::streamingToViewsFunc()
        }
    }

-    /// Wait for attached views
    if (!stream_cancelled)
-        streaming_task->scheduleAfter(RESCHEDULE_MS);
+        streaming_task->scheduleAfter(milliseconds_to_wait);
 }


@ -1019,6 +1030,7 @@ bool StorageRabbitMQ::streamToViews()
        looping_task->activateAndSchedule();
    }

+    /// Do not reschedule, do not stop event loop.
    return false;
 }

--- a/src/Storages/RabbitMQ/StorageRabbitMQ.h
+++ b/src/Storages/RabbitMQ/StorageRabbitMQ.h
@ -136,6 +136,8 @@ private:
    BackgroundSchedulePool::TaskHolder looping_task;
    BackgroundSchedulePool::TaskHolder connection_task;

+    uint64_t milliseconds_to_wait;
+
    std::atomic<bool> stream_cancelled{false};
    size_t read_attempts = 0;
    mutable bool drop_table = false;
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@ -16,6 +16,7 @@
 #include <Storages/StorageReplicatedMergeTree.h>
 #include <Storages/MergeTree/IMergeTreeDataPart.h>
 #include <Storages/MergeTree/MergeList.h>
+#include <Storages/MergeTree/PinnedPartUUIDs.h>
 #include <Storages/MergeTree/PartitionPruner.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeTableMetadata.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeBlockOutputStream.h>
@ -130,6 +131,7 @@ namespace ErrorCodes
    extern const int NO_SUCH_DATA_PART;
    extern const int INTERSERVER_SCHEME_DOESNT_MATCH;
    extern const int DUPLICATE_DATA_PART;
+    extern const int BAD_ARGUMENTS;
 }

 namespace ActionLocks
@ -282,6 +284,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
    , cleanup_thread(*this)
    , part_check_thread(*this)
    , restarting_thread(*this)
+    , part_moves_between_shards_orchestrator(*this)
    , allow_renaming(allow_renaming_)
    , replicated_fetches_pool_size(getContext()->getSettingsRef().background_fetches_pool_size)
 {
@ -459,6 +462,7 @@ StorageReplicatedMergeTree::StorageReplicatedMergeTree(
    }

    createNewZooKeeperNodes();
+    syncPinnedPartUUIDs();
 }


@ -580,6 +584,9 @@ void StorageReplicatedMergeTree::createNewZooKeeperNodes()
        zookeeper->createIfNotExists(zookeeper_path + "/zero_copy_s3/shared", String());
    }

+    /// Part movement.
+    zookeeper->createIfNotExists(zookeeper_path + "/part_moves_shard", String());
+    zookeeper->createIfNotExists(zookeeper_path + "/pinned_part_uuids", getPinnedPartUUIDs()->toString());
    /// For ALTER PARTITION with multi-leaders
    zookeeper->createIfNotExists(zookeeper_path + "/alter_partition_version", String());
 }
@ -1224,6 +1231,27 @@ void StorageReplicatedMergeTree::checkParts(bool skip_sanity_checks)
 }


+void StorageReplicatedMergeTree::syncPinnedPartUUIDs()
+{
+    auto zookeeper = getZooKeeper();
+
+    Coordination::Stat stat;
+    String s = zookeeper->get(zookeeper_path + "/pinned_part_uuids", &stat);
+
+    std::lock_guard lock(pinned_part_uuids_mutex);
+
+    /// Unsure whether or not this can be called concurrently.
+    if (pinned_part_uuids->stat.version < stat.version)
+    {
+        auto new_pinned_part_uuids = std::make_shared<PinnedPartUUIDs>();
+        new_pinned_part_uuids->fromString(s);
+        new_pinned_part_uuids->stat = stat;
+
+        pinned_part_uuids = new_pinned_part_uuids;
+    }
+}
+
+
 void StorageReplicatedMergeTree::checkPartChecksumsAndAddCommitOps(const zkutil::ZooKeeperPtr & zookeeper,
    const DataPartPtr & part, Coordination::Requests & ops, String part_name, NameSet * absent_replicas_paths)
 {
@ -1512,6 +1540,12 @@ bool StorageReplicatedMergeTree::executeLogEntry(LogEntry & entry)
            break;
        case LogEntry::ALTER_METADATA:
            return executeMetadataAlter(entry);
+        case LogEntry::SYNC_PINNED_PART_UUIDS:
+            syncPinnedPartUUIDs();
+            return true;
+        case LogEntry::CLONE_PART_FROM_SHARD:
+            executeClonePartFromShard(entry);
+            return true;
        default:
            throw Exception(ErrorCodes::LOGICAL_ERROR, "Unexpected log entry type: {}", static_cast<int>(entry.type));
    }
@ -2543,6 +2577,59 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
 }


+void StorageReplicatedMergeTree::executeClonePartFromShard(const LogEntry & entry)
+{
+    auto zookeeper = getZooKeeper();
+
+    Strings replicas = zookeeper->getChildren(entry.source_shard + "/replicas");
+    std::shuffle(replicas.begin(), replicas.end(), thread_local_rng);
+    String replica;
+    for (const String & candidate : replicas)
+    {
+        if (zookeeper->exists(entry.source_shard + "/replicas/" + candidate + "/is_active"))
+        {
+            replica = candidate;
+            break;
+        }
+    }
+
+    if (replica.empty())
+        throw Exception(ErrorCodes::NO_REPLICA_HAS_PART, "Not found active replica on shard {} to clone part {}", entry.source_shard, entry.new_part_name);
+
+    LOG_INFO(log, "Will clone part from shard " + entry.source_shard + " and replica " + replica);
+
+    MutableDataPartPtr part;
+
+    {
+        auto metadata_snapshot = getInMemoryMetadataPtr();
+        String source_replica_path = entry.source_shard + "/replicas/" + replica;
+        ReplicatedMergeTreeAddress address(getZooKeeper()->get(source_replica_path + "/host"));
+        auto timeouts = ConnectionTimeouts::getHTTPTimeouts(getContext());
+        auto credentials = getContext()->getInterserverCredentials();
+        String interserver_scheme = getContext()->getInterserverScheme();
+
+        auto get_part = [&, address, timeouts, credentials, interserver_scheme]()
+        {
+            if (interserver_scheme != address.scheme)
+                throw Exception("Interserver schemes are different: '" + interserver_scheme
+                                + "' != '" + address.scheme + "', can't fetch part from " + address.host,
+                                ErrorCodes::LOGICAL_ERROR);
+
+            return fetcher.fetchPart(
+                metadata_snapshot, entry.new_part_name, source_replica_path,
+                address.host, address.replication_port,
+                timeouts, credentials->getUser(), credentials->getPassword(), interserver_scheme, true);
+        };
+
+        part = get_part();
+        // The fetched part is valuable and should not be cleaned like a temp part.
+        part->is_temp = false;
+        part->renameTo("detached/" + entry.new_part_name, true);
+        LOG_INFO(log, "Cloned part {} to detached directory", part->name);
+    }
+}
+
+
 void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coordination::Stat source_is_lost_stat, zkutil::ZooKeeperPtr & zookeeper)
 {
    String source_path = zookeeper_path + "/replicas/" + source_replica;
@ -4146,6 +4233,8 @@ void StorageReplicatedMergeTree::startup()
        /// between the assignment of queue_task_handle and queueTask that use the queue_task_handle.
        background_executor.start();
        startBackgroundMovesIfNeeded();
+
+        part_moves_between_shards_orchestrator.start();
    }
    catch (...)
    {
@ -4176,6 +4265,7 @@ void StorageReplicatedMergeTree::shutdown()

    restarting_thread.shutdown();
    background_executor.finish();
+    part_moves_between_shards_orchestrator.shutdown();

    {
        auto lock = queue.lockQueue();
@ -5097,8 +5187,14 @@ bool StorageReplicatedMergeTree::existsNodeCached(const std::string & path) cons

 std::optional<EphemeralLockInZooKeeper>
 StorageReplicatedMergeTree::allocateBlockNumber(
-    const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_block_id_path) const
+    const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper, const String & zookeeper_block_id_path, const String & zookeeper_path_prefix) const
 {
+    String zookeeper_table_path;
+    if (zookeeper_path_prefix.empty())
+        zookeeper_table_path = zookeeper_path;
+    else
+        zookeeper_table_path = zookeeper_path_prefix;
+
    /// Lets check for duplicates in advance, to avoid superfluous block numbers allocation
    Coordination::Requests deduplication_check_ops;
    if (!zookeeper_block_id_path.empty())
@ -5107,7 +5203,7 @@ StorageReplicatedMergeTree::allocateBlockNumber(
        deduplication_check_ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_block_id_path, -1));
    }

-    String block_numbers_path = zookeeper_path + "/block_numbers";
+    String block_numbers_path = zookeeper_table_path + "/block_numbers";
    String partition_path = block_numbers_path + "/" + partition_id;

    if (!existsNodeCached(partition_path))
@ -5130,7 +5226,7 @@ StorageReplicatedMergeTree::allocateBlockNumber(
    try
    {
        lock = EphemeralLockInZooKeeper(
-            partition_path + "/block-", zookeeper_path + "/temp", *zookeeper, &deduplication_check_ops);
+            partition_path + "/block-", zookeeper_table_path + "/temp", *zookeeper, &deduplication_check_ops);
    }
    catch (const zkutil::KeeperMultiException & e)
    {
@ -5402,6 +5498,11 @@ void StorageReplicatedMergeTree::getQueue(LogEntriesData & res, String & replica
    queue.getEntries(res);
 }

+std::vector<PartMovesBetweenShardsOrchestrator::Entry> StorageReplicatedMergeTree::getPartMovesBetweenShardsEntries()
+{
+    return part_moves_between_shards_orchestrator.getEntries();
+}
+
 time_t StorageReplicatedMergeTree::getAbsoluteDelay() const
 {
    time_t min_unprocessed_insert_time = 0;
@ -6613,6 +6714,100 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
    cleanLastPartNode(partition_id);
 }

+void StorageReplicatedMergeTree::movePartitionToShard(
+    const ASTPtr & partition, bool move_part, const String & to, ContextPtr /*query_context*/)
+{
+    /// This is a lightweight operation that only optimistically checks if it could succeed and queues tasks.
+
+    if (!move_part)
+        throw Exception("MOVE PARTITION TO SHARD is not supported, use MOVE PART instead", ErrorCodes::NOT_IMPLEMENTED);
+
+    if (normalizeZooKeeperPath(zookeeper_path) == normalizeZooKeeperPath(to))
+        throw Exception("Source and destination are the same", ErrorCodes::BAD_ARGUMENTS);
+
+    auto zookeeper = getZooKeeper();
+
+    String part_name = partition->as<ASTLiteral &>().value.safeGet<String>();
+    auto part_info = MergeTreePartInfo::fromPartName(part_name, format_version);
+
+    auto part = getPartIfExists(part_info, {MergeTreeDataPartState::Committed});
+    if (!part)
+        throw Exception(ErrorCodes::NO_SUCH_DATA_PART, "Part {} not found locally", part_name);
+
+    if (part->uuid == UUIDHelpers::Nil)
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Part {} does not have an uuid assigned and it can't be moved between shards", part_name);
+
+
+    ReplicatedMergeTreeMergePredicate merge_pred = queue.getMergePredicate(zookeeper);
+
+    /// The following block is pretty much copy & paste from StorageReplicatedMergeTree::dropPart to avoid conflicts while this is WIP.
+    /// Extract it to a common method and re-use it before merging.
+    {
+        if (partIsLastQuorumPart(part->info))
+        {
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Part {} is last inserted part with quorum in partition. Would not be able to drop", part_name);
+        }
+
+        /// canMergeSinglePart is overlapping with dropPart, let's try to use the same code.
+        String out_reason;
+        if (!merge_pred.canMergeSinglePart(part, &out_reason))
+            throw Exception(ErrorCodes::PART_IS_TEMPORARILY_LOCKED, "Part is busy, reason: " + out_reason);
+    }
+
+    {
+        /// Optimistic check that for compatible destination table structure.
+        checkTableStructure(to, getInMemoryMetadataPtr());
+    }
+
+    PinnedPartUUIDs src_pins;
+    PinnedPartUUIDs dst_pins;
+
+    {
+        String s = zookeeper->get(zookeeper_path + "/pinned_part_uuids", &src_pins.stat);
+        src_pins.fromString(s);
+    }
+
+    {
+        String s = zookeeper->get(to + "/pinned_part_uuids", &dst_pins.stat);
+        dst_pins.fromString(s);
+    }
+
+    if (src_pins.part_uuids.contains(part->uuid) || dst_pins.part_uuids.contains(part->uuid))
+        throw Exception(ErrorCodes::PART_IS_TEMPORARILY_LOCKED, "Part {} has it's uuid ({}) already pinned.", part_name, toString(part->uuid));
+
+    src_pins.part_uuids.insert(part->uuid);
+    dst_pins.part_uuids.insert(part->uuid);
+
+    PartMovesBetweenShardsOrchestrator::Entry part_move_entry;
+    part_move_entry.create_time = std::time(nullptr);
+    part_move_entry.update_time = part_move_entry.create_time;
+    part_move_entry.task_uuid = UUIDHelpers::generateV4();
+    part_move_entry.part_name = part->name;
+    part_move_entry.part_uuid = part->uuid;
+    part_move_entry.to_shard = to;
+
+    Coordination::Requests ops;
+    ops.emplace_back(zkutil::makeCheckRequest(zookeeper_path + "/log", merge_pred.getVersion())); /// Make sure no new events were added to the log.
+    ops.emplace_back(zkutil::makeSetRequest(zookeeper_path + "/pinned_part_uuids", src_pins.toString(), src_pins.stat.version));
+    ops.emplace_back(zkutil::makeSetRequest(to + "/pinned_part_uuids", dst_pins.toString(), dst_pins.stat.version));
+    ops.emplace_back(zkutil::makeCreateRequest(
+        part_moves_between_shards_orchestrator.entries_znode_path + "/task-",
+        part_move_entry.toString(),
+        zkutil::CreateMode::PersistentSequential));
+
+    Coordination::Responses responses;
+    Coordination::Error rc = zookeeper->tryMulti(ops, responses);
+    zkutil::KeeperMultiException::check(rc, ops, responses);
+
+    String task_znode_path = dynamic_cast<const Coordination::CreateResponse &>(*responses.back()).path_created;
+    LOG_DEBUG(log, "Created task for part movement between shards at " + task_znode_path);
+
+    /// Force refresh local state. This will make the task immediately visible in `system.part_moves_between_shards` table.
+    part_moves_between_shards_orchestrator.fetchStateFromZK();
+
+    // TODO: Add support for `replication_alter_partitions_sync`.
+}
+
 void StorageReplicatedMergeTree::getCommitPartOps(
    Coordination::Requests & ops,
    MutableDataPartPtr & part,
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@ -20,6 +20,7 @@
 #include <Storages/MergeTree/DataPartsExchange.h>
 #include <Storages/MergeTree/ReplicatedMergeTreeAddress.h>
 #include <Storages/MergeTree/LeaderElection.h>
+#include <Storages/MergeTree/PartMovesBetweenShardsOrchestrator.h>
 #include <DataTypes/DataTypesNumber.h>
 #include <Interpreters/Cluster.h>
 #include <Interpreters/PartLog.h>
@ -183,6 +184,8 @@ public:
    using LogEntriesData = std::vector<ReplicatedMergeTreeLogEntryData>;
    void getQueue(LogEntriesData & res, String & replica_name);

+    std::vector<PartMovesBetweenShardsOrchestrator::Entry> getPartMovesBetweenShardsEntries();
+
    /// Get replica delay relative to current time.
    time_t getAbsoluteDelay() const;

@ -252,6 +255,7 @@ private:
    friend struct ReplicatedMergeTreeLogEntry;
    friend class ScopedPartitionMergeLock;
    friend class ReplicatedMergeTreeQueue;
+    friend class PartMovesBetweenShardsOrchestrator;
    friend class MergeTreeData;

    using MergeStrategyPicker = ReplicatedMergeTreeMergeStrategyPicker;
@ -305,7 +309,6 @@ private:

    DataPartsExchange::Fetcher fetcher;

-
    /// When activated, replica is initialized and startup() method could exit
    Poco::Event startup_event;

@ -350,6 +353,8 @@ private:
    /// A thread that processes reconnection to ZooKeeper when the session expires.
    ReplicatedMergeTreeRestartingThread restarting_thread;

+    PartMovesBetweenShardsOrchestrator part_moves_between_shards_orchestrator;
+
    /// True if replica was created for existing table with fixed granularity
    bool other_replicas_fixed_granularity = false;

@ -387,6 +392,10 @@ private:
      */
    void checkParts(bool skip_sanity_checks);

+    /// Synchronize the list of part uuids which are currently pinned. These should be sent to root query executor
+    /// to be used for deduplication.
+    void syncPinnedPartUUIDs();
+
    /** Check that the part's checksum is the same as the checksum of the same part on some other replica.
      * If no one has such a part, nothing checks.
      * Not very reliable: if two replicas add a part almost at the same time, no checks will occur.
@ -457,6 +466,7 @@ private:
    bool executeFetch(LogEntry & entry);

    bool executeReplaceRange(const LogEntry & entry);
+    void executeClonePartFromShard(const LogEntry & entry);

    /** Updates the queue.
      */
@ -585,9 +595,11 @@ private:
    bool partIsInsertingWithParallelQuorum(const MergeTreePartInfo & part_info) const;

    /// Creates new block number if block with such block_id does not exist
+    /// If zookeeper_path_prefix specified then allocate block number on this path
+    /// (can be used if we want to allocate blocks on other replicas)
    std::optional<EphemeralLockInZooKeeper> allocateBlockNumber(
        const String & partition_id, const zkutil::ZooKeeperPtr & zookeeper,
-        const String & zookeeper_block_id_path = "") const;
+        const String & zookeeper_block_id_path = "", const String & zookeeper_path_prefix = "") const;

    /** Wait until all replicas, including this, execute the specified action from the log.
      * If replicas are added at the same time, it can not wait the added replica .
@ -639,6 +651,7 @@ private:
    PartitionCommandsResultInfo attachPartition(const ASTPtr & partition, const StorageMetadataPtr & metadata_snapshot, bool part, ContextPtr query_context) override;
    void replacePartitionFrom(const StoragePtr & source_table, const ASTPtr & partition, bool replace, ContextPtr query_context) override;
    void movePartitionToTable(const StoragePtr & dest_table, const ASTPtr & partition, ContextPtr query_context) override;
+    void movePartitionToShard(const ASTPtr & partition, bool move_part, const String & to, ContextPtr query_context) override;
    void fetchPartition(
        const ASTPtr & partition,
        const StorageMetadataPtr & metadata_snapshot,
--- a/src/Storages/System/StorageSystemPartMovesBetweenShards.cpp
+++ b/src/Storages/System/StorageSystemPartMovesBetweenShards.cpp
@ -0,0 +1,135 @@
+#include <Access/ContextAccess.h>
+#include <Columns/ColumnString.h>
+#include <DataTypes/DataTypeArray.h>
+#include <DataTypes/DataTypeDateTime.h>
+#include <DataTypes/DataTypeString.h>
+#include <DataTypes/DataTypeUUID.h>
+#include <DataTypes/DataTypesNumber.h>
+#include <Databases/IDatabase.h>
+#include <Storages/StorageReplicatedMergeTree.h>
+#include <Storages/System/StorageSystemPartMovesBetweenShards.h>
+#include <Storages/VirtualColumnUtils.h>
+#include <Common/typeid_cast.h>
+
+
+namespace DB
+{
+
+
+NamesAndTypesList StorageSystemPartMovesBetweenShards::getNamesAndTypes()
+{
+    return {
+        /// Table properties.
+        { "database",                std::make_shared<DataTypeString>() },
+        { "table",                   std::make_shared<DataTypeString>() },
+
+        /// Constant element properties.
+        { "task_name",               std::make_shared<DataTypeString>() },
+        { "task_uuid",               std::make_shared<DataTypeUUID>() },
+        { "create_time",             std::make_shared<DataTypeDateTime>() },
+        { "part_name",               std::make_shared<DataTypeString>() },
+        { "part_uuid",               std::make_shared<DataTypeUUID>() },
+        { "to_shard",                std::make_shared<DataTypeString>() },
+
+        /// Processing status of item.
+        { "update_time",             std::make_shared<DataTypeDateTime>() },
+        { "state",                   std::make_shared<DataTypeString>() },
+        { "num_tries",               std::make_shared<DataTypeUInt32>() },
+        { "last_exception",          std::make_shared<DataTypeString>() },
+    };
+}
+
+
+void StorageSystemPartMovesBetweenShards::fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const
+{
+    const auto access = context->getAccess();
+    const bool check_access_for_databases = !access->isGranted(AccessType::SHOW_TABLES);
+
+    std::map<String, std::map<String, StoragePtr>> replicated_tables;
+    for (const auto & db : DatabaseCatalog::instance().getDatabases())
+    {
+        /// Check if database can contain replicated tables
+        if (!db.second->canContainMergeTreeTables())
+            continue;
+
+        const bool check_access_for_tables = check_access_for_databases && !access->isGranted(AccessType::SHOW_TABLES, db.first);
+
+        for (auto iterator = db.second->getTablesIterator(context); iterator->isValid(); iterator->next())
+        {
+            const auto & table = iterator->table();
+            if (!table)
+                continue;
+            if (!dynamic_cast<const StorageReplicatedMergeTree *>(table.get()))
+                continue;
+            if (check_access_for_tables && !access->isGranted(AccessType::SHOW_TABLES, db.first, iterator->name()))
+                continue;
+            replicated_tables[db.first][iterator->name()] = table;
+        }
+    }
+
+
+    MutableColumnPtr col_database_mut = ColumnString::create();
+    MutableColumnPtr col_table_mut = ColumnString::create();
+
+    for (auto & db : replicated_tables)
+    {
+        for (auto & table : db.second)
+        {
+            col_database_mut->insert(db.first);
+            col_table_mut->insert(table.first);
+        }
+    }
+
+    ColumnPtr col_database_to_filter = std::move(col_database_mut);
+    ColumnPtr col_table_to_filter = std::move(col_table_mut);
+
+    /// Determine what tables are needed by the conditions in the query.
+    {
+        Block filtered_block
+        {
+            { col_database_to_filter, std::make_shared<DataTypeString>(), "database" },
+            { col_table_to_filter, std::make_shared<DataTypeString>(), "table" },
+        };
+
+        VirtualColumnUtils::filterBlockWithQuery(query_info.query, filtered_block, context);
+
+        if (!filtered_block.rows())
+            return;
+
+        col_database_to_filter = filtered_block.getByName("database").column;
+        col_table_to_filter = filtered_block.getByName("table").column;
+    }
+
+    for (size_t i = 0, tables_size = col_database_to_filter->size(); i < tables_size; ++i)
+    {
+        String database = (*col_database_to_filter)[i].safeGet<const String &>();
+        String table = (*col_table_to_filter)[i].safeGet<const String &>();
+
+        auto moves = dynamic_cast<StorageReplicatedMergeTree &>(*replicated_tables[database][table]).getPartMovesBetweenShardsEntries();
+
+        for (auto & entry : moves)
+        {
+            size_t col_num = 0;
+
+            /// Table properties.
+            res_columns[col_num++]->insert(database);
+            res_columns[col_num++]->insert(table);
+
+            /// Constant element properties.
+            res_columns[col_num++]->insert(entry.znode_name);
+            res_columns[col_num++]->insert(entry.task_uuid);
+            res_columns[col_num++]->insert(entry.create_time);
+            res_columns[col_num++]->insert(entry.part_name);
+            res_columns[col_num++]->insert(entry.part_uuid);
+            res_columns[col_num++]->insert(entry.to_shard);
+
+            /// Processing status of item.
+            res_columns[col_num++]->insert(entry.update_time);
+            res_columns[col_num++]->insert(entry.state.toString());
+            res_columns[col_num++]->insert(0);
+            res_columns[col_num++]->insert(entry.last_exception_msg);
+        }
+    }
+}
+
+}
--- a/src/Storages/System/StorageSystemPartMovesBetweenShards.h
+++ b/src/Storages/System/StorageSystemPartMovesBetweenShards.h
@ -0,0 +1,27 @@
+#pragma once
+
+#include <ext/shared_ptr_helper.h>
+#include <Storages/System/IStorageSystemOneBlock.h>
+
+
+namespace DB
+{
+
+class Context;
+
+
+class StorageSystemPartMovesBetweenShards final : public ext::shared_ptr_helper<StorageSystemPartMovesBetweenShards>, public IStorageSystemOneBlock<StorageSystemPartMovesBetweenShards>
+{
+    friend struct ext::shared_ptr_helper<StorageSystemPartMovesBetweenShards>;
+public:
+    std::string getName() const override { return "SystemShardMoves"; }
+
+    static NamesAndTypesList getNamesAndTypes();
+
+protected:
+    using IStorageSystemOneBlock::IStorageSystemOneBlock;
+
+    void fillData(MutableColumns & res_columns, ContextPtr context, const SelectQueryInfo & query_info) const override;
+};
+
+}
--- a/src/Storages/System/attachSystemTables.cpp
+++ b/src/Storages/System/attachSystemTables.cpp
@ -25,6 +25,7 @@
 #include <Storages/System/StorageSystemMutations.h>
 #include <Storages/System/StorageSystemNumbers.h>
 #include <Storages/System/StorageSystemOne.h>
+#include <Storages/System/StorageSystemPartMovesBetweenShards.h>
 #include <Storages/System/StorageSystemParts.h>
 #include <Storages/System/StorageSystemProjectionParts.h>
 #include <Storages/System/StorageSystemPartsColumns.h>
@ -148,6 +149,7 @@ void attachSystemTablesServer(IDatabase & system_database, bool has_zookeeper)
    attach<StorageSystemGraphite>(system_database, "graphite_retentions");
    attach<StorageSystemMacros>(system_database, "macros");
    attach<StorageSystemReplicatedFetches>(system_database, "replicated_fetches");
+    attach<StorageSystemPartMovesBetweenShards>(system_database, "part_moves_between_shards");

    if (has_zookeeper)
        attach<StorageSystemZooKeeper>(system_database, "zookeeper");
--- a/src/Storages/ya.make
+++ b/src/Storages/ya.make
@ -92,7 +92,9 @@ SRCS(
    MergeTree/MergeType.cpp
    MergeTree/MergedBlockOutputStream.cpp
    MergeTree/MergedColumnOnlyOutputStream.cpp
+    MergeTree/PartMovesBetweenShardsOrchestrator.cpp
    MergeTree/PartitionPruner.cpp
+    MergeTree/PinnedPartUUIDs.cpp
    MergeTree/ReplicatedFetchList.cpp
    MergeTree/ReplicatedMergeTreeAddress.cpp
    MergeTree/ReplicatedMergeTreeAltersSequence.cpp
@ -177,6 +179,7 @@ SRCS(
    System/StorageSystemMutations.cpp
    System/StorageSystemNumbers.cpp
    System/StorageSystemOne.cpp
+    System/StorageSystemPartMovesBetweenShards.cpp
    System/StorageSystemParts.cpp
    System/StorageSystemPartsBase.cpp
    System/StorageSystemPartsColumns.cpp
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@ -185,9 +185,10 @@ def run_single_test(args, ext, server_logs_level, client_options, case_file, std
        'stderr': stderr_file,
    }

-    # >> append to stdout and stderr, because there are also output of per test database creation
+    # >> append to stderr (but not stdout since it is not used there),
+    # because there are also output of per test database creation
    if not args.database:
-        pattern = '{test} >> {stdout} 2>> {stderr}'
+        pattern = '{test} > {stdout} 2>> {stderr}'
    else:
        pattern = '{test} > {stdout} 2> {stderr}'

--- a/tests/queries/0_stateless/00938_fix_rwlock_segfault.reference
+++ b/tests/queries/0_stateless/00938_fix_rwlock_segfault.reference
--- a/tests/integration/test_part_moves_between_shards/configs/merge_tree.xml
+++ b/tests/integration/test_part_moves_between_shards/configs/merge_tree.xml
@ -0,0 +1,7 @@
+<yandex>
+    <merge_tree>
+        <assign_part_uuids>1</assign_part_uuids>
+        <part_moves_between_shards_enable>1</part_moves_between_shards_enable>
+        <part_moves_between_shards_delay_seconds>3</part_moves_between_shards_delay_seconds>
+    </merge_tree>
+</yandex>
--- a/tests/integration/test_part_moves_between_shards/configs/remote_servers.xml
+++ b/tests/integration/test_part_moves_between_shards/configs/remote_servers.xml
@ -0,0 +1,26 @@
+<yandex>
+    <remote_servers>
+        <test_cluster>
+            <shard>
+                <replica>
+                    <host>s0r0</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>s0r1</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+            <shard>
+                <replica>
+                    <host>s1r0</host>
+                    <port>9000</port>
+                </replica>
+                <replica>
+                    <host>s1r1</host>
+                    <port>9000</port>
+                </replica>
+            </shard>
+        </test_cluster>
+    </remote_servers>
+</yandex>
--- a/tests/integration/test_part_moves_between_shards/test.py
+++ b/tests/integration/test_part_moves_between_shards/test.py
@ -0,0 +1,159 @@
+import random
+import time
+
+import pytest
+
+from helpers.client import QueryRuntimeException
+from helpers.cluster import ClickHouseCluster
+from helpers.test_tools import TSV
+
+cluster = ClickHouseCluster(__file__)
+
+s0r0 = cluster.add_instance(
+    's0r0',
+    main_configs=['configs/remote_servers.xml', 'configs/merge_tree.xml'],
+    with_zookeeper=True)
+
+s0r1 = cluster.add_instance(
+    's0r1',
+    main_configs=['configs/remote_servers.xml', 'configs/merge_tree.xml'],
+    with_zookeeper=True)
+
+s1r0 = cluster.add_instance(
+    's1r0',
+    main_configs=['configs/remote_servers.xml', 'configs/merge_tree.xml'],
+    with_zookeeper=True)
+
+s1r1 = cluster.add_instance(
+    's1r1',
+    main_configs=['configs/remote_servers.xml', 'configs/merge_tree.xml'],
+    with_zookeeper=True)
+
+
+@pytest.fixture(scope="module")
+def started_cluster():
+    try:
+        cluster.start()
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+def test_move(started_cluster):
+    for shard_ix, rs in enumerate([[s0r0, s0r1], [s1r0, s1r1]]):
+        for replica_ix, r in enumerate(rs):
+            r.query("""
+            CREATE TABLE test_move(v UInt64)
+            ENGINE ReplicatedMergeTree('/clickhouse/shard_{}/tables/test_move', '{}')
+            ORDER BY tuple()
+            """.format(shard_ix, replica_ix))
+
+    s0r0.query("SYSTEM STOP MERGES test_move")
+
+    s0r0.query("INSERT INTO test_move VALUES (1)")
+    s0r0.query("INSERT INTO test_move VALUES (2)")
+
+    assert "2" == s0r0.query("SELECT count() FROM test_move").strip()
+    assert "0" == s1r0.query("SELECT count() FROM test_move").strip()
+
+    s0r0.query("ALTER TABLE test_move MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_1/tables/test_move'")
+
+    print(s0r0.query("SELECT * FROM system.part_moves_between_shards"))
+
+    s0r0.query("SYSTEM START MERGES test_move")
+    s0r0.query("OPTIMIZE TABLE test_move FINAL")
+
+    while True:
+        time.sleep(3)
+
+        print(s0r0.query("SELECT * FROM system.part_moves_between_shards"))
+
+        # Eventually.
+        if "DONE" == s0r0.query("SELECT state FROM system.part_moves_between_shards WHERE table = 'test_move'").strip():
+            break
+
+    for n in [s0r0, s0r1]:
+        assert "1" == n.query("SELECT count() FROM test_move").strip()
+
+    for n in [s1r0, s1r1]:
+        assert "1" == n.query("SELECT count() FROM test_move").strip()
+
+    # Move part back
+    s1r0.query("ALTER TABLE test_move MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_0/tables/test_move'")
+
+    while True:
+        time.sleep(3)
+
+        print(s1r0.query("SELECT * FROM system.part_moves_between_shards"))
+
+        # Eventually.
+        if "DONE" == s1r0.query("SELECT state FROM system.part_moves_between_shards WHERE table = 'test_move'").strip():
+            break
+
+    for n in [s0r0, s0r1]:
+        assert "2" == n.query("SELECT count() FROM test_move").strip()
+
+    for n in [s1r0, s1r1]:
+        assert "0" == n.query("SELECT count() FROM test_move").strip()
+
+
+def test_deduplication_while_move(started_cluster):
+    for shard_ix, rs in enumerate([[s0r0, s0r1], [s1r0, s1r1]]):
+        for replica_ix, r in enumerate(rs):
+            r.query("""
+            CREATE TABLE test_deduplication(v UInt64)
+            ENGINE ReplicatedMergeTree('/clickhouse/shard_{}/tables/test_deduplication', '{}')
+            ORDER BY tuple()
+            """.format(shard_ix, replica_ix))
+
+            r.query("""
+            CREATE TABLE t_d AS test_deduplication
+            ENGINE Distributed('test_cluster', '', test_deduplication)
+            """)
+
+    s0r0.query("SYSTEM STOP MERGES test_deduplication")
+
+    s0r0.query("INSERT INTO test_deduplication VALUES (1)")
+    s0r0.query("INSERT INTO test_deduplication VALUES (2)")
+    s0r1.query("SYSTEM SYNC REPLICA test_deduplication", timeout=20)
+
+    assert "2" == s0r0.query("SELECT count() FROM test_deduplication").strip()
+    assert "0" == s1r0.query("SELECT count() FROM test_deduplication").strip()
+
+    s0r0.query("ALTER TABLE test_deduplication MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_1/tables/test_deduplication'")
+    s0r0.query("SYSTEM START MERGES test_deduplication")
+
+    expected = """
+1
+2
+"""
+
+    # Verify that we get consisntent result at all times while the part is moving from one shard to another.
+    while "DONE" != s0r0.query("SELECT state FROM system.part_moves_between_shards WHERE table = 'test_deduplication' ORDER BY create_time DESC LIMIT 1").strip():
+        n = random.choice(list(started_cluster.instances.values()))
+
+        assert TSV(n.query("SELECT * FROM t_d ORDER BY v", settings={
+            "allow_experimental_query_deduplication": 1
+        })) == TSV(expected)
+
+
+def test_move_not_permitted(started_cluster):
+    for ix, n in enumerate([s0r0, s1r0]):
+        n.query("DROP TABLE IF EXISTS not_permitted")
+        n.query("""
+        CREATE TABLE not_permitted(v_{} UInt64)
+        ENGINE ReplicatedMergeTree('/clickhouse/shard_{}/tables/not_permitted', 'r')
+        ORDER BY tuple()
+        """.format(ix, ix))
+
+    s0r0.query("INSERT INTO not_permitted VALUES (1)")
+
+    with pytest.raises(QueryRuntimeException) as exc:
+        s0r0.query("ALTER TABLE not_permitted MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_1/tables/not_permitted'")
+
+    assert "DB::Exception: Table columns structure in ZooKeeper is different from local table structure." in str(exc.value)
+
+    with pytest.raises(QueryRuntimeException) as exc:
+        s0r0.query("ALTER TABLE not_permitted MOVE PART 'all_0_0_0' TO SHARD '/clickhouse/shard_0/tables/not_permitted'")
+
+    assert "DB::Exception: Source and destination are the same" in str(exc.value)
--- a/tests/integration/test_query_deduplication/configs/profiles.xml
+++ b/tests/integration/test_query_deduplication/configs/profiles.xml
@ -0,0 +1,7 @@
+<yandex>
+    <profiles>
+        <default>
+            <experimental_query_deduplication_send_all_part_uuids>1</experimental_query_deduplication_send_all_part_uuids>
+        </default>
+    </profiles>
+</yandex>
--- a/tests/integration/test_query_deduplication/test.py
+++ b/tests/integration/test_query_deduplication/test.py
@ -11,15 +11,18 @@ cluster = ClickHouseCluster(__file__)

 node1 = cluster.add_instance(
    'node1',
-    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'])
+    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'],
+    user_configs=['configs/profiles.xml'])

 node2 = cluster.add_instance(
    'node2',
-    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'])
+    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'],
+    user_configs=['configs/profiles.xml'])

 node3 = cluster.add_instance(
    'node3',
-    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'])
+    main_configs=['configs/remote_servers.xml', 'configs/deduplication_settings.xml'],
+    user_configs=['configs/profiles.xml'])


@pytest.fixture(scope="module")
--- a/tests/queries/0_stateless/00080_show_tables_and_system_tables.sql
+++ b/tests/queries/0_stateless/00080_show_tables_and_system_tables.sql
@ -25,6 +25,6 @@ DROP DATABASE IF EXISTS test_DatabaseMemory;
 CREATE DATABASE test_DatabaseMemory ENGINE = Memory;
 CREATE TABLE test_DatabaseMemory.A (A UInt8) ENGINE = Null;

-SELECT sum(ignore(*, metadata_modification_time, engine_full, create_table_query)) FROM system.tables;
+SELECT sum(ignore(*, metadata_modification_time, engine_full, create_table_query)) FROM system.tables WHERE database = 'test_DatabaseMemory';

 DROP DATABASE test_DatabaseMemory;
--- a/tests/queries/0_stateless/00385_storage_file_and_clickhouse-local_app_long.reference
+++ b/tests/queries/0_stateless/00385_storage_file_and_clickhouse-local_app_long.reference
--- a/tests/queries/0_stateless/00385_storage_file_and_clickhouse-local_app_long.sh
+++ b/tests/queries/0_stateless/00385_storage_file_and_clickhouse-local_app_long.sh
@ -57,7 +57,8 @@ ${CLICKHOUSE_LOCAL} -q "CREATE TABLE sophisticated_default
    a UInt8 DEFAULT 3,
    b UInt8 ALIAS a + 5,
    c UInt8
-) ENGINE = Memory; SELECT count() FROM system.tables WHERE name='sophisticated_default';"
+) ENGINE = Memory;
+SELECT count() FROM system.tables WHERE name='sophisticated_default' AND database = currentDatabase();"

 # Help is not skipped
 [[ $(${CLICKHOUSE_LOCAL} --help | wc -l) -gt 100 ]]
--- a/tests/queries/0_stateless/00693_max_block_size_system_tables_columns.sql
+++ b/tests/queries/0_stateless/00693_max_block_size_system_tables_columns.sql
@ -1,3 +1,5 @@
+-- NOTE: database = currentDatabase() is not mandatory
+
 SELECT avg(blockSize()) <= 10 FROM system.tables SETTINGS max_block_size = 10;
 SELECT avg(blockSize()) <= 10 FROM system.tables LIMIT 10 SETTINGS max_block_size = 10;
 SELECT (SELECT count() FROM system.tables SETTINGS max_block_size = 10) = (SELECT count() FROM system.tables SETTINGS max_block_size = 9);
--- a/tests/queries/0_stateless/00696_system_columns_limit.sql
+++ b/tests/queries/0_stateless/00696_system_columns_limit.sql
@ -1,3 +1,5 @@
+-- NOTE: database = currentDatabase() is not mandatory
+
 SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 0);
 SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 1);
 SELECT count() > 0 FROM (SELECT * FROM system.columns LIMIT 2);
--- a/tests/queries/0_stateless/00753_system_columns_and_system_tables_long.reference
+++ b/tests/queries/0_stateless/00753_system_columns_and_system_tables_long.reference
--- a/tests/queries/0_stateless/00753_system_columns_and_system_tables_long.sql
+++ b/tests/queries/0_stateless/00753_system_columns_and_system_tables_long.sql
@ -13,17 +13,15 @@ CREATE TABLE check_system_tables
    SETTINGS min_bytes_for_wide_part = 0;

 SELECT name, partition_key, sorting_key, primary_key, sampling_key, storage_policy, total_rows
-FROM system.tables
-WHERE name = 'check_system_tables'
+FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase()
 FORMAT PrettyCompactNoEscapes;

 SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key
-FROM system.columns
-WHERE table = 'check_system_tables'
+FROM system.columns WHERE table = 'check_system_tables' AND database = currentDatabase()
 FORMAT PrettyCompactNoEscapes;

 INSERT INTO check_system_tables VALUES (1, 1, 1);
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();

 DROP TABLE IF EXISTS check_system_tables;

@ -39,13 +37,11 @@ CREATE TABLE check_system_tables
    ORDER BY date;

 SELECT name, partition_key, sorting_key, primary_key, sampling_key
-FROM system.tables
-WHERE name = 'check_system_tables'
+FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase()
 FORMAT PrettyCompactNoEscapes;

 SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key
-FROM system.columns
-WHERE table = 'check_system_tables'
+FROM system.columns WHERE table = 'check_system_tables' AND database = currentDatabase()
 FORMAT PrettyCompactNoEscapes;

 DROP TABLE IF EXISTS check_system_tables;
@ -59,29 +55,27 @@ CREATE TABLE check_system_tables
  ) ENGINE = MergeTree(Event, intHash32(UserId), (Counter, Event, intHash32(UserId)), 8192);

 SELECT name, partition_key, sorting_key, primary_key, sampling_key
-FROM system.tables
-WHERE name = 'check_system_tables'
+FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase()
 FORMAT PrettyCompactNoEscapes;

 SELECT name, is_in_partition_key, is_in_sorting_key, is_in_primary_key, is_in_sampling_key
-FROM system.columns
-WHERE table = 'check_system_tables'
+FROM system.columns WHERE table = 'check_system_tables' AND database = currentDatabase()
 FORMAT PrettyCompactNoEscapes;

 DROP TABLE IF EXISTS check_system_tables;

 SELECT 'Check total_bytes/total_rows for TinyLog';
 CREATE TABLE check_system_tables (key UInt8) ENGINE = TinyLog();
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 INSERT INTO check_system_tables VALUES (1);
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 DROP TABLE check_system_tables;

 SELECT 'Check total_bytes/total_rows for Memory';
 CREATE TABLE check_system_tables (key UInt16) ENGINE = Memory();
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 INSERT INTO check_system_tables VALUES (1);
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 DROP TABLE check_system_tables;

 SELECT 'Check total_bytes/total_rows for Buffer';
@ -96,33 +90,33 @@ CREATE TABLE check_system_tables (key UInt16) ENGINE = Buffer(
    100, 100, /* min_rows /max_rows */
    0,   1e6  /* min_bytes/max_bytes */
 );
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 INSERT INTO check_system_tables SELECT * FROM numbers_mt(50);
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();

 SELECT 'Check lifetime_bytes/lifetime_rows for Buffer';
-SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 OPTIMIZE TABLE check_system_tables; -- flush
-SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 INSERT INTO check_system_tables SELECT * FROM numbers_mt(50);
-SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 OPTIMIZE TABLE check_system_tables; -- flush
-SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 INSERT INTO check_system_tables SELECT * FROM numbers_mt(101); -- direct block write (due to min_rows exceeded)
-SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT lifetime_bytes, lifetime_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 DROP TABLE check_system_tables;
 DROP TABLE check_system_tables_null;

 SELECT 'Check total_bytes/total_rows for Set';
 CREATE TABLE check_system_tables Engine=Set() AS SELECT * FROM numbers(50);
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 INSERT INTO check_system_tables SELECT number+50 FROM numbers(50);
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 DROP TABLE check_system_tables;

 SELECT 'Check total_bytes/total_rows for Join';
 CREATE TABLE check_system_tables Engine=Join(ANY, LEFT, number) AS SELECT * FROM numbers(50);
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 INSERT INTO check_system_tables SELECT number+50 FROM numbers(50);
-SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables';
+SELECT total_bytes, total_rows FROM system.tables WHERE name = 'check_system_tables' AND database = currentDatabase();
 DROP TABLE check_system_tables;
--- a/tests/queries/0_stateless/00821_distributed_storage_with_join_on.sql
+++ b/tests/queries/0_stateless/00821_distributed_storage_with_join_on.sql
@ -1,3 +1,5 @@
+-- NOTE: database = currentDatabase() is not mandatory
+
 DROP TABLE IF EXISTS table1;
 DROP TABLE IF EXISTS table2;

--- a/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh
+++ b/tests/queries/0_stateless/00840_long_concurrent_select_and_drop_deadlock.sh
@ -1,5 +1,7 @@
 #!/usr/bin/env bash

+# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
+
 set -e

 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
--- a/tests/queries/0_stateless/00910_zookeeper_test_alter_compression_codecs_long.reference
+++ b/tests/queries/0_stateless/00910_zookeeper_test_alter_compression_codecs_long.reference
--- a/tests/queries/0_stateless/00910_zookeeper_test_alter_compression_codecs_long.sql
+++ b/tests/queries/0_stateless/00910_zookeeper_test_alter_compression_codecs_long.sql
@ -7,12 +7,12 @@ DROP TABLE IF EXISTS alter_compression_codec2;
 CREATE TABLE alter_compression_codec1 (
    somedate Date CODEC(LZ4),
    id UInt64 CODEC(NONE)
-) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00910/alter_compression_codecs', '1') PARTITION BY somedate ORDER BY id;
+) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00910/'||currentDatabase()||'alter_compression_codecs', '1') PARTITION BY somedate ORDER BY id;

 CREATE TABLE alter_compression_codec2 (
  somedate Date CODEC(LZ4),
  id UInt64 CODEC(NONE)
-) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00910/alter_compression_codecs', '2') PARTITION BY somedate ORDER BY id;
+) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test_00910/'||currentDatabase()||'alter_compression_codecs', '2') PARTITION BY somedate ORDER BY id;

 INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 1);
 INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 2);
@ -25,8 +25,8 @@ ALTER TABLE alter_compression_codec1 ADD COLUMN alter_column String DEFAULT 'def
 SYSTEM SYNC REPLICA alter_compression_codec1;
 SYSTEM SYNC REPLICA alter_compression_codec2;

-SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column';
-SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column';
+SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column' AND database = currentDatabase();
+SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column' AND database = currentDatabase();

 INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 3, '3');
 INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 4, '4');
@ -37,8 +37,8 @@ SELECT * FROM alter_compression_codec1 ORDER BY id;
 SELECT * FROM alter_compression_codec2 ORDER BY id;

 ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column CODEC(NONE);
-SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column';
-SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column';
+SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column' AND database = currentDatabase();
+SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column' AND database = currentDatabase();

 INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 5, '5');
 INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 6, '6');
@ -50,8 +50,8 @@ SET allow_suspicious_codecs = 1;
 ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column CODEC(ZSTD, LZ4HC, LZ4, LZ4, NONE);
 SYSTEM SYNC REPLICA alter_compression_codec1;
 SYSTEM SYNC REPLICA alter_compression_codec2;
-SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column';
-SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column';
+SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column' AND database = currentDatabase();
+SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column' AND database = currentDatabase();

 INSERT INTO alter_compression_codec1 VALUES('2018-01-01', 7, '7');
 INSERT INTO alter_compression_codec2 VALUES('2018-01-01', 8, '8');
@ -62,8 +62,8 @@ SELECT * FROM alter_compression_codec2 ORDER BY id;

 ALTER TABLE alter_compression_codec1 MODIFY COLUMN alter_column FixedString(100);
 SYSTEM SYNC REPLICA alter_compression_codec2;
-SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column';
-SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column';
+SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec1' AND name = 'alter_column' AND database = currentDatabase();
+SELECT compression_codec FROM system.columns WHERE table = 'alter_compression_codec2' AND name = 'alter_column' AND database = currentDatabase();

 DROP TABLE IF EXISTS alter_compression_codec1;
 DROP TABLE IF EXISTS alter_compression_codec2;
--- a/tests/queries/0_stateless/00991_system_parts_race_condition.reference
+++ b/tests/queries/0_stateless/00991_system_parts_race_condition.reference
--- a/tests/queries/0_stateless/00938_fix_rwlock_segfault_long.sh
+++ b/tests/queries/0_stateless/00938_fix_rwlock_segfault_long.sh
@ -8,6 +8,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)

 set -e

+# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
 for _ in {1..100}; do \
 $CLICKHOUSE_CLIENT -q "SELECT name FROM system.tables UNION ALL SELECT name FROM system.columns format Null";
 done
--- a/tests/queries/0_stateless/00941_system_columns_race_condition.sh
+++ b/tests/queries/0_stateless/00941_system_columns_race_condition.sh
@ -13,6 +13,7 @@ $CLICKHOUSE_CLIENT -q "CREATE TABLE alter_table (a UInt8, b Int16, c Float32, d

 function thread1()
 {
+    # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
    while true; do $CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns UNION ALL SELECT name FROM system.columns FORMAT Null"; done
 }

--- a/tests/queries/0_stateless/00991_system_parts_race_condition_long.reference
+++ b/tests/queries/0_stateless/00991_system_parts_race_condition_long.reference
--- a/tests/queries/0_stateless/00991_system_parts_race_condition_long.sh
+++ b/tests/queries/0_stateless/00991_system_parts_race_condition_long.sh
@ -15,6 +15,7 @@ $CLICKHOUSE_CLIENT -q "CREATE TABLE alter_table (a UInt8, b Int16, c Float32, d

 function thread1()
 {
+    # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
    while true; do $CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null"; done
 }

--- a/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh
+++ b/tests/queries/0_stateless/00992_system_parts_race_condition_zookeeper_long.sh
@ -16,6 +16,7 @@ $CLICKHOUSE_CLIENT -n -q "

 function thread1()
 {
+    # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
    while true; do $CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null"; done
 }

--- a/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh
+++ b/tests/queries/0_stateless/00993_system_parts_race_condition_drop_zookeeper.sh
@ -8,7 +8,8 @@ set -e

 function thread1()
 {
-    while true; do 
+    # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
+    while true; do
        $CLICKHOUSE_CLIENT --query "SELECT * FROM system.parts FORMAT Null";
    done
 }
--- a/tests/queries/0_stateless/01004_rename_deadlock.sh
+++ b/tests/queries/0_stateless/01004_rename_deadlock.sh
@ -28,6 +28,7 @@ function thread2()
 function thread3()
 {
    while true; do
+        # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
        $CLICKHOUSE_CLIENT --query "SELECT * FROM system.tables" --format Null
    done
 }
--- a/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.sh
+++ b/tests/queries/0_stateless/01007_r1r2_w_r2r1_deadlock.sh
@ -15,14 +15,16 @@ $CLICKHOUSE_CLIENT --query "CREATE TABLE b (x UInt8) ENGINE = MergeTree ORDER BY

 function thread1()
 {
-    while true; do 
+    while true; do
+        # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
        seq 1 100 | awk '{ print "SELECT x FROM a WHERE x IN (SELECT toUInt8(count()) FROM system.tables);" }' | $CLICKHOUSE_CLIENT -n
    done
 }

 function thread2()
 {
-    while true; do 
+    while true; do
+        # NOTE: database = $CLICKHOUSE_DATABASE is unwanted
        seq 1 100 | awk '{ print "SELECT x FROM b WHERE x IN (SELECT toUInt8(count()) FROM system.tables);" }' | $CLICKHOUSE_CLIENT -n
    done
 }
--- a/tests/queries/0_stateless/01033_quota_dcl.reference
+++ b/tests/queries/0_stateless/01033_quota_dcl.reference
@ -1,2 +1 @@
-default
 CREATE QUOTA default KEYED BY user_name FOR INTERVAL 1 hour TRACKING ONLY TO default, readonly
--- a/tests/queries/0_stateless/01033_quota_dcl.sql
+++ b/tests/queries/0_stateless/01033_quota_dcl.sql
@ -1,2 +1 @@
-SHOW QUOTAS;
 SHOW CREATE QUOTA default;
--- a/tests/queries/0_stateless/01079_parallel_alter_add_drop_column_zookeeper.sh
+++ b/tests/queries/0_stateless/01079_parallel_alter_add_drop_column_zookeeper.sh
@ -82,14 +82,14 @@ wait

 echo "Finishing alters"

-columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_1'" 2> /dev/null)
-columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_2'" 2> /dev/null)
-columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_3'" 2> /dev/null)
+columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_1' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
+columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_2' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
+columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_3' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)

 while [ "$columns1" != "$columns2" ] || [ "$columns2" != "$columns3" ]; do
-    columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_1'" 2> /dev/null)
-    columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_2'" 2> /dev/null)
-    columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_3'" 2> /dev/null)
+    columns1=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_1' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
+    columns2=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_2' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)
+    columns3=$($CLICKHOUSE_CLIENT --query "select count() from system.columns where table='concurrent_alter_add_drop_3' and database='$CLICKHOUSE_DATABASE'" 2> /dev/null)

    sleep 1
 done
--- a/tests/queries/0_stateless/01098_temporary_and_external_tables.sh
+++ b/tests/queries/0_stateless/01098_temporary_and_external_tables.sh
@ -10,6 +10,7 @@ url="${url_without_session}session_id=test_01098"
 ${CLICKHOUSE_CURL} -m 30 -sSk "$url" --data "DROP TEMPORARY TABLE IF EXISTS tmp_table"
 ${CLICKHOUSE_CURL} -m 30 -sSk "$url" --data "CREATE TEMPORARY TABLE tmp_table AS SELECT number AS n FROM numbers(42)"

+# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
 id=$(echo "SELECT uuid FROM system.tables WHERE name='tmp_table' AND is_temporary" | ${CLICKHOUSE_CURL} -m 31 -sSgk "$url" -d @-)
 internal_table_name="_temporary_and_external_tables.\`_tmp_$id\`"

--- a/tests/queries/0_stateless/01109_exchange_tables.sql
+++ b/tests/queries/0_stateless/01109_exchange_tables.sql
@ -4,6 +4,7 @@ CREATE DATABASE test_01109 ENGINE=Atomic;
 USE test_01109;

 CREATE TABLE t0 ENGINE=MergeTree() ORDER BY tuple() AS SELECT rowNumberInAllBlocks(), * FROM (SELECT toLowCardinality(arrayJoin(['exchange', 'tables'])));
+-- NOTE: database = currentDatabase() is not mandatory
 CREATE TABLE t1 ENGINE=Log() AS SELECT * FROM system.tables AS t JOIN system.databases AS d ON t.database=d.name;
 CREATE TABLE t2 ENGINE=MergeTree() ORDER BY tuple() AS SELECT rowNumberInAllBlocks() + (SELECT count() FROM t0), * FROM (SELECT arrayJoin(['hello', 'world']));

--- a/tests/queries/0_stateless/01213_alter_rename_column_zookeeper_long.reference
+++ b/tests/queries/0_stateless/01213_alter_rename_column_zookeeper_long.reference
@ -1,7 +1,7 @@
 1
-CREATE TABLE default.table_for_rename_replicated\n(\n    `date` Date,\n    `key` UInt64,\n    `value1` String,\n    `value2` String,\n    `value3` String\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/01213_alter_rename_column_zookeeper_default/table_for_rename_replicated\', \'1\')\nPARTITION BY date\nORDER BY key\nSETTINGS index_granularity = 8192
+CREATE TABLE default.table_for_rename_replicated\n(\n    `date` Date,\n    `key` UInt64,\n    `value1` String,\n    `value2` String,\n    `value3` String\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/01213_alter_rename_column_zookeeper_long_default/table_for_rename_replicated\', \'1\')\nPARTITION BY date\nORDER BY key\nSETTINGS index_granularity = 8192
 renamed_value1
-CREATE TABLE default.table_for_rename_replicated\n(\n    `date` Date,\n    `key` UInt64,\n    `renamed_value1` String,\n    `value2` String,\n    `value3` String\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/01213_alter_rename_column_zookeeper_default/table_for_rename_replicated\', \'1\')\nPARTITION BY date\nORDER BY key\nSETTINGS index_granularity = 8192
+CREATE TABLE default.table_for_rename_replicated\n(\n    `date` Date,\n    `key` UInt64,\n    `renamed_value1` String,\n    `value2` String,\n    `value3` String\n)\nENGINE = ReplicatedMergeTree(\'/clickhouse/tables/01213_alter_rename_column_zookeeper_long_default/table_for_rename_replicated\', \'1\')\nPARTITION BY date\nORDER BY key\nSETTINGS index_granularity = 8192
 1
 date	key	renamed_value1	value2	value3
 2019-10-02	1	1	1	1
--- a/tests/queries/0_stateless/01213_alter_rename_column_zookeeper_long.sh
+++ b/tests/queries/0_stateless/01213_alter_rename_column_zookeeper_long.sh
@ -32,11 +32,11 @@ $CLICKHOUSE_CLIENT --query "SHOW CREATE TABLE table_for_rename_replicated;"
 $CLICKHOUSE_CLIENT --query "ALTER TABLE table_for_rename_replicated RENAME COLUMN value1 to renamed_value1" --replication_alter_partitions_sync=0


-while [[ -z $($CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns WHERE name = 'renamed_value1' and table = 'table_for_rename_replicated'" 2>/dev/null) ]]; do
+while [[ -z $($CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns WHERE name = 'renamed_value1' and table = 'table_for_rename_replicated' AND database = '$CLICKHOUSE_DATABASE'" 2>/dev/null) ]]; do
    sleep 0.5
 done

-$CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns WHERE name = 'renamed_value1' and table = 'table_for_rename_replicated'"
+$CLICKHOUSE_CLIENT --query "SELECT name FROM system.columns WHERE name = 'renamed_value1' and table = 'table_for_rename_replicated' AND database = '$CLICKHOUSE_DATABASE'"

 # SHOW CREATE TABLE takes query from .sql file on disk.
 # previous select take metadata from memory. So, when previous select says, that return renamed_value1 already exists in table, it's still can have old version on disk.
--- a/tests/queries/0_stateless/01445_create_table_as_table_function.sh
+++ b/tests/queries/0_stateless/01445_create_table_as_table_function.sh
@ -6,6 +6,7 @@ CUR_DIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CUR_DIR"/../shell_config.sh

+# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
 ${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns AS numbers(10);" 2>&1 | grep -F "Code: 57" > /dev/null && echo 'OK' || echo 'FAIL'
 ${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns engine=Memory AS numbers(10);" 2>&1 | grep -F "Code: 62" > /dev/null && echo 'OK' || echo 'FAIL'
 ${CLICKHOUSE_CLIENT} --query "CREATE TABLE system.columns AS numbers(10) engine=Memory;" 2>&1 | grep -F "Code: 62" > /dev/null && echo 'OK' || echo 'FAIL'
--- a/tests/queries/0_stateless/01451_detach_drop_part.sql
+++ b/tests/queries/0_stateless/01451_detach_drop_part.sql
@ -15,13 +15,13 @@ ALTER TABLE mt_01451 DETACH PART 'all_2_2_0';

 SELECT v FROM mt_01451 ORDER BY v;

-SELECT name FROM system.detached_parts WHERE table = 'mt_01451';
+SELECT name FROM system.detached_parts WHERE table = 'mt_01451' AND database = currentDatabase();

 ALTER TABLE mt_01451 ATTACH PART 'all_2_2_0';

 SELECT v FROM mt_01451 ORDER BY v;

-SELECT name FROM system.detached_parts WHERE table = 'mt_01451';
+SELECT name FROM system.detached_parts WHERE table = 'mt_01451' AND database = currentDatabase();

 SELECT '-- drop part --';

@ -37,6 +37,6 @@ OPTIMIZE TABLE mt_01451 FINAL;

 SELECT v FROM mt_01451 ORDER BY v;

-SELECT name FROM system.parts WHERE table = 'mt_01451' AND active;
+SELECT name FROM system.parts WHERE table = 'mt_01451' AND active AND database = currentDatabase();

 DROP TABLE mt_01451;
--- a/tests/queries/0_stateless/01451_replicated_detach_drop_part_long.reference
+++ b/tests/queries/0_stateless/01451_replicated_detach_drop_part_long.reference
--- a/tests/queries/0_stateless/01451_replicated_detach_drop_part_long.sql
+++ b/tests/queries/0_stateless/01451_replicated_detach_drop_part_long.sql
@ -3,8 +3,8 @@ SET replication_alter_partitions_sync = 2;
 DROP TABLE IF EXISTS replica1;
 DROP TABLE IF EXISTS replica2;

-CREATE TABLE replica1 (v UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/01451/attach', 'r1') order by tuple() settings max_replicated_merges_in_queue = 0;
-CREATE TABLE replica2 (v UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/01451/attach', 'r2') order by tuple() settings max_replicated_merges_in_queue = 0;
+CREATE TABLE replica1 (v UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/'||currentDatabase()||'test/01451/attach', 'r1') order by tuple() settings max_replicated_merges_in_queue = 0;
+CREATE TABLE replica2 (v UInt8) ENGINE = ReplicatedMergeTree('/clickhouse/tables/'||currentDatabase()||'test/01451/attach', 'r2') order by tuple() settings max_replicated_merges_in_queue = 0;

 INSERT INTO replica1 VALUES (0);
 INSERT INTO replica1 VALUES (1);
@ -19,14 +19,14 @@ ALTER TABLE replica2 DETACH PART 'all_1_1_0';

 SELECT v FROM replica1 ORDER BY v;

-SELECT name FROM system.detached_parts WHERE table = 'replica2';
+SELECT name FROM system.detached_parts WHERE table = 'replica2' AND database = currentDatabase();

 ALTER TABLE replica2 ATTACH PART 'all_1_1_0';

 SYSTEM SYNC REPLICA replica1;
 SELECT v FROM replica1 ORDER BY v;

-SELECT name FROM system.detached_parts WHERE table = 'replica2';
+SELECT name FROM system.detached_parts WHERE table = 'replica2' AND database = currentDatabase();

 SELECT '-- drop part --';

@ -43,7 +43,7 @@ OPTIMIZE TABLE replica1 FINAL;

 SELECT v FROM replica1 ORDER BY v;

-SELECT name FROM system.parts WHERE table = 'replica2' AND active;
+SELECT name FROM system.parts WHERE table = 'replica2' AND active AND database = currentDatabase();

 DROP TABLE replica1;
 DROP TABLE replica2;
--- a/tests/queries/0_stateless/01518_cast_nullable_virtual_system_column.sql
+++ b/tests/queries/0_stateless/01518_cast_nullable_virtual_system_column.sql
@ -1,7 +1,8 @@
+-- NOTE: database = currentDatabase() is not mandatory
+
 SELECT database FROM system.tables WHERE database LIKE '%' format Null;
 SELECT database AS db FROM system.tables WHERE db LIKE '%' format Null;
 SELECT CAST(database, 'String') AS db FROM system.tables WHERE db LIKE '%' format Null;
 SELECT CAST('a string', 'Nullable(String)') AS str WHERE str LIKE '%' format Null;
 SELECT CAST(database, 'Nullable(String)') AS ndb FROM system.tables WHERE ndb LIKE '%' format Null;
 SELECT 'all tests passed';
-
--- a/tests/queries/0_stateless/01533_optimize_skip_merged_partitions.sql
+++ b/tests/queries/0_stateless/01533_optimize_skip_merged_partitions.sql
@ -14,7 +14,7 @@ INSERT INTO optimize_final SELECT toDate('2000-01-01'), number + 5 FROM numbers(

 OPTIMIZE TABLE optimize_final FINAL;

-SELECT table, partition, active, level from system.parts where table = 'optimize_final' and active = 1;
+SELECT table, partition, active, level from system.parts where table = 'optimize_final' and database = currentDatabase() and active = 1;

 DROP TABLE optimize_final;

--- a/tests/queries/0_stateless/01560_optimize_on_insert_long.reference
+++ b/tests/queries/0_stateless/01560_optimize_on_insert_long.reference
--- a/tests/queries/0_stateless/01560_optimize_on_insert_long.sql
+++ b/tests/queries/0_stateless/01560_optimize_on_insert_long.sql
@ -38,5 +38,5 @@ DROP TABLE IF EXISTS empty;
 CREATE TABLE empty (key UInt32, val UInt32, date Datetime) ENGINE=SummingMergeTree(val) PARTITION BY date ORDER BY key;
 INSERT INTO empty VALUES (1, 1, '2020-01-01'), (1, 1, '2020-01-01'), (1, -2, '2020-01-01');
 SELECT * FROM empty ORDER BY key;
-SELECT table, partition, active FROM system.parts where table = 'empty' and active = 1;
+SELECT table, partition, active FROM system.parts where table = 'empty' and active = 1 and database = currentDatabase();
 DROP TABLE empty;
--- a/tests/queries/0_stateless/01600_parts_states_metrics_long.reference
+++ b/tests/queries/0_stateless/01600_parts_states_metrics_long.reference
--- a/tests/queries/0_stateless/01600_parts_states_metrics_long.sh
+++ b/tests/queries/0_stateless/01600_parts_states_metrics_long.sh
@ -4,6 +4,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh

+# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
 verify_sql="SELECT
    (SELECT sumIf(value, metric = 'PartsCommitted'), sumIf(value, metric = 'PartsOutdated') FROM system.metrics)
    = (SELECT sum(active), sum(NOT active) FROM system.parts)"
--- a/tests/queries/0_stateless/01600_parts_types_metrics_long.reference
+++ b/tests/queries/0_stateless/01600_parts_types_metrics_long.reference
--- a/tests/queries/0_stateless/01600_parts_types_metrics_long.sh
+++ b/tests/queries/0_stateless/01600_parts_types_metrics_long.sh
@ -7,6 +7,7 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 set -e
 set -o pipefail

+# NOTE: database = $CLICKHOUSE_DATABASE is unwanted
 verify_sql="SELECT
    (SELECT sumIf(value, metric = 'PartsInMemory'), sumIf(value, metric = 'PartsCompact'), sumIf(value, metric = 'PartsWide') FROM system.metrics) =
    (SELECT countIf(part_type == 'InMemory'), countIf(part_type == 'Compact'), countIf(part_type == 'Wide') FROM system.parts)"
--- a/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh
+++ b/tests/queries/0_stateless/01676_long_clickhouse_client_autocomplete.sh
@ -4,6 +4,8 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # shellcheck source=../shell_config.sh
 . "$CURDIR"/../shell_config.sh

+# NOTE: database = $CLICKHOUSE_DATABASE is superfluous
+
 function test_completion_word()
 {
    local w=$1 && shift
@ -13,7 +15,7 @@ function test_completion_word()
    local compword_end=${w:$((w_len-3))}

    # NOTE: here and below you should escape variables of the expect.
-    timeout 22s expect << EOF
+    timeout 60s expect << EOF
 log_user 0
 set timeout 3
 match_max 100000
--- a/tests/queries/0_stateless/01800_log_nested.sql
+++ b/tests/queries/0_stateless/01800_log_nested.sql
@ -1,11 +1,11 @@
-- TinyTinyLog
+-- TinyLog
 DROP TABLE IF EXISTS nested_01800_tiny_log;
 CREATE TABLE nested_01800_tiny_log (`column` Nested(name String, names Array(String), types Array(Enum8('PU' = 1, 'US' = 2, 'OTHER' = 3)))) ENGINE = TinyLog;
 INSERT INTO nested_01800_tiny_log VALUES (['Hello', 'World'], [['a'], ['b', 'c']], [['PU', 'US'], ['OTHER']]);
 SELECT 10 FROM nested_01800_tiny_log FORMAT Null;
 DROP TABLE nested_01800_tiny_log;

-- StripeStripeLog
+-- StripeLog
 DROP TABLE IF EXISTS nested_01800_stripe_log;
 CREATE TABLE nested_01800_stripe_log (`column` Nested(name String, names Array(String), types Array(Enum8('PU' = 1, 'US' = 2, 'OTHER' = 3)))) ENGINE = StripeLog;
 INSERT INTO nested_01800_stripe_log VALUES (['Hello', 'World'], [['a'], ['b', 'c']], [['PU', 'US'], ['OTHER']]);
--- a/tests/queries/0_stateless/01821_table_comment.sql
+++ b/tests/queries/0_stateless/01821_table_comment.sql
@ -28,7 +28,7 @@ SELECT
    name,
    comment
 FROM system.tables
-WHERE name IN ('t1', 't2', 't3') order by name;
+WHERE name IN ('t1', 't2', 't3') AND database = currentDatabase() order by name;

 SHOW CREATE TABLE t1;

--- a/tests/queries/1_stateful/00076_system_columns_bytes.sql
+++ b/tests/queries/1_stateful/00076_system_columns_bytes.sql
@ -1 +1,2 @@
+-- NOTE: database = currentDatabase() is not mandatory
 SELECT sum(data_compressed_bytes) > 0, sum(data_uncompressed_bytes) > 0, sum(marks_bytes) > 0 FROM system.columns;
--- a/utils/check-style/check-style
+++ b/utils/check-style/check-style
@ -94,7 +94,28 @@ tests_with_query_log=( $(
        xargs grep --with-filename -e system.query_log -e system.query_thread_log | cut -d: -f1 | sort -u
 ) )
 for test_case in "${tests_with_query_log[@]}"; do
-    grep -qE current_database.*currentDatabase "$test_case" || echo "Queries to system.query_log/system.query_thread_log does not have current_database = currentDatabase() condition in $test_case"
+    grep -qE current_database.*currentDatabase "$test_case" || {
+        grep -qE 'current_database.*\$CLICKHOUSE_DATABASE' "$test_case"
+    } || echo "Queries to system.query_log/system.query_thread_log does not have current_database = currentDatabase() condition in $test_case"
+done
+
+# Queries to system.tables/system.parts/system.detached_parts/system.parts_columns/system.columns should have database = currentDatabase() condition
+# NOTE: it is not that accuate, but at least something.
+tests_with_database_column=( $(
+    find $ROOT_PATH/tests/queries -iname '*.sql' -or -iname '*.sh' -or -iname '*.py' |
+        grep -vP $EXCLUDE_DIRS |
+        grep -v -x -e $ROOT_PATH/tests/queries/query_test.py |
+        xargs grep --with-filename -e system.tables -e system.parts -e system.detached_parts -e system.parts_columns -e system.columns | cut -d: -f1 | sort -u
+) )
+for test_case in "${tests_with_database_column[@]}"; do
+    grep -qE database.*currentDatabase "$test_case" || {
+        grep -qE 'database.*\$CLICKHOUSE_DATABASE' "$test_case"
+    } || {
+        # explicit database
+        grep -qE "database[ ]*=[ ]*'" "$test_case"
+    } || {
+        echo "Queries to system.tables/system.parts/system.detached_parts/system.parts_columns/system.columns does not have database = currentDatabase()/\$CLICKHOUSE_DATABASE condition in $test_case"
+    }
 done

 # Queries with ReplicatedMergeTree
--- a/utils/ci/build-normal.sh
+++ b/utils/ci/build-normal.sh
@ -8,11 +8,6 @@ source default-config
 mkdir -p "${WORKSPACE}/build"
 pushd "${WORKSPACE}/build"

-if [[ "${ENABLE_EMBEDDED_COMPILER}" == 1 ]]; then
-    [[ "$USE_LLVM_LIBRARIES_FROM_SYSTEM" == 0 ]] && CMAKE_FLAGS="$CMAKE_FLAGS -DUSE_INTERNAL_LLVM_LIBRARY=1"
-    [[ "$USE_LLVM_LIBRARIES_FROM_SYSTEM" != 0 ]] && CMAKE_FLAGS="$CMAKE_FLAGS -DUSE_INTERNAL_LLVM_LIBRARY=0"
-fi
-
 cmake -DCMAKE_BUILD_TYPE=${BUILD_TYPE} -DENABLE_EMBEDDED_COMPILER=${ENABLE_EMBEDDED_COMPILER} $CMAKE_FLAGS ../sources

 [[ "$BUILD_TARGETS" != 'all' ]] && BUILD_TARGETS_STRING="--target $BUILD_TARGETS"
--- a/utils/ci/default-config
+++ b/utils/ci/default-config
@ -27,7 +27,6 @@ CLANG_SOURCES_BRANCH=trunk          # or tags/RELEASE_600/final
 GCC_SOURCES_VERSION=latest          # or gcc-7.1.0

 # install-libraries
-USE_LLVM_LIBRARIES_FROM_SYSTEM=0    # 0 or 1
 ENABLE_EMBEDDED_COMPILER=1

 # build
--- a/utils/ci/install-libraries.sh
+++ b/utils/ci/install-libraries.sh
@ -5,7 +5,3 @@ source default-config

 ./install-os-packages.sh libicu-dev
 ./install-os-packages.sh libreadline-dev
-
-if [[ "$ENABLE_EMBEDDED_COMPILER" == 1 && "$USE_LLVM_LIBRARIES_FROM_SYSTEM" == 1 ]]; then
-    ./install-os-packages.sh llvm-libs-5.0
-fi
--- a/Show More
+++ b/Show More