Merge branch 'master' into stress-thread-fuzzer

2024-09-20 00:30:49 +00:00 · 2021-08-21 11:12:52 +00:00 · 2021-08-21 11:12:52 +00:00 · d75fbdf5f1
commit d75fbdf5f1
parent 91995ca3a6 894e56fd99
697 changed files with 14201 additions and 6295 deletions
--- a/.github/PULL_REQUEST_TEMPLATE.md
+++ b/.github/PULL_REQUEST_TEMPLATE.md
@ -3,7 +3,7 @@ I hereby agree to the terms of the CLA available at: https://yandex.ru/legal/cla
 Changelog category (leave one):
 - New Feature
 - Improvement
- Bug Fix
+- Bug Fix (user-visible misbehaviour in official stable or prestable release)
 - Performance Improvement
 - Backward Incompatible Change
 - Build/Testing/Packaging Improvement
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@ -1,5 +1,8 @@
 ### ClickHouse release v21.8, 2021-08-12

+#### Upgrade Notes
+* New version is using `Map` data type for system logs tables (`system.query_log`, `system.query_thread_log`, `system.processes`, `system.opentelemetry_span_log`). These tables will be auto-created with new data types. Virtual columns are created to support old queries. Closes [#18698](https://github.com/ClickHouse/ClickHouse/issues/18698). [#23934](https://github.com/ClickHouse/ClickHouse/pull/23934), [#25773](https://github.com/ClickHouse/ClickHouse/pull/25773) ([hexiaoting](https://github.com/hexiaoting), [sundy-li](https://github.com/sundy-li), [Maksim Kita](https://github.com/kitaisreal)). If you want to *downgrade* from version 21.8 to older versions, you will need to cleanup system tables with logs manually. Look at `/var/lib/clickhouse/data/system/*_log`.
+
 #### New Features

 * Add support for a part of SQL/JSON standard. [#24148](https://github.com/ClickHouse/ClickHouse/pull/24148) ([l1tsolaiki](https://github.com/l1tsolaiki), [Kseniia Sumarokova](https://github.com/kssenii)).
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -593,7 +593,23 @@ macro (add_executable target)
        # disabled for TSAN and gcc since libtsan.a provides overrides too
        if (TARGET clickhouse_new_delete)
            # operator::new/delete for executables (MemoryTracker stuff)
-            target_link_libraries (${target} PRIVATE clickhouse_new_delete ${MALLOC_LIBRARIES})
+            target_link_libraries (${target} PRIVATE clickhouse_new_delete)
+        endif()
+
+        # In case of static jemalloc, because zone_register() is located in zone.c and
+        # is never used outside (it is declared as constructor) it is omitted
+        # by the linker, and so jemalloc will not be registered as system
+        # allocator under osx [1], and clickhouse will SIGSEGV.
+        #
+        #   [1]: https://github.com/jemalloc/jemalloc/issues/708
+        #
+        # About symbol name:
+        # - _zone_register not zone_register due to Mach-O binary format,
+        # - _je_zone_register due to JEMALLOC_PRIVATE_NAMESPACE=je_ under OS X.
+        # - but jemalloc-cmake does not run private_namespace.sh
+        #   so symbol name should be _zone_register
+        if (ENABLE_JEMALLOC AND MAKE_STATIC_LIBRARIES AND OS_DARWIN)
+            set_property(TARGET ${target} APPEND PROPERTY LINK_OPTIONS -u_zone_register)
        endif()
    endif()
 endmacro()
--- a/base/common/wide_integer_impl.h
+++ b/base/common/wide_integer_impl.h
@ -152,7 +152,7 @@ namespace wide
 template <size_t Bits, typename Signed>
 struct integer<Bits, Signed>::_impl
 {
-    static constexpr size_t _Bits = Bits;
+    static constexpr size_t _bits = Bits;
    static constexpr const unsigned byte_count = Bits / 8;
    static constexpr const unsigned item_count = byte_count / sizeof(base_type);
    static constexpr const unsigned base_bits = sizeof(base_type) * 8;
@ -614,8 +614,8 @@ public:
        else
        {
            static_assert(IsWideInteger<T>::value);
-            return std::common_type_t<integer<Bits, Signed>, integer<T::_impl::_Bits, Signed>>::_impl::operator_plus(
-                integer<T::_impl::_Bits, Signed>(lhs), rhs);
+            return std::common_type_t<integer<Bits, Signed>, integer<T::_impl::_bits, Signed>>::_impl::operator_plus(
+                integer<T::_impl::_bits, Signed>(lhs), rhs);
        }
    }

@ -632,8 +632,8 @@ public:
        else
        {
            static_assert(IsWideInteger<T>::value);
-            return std::common_type_t<integer<Bits, Signed>, integer<T::_impl::_Bits, Signed>>::_impl::operator_minus(
-                integer<T::_impl::_Bits, Signed>(lhs), rhs);
+            return std::common_type_t<integer<Bits, Signed>, integer<T::_impl::_bits, Signed>>::_impl::operator_minus(
+                integer<T::_impl::_bits, Signed>(lhs), rhs);
        }
    }

@ -857,7 +857,7 @@ public:
        else
        {
            static_assert(IsWideInteger<T>::value);
-            return std::common_type_t<integer<Bits, Signed>, integer<T::_impl::_Bits, Signed>>::operator_slash(T(lhs), rhs);
+            return std::common_type_t<integer<Bits, Signed>, integer<T::_impl::_bits, Signed>>::operator_slash(T(lhs), rhs);
        }
    }

@ -877,7 +877,7 @@ public:
        else
        {
            static_assert(IsWideInteger<T>::value);
-            return std::common_type_t<integer<Bits, Signed>, integer<T::_impl::_Bits, Signed>>::operator_percent(T(lhs), rhs);
+            return std::common_type_t<integer<Bits, Signed>, integer<T::_impl::_bits, Signed>>::operator_percent(T(lhs), rhs);
        }
    }

--- a/base/daemon/SentryWriter.cpp
+++ b/base/daemon/SentryWriter.cpp
@ -12,6 +12,7 @@
 #include <Common/SymbolIndex.h>
 #include <Common/StackTrace.h>
 #include <Common/getNumberOfPhysicalCPUCores.h>
+#include <Core/ServerUUID.h>

 #if !defined(ARCADIA_BUILD)
 #    include "Common/config_version.h"
@ -38,6 +39,13 @@ void setExtras()
    if (!anonymize)
        sentry_set_extra("server_name", sentry_value_new_string(getFQDNOrHostName().c_str()));

+    DB::UUID server_uuid = DB::ServerUUID::get();
+    if (server_uuid != DB::UUIDHelpers::Nil)
+    {
+        std::string server_uuid_str = DB::toString(server_uuid);
+        sentry_set_extra("server_uuid", sentry_value_new_string(server_uuid_str.c_str()));
+    }
+
    sentry_set_tag("version", VERSION_STRING);
    sentry_set_extra("version_githash", sentry_value_new_string(VERSION_GITHASH));
    sentry_set_extra("version_describe", sentry_value_new_string(VERSION_DESCRIBE));
--- a/base/glibc-compatibility/musl/getauxval.c
+++ b/base/glibc-compatibility/musl/getauxval.c
@ -1,5 +1,4 @@
 #include <sys/auxv.h>
-#include "atomic.h"
 #include <unistd.h> // __environ
 #include <errno.h>

@ -18,7 +17,18 @@ static size_t __find_auxv(unsigned long type)
    return (size_t) -1;
 }

-unsigned long __getauxval(unsigned long type)
+__attribute__((constructor)) static void __auxv_init()
+{
+    size_t i;
+    for (i = 0; __environ[i]; i++);
+    __auxv = (unsigned long *) (__environ + i + 1);
+
+    size_t secure_idx = __find_auxv(AT_SECURE);
+    if (secure_idx != ((size_t) -1))
+        __auxv_secure = __auxv[secure_idx];
+}
+
+unsigned long getauxval(unsigned long type)
 {
    if (type == AT_SECURE)
        return __auxv_secure;
@ -33,38 +43,3 @@ unsigned long __getauxval(unsigned long type)
    errno = ENOENT;
    return 0;
 }
-
-static void * volatile getauxval_func;
-
-static unsigned long  __auxv_init(unsigned long type)
-{
-    if (!__environ)
-    {
-        // __environ is not initialized yet so we can't initialize __auxv right now.
-        // That's normally occurred only when getauxval() is called from some sanitizer's internal code.
-        errno = ENOENT;
-        return 0;
-    }
-
-    // Initialize __auxv and __auxv_secure.
-    size_t i;
-    for (i = 0; __environ[i]; i++);
-    __auxv = (unsigned long *) (__environ + i + 1);
-
-    size_t secure_idx = __find_auxv(AT_SECURE);
-    if (secure_idx != ((size_t) -1))
-        __auxv_secure = __auxv[secure_idx];
-
-    // Now we've initialized __auxv, next time getauxval() will only call __get_auxval().
-    a_cas_p(&getauxval_func, (void *)__auxv_init, (void *)__getauxval);
-
-    return __getauxval(type);
-}
-
-// First time getauxval() will call __auxv_init().
-static void * volatile getauxval_func = (void *)__auxv_init;
-
-unsigned long getauxval(unsigned long type)
-{
-    return ((unsigned long (*)(unsigned long))getauxval_func)(type);
-}
--- a/cmake/autogenerated_versions.txt
+++ b/cmake/autogenerated_versions.txt
@ -2,11 +2,11 @@

 # NOTE: has nothing common with DBMS_TCP_PROTOCOL_VERSION,
 # only DBMS_TCP_PROTOCOL_VERSION should be incremented on protocol changes.
-SET(VERSION_REVISION 54454)
+SET(VERSION_REVISION 54455)
 SET(VERSION_MAJOR 21)
-SET(VERSION_MINOR 9)
+SET(VERSION_MINOR 10)
 SET(VERSION_PATCH 1)
-SET(VERSION_GITHASH f063e44131a048ba2d9af8075f03700fd5ec3e69)
-SET(VERSION_DESCRIBE v21.9.1.7770-prestable)
-SET(VERSION_STRING 21.9.1.7770)
+SET(VERSION_GITHASH 09df5018f95edcd0f759d4689ac5d029dd400c2a)
+SET(VERSION_DESCRIBE v21.10.1.1-testing)
+SET(VERSION_STRING 21.10.1.1)
 # end of autochange
--- a/contrib/jemalloc-cmake/CMakeLists.txt
+++ b/contrib/jemalloc-cmake/CMakeLists.txt
@ -1,9 +1,10 @@
-# Disabled under OSX until https://github.com/ClickHouse/ClickHouse/issues/27568 is fixed
 if (SANITIZE OR NOT (
-    ((OS_LINUX OR OS_FREEBSD) AND (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE))))
+    ((OS_LINUX OR OS_FREEBSD) AND (ARCH_AMD64 OR ARCH_ARM OR ARCH_PPC64LE)) OR
+    (OS_DARWIN AND (CMAKE_BUILD_TYPE STREQUAL "RelWithDebInfo" OR CMAKE_BUILD_TYPE STREQUAL "Debug"))
+))
    if (ENABLE_JEMALLOC)
        message (${RECONFIGURE_MESSAGE_LEVEL}
-                 "jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64, or ppc64le Linux or FreeBSD builds")
+                 "jemalloc is disabled implicitly: it doesn't work with sanitizers and can only be used with x86_64, aarch64, or ppc64le Linux or FreeBSD builds and RelWithDebInfo macOS builds.")
    endif ()
    set (ENABLE_JEMALLOC OFF)
 else ()
@ -138,9 +139,5 @@ target_compile_options(jemalloc PRIVATE -Wno-redundant-decls)
 target_compile_options(jemalloc PRIVATE -D_GNU_SOURCE)

 set_property(TARGET jemalloc APPEND PROPERTY INTERFACE_COMPILE_DEFINITIONS USE_JEMALLOC=1)
-if (MAKE_STATIC_LIBRARIES)
-    # To detect whether we need to register jemalloc for osx as default zone.
-    set_property(TARGET jemalloc APPEND PROPERTY INTERFACE_COMPILE_DEFINITIONS BUNDLED_STATIC_JEMALLOC=1)
-endif()

 message (STATUS "Using jemalloc")
--- a/contrib/librdkafka
+++ b/contrib/librdkafka
@ -1 +1 @@
-Subproject commit 43491d33ca2826531d1e3cae70d4bf1e5249e3c9
+Subproject commit b8554f1682062c85ba519eb54ef2f90e02b812cb
--- a/debian/changelog
+++ b/debian/changelog
@ -1,5 +1,5 @@
-clickhouse (21.9.1.1) unstable; urgency=low
+clickhouse (21.10.1.1) unstable; urgency=low

  * Modified source code

- -- clickhouse-release <clickhouse-release@yandex-team.ru>  Sat, 10 Jul 2021 08:22:49 +0300
+ -- clickhouse-release <clickhouse-release@yandex-team.ru>  Sat, 17 Jul 2021 08:45:03 +0300
--- a/docker/client/Dockerfile
+++ b/docker/client/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:18.04

 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.9.1.*
+ARG version=21.10.1.*

 RUN apt-get update \
    && apt-get install --yes --no-install-recommends \
--- a/docker/server/Dockerfile
+++ b/docker/server/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:20.04

 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.9.1.*
+ARG version=21.10.1.*
 ARG gosu_ver=1.10

 # set non-empty deb_location_url url to create a docker image
--- a/docker/test/Dockerfile
+++ b/docker/test/Dockerfile
@ -1,7 +1,7 @@
 FROM ubuntu:18.04

 ARG repository="deb https://repo.clickhouse.tech/deb/stable/ main/"
-ARG version=21.9.1.*
+ARG version=21.10.1.*

 RUN apt-get update && \
    apt-get install -y apt-transport-https dirmngr && \
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -641,6 +641,7 @@ create view partial_query_times as select * from
 -- Report for partial queries that we could only run on the new server (e.g.
 -- queries with new functions added in the tested PR).
 create table partial_queries_report engine File(TSV, 'report/partial-queries-report.tsv')
+    settings output_format_decimal_trailing_zeros = 1
    as select toDecimal64(time_median, 3) time,
        toDecimal64(time_stddev / time_median, 3) relative_time_stddev,
        test, query_index, query_display_name
@ -713,8 +714,9 @@ create table queries engine File(TSVWithNamesAndTypes, 'report/queries.tsv')
    order by test, query_index, metric_name
    ;

-create table changed_perf_report engine File(TSV, 'report/changed-perf.tsv') as
-    with
+create table changed_perf_report engine File(TSV, 'report/changed-perf.tsv')
+    settings output_format_decimal_trailing_zeros = 1
+    as with
        -- server_time is sometimes reported as zero (if it's less than 1 ms),
        -- so we have to work around this to not get an error about conversion
        -- of NaN to decimal.
@ -730,8 +732,9 @@ create table changed_perf_report engine File(TSV, 'report/changed-perf.tsv') as
        changed_fail, test, query_index, query_display_name
    from queries where changed_show order by abs(diff) desc;

-create table unstable_queries_report engine File(TSV, 'report/unstable-queries.tsv') as
-    select
+create table unstable_queries_report engine File(TSV, 'report/unstable-queries.tsv')
+    settings output_format_decimal_trailing_zeros = 1
+    as select
        toDecimal64(left, 3), toDecimal64(right, 3), toDecimal64(diff, 3),
        toDecimal64(stat_threshold, 3), unstable_fail, test, query_index, query_display_name
    from queries where unstable_show order by stat_threshold desc;
@ -761,8 +764,9 @@ create view total_speedup as
    from test_speedup
    ;

-create table test_perf_changes_report engine File(TSV, 'report/test-perf-changes.tsv') as
-    with
+create table test_perf_changes_report engine File(TSV, 'report/test-perf-changes.tsv')
+    settings output_format_decimal_trailing_zeros = 1
+    as with
        (times_speedup >= 1
            ? '-' || toString(toDecimal64(times_speedup, 3)) || 'x'
            : '+' || toString(toDecimal64(1 / times_speedup, 3)) || 'x')
@ -788,8 +792,9 @@ create view total_client_time_per_query as select *
    from file('analyze/client-times.tsv', TSV,
        'test text, query_index int, client float, server float');

-create table slow_on_client_report engine File(TSV, 'report/slow-on-client.tsv') as
-    select client, server, toDecimal64(client/server, 3) p,
+create table slow_on_client_report engine File(TSV, 'report/slow-on-client.tsv')
+    settings output_format_decimal_trailing_zeros = 1
+    as select client, server, toDecimal64(client/server, 3) p,
        test, query_display_name
    from total_client_time_per_query left join query_display_names using (test, query_index)
    where p > toDecimal64(1.02, 3) order by p desc;
@ -874,8 +879,9 @@ create view test_times_view_total as
    from test_times_view
    ;

-create table test_times_report engine File(TSV, 'report/test-times.tsv') as
-    select
+create table test_times_report engine File(TSV, 'report/test-times.tsv')
+    settings output_format_decimal_trailing_zeros = 1
+    as select
        test,
        toDecimal64(real, 3),
        toDecimal64(total_client_time, 3),
@ -893,8 +899,9 @@ create table test_times_report engine File(TSV, 'report/test-times.tsv') as
    ;

 -- report for all queries page, only main metric
-create table all_tests_report engine File(TSV, 'report/all-queries.tsv') as
-    with
+create table all_tests_report engine File(TSV, 'report/all-queries.tsv')
+    settings output_format_decimal_trailing_zeros = 1
+    as with
        -- server_time is sometimes reported as zero (if it's less than 1 ms),
        -- so we have to work around this to not get an error about conversion
        -- of NaN to decimal.
@ -1057,9 +1064,10 @@ create table unstable_run_traces engine File(TSVWithNamesAndTypes,
    ;

 create table metric_devation engine File(TSVWithNamesAndTypes,
-        'report/metric-deviation.$version.tsv') as
+        'report/metric-deviation.$version.tsv')
+    settings output_format_decimal_trailing_zeros = 1
    -- first goes the key used to split the file with grep
-    select test, query_index, query_display_name,
+    as select test, query_index, query_display_name,
        toDecimal64(d, 3) d, q, metric
    from (
        select
@ -1187,8 +1195,9 @@ create table metrics engine File(TSV, 'metrics/metrics.tsv') as
    ;

 -- Show metrics that have changed
-create table changes engine File(TSV, 'metrics/changes.tsv') as
-    select metric, left, right,
+create table changes engine File(TSV, 'metrics/changes.tsv')
+    settings output_format_decimal_trailing_zeros = 1
+    as select metric, left, right,
        toDecimal64(diff, 3), toDecimal64(times_diff, 3)
    from (
        select metric, median(left) as left, median(right) as right,
--- a/docker/test/performance-comparison/download.sh
+++ b/docker/test/performance-comparison/download.sh
@ -13,7 +13,7 @@ left_sha=$2
 # right_pr=$3 not used for now
 right_sha=$4

-datasets=${CHPC_DATASETS:-"hits1 hits10 hits100 values"}
+datasets=${CHPC_DATASETS-"hits1 hits10 hits100 values"}

 declare -A dataset_paths
 dataset_paths["hits10"]="https://s3.mds.yandex.net/clickhouse-private-datasets/hits_10m_single/partitions/hits_10m_single.tar"
--- a/docker/test/performance-comparison/entrypoint.sh
+++ b/docker/test/performance-comparison/entrypoint.sh
@ -127,6 +127,15 @@ export PATH
 export REF_PR
 export REF_SHA

+# Try to collect some core dumps. I've seen two patterns in Sandbox:
+# 1) |/home/zomb-sandbox/venv/bin/python /home/zomb-sandbox/client/sandbox/bin/coredumper.py %e %p %g %u %s %P %c
+#    Not sure what this script does (puts them to sandbox resources, logs some messages?),
+#    and it's not accessible from inside docker anyway.
+# 2) something like %e.%p.core.dmp. The dump should end up in the workspace directory.
+# At least we remove the ulimit and then try to pack some common file names into output.
+ulimit -c unlimited
+cat /proc/sys/kernel/core_pattern
+
 # Start the main comparison script.
 { \
    time ../download.sh "$REF_PR" "$REF_SHA" "$PR_TO_TEST" "$SHA_TO_TEST" && \
@ -144,8 +153,11 @@ done

 dmesg -T > dmesg.log

+ls -lath
+
 7z a '-x!*/tmp' /output/output.7z ./*.{log,tsv,html,txt,rep,svg,columns} \
    {right,left}/{performance,scripts} {{right,left}/db,db0}/preprocessed_configs \
-    report analyze benchmark metrics
+    report analyze benchmark metrics \
+    ./*.core.dmp ./*.core

 cp compare.log /output
--- a/docker/test/stateless/process_functional_tests_result.py
+++ b/docker/test/stateless/process_functional_tests_result.py
@ -105,10 +105,6 @@ def process_result(result_path):
            description += ", skipped: {}".format(skipped)
        if unknown != 0:
            description += ", unknown: {}".format(unknown)
-
-        # Temporary green for tests with DatabaseReplicated:
-        if 1 == int(os.environ.get('USE_DATABASE_REPLICATED', 0)):
-            state = "success"
    else:
        state = "failure"
        description = "Output log doesn't exist"
--- a/docs/en/engines/database-engines/replicated.md
+++ b/docs/en/engines/database-engines/replicated.md
@ -1,3 +1,8 @@
+---
+toc_priority: 36
+toc_title: Replicated
+---
+
 # [experimental] Replicated {#replicated}

 The engine is based on the [Atomic](../../engines/database-engines/atomic.md) engine. It supports replication of metadata via DDL log being written to ZooKeeper and executed on all of the replicas for a given database.
--- a/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/graphitemergetree.md
@ -38,9 +38,7 @@ A table for the Graphite data should have the following columns for the followin

 -   Value of the metric. Data type: any numeric.

-   Version of the metric. Data type: any numeric.
-
-    ClickHouse saves the rows with the highest version or the last written if versions are the same. Other rows are deleted during the merge of data parts.
+-   Version of the metric. Data type: any numeric (ClickHouse saves the rows with the highest version or the last written if versions are the same. Other rows are deleted during the merge of data parts).

 The names of these columns should be set in the rollup configuration.

@ -132,7 +130,7 @@ Fields for `pattern` and `default` sections:
 -   `regexp`– A pattern for the metric name.
 -   `age` – The minimum age of the data in seconds.
 -   `precision`– How precisely to define the age of the data in seconds. Should be a divisor for 86400 (seconds in a day).
-   `function` – The name of the aggregating function to apply to data whose age falls within the range `[age, age + precision]`.
+-   `function` – The name of the aggregating function to apply to data whose age falls within the range `[age, age + precision]`. Accepted functions: min / max / any / avg. The average is calculated imprecisely, like the average of the averages. 

 ### Configuration Example {#configuration-example}

@ -169,4 +167,7 @@ Fields for `pattern` and `default` sections:
 </graphite_rollup>
 ```

+!!! warning "Warning"
+    Data rollup is performed during merges. Usually, for old partitions, merges are not started, so for rollup it is necessary to trigger an unscheduled merge using [optimize](../../../sql-reference/statements/optimize.md). Or use additional tools, for example [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer).
+
 [Original article](https://clickhouse.tech/docs/en/operations/table_engines/graphitemergetree/) <!--hide-->
--- a/docs/en/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/en/engines/table-engines/mergetree-family/mergetree.md
@ -844,44 +844,3 @@ S3 disk can be configured as `main` or `cold` storage:
 ```

 In case of `cold` option a data can be moved to S3 if local disk free size will be smaller than `move_factor * disk_size` or by TTL move rule.
-
-## Using HDFS for Data Storage {#table_engine-mergetree-hdfs}
-
-[HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html) is a distributed file system for remote data storage.
-
-`MergeTree` family table engines can store data to HDFS using a disk with type `HDFS`.
-
-Configuration markup:
-``` xml
-<yandex>
-    <storage_configuration>
-        <disks>
-            <hdfs>
-                <type>hdfs</type>
-                <endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
-            </hdfs>
-        </disks>
-        <policies>
-            <hdfs>
-                <volumes>
-                    <main>
-                        <disk>hdfs</disk>
-                    </main>
-                </volumes>
-            </hdfs>
-        </policies>
-    </storage_configuration>
-
-    <merge_tree>
-        <min_bytes_for_wide_part>0</min_bytes_for_wide_part>
-    </merge_tree>
-</yandex>
-```
-
-Required parameters:
-
-   `endpoint` — HDFS endpoint URL in `path` format. Endpoint URL should contain a root path to store data.
-
-Optional parameters:
-
-   `min_bytes_for_seek` — The minimal number of bytes to use seek operation instead of sequential read. Default value: `1 Mb`.
--- a/docs/en/engines/table-engines/mergetree-family/replication.md
+++ b/docs/en/engines/table-engines/mergetree-family/replication.md
@ -137,7 +137,7 @@ CREATE TABLE table_name
 ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', ver)
 PARTITION BY toYYYYMM(EventDate)
 ORDER BY (CounterID, EventDate, intHash32(UserID))
-SAMPLE BY intHash32(UserID)
+SAMPLE BY intHash32(UserID);
 ```

 <details markdown="1">
@ -150,12 +150,12 @@ CREATE TABLE table_name
    EventDate DateTime,
    CounterID UInt32,
    UserID UInt32
-) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192)
+) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192);
 ```

 </details>

-As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the «[macros](../../../operations/server-configuration-parameters/settings/#macros) section of the configuration file.
+As the example shows, these parameters can contain substitutions in curly brackets. The substituted values are taken from the [macros](../../../operations/server-configuration-parameters/settings.md#macros) section of the configuration file.

 Example:

--- a/docs/en/engines/table-engines/special/buffer.md
+++ b/docs/en/engines/table-engines/special/buffer.md
@ -56,6 +56,9 @@ The same thing happens if the subordinate table does not exist when the buffer i

 If you need to run ALTER for a subordinate table, and the Buffer table, we recommend first deleting the Buffer table, running ALTER for the subordinate table, then creating the Buffer table again.

+!!! attention "Attention"
+    Running ALTER on the Buffer table in releases made before 28 Sep 2020 will cause a `Block structure mismatch` error (see [#15117](https://github.com/ClickHouse/ClickHouse/issues/15117)), so deleting the Buffer table and then recreating is the only option. It is advisable to check that this error is fixed in your release before trying to run ALTER on the Buffer table.
+
 If the server is restarted abnormally, the data in the buffer is lost.

 `FINAL` and `SAMPLE` do not work correctly for Buffer tables. These conditions are passed to the destination table, but are not used for processing data in the buffer. If these features are required we recommend only using the Buffer table for writing, while reading from the destination table.
--- a/docs/en/getting-started/example-datasets/menus.md
+++ b/docs/en/getting-started/example-datasets/menus.md
@ -105,7 +105,7 @@ We use `Decimal` data type to store prices. Everything else is quite straightfor

 ## Import Data

-Upload data into ClickHouse in parallel:
+Upload data into ClickHouse:

 ```
 clickhouse-client --format_csv_allow_single_quotes 0 --input_format_null_as_default 0 --query "INSERT INTO dish FORMAT CSVWithNames" < Dish.csv
--- a/docs/en/operations/clickhouse-keeper.md
+++ b/docs/en/operations/clickhouse-keeper.md
@ -114,5 +114,5 @@ Seamlessly migration from ZooKeeper to `clickhouse-keeper` is impossible you hav
 clickhouse-keeper-converter --zookeeper-logs-dir /var/lib/zookeeper/version-2 --zookeeper-snapshots-dir /var/lib/zookeeper/version-2 --output-dir /path/to/clickhouse/keeper/snapshots
 ```

-4. Copy snapshot to `clickhouse-server` nodes with configured `keeper` or start `clickhouse-keeper` instead of ZooKeeper. Snapshot must persist only on leader node, leader will sync it automatically to other nodes.
+4. Copy snapshot to `clickhouse-server` nodes with configured `keeper` or start `clickhouse-keeper` instead of ZooKeeper. Snapshot must persist on all nodes, otherwise empty nodes can be faster and one of them can becamse leader.

--- a/docs/en/operations/configuration-files.md
+++ b/docs/en/operations/configuration-files.md
@ -18,6 +18,18 @@ Some settings specified in the main configuration file can be overridden in othe
 -   If `replace` is specified, it replaces the entire element with the specified one.
 -   If `remove` is specified, it deletes the element.

+You can also declare attributes as coming from environment variables by using `from_env="VARIABLE_NAME"`:
+
+```xml
+<yandex>
+    <macros>
+        <replica from_env="REPLICA" />
+        <layer from_env="LAYER" />
+        <shard from_env="SHARD" />
+    </macros>
+</yandex>
+```
+
 ## Substitution {#substitution}

 The config can also define “substitutions”. If an element has the `incl` attribute, the corresponding substitution from the file will be used as the value. By default, the path to the file with substitutions is `/etc/metrika.xml`. This can be changed in the [include_from](../operations/server-configuration-parameters/settings.md#server_configuration_parameters-include_from) element in the server config. The substitution values are specified in `/yandex/substitution_name` elements in this file. If a substitution specified in `incl` does not exist, it is recorded in the log. To prevent ClickHouse from logging missing substitutions, specify the `optional="true"` attribute (for example, settings for [macros](../operations/server-configuration-parameters/settings.md)).
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -486,7 +486,7 @@ Parameter substitutions for replicated tables.

 Can be omitted if replicated tables are not used.

-For more information, see the section “[Creating replicated tables](../../engines/table-engines/mergetree-family/replication.md)”.
+For more information, see the section [Creating replicated tables](../../engines/table-engines/mergetree-family/replication.md#creating-replicated-tables).

 **Example**

@ -1247,6 +1247,7 @@ Default value: `/var/lib/clickhouse/access/`.
 Section of the configuration file that contains settings:
 -   Path to configuration file with predefined users.
 -   Path to folder where users created by SQL commands are stored.
+-   ZooKeeper node path where users created by SQL commands are stored and replicated (experimental).

 If this section is specified, the path from [users_config](../../operations/server-configuration-parameters/settings.md#users-config) and [access_control_path](../../operations/server-configuration-parameters/settings.md#access_control_path) won't be used.

@ -1262,6 +1263,9 @@ The `user_directories` section can contain any number of items, the order of the
    <local_directory>
        <path>/var/lib/clickhouse/access/</path>
    </local_directory>
+    <replicated>
+        <zookeeper_path>/clickhouse/access/</zookeeper_path>
+    </replicated>
 </user_directories>
 ```

--- a/docs/en/operations/storing-data.md
+++ b/docs/en/operations/storing-data.md
@ -5,10 +5,111 @@ toc_title: External Disks for Storing Data

 # External Disks for Storing Data {#external-disks}

-Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely — on [Amazon s3](https://aws.amazon.com/s3/) disks or in the Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)). 
+Data, processed in ClickHouse, is usually stored in the local file system — on the same machine with the ClickHouse server. That requires large-capacity disks, which can be expensive enough. To avoid that you can store the data remotely — on [Amazon S3](https://aws.amazon.com/s3/) disks or in the Hadoop Distributed File System ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)). 

-To work with data stored on `Amazon s3` disks use [s3](../engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](../engines/table-engines/integrations/hdfs.md) table engine. 
+To work with data stored on `Amazon S3` disks use [S3](../engines/table-engines/integrations/s3.md) table engine, and to work with data in the Hadoop Distributed File System — [HDFS](../engines/table-engines/integrations/hdfs.md) table engine. 

 ## Zero-copy Replication {#zero-copy}

-ClickHouse supports zero-copy replication for `s3` and `HDFS` disks, which means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself. 
+ClickHouse supports zero-copy replication for `S3` and `HDFS` disks, which means that if the data is stored remotely on several machines and needs to be synchronized, then only the metadata is replicated (paths to the data parts), but not the data itself.
+
+## Configuring HDFS {#configuring-hdfs}
+
+[MergeTree](../engines/table-engines/mergetree-family/mergetree.md) and [Log](../engines/table-engines/log-family/log.md) family table engines can store data to HDFS using a disk with type `HDFS`.
+
+Configuration markup:
+
+``` xml
+<yandex>
+    <storage_configuration>
+        <disks>
+            <hdfs>
+                <type>hdfs</type>
+                <endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
+            </hdfs>
+        </disks>
+        <policies>
+            <hdfs>
+                <volumes>
+                    <main>
+                        <disk>hdfs</disk>
+                    </main>
+                </volumes>
+            </hdfs>
+        </policies>
+    </storage_configuration>
+
+    <merge_tree>
+        <min_bytes_for_wide_part>0</min_bytes_for_wide_part>
+    </merge_tree>
+</yandex>
+```
+
+Required parameters:
+
+-   `endpoint` — HDFS endpoint URL in `path` format. Endpoint URL should contain a root path to store data.
+
+Optional parameters:
+
+-   `min_bytes_for_seek` — The minimal number of bytes to use seek operation instead of sequential read. Default value: `1 Mb`.
+
+## Using Virtual File System for Data Encryption {#encrypted-virtual-file-system}
+
+You can encrypt the data stored on [S3](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-s3), or [HDFS](#configuring-hdfs) external disks, or on a local disk. To turn on the encryption mode, in the configuration file you must define a disk with the type `encrypted` and choose a disk on which the data will be saved. An `encrypted` disk ciphers all written files on the fly, and when you read files from an `encrypted` disk it deciphers them automatically. So you can work with an `encrypted` disk like with a normal one.
+
+Example of disk configuration:
+
+``` xml
+<disks>
+  <disk1>
+    <type>local</type>
+    <path>/path1/</path>
+  </disk1>
+  <disk2>
+    <type>encrypted</type>
+    <disk>disk1</disk>
+    <path>path2/</path>
+    <key>_16_ascii_chars_</key>
+  </disk2>
+</disks>
+```
+
+For example, when ClickHouse writes data from some table to a file `store/all_1_1_0/data.bin` to `disk1`, then in fact this file will be written to the physical disk along the path `/path1/store/all_1_1_0/data.bin`.
+
+When writing the same file to `disk2`, it will actually be written to the physical disk at the path `/path1/path2/store/all_1_1_0/data.bin` in encrypted mode.
+
+Required parameters:
+
+-   `type` — `encrypted`. Otherwise the encrypted disk is not created.
+-   `disk` — Type of disk for data storage.
+-   `key` — The key for encryption and decryption. Type: [Uint64](../sql-reference/data-types/int-uint.md). You can use `key_hex` parameter to encrypt in hexadecimal form.
+    You can specify multiple keys using the `id` attribute (see example above).
+
+Optional parameters:
+
+-   `path` — Path to the location on the disk where the data will be saved. If not specified, the data will be saved in the root directory.
+-   `current_key_id` — The key used for encryption. All the specified keys can be used for decryption, and you can always switch to another key while maintaining access to previously encrypted data.
+-   `algorithm` — [Algorithm](../sql-reference/statements/create/table.md#create-query-encryption-codecs) for encryption. Possible values: `AES_128_CTR`, `AES_192_CTR` or `AES_256_CTR`. Default value: `AES_128_CTR`. The key length depends on the algorithm: `AES_128_CTR` — 16 bytes, `AES_192_CTR` — 24 bytes, `AES_256_CTR` — 32 bytes.
+
+Example of disk configuration:
+
+``` xml
+<yandex>
+    <storage_configuration>
+        <disks>
+            <disk_s3>
+                <type>s3</type>
+                <endpoint>...
+            </disk_s3>
+            <disk_s3_encrypted>
+                <type>encrypted</type>
+                <disk>disk_s3</disk>
+                <algorithm>AES_128_CTR</algorithm>
+                <key_hex id="0">00112233445566778899aabbccddeeff</key_hex>
+                <key_hex id="1">ffeeddccbbaa99887766554433221100</key_hex>
+                <current_key_id>1</current_key_id>
+            </disk_s3_encrypted>
+        </disks>
+    </storage_configuration>
+</yandex>
+```
--- a/docs/en/operations/system-tables/opentelemetry_span_log.md
+++ b/docs/en/operations/system-tables/opentelemetry_span_log.md
@ -4,7 +4,7 @@ Contains information about [trace spans](https://opentracing.io/docs/overview/sp

 Columns:

-   `trace_id` ([UUID](../../sql-reference/data-types/uuid.md) — ID of the trace for executed query.
+-   `trace_id` ([UUID](../../sql-reference/data-types/uuid.md)) — ID of the trace for executed query.

 -   `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — ID of the `trace span`.

--- a/docs/en/operations/system-tables/zookeeper_log.md
+++ b/docs/en/operations/system-tables/zookeeper_log.md
@ -0,0 +1,129 @@
+# system.zookeeper_log {#system-zookeeper_log}
+
+This table contains information about the parameters of the request to the ZooKeeper server and the response from it.
+
+For requests, only columns with request parameters are filled in, and the remaining columns are filled with default values (`0` or `NULL`). When the response arrives, the data from the response is added to the other columns.
+
+Columns with request parameters:
+
+-   `type` ([Enum](../../sql-reference/data-types/enum.md)) — Event type in the ZooKeeper client. Can have one of the following values:
+    -   `Request` — The request has been sent.
+    -   `Response` — The response was received.
+    -   `Finalize` — The connection is lost, no response was received.
+-   `event_date` ([Date](../../sql-reference/data-types/date.md)) — The date when the event happened.
+-   `event_time` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — The date and time when the event happened.
+-   `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP address of ZooKeeper server that was used to make the request.
+-   `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — The port of ZooKeeper server that was used to make the request.
+-   `session_id` ([Int64](../../sql-reference/data-types/int-uint.md)) — The session ID that the ZooKeeper server sets for each connection.
+-   `xid` ([Int32](../../sql-reference/data-types/int-uint.md)) — The ID of the request within the session. This is usually a sequential request number. It is the same for the request row and the paired `response`/`finalize` row.
+-   `has_watch` ([UInt8](../../sql-reference/data-types/int-uint.md)) — The request whether the [watch](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#ch_zkWatches) has been set.
+-   `op_num` ([Enum](../../sql-reference/data-types/enum.md)) — The type of request or response.
+-   `path` ([String](../../sql-reference/data-types/string.md)) — The path to the ZooKeeper node specified in the request, or an empty string  if the request not requires specifying a path.
+-   `data` ([String](../../sql-reference/data-types/string.md)) — The data written to the ZooKeeper node (for the `SET` and `CREATE` requests — what the request wanted to write, for the response to the `GET` request — what was read) or an empty string.
+-   `is_ephemeral` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Is the ZooKeeper node being created as an [ephemeral](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#Ephemeral+Nodes).
+-   `is_sequential` ([UInt8](../../sql-reference/data-types/int-uint.md)) — Is the ZooKeeper node being created as an [sequential](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#Sequence+Nodes+--+Unique+Naming).
+-   `version` ([Nullable(Int32)](../../sql-reference/data-types/nullable.md)) — The version of the ZooKeeper node that the request expects when executing. This is supported for `CHECK`, `SET`, `REMOVE` requests (is relevant `-1` if the request does not check the version or `NULL` for other requests that do not support version checking).
+-   `requests_size` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of requests included in the multi request (this is a special request that consists of several consecutive ordinary requests and executes them atomically). All requests included in multi request will have the same `xid`.
+-   `request_idx` ([UInt32](../../sql-reference/data-types/int-uint.md)) — The number of the request included in multi request (for multi request — `0`, then in order from `1`).
+
+Columns with request response parameters:
+
+-   `zxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — ZooKeeper transaction ID. The serial number issued by the ZooKeeper server in response to a successfully executed request (`0` if the request was not executed/returned an error/the client does not know whether the request was executed).
+-   `error` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — Error code. Can have many values, here are just some of them:
+    -   `ZOK` — The request was executed seccessfully.
+    -   `ZCONNECTIONLOSS` — The connection was lost.
+    -   `ZOPERATIONTIMEOUT` — The request execution timeout has expired.
+	-   `ZSESSIONEXPIRED` — The session has expired.
+    -   `NULL` — The request is completed.
+-   `watch_type` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — The type of the `watch` event (for responses with `op_num` = `Watch`), for the remaining responses: `NULL`.
+-   `watch_state` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — The status of the `watch` event (for responses with `op_num` = `Watch`), for the remaining responses: `NULL`.
+-   `path_created` ([String](../../sql-reference/data-types/string.md)) — The path to the created ZooKeeper node (for responses to the `CREATE` request), may differ from the `path` if the node is created as a `sequential`.
+-   `stat_czxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The `zxid` of the change that caused this ZooKeeper node to be created.
+-   `stat_mzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The `zxid` of the change that last modified this ZooKeeper node.
+-   `stat_pzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — The transaction ID of the change that last modified childern of this ZooKeeper node.
+-   `stat_version` ([Int32](../../sql-reference/data-types/int-uint.md)) — The number of changes to the data of this ZooKeeper node.
+-   `stat_cversion` ([Int32](../../sql-reference/data-types/int-uint.md)) — The number of changes to the children of this ZooKeeper node.
+-   `stat_dataLength` ([Int32](../../sql-reference/data-types/int-uint.md)) — The length of the data field of this ZooKeeper node.
+-   `stat_numChildren` ([Int32](../../sql-reference/data-types/int-uint.md)) — The number of children of this ZooKeeper node.
+-   `children` ([Array(String)](../../sql-reference/data-types/array.md)) — The list of child ZooKeeper nodes (for responses to `LIST` request).
+
+**Example**
+
+Query:
+
+``` sql
+SELECT * FROM system.zookeeper_log WHERE (session_id = '106662742089334927') AND (xid = '10858') FORMAT Vertical;
+```
+
+Result:
+
+``` text
+Row 1:
+──────
+type:             Request
+event_date:       2021-08-09
+event_time:       2021-08-09 21:38:30.291792
+address:          ::
+port:             2181
+session_id:       106662742089334927
+xid:              10858
+has_watch:        1
+op_num:           List
+path:             /clickhouse/task_queue/ddl
+data:             
+is_ephemeral:     0
+is_sequential:    0
+version:          ᴺᵁᴸᴸ
+requests_size:    0
+request_idx:      0
+zxid:             0
+error:            ᴺᵁᴸᴸ
+watch_type:       ᴺᵁᴸᴸ
+watch_state:      ᴺᵁᴸᴸ
+path_created:     
+stat_czxid:       0
+stat_mzxid:       0
+stat_pzxid:       0
+stat_version:     0
+stat_cversion:    0
+stat_dataLength:  0
+stat_numChildren: 0
+children:         []
+
+Row 2:
+──────
+type:             Response
+event_date:       2021-08-09
+event_time:       2021-08-09 21:38:30.292086
+address:          ::
+port:             2181
+session_id:       106662742089334927
+xid:              10858
+has_watch:        1
+op_num:           List
+path:             /clickhouse/task_queue/ddl
+data:             
+is_ephemeral:     0
+is_sequential:    0
+version:          ᴺᵁᴸᴸ
+requests_size:    0
+request_idx:      0
+zxid:             16926267
+error:            ZOK
+watch_type:       ᴺᵁᴸᴸ
+watch_state:      ᴺᵁᴸᴸ
+path_created:     
+stat_czxid:       16925469
+stat_mzxid:       16925469
+stat_pzxid:       16926179
+stat_version:     0
+stat_cversion:    7
+stat_dataLength:  0
+stat_numChildren: 7
+children:         ['query-0000000006','query-0000000005','query-0000000004','query-0000000003','query-0000000002','query-0000000001','query-0000000000']
+```
+
+**See Also**
+
+-   [ZooKeeper](../../operations/tips.md#zookeeper)
+-   [ZooKeeper guide](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html)
--- a/docs/en/sql-reference/functions/geo/h3.md
+++ b/docs/en/sql-reference/functions/geo/h3.md
@ -197,7 +197,7 @@ Result:

 ## h3ToGeo {#h3togeo}

-Returns `(lon, lat)` that corresponds to the provided H3 index.
+Returns the geographical coordinates of longitude and latitude corresponding to the provided [H3](#h3index) index.

 **Syntax**

@ -207,20 +207,18 @@ h3ToGeo(h3Index)

 **Arguments**

-   `h3Index` — H3 Index. Type: [UInt64](../../../sql-reference/data-types/int-uint.md).
+-   `h3Index` — H3 Index. [UInt64](../../../sql-reference/data-types/int-uint.md).

 **Returned values**

-   `lon` — Longitude. Type: [Float64](../../../sql-reference/data-types/float.md).
-   `lat` — Latitude. Type: [Float64](../../../sql-reference/data-types/float.md).
-
+-   A tuple consisting of two values: `tuple(lon,lat)`. `lon` — Longitude. [Float64](../../../sql-reference/data-types/float.md). `lat` — Latitude. [Float64](../../../sql-reference/data-types/float.md).

 **Example**

 Query:

 ``` sql
-SELECT h3ToGeo(644325524701193974) coordinates;
+SELECT h3ToGeo(644325524701193974) AS coordinates;
 ```

 Result:
@ -230,6 +228,7 @@ Result:
 │ (37.79506616830252,55.71290243145668) │
 └───────────────────────────────────────┘
 ```
+
 ## h3kRing {#h3kring}

 Lists all the [H3](#h3index) hexagons in the raduis of `k` from the given hexagon in random order.
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@ -1339,3 +1339,149 @@ Result:
 │ 2,"good"                                  │
 └───────────────────────────────────────────┘
 ```
+
+## snowflakeToDateTime {#snowflakeToDateTime}
+
+Extract time from snowflake id as DateTime format.
+
+**Syntax**
+
+``` sql
+snowflakeToDateTime(value [, time_zone])
+```
+
+**Parameters**
+
+-   `value` — `snowflake id`, Int64 value.
+-   `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-  value converted to the `DateTime` data type.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC');
+```
+
+Result:
+
+``` text
+
+┌─snowflakeToDateTime(CAST('1426860702823350272', 'Int64'), 'UTC')─┐
+│                                              2021-08-15 10:57:56 │
+└──────────────────────────────────────────────────────────────────┘
+```
+
+## snowflakeToDateTime64 {#snowflakeToDateTime64}
+
+Extract time from snowflake id as DateTime64 format.
+
+**Syntax**
+
+``` sql
+snowflakeToDateTime64(value [, time_zone])
+```
+
+**Parameters**
+
+-   `value` — `snowflake id`, Int64 value.
+-   `time_zone` — [Timezone](../../operations/server-configuration-parameters/settings.md#server_configuration_parameters-timezone). The function parses `time_string` according to the timezone. Optional. [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-  value converted to the `DateTime64` data type.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC');
+```
+
+Result:
+
+``` text
+
+┌─snowflakeToDateTime64(CAST('1426860802823350272', 'Int64'), 'UTC')─┐
+│                                            2021-08-15 10:58:19.841 │
+└────────────────────────────────────────────────────────────────────┘
+```
+
+## dateTimeToSnowflake {#dateTimeToSnowflake}
+
+Convert DateTime to the first snowflake id at the giving time.
+
+**Syntax**
+
+``` sql
+dateTimeToSnowflake(value)
+```
+
+**Parameters**
+
+-   `value` — Date and time. [DateTime](../../sql-reference/data-types/datetime.md).
+
+
+**Returned value**
+
+-   `value` converted to the `Int64` data type as the first snowflake id at that time.
+
+**Example**
+
+Query:
+
+``` sql
+WITH toDateTime('2021-08-15 18:57:56', 'Asia/Shanghai') AS dt
+SELECT dateTimeToSnowflake(dt);
+```
+
+Result:
+
+``` text
+
+┌─dateTimeToSnowflake(dt)─┐
+│     1426860702823350272 │
+└─────────────────────────┘
+```
+
+
+## dateTime64ToSnowflake {#dateTime64ToSnowflake}
+
+Convert DateTime64 to the first snowflake id at the giving time.
+
+**Syntax**
+
+``` sql
+dateTime64ToSnowflake(value)
+```
+
+**Parameters**
+
+-   `value` — Date and time. [DateTime64](../../sql-reference/data-types/datetime64.md).
+
+
+**Returned value**
+
+-   `value` converted to the `Int64` data type as the first snowflake id at that time.
+
+**Example**
+
+Query:
+
+``` sql
+WITH toDateTime64('2021-08-15 18:57:56.492', 3, 'Asia/Shanghai') AS dt64
+SELECT dateTime64ToSnowflake(dt64);
+```
+
+Result:
+
+``` text
+┌─dateTime64ToSnowflake(dt64)─┐
+│         1426860704886947840 │
+└─────────────────────────────┘
+```
--- a/docs/en/sql-reference/statements/explain.md
+++ b/docs/en/sql-reference/statements/explain.md
@ -384,5 +384,32 @@ ExpressionTransform
            (ReadFromStorage)
            NumbersMt × 2 0 → 1
 ```
+### EXPLAIN ESTIMATE {#explain-estimate}
+
+Shows the estimated number of rows, marks and parts to be read from the tables while processing the query. Works with tables in the [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree) family. 
+
+**Example**
+
+Creating a table:
+
+```sql
+CREATE TABLE ttt (i Int64) ENGINE = MergeTree() ORDER BY i SETTINGS index_granularity = 16, write_final_mark = 0;
+INSERT INTO ttt SELECT number FROM numbers(128);
+OPTIMIZE TABLE ttt;
+```
+
+Query:
+
+```sql
+EXPLAIN ESTIMATE SELECT * FROM ttt;
+```
+
+Result:
+
+```text
+┌─database─┬─table─┬─parts─┬─rows─┬─marks─┐
+│ default  │ ttt   │     1 │  128 │     8 │
+└──────────┴───────┴───────┴──────┴───────┘
+```

 [Оriginal article](https://clickhouse.tech/docs/en/sql-reference/statements/explain/) <!--hide-->
--- a/docs/en/sql-reference/statements/system.md
+++ b/docs/en/sql-reference/statements/system.md
@ -311,12 +311,12 @@ One may execute query after:
  - Individual replica path `/replicas/replica_name/` loss.

 Replica attaches locally found parts and sends info about them to Zookeeper.
-Parts present on replica before metadata loss are not re-fetched from other replicas if not being outdated
-(so replica restoration does not mean re-downloading all data over the network).
+Parts present on a replica before metadata loss are not re-fetched from other ones if not being outdated (so replica restoration does not mean re-downloading all data over the network).

-Caveat: parts in all states are moved to `detached/` folder. Parts active before data loss (Committed) are attached.
+!!! warning "Warning"
+    Parts in all states are moved to `detached/` folder. Parts active before data loss (committed) are attached.

-#### Syntax
+**Syntax**

 ```sql
 SYSTEM RESTORE REPLICA [db.]replicated_merge_tree_family_table_name [ON CLUSTER cluster_name]
@ -328,11 +328,11 @@ Alternative syntax:
 SYSTEM RESTORE REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name
 ```

-#### Example
+**Example**
+
+Creating a table on multiple servers. After the replica's metadata in ZooKeeper is lost, the table will attach as read-only as metadata is missing. The last query needs to execute on every replica.

 ```sql
-- Creating table on multiple servers
-
 CREATE TABLE test(n UInt32)
 ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/', '{replica}')
 ORDER BY n PARTITION BY n % 10;
@ -341,8 +341,14 @@ INSERT INTO test SELECT * FROM numbers(1000);

 -- zookeeper_delete_path("/clickhouse/tables/test", recursive=True) <- root loss.

-SYSTEM RESTART REPLICA test; -- Table will attach as readonly as metadata is missing.
-SYSTEM RESTORE REPLICA test; -- Need to execute on every replica, another way: RESTORE REPLICA test ON CLUSTER cluster
+SYSTEM RESTART REPLICA test;
+SYSTEM RESTORE REPLICA test;
+```
+
+Another way:
+
+```sql
+SYSTEM RESTORE REPLICA test ON CLUSTER cluster;
 ```

 ### RESTART REPLICAS {#query_language-system-restart-replicas}
--- a/docs/en/sql-reference/table-functions/cluster.md
+++ b/docs/en/sql-reference/table-functions/cluster.md
@ -6,12 +6,13 @@ toc_title: cluster
 # cluster, clusterAllReplicas {#cluster-clusterallreplicas}

 Allows to access all shards in an existing cluster which configured in `remote_servers` section without creating a [Distributed](../../engines/table-engines/special/distributed.md) table. One replica of each shard is queried.
-`clusterAllReplicas` - same as `cluster` but all replicas are queried. Each replica in a cluster is used as separate shard/connection.
+
+`clusterAllReplicas` function — same as `cluster`, but all replicas are queried. Each replica in a cluster is used as a separate shard/connection.

 !!! note "Note"
-    All available clusters are listed in the `system.clusters` table.
+    All available clusters are listed in the [system.clusters](../../operations/system-tables/clusters.md) table.

-Signatures:
+**Syntax**

 ``` sql
 cluster('cluster_name', db.table[, sharding_key])
@ -19,10 +20,27 @@ cluster('cluster_name', db, table[, sharding_key])
 clusterAllReplicas('cluster_name', db.table[, sharding_key])
 clusterAllReplicas('cluster_name', db, table[, sharding_key])
 ```
+**Arguments**

-`cluster_name` – Name of a cluster that is used to build a set of addresses and connection parameters to remote and local servers.
+- `cluster_name` – Name of a cluster that is used to build a set of addresses and connection parameters to remote and local servers. 
+- `db.table` or `db`, `table` - Name of a database and a table.  
+- `sharding_key` -  A sharding key. Optional. Needs to be specified if the cluster has more than one shard. 

-`sharding_key` - When insert into cluster function with more than one shard, sharding_key need to be provided.
+**Returned value**
+
+The dataset from clusters.
+
+**Using Macros**
+
+`cluster_name` can contain macros — substitution in curly brackets. The substituted value is taken from the [macros](../../operations/server-configuration-parameters/settings.md#macros) section of the server configuration file.
+
+Example:
+
+```sql
+SELECT * FROM cluster('{cluster}', default.example_table);
+```
+
+**Usage and Recommendations**

 Using the `cluster` and `clusterAllReplicas` table functions are less efficient than creating a `Distributed` table because in this case, the server connection is re-established for every request. When processing a large number of queries, please always create the `Distributed` table ahead of time, and do not use the `cluster` and `clusterAllReplicas` table functions.

--- a/docs/ru/engines/database-engines/replicated.md
+++ b/docs/ru/engines/database-engines/replicated.md
@ -1,3 +1,7 @@
+---
+toc_priority: 36
+toc_title: Replicated
+---

 # [экспериментальный] Replicated {#replicated}

--- a/docs/ru/engines/table-engines/mergetree-family/graphitemergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/graphitemergetree.md
@ -38,9 +38,7 @@ CREATE TABLE [IF NOT EXISTS] [db.]table_name [ON CLUSTER cluster]

 -   Значение метрики. Тип данных: любой числовой.

-   Версия метрики. Тип данных: любой числовой.
-
-        ClickHouse сохраняет строки с последней версией или последнюю записанную строку, если версии совпадают. Другие строки удаляются при слиянии кусков данных.
+-   Версия метрики. Тип данных: любой числовой (ClickHouse сохраняет строки с последней версией или последнюю записанную строку, если версии совпадают. Другие строки удаляются при слиянии кусков данных).

 Имена этих столбцов должны быть заданы в конфигурации rollup.

@ -173,4 +171,4 @@ default


 !!! warning "Внимание"
-    Прореживание данных производится во время слияний. Обычно для старых партций слияния не запускаются, поэтому для прореживания надо иницировать незапланированное слияние используя [optimize](../../../sql-reference/statements/optimize/). Или использовать дополнительные инструменты, например [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer).
+    Прореживание данных производится во время слияний. Обычно для старых партций слияния не запускаются, поэтому для прореживания надо иницировать незапланированное слияние используя [optimize](../../../sql-reference/statements/optimize.md). Или использовать дополнительные инструменты, например [graphite-ch-optimizer](https://github.com/innogames/graphite-ch-optimizer).
--- a/docs/ru/engines/table-engines/mergetree-family/mergetree.md
+++ b/docs/ru/engines/table-engines/mergetree-family/mergetree.md
@ -827,44 +827,3 @@ SETTINGS storage_policy = 'moving_from_ssd_to_hdd'
 ```

 Если диск сконфигурирован как `cold`, данные будут переноситься в S3 при срабатывании правил TTL или когда свободное место на локальном диске станет меньше порогового значения, которое определяется как `move_factor * disk_size`.
-
-## Использование сервиса HDFS для хранения данных {#table_engine-mergetree-hdfs}
-
-[HDFS](https://hadoop.apache.org/docs/r1.2.1/hdfs_design.html) — это распределенная файловая система для удаленного хранения данных.
-
-Таблицы семейства `MergeTree` могут хранить данные в сервисе HDFS при использовании диска типа `HDFS`.
-
-Пример конфигурации:
-``` xml
-<yandex>
-    <storage_configuration>
-        <disks>
-            <hdfs>
-                <type>hdfs</type>
-                <endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
-            </hdfs>
-        </disks>
-        <policies>
-            <hdfs>
-                <volumes>
-                    <main>
-                        <disk>hdfs</disk>
-                    </main>
-                </volumes>
-            </hdfs>
-        </policies>
-    </storage_configuration>
-
-    <merge_tree>
-        <min_bytes_for_wide_part>0</min_bytes_for_wide_part>
-    </merge_tree>
-</yandex>
-```
-
-Обязательные параметры:
-
-   `endpoint` — URL точки приема запроса на стороне HDFS в формате `path`. URL точки должен содержать путь к корневой директории на сервере, где хранятся данные.
-
-Необязательные параметры:
-
-   `min_bytes_for_seek` — минимальное количество байтов, которые используются для операций поиска вместо последовательного чтения. Значение по умолчанию: 1 МБайт.
--- a/docs/ru/engines/table-engines/mergetree-family/replication.md
+++ b/docs/ru/engines/table-engines/mergetree-family/replication.md
@ -102,7 +102,7 @@ CREATE TABLE table_name
 ) ENGINE = ReplicatedReplacingMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', ver)
 PARTITION BY toYYYYMM(EventDate)
 ORDER BY (CounterID, EventDate, intHash32(UserID))
-SAMPLE BY intHash32(UserID)
+SAMPLE BY intHash32(UserID);
 ```

 <details markdown="1">
@ -115,12 +115,12 @@ CREATE TABLE table_name
    EventDate DateTime,
    CounterID UInt32,
    UserID UInt32
-) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192)
+) ENGINE = ReplicatedMergeTree('/clickhouse/tables/{layer}-{shard}/table_name', '{replica}', EventDate, intHash32(UserID), (CounterID, EventDate, intHash32(UserID), EventTime), 8192);
 ```

 </details>

-Как видно в примере, эти параметры могут содержать подстановки в фигурных скобках. Подставляемые значения достаются из конфигурационного файла, из секции «[macros](../../../operations/server-configuration-parameters/settings/#macros)».
+Как видно в примере, эти параметры могут содержать подстановки в фигурных скобках. Эти подстановки заменяются на соответствующие значения из конфигурационного файла, из секции [macros](../../../operations/server-configuration-parameters/settings.md#macros).

 Пример:

--- a/docs/ru/engines/table-engines/special/buffer.md
+++ b/docs/ru/engines/table-engines/special/buffer.md
@ -48,7 +48,10 @@ CREATE TABLE merge.hits_buffer AS merge.hits ENGINE = Buffer(merge, hits, 16, 10
 Если у одного из столбцов таблицы Buffer и подчинённой таблицы не совпадает тип, то в лог сервера будет записано сообщение об ошибке и буфер будет очищен.
 То же самое происходит, если подчинённая таблица не существует в момент сброса буфера.

-Если есть необходимость выполнить ALTER для подчинённой таблицы и для таблицы Buffer, то рекомендуется удалить таблицу Buffer, затем выполнить ALTER подчинённой таблицы, а затем создать таблицу Buffer заново.
+Если есть необходимость выполнить ALTER для подчинённой таблицы и для таблицы Buffer, то рекомендуется удалить таблицу Buffer, затем выполнить ALTER подчинённой таблицы, а после создать таблицу Buffer заново. 
+
+!!! attention "Внимание"
+    В релизах до 28 сентября 2020 года выполнение ALTER на таблице Buffer ломает структуру блоков и вызывает ошибку (см. [#15117](https://github.com/ClickHouse/ClickHouse/issues/15117)), поэтому удаление буфера и его пересоздание — единственный вариант миграции для данного движка. Перед выполнением ALTER на таблице Buffer убедитесь, что в вашей версии эта ошибка устранена.

 При нештатном перезапуске сервера, данные, находящиеся в буфере, будут потеряны.

--- a/docs/ru/operations/server-configuration-parameters/settings.md
+++ b/docs/ru/operations/server-configuration-parameters/settings.md
@ -465,9 +465,9 @@ ClickHouse проверяет условия для `min_part_size` и `min_part

 Подстановки параметров реплицируемых таблиц.

-Можно не указывать, если реплицируемых таблицы не используются.
+Можно не указывать, если реплицируемые таблицы не используются.

-Подробнее смотрите в разделе «[Создание реплицируемых таблиц](../../engines/table-engines/mergetree-family/replication.md)».
+Подробнее смотрите в разделе [Создание реплицируемых таблиц](../../engines/table-engines/mergetree-family/replication.md#creating-replicated-tables).

 **Пример**

--- a/docs/ru/operations/storing-data.md
+++ b/docs/ru/operations/storing-data.md
@ -5,10 +5,110 @@ toc_title: "Хранение данных на внешних дисках"

 # Хранение данных на внешних дисках {#external-disks}

-Данные, которые обрабатываются в ClickHouse, обычно хранятся в файловой системе локально, где развернут сервер ClickHouse. При этом для хранения данных требуются диски большого объема, которые могут быть довольно дорогостоящими. Решением проблемы может стать хранение данных отдельно от сервера — в распределенных файловых системах — [Amazon s3](https://aws.amazon.com/s3/) или Hadoop ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)). 
+Данные, которые обрабатываются в ClickHouse, обычно хранятся в файловой системе локально, где развернут сервер ClickHouse. При этом для хранения данных требуются диски большого объема, которые могут быть довольно дорогостоящими. Решением проблемы может стать хранение данных отдельно от сервера — в распределенных файловых системах — [Amazon S3](https://aws.amazon.com/s3/) или Hadoop ([HDFS](https://hadoop.apache.org/docs/current/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html)). 

-Для работы с данными, хранящимися в файловой системе `Amazon s3`, используйте движок [s3](../engines/table-engines/integrations/s3.md), а для работы с данными в файловой системе Hadoop — движок [HDFS](../engines/table-engines/integrations/hdfs.md). 
+Для работы с данными, хранящимися в файловой системе `Amazon S3`, используйте движок [S3](../engines/table-engines/integrations/s3.md), а для работы с данными в файловой системе Hadoop — движок [HDFS](../engines/table-engines/integrations/hdfs.md). 

 ## Репликация без копирования данных {#zero-copy}

-Для дисков `s3` и `HDFS` в ClickHouse поддерживается репликация без копирования данных (zero-copy): если данные хранятся на нескольких репликах, то при синхронизации пересылаются только метаданные (пути к кускам данных), а сами данные не копируются.
+Для дисков `S3` и `HDFS` в ClickHouse поддерживается репликация без копирования данных (zero-copy): если данные хранятся на нескольких репликах, то при синхронизации пересылаются только метаданные (пути к кускам данных), а сами данные не копируются.
+
+## Использование сервиса HDFS для хранения данных {#table_engine-mergetree-hdfs}
+
+Таблицы семейств [MergeTree](../engines/table-engines/mergetree-family/mergetree.md) и [Log](../engines/table-engines/log-family/log.md) могут хранить данные в сервисе HDFS при использовании диска типа `HDFS`.
+
+Пример конфигурации:
+``` xml
+<yandex>
+    <storage_configuration>
+        <disks>
+            <hdfs>
+                <type>hdfs</type>
+                <endpoint>hdfs://hdfs1:9000/clickhouse/</endpoint>
+            </hdfs>
+        </disks>
+        <policies>
+            <hdfs>
+                <volumes>
+                    <main>
+                        <disk>hdfs</disk>
+                    </main>
+                </volumes>
+            </hdfs>
+        </policies>
+    </storage_configuration>
+
+    <merge_tree>
+        <min_bytes_for_wide_part>0</min_bytes_for_wide_part>
+    </merge_tree>
+</yandex>
+```
+
+Обязательные параметры:
+
+-   `endpoint` — URL точки приема запроса на стороне HDFS в формате `path`. URL точки должен содержать путь к корневой директории на сервере, где хранятся данные.
+
+Необязательные параметры:
+
+-   `min_bytes_for_seek` — минимальное количество байтов, которые используются для операций поиска вместо последовательного чтения. Значение по умолчанию: `1 МБайт`.
+
+## Использование виртуальной файловой системы для шифрования данных {#encrypted-virtual-file-system}
+
+Вы можете зашифровать данные, сохраненные на внешних дисках [S3](../engines/table-engines/mergetree-family/mergetree.md#table_engine-mergetree-s3) или [HDFS](#table_engine-mergetree-hdfs) или на локальном диске. Чтобы включить режим шифрования, в конфигурационном файле вы должны указать диск с типом `encrypted` и тип диска, на котором будут сохранены данные. Диск типа `encrypted` шифрует данные "на лету", то есть при чтении файлов с этого диска расшифровка происходит автоматически. Таким образом, вы можете работать с диском типа `encrypted` как с обычным.
+
+Пример конфигурации:
+
+``` xml
+<disks>
+  <disk1>
+    <type>local</type>
+    <path>/path1/</path>
+  </disk1>
+  <disk2>
+    <type>encrypted</type>
+    <disk>disk1</disk>
+    <path>path2/</path>
+    <key>_16_ascii_chars_</key>
+  </disk2>
+</disks>
+```
+
+Например, когда ClickHouse записывает данные из какой-либо таблицы в файл `store/all_1_1_0/data.bin` на `disk1`, то на самом деле этот файл будет записан на физический диск по пути `/path1/store/all_1_1_0/data.bin`.
+
+При записи того же файла на диск `disk2` он будет записан на физический диск в зашифрованном виде по пути `/path1/path2/store/all_1_1_0/data.bin`.
+
+Обязательные параметры:
+
+-   `type` — `encrypted`. Иначе зашифрованный диск создан не будет.
+-   `disk` — тип диска для хранения данных.
+-   `key` — ключ для шифрования и расшифровки. Тип: [Uint64](../sql-reference/data-types/int-uint.md). Вы можете использовать параметр `key_hex` для шифрования в шестнадцатеричной форме.
+    Вы можете указать несколько ключей, используя атрибут `id` (смотрите пример выше).
+
+Необязательные параметры:
+
+-   `path` — путь к месту на диске, где будут сохранены данные. Если не указан, данные будут сохранены в корневом каталоге.
+-   `current_key_id` — ключ, используемый для шифрования. Все указанные ключи могут быть использованы для расшифровки, и вы всегда можете переключиться на другой ключ, сохраняя доступ к ранее зашифрованным данным.
+-   `algorithm` — [алгоритм](../sql-reference/statements/create/table.md#create-query-encryption-codecs) шифрования данных. Возможные значения: `AES_128_CTR`, `AES_192_CTR` или `AES_256_CTR`. Значение по умолчанию: `AES_128_CTR`. Длина ключа зависит от алгоритма: `AES_128_CTR` — 16 байт, `AES_192_CTR` — 24 байта, `AES_256_CTR` — 32 байта.
+
+Пример конфигурации:
+
+``` xml
+<yandex>
+    <storage_configuration>
+        <disks>
+            <disk_s3>
+                <type>s3</type>
+                <endpoint>...
+            </disk_s3>
+            <disk_s3_encrypted>
+                <type>encrypted</type>
+                <disk>disk_s3</disk>
+                <algorithm>AES_128_CTR</algorithm>
+                <key_hex id="0">00112233445566778899aabbccddeeff</key_hex>
+                <key_hex id="1">ffeeddccbbaa99887766554433221100</key_hex>
+                <current_key_id>1</current_key_id>
+            </disk_s3_encrypted>
+        </disks>
+    </storage_configuration>
+</yandex>
+```
--- a/docs/ru/operations/system-tables/opentelemetry_span_log.md
+++ b/docs/ru/operations/system-tables/opentelemetry_span_log.md
@ -4,7 +4,7 @@

 Столбцы:

-   `trace_id` ([UUID](../../sql-reference/data-types/uuid.md) — идентификатор трассировки для выполненного запроса.
+-   `trace_id` ([UUID](../../sql-reference/data-types/uuid.md)) — идентификатор трассировки для выполненного запроса.

 -   `span_id` ([UInt64](../../sql-reference/data-types/int-uint.md)) — идентификатор `trace span`.

--- a/docs/ru/operations/system-tables/zookeeper_log.md
+++ b/docs/ru/operations/system-tables/zookeeper_log.md
@ -0,0 +1,129 @@
+# system.zookeeper_log {#system-zookeeper_log}
+
+Эта таблица содержит информацию о параметрах запроса к серверу ZooKeeper и ответа от него.
+
+Для запросов заполняются только столбцы с параметрами запроса, а остальные столбцы заполняются значениями по умолчанию (`0` или `NULL`). Когда поступает ответ, данные добавляются в столбцы с параметрами ответа на запрос.
+
+Столбцы с параметрами запроса:
+
+-   `type` ([Enum](../../sql-reference/data-types/enum.md)) — тип события в клиенте ZooKeeper. Может иметь одно из следующих значений:
+    -   `Request` — запрос отправлен.
+    -   `Response` — ответ получен.
+    -   `Finalize` — соединение разорвано, ответ не получен.
+-   `event_date` ([Date](../../sql-reference/data-types/date.md)) — дата, когда произошло событие.
+-   `event_time` ([DateTime64](../../sql-reference/data-types/datetime64.md)) — дата и время, когда произошло событие.
+-   `address` ([IPv6](../../sql-reference/data-types/domains/ipv6.md)) — IP адрес сервера ZooKeeper, с которого был сделан запрос.
+-   `port` ([UInt16](../../sql-reference/data-types/int-uint.md)) — порт сервера ZooKeeper, с которого был сделан запрос.
+-   `session_id` ([Int64](../../sql-reference/data-types/int-uint.md)) — идентификатор сессии, который сервер ZooKeeper создает для каждого соединения.
+-   `xid` ([Int32](../../sql-reference/data-types/int-uint.md)) — идентификатор запроса внутри сессии. Обычно это последовательный номер запроса, одинаковый у строки запроса и у парной строки `response`/`finalize`.
+-   `has_watch` ([UInt8](../../sql-reference/data-types/int-uint.md)) — установлен ли запрос [watch](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#ch_zkWatches).
+-   `op_num` ([Enum](../../sql-reference/data-types/enum.md)) — тип запроса или ответа на запрос.
+-   `path` ([String](../../sql-reference/data-types/string.md)) — путь к узлу ZooKeeper, указанный в запросе. Пустая строка, если запрос не требует указания пути.
+-   `data` ([String](../../sql-reference/data-types/string.md)) — данные, записанные на узле ZooKeeper (для запросов `SET` и `CREATE` — что запрос хотел записать, для ответа на запрос `GET` — что было прочитано), или пустая строка.
+-   `is_ephemeral` ([UInt8](../../sql-reference/data-types/int-uint.md)) — создается ли узел ZooKeeper как [ephemeral](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#Ephemeral+Nodes).
+-   `is_sequential` ([UInt8](../../sql-reference/data-types/int-uint.md)) — создается ли узел ZooKeeper как [sequential](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html#Sequence+Nodes+--+Unique+Naming).
+-   `version` ([Nullable(Int32)](../../sql-reference/data-types/nullable.md)) — версия узла ZooKeeper, которую запрос ожидает увидеть при выполнении. Поддерживается для запросов `CHECK`, `SET`, `REMOVE` (`-1` — запрос не проверяет версию, `NULL` — для других запросов, которые не поддерживают проверку версии).
+-   `requests_size` ([UInt32](../../sql-reference/data-types/int-uint.md)) — количество запросов, включенных в мультизапрос (это специальный запрос, который состоит из нескольких последовательных обычных запросов, выполняющихся атомарно). Все запросы, включенные в мультизапрос, имеют одинаковый `xid`.
+-   `request_idx` ([UInt32](../../sql-reference/data-types/int-uint.md)) — номер запроса, включенного в мультизапрос (`0` — для мультизапроса, далее по порядку с `1`).
+
+Столбцы с параметрами ответа на запрос:
+
+-   `zxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — идентификатор транзакции в ZooKeeper. Последовательный номер, выданный сервером ZooKeeper в ответе на успешно выполненный запрос (`0` — запрос не был выполнен, возвращена ошибка или клиент ZooKeeper не знает, был ли выполнен запрос).
+-   `error` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — код ошибки. Может иметь много значений, здесь приведены только некоторые из них:
+    -   `ZOK` — запрос успешно выполнен.
+    -   `ZCONNECTIONLOSS` — соединение разорвано.
+    -   `ZOPERATIONTIMEOUT` — истекло время ожидания выполнения запроса.
+	-   `ZSESSIONEXPIRED` — истекло время сессии.
+    -   `NULL` — выполнен запрос.
+-   `watch_type` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — тип события `watch` (для ответов на запрос при `op_num` = `Watch`), для остальных ответов: `NULL`.
+-   `watch_state` ([Nullable(Enum)](../../sql-reference/data-types/nullable.md)) — статус события `watch` (для ответов на запрос при `op_num` = `Watch`), для остальных ответов: `NULL`.
+-   `path_created` ([String](../../sql-reference/data-types/string.md)) — путь к созданному узлу ZooKeeper (для ответов на запрос `CREATE`). Может отличаться от `path`, если узел создается как `sequential`.
+-   `stat_czxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — идентификатор транзакции, в результате которой был создан узел ZooKeeper.
+-   `stat_mzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — идентификатор транзакции, которая последней модифицировала узел ZooKeeper.
+-   `stat_pzxid` ([Int64](../../sql-reference/data-types/int-uint.md)) — идентификатор транзакции, которая последней модифицировала дочерние узлы ZooKeeper.
+-   `stat_version` ([Int32](../../sql-reference/data-types/int-uint.md)) — количество изменений в данных узла ZooKeeper.
+-   `stat_cversion` ([Int32](../../sql-reference/data-types/int-uint.md)) — количество изменений в дочерних узлах ZooKeeper.
+-   `stat_dataLength` ([Int32](../../sql-reference/data-types/int-uint.md)) — длина поля данных узла ZooKeeper.
+-   `stat_numChildren` ([Int32](../../sql-reference/data-types/int-uint.md)) — количество дочерних узлов ZooKeeper.
+-   `children` ([Array(String)](../../sql-reference/data-types/array.md)) — список дочерних узлов ZooKeeper (для ответов на запрос `LIST`).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT * FROM system.zookeeper_log WHERE (session_id = '106662742089334927') AND (xid = '10858') FORMAT Vertical;
+```
+
+Результат:
+
+``` text
+Row 1:
+──────
+type:             Request
+event_date:       2021-08-09
+event_time:       2021-08-09 21:38:30.291792
+address:          ::
+port:             2181
+session_id:       106662742089334927
+xid:              10858
+has_watch:        1
+op_num:           List
+path:             /clickhouse/task_queue/ddl
+data:             
+is_ephemeral:     0
+is_sequential:    0
+version:          ᴺᵁᴸᴸ
+requests_size:    0
+request_idx:      0
+zxid:             0
+error:            ᴺᵁᴸᴸ
+watch_type:       ᴺᵁᴸᴸ
+watch_state:      ᴺᵁᴸᴸ
+path_created:     
+stat_czxid:       0
+stat_mzxid:       0
+stat_pzxid:       0
+stat_version:     0
+stat_cversion:    0
+stat_dataLength:  0
+stat_numChildren: 0
+children:         []
+
+Row 2:
+──────
+type:             Response
+event_date:       2021-08-09
+event_time:       2021-08-09 21:38:30.292086
+address:          ::
+port:             2181
+session_id:       106662742089334927
+xid:              10858
+has_watch:        1
+op_num:           List
+path:             /clickhouse/task_queue/ddl
+data:             
+is_ephemeral:     0
+is_sequential:    0
+version:          ᴺᵁᴸᴸ
+requests_size:    0
+request_idx:      0
+zxid:             16926267
+error:            ZOK
+watch_type:       ᴺᵁᴸᴸ
+watch_state:      ᴺᵁᴸᴸ
+path_created:     
+stat_czxid:       16925469
+stat_mzxid:       16925469
+stat_pzxid:       16926179
+stat_version:     0
+stat_cversion:    7
+stat_dataLength:  0
+stat_numChildren: 7
+children:         ['query-0000000006','query-0000000005','query-0000000004','query-0000000003','query-0000000002','query-0000000001','query-0000000000']
+```
+
+**См. также**
+
+-   [ZooKeeper](../../operations/tips.md#zookeeper)
+-   [Руководство по ZooKeeper](https://zookeeper.apache.org/doc/r3.3.3/zookeeperProgrammers.html)
--- a/docs/ru/sql-reference/functions/geo/h3.md
+++ b/docs/ru/sql-reference/functions/geo/h3.md
@ -193,6 +193,40 @@ SELECT geoToH3(37.79506683, 55.71290588, 15) as h3Index;
 └────────────────────┘
 ```

+## h3ToGeo {#h3togeo}
+
+Возвращает географические координаты долготы и широты, соответствующие указанному [H3](#h3index)-индексу.
+
+**Синтаксис**
+
+``` sql
+h3ToGeo(h3Index)
+```
+
+**Аргументы**
+
+-   `h3Index` — [H3](#h3index)-индекс. [UInt64](../../../sql-reference/data-types/int-uint.md).
+
+**Возвращаемые значения**
+
+-   кортеж из двух значений: `tuple(lon,lat)`, где `lon` — долгота [Float64](../../../sql-reference/data-types/float.md), `lat` — широта [Float64](../../../sql-reference/data-types/float.md).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT h3ToGeo(644325524701193974) coordinates;
+```
+
+Результат:
+
+``` text
+┌─coordinates───────────────────────────┐
+│ (37.79506616830252,55.71290243145668) │
+└───────────────────────────────────────┘
+```
+
 ## h3kRing {#h3kring}

 Возвращает [H3](#h3index)-индексы шестигранников в радиусе `k` от данного в произвольном порядке.
--- a/docs/ru/sql-reference/statements/explain.md
+++ b/docs/ru/sql-reference/statements/explain.md
@ -385,4 +385,32 @@ ExpressionTransform
            NumbersMt × 2 0 → 1
 ```

+### EXPLAIN ESTIMATE {#explain-estimate}
+
+ Отображает оценки числа строк, засечек и кусков, которые будут прочитаны при выполнении запроса. Применяется для таблиц семейства [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md#table_engines-mergetree). 
+
+**Пример**
+
+Создадим таблицу:
+
+```sql
+CREATE TABLE ttt (i Int64) ENGINE = MergeTree() ORDER BY i SETTINGS index_granularity = 16, write_final_mark = 0;
+INSERT INTO ttt SELECT number FROM numbers(128);
+OPTIMIZE TABLE ttt;
+```
+
+Запрос:
+
+```sql
+EXPLAIN ESTIMATE SELECT * FROM ttt;
+```
+
+Результат:
+
+```text
+┌─database─┬─table─┬─parts─┬─rows─┬─marks─┐
+│ default  │ ttt   │     1 │  128 │     8 │
+└──────────┴───────┴───────┴──────┴───────┘
+```
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/sql-reference/statements/explain/) <!--hide-->
--- a/docs/ru/sql-reference/statements/system.md
+++ b/docs/ru/sql-reference/statements/system.md
@ -36,6 +36,7 @@ toc_title: SYSTEM
 -   [START REPLICATION QUEUES](#query_language-system-start-replication-queues)
 -   [SYNC REPLICA](#query_language-system-sync-replica)
 -   [RESTART REPLICA](#query_language-system-restart-replica)
+-   [RESTORE REPLICA](#query_language-system-restore-replica)
 -   [RESTART REPLICAS](#query_language-system-restart-replicas)

 ## RELOAD EMBEDDED DICTIONARIES] {#query_language-system-reload-emdedded-dictionaries}
@ -287,13 +288,66 @@ SYSTEM SYNC REPLICA [db.]replicated_merge_tree_family_table_name

 ### RESTART REPLICA {#query_language-system-restart-replica}

-Реинициализация состояния Zookeeper-сессий для таблицы семейства `ReplicatedMergeTree`. Сравнивает текущее состояние с тем, что хранится в Zookeeper, как источник правды, и добавляет задачи в очередь репликации в Zookeeper, если необходимо.
-Инициализация очереди репликации на основе данных ZooKeeper происходит так же, как при attach table. На короткое время таблица станет недоступной для любых операций.
+Реинициализирует состояние сессий Zookeeper для таблицы семейства `ReplicatedMergeTree`. Сравнивает текущее состояние с состоянием в Zookeeper (как с эталоном) и при необходимости добавляет задачи в очередь репликации в Zookeeper. 
+Инициализация очереди репликации на основе данных ZooKeeper происходит так же, как при `ATTACH TABLE`. Некоторое время таблица будет недоступна для любых операций.

 ``` sql
 SYSTEM RESTART REPLICA [db.]replicated_merge_tree_family_table_name
 ```

+### RESTORE REPLICA {#query_language-system-restore-replica}
+
+Восстанавливает реплику, если метаданные в Zookeeper потеряны, но сами данные возможно существуют.
+
+Работает только с таблицами семейства `ReplicatedMergeTree` и только если таблица находится в readonly-режиме.
+
+Запрос можно выполнить если:
+
+  - потерян корневой путь ZooKeeper `/`;
+  - потерян путь реплик `/replicas`;
+  - потерян путь конкретной реплики `/replicas/replica_name/`.
+
+К реплике прикрепляются локально найденные куски, информация о них отправляется в Zookeeper.
+Если присутствующие в реплике до потери метаданных данные не устарели, они не скачиваются повторно с других реплик. Поэтому восстановление реплики не означает повторную загрузку всех данных по сети.
+
+!!! warning "Предупреждение"
+    Потерянные данные в любых состояниях перемещаются в папку `detached/`. Куски, активные до потери данных (находившиеся в состоянии Committed), прикрепляются.
+
+**Синтаксис**
+
+```sql
+SYSTEM RESTORE REPLICA [db.]replicated_merge_tree_family_table_name [ON CLUSTER cluster_name]
+```
+
+Альтернативный синтаксис:
+
+```sql
+SYSTEM RESTORE REPLICA [ON CLUSTER cluster_name] [db.]replicated_merge_tree_family_table_name
+```
+
+**Пример**
+
+Создание таблицы на нескольких серверах. После потери корневого пути реплики таблица будет прикреплена только для чтения, так как метаданные отсутствуют. Последний запрос необходимо выполнить на каждой реплике.
+
+```sql
+CREATE TABLE test(n UInt32)
+ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/', '{replica}')
+ORDER BY n PARTITION BY n % 10;
+
+INSERT INTO test SELECT * FROM numbers(1000);
+
+-- zookeeper_delete_path("/clickhouse/tables/test", recursive=True) <- root loss.
+
+SYSTEM RESTART REPLICA test;
+SYSTEM RESTORE REPLICA test;
+```
+
+Альтернативный способ:
+
+```sql
+SYSTEM RESTORE REPLICA test ON CLUSTER cluster;
+```
+
 ### RESTART REPLICAS {#query_language-system-restart-replicas}

 Реинициализация состояния ZooKeeper-сессий для всех `ReplicatedMergeTree` таблиц. Сравнивает текущее состояние реплики с тем, что хранится в ZooKeeper, как c источником правды, и добавляет задачи в очередь репликации в ZooKeeper, если необходимо.
--- a/docs/ru/sql-reference/table-functions/cluster.md
+++ b/docs/ru/sql-reference/table-functions/cluster.md
@ -5,22 +5,44 @@ toc_title: cluster

 # cluster, clusterAllReplicas {#cluster-clusterallreplicas}

-Позволяет обратиться ко всем серверам существующего кластера, который присутствует в таблице `system.clusters` и сконфигурирован в секцци `remote_servers` без создания таблицы типа `Distributed`.
-`clusterAllReplicas` - работает также как `cluster` но каждая реплика в кластере будет использована как отдельный шард/отдельное соединение.
+Позволяет обратиться ко всем шардам существующего кластера, который сконфигурирован в секции `remote_servers` без создания таблицы типа [Distributed](../../engines/table-engines/special/distributed.md). В запросе используется одна реплика каждого шарда.

+Функция `clusterAllReplicas` работает также как `cluster`, но каждая реплика в кластере используется как отдельный шард/отдельное соединение.

-Сигнатуры:
+!!! note "Примечание"
+    Все доступные кластеры перечислены в таблице [system.clusters](../../operations/system-tables/clusters.md).
+
+**Синтаксис**

 ``` sql
-cluster('cluster_name', db.table)
-cluster('cluster_name', db, table)
-clusterAllReplicas('cluster_name', db.table)
-clusterAllReplicas('cluster_name', db, table)
+cluster('cluster_name', db.table[, sharding_key])
+cluster('cluster_name', db, table[, sharding_key])
+clusterAllReplicas('cluster_name', db.table[, sharding_key])
+clusterAllReplicas('cluster_name', db, table[, sharding_key])
+```
+**Аргументы**
+
+- `cluster_name` – имя кластера, который обозначает подмножество адресов и параметров подключения к удаленным и локальным серверам, входящим в кластер.
+- `db.table` или `db`, `table` - имя базы данных и таблицы. 
+- `sharding_key` - ключ шардирования. Необязательный аргумент. Указывается, если данные добавляются более чем в один шард кластера. 
+
+**Возвращаемое значение**
+
+Набор данных из кластеров.
+
+**Использование макросов**
+
+`cluster_name` может содержать макрос — подстановку в фигурных скобках. Эта подстановка заменяется на соответствующее значение из секции [macros](../../operations/server-configuration-parameters/settings.md#macros) конфигурационного файла .
+
+Пример:
+
+```sql
+SELECT * FROM cluster('{cluster}', default.example_table);
 ```

-`cluster_name` – имя кластера, который обязан присутствовать в таблице `system.clusters`  и обозначает подмножество адресов и параметров подключения к удаленным и локальным серверам, входящим в кластер.
+**Использование и рекомендации** 

-Использование табличных функций `cluster` и `clusterAllReplicas` менее оптимальное чем создание таблицы типа `Distributed`, поскольку в этом случае соединение с сервером переустанавливается на каждый запрос. При обработке большого количества запросов, всегда создавайте `Distributed` таблицу заранее и не используйте табличные функции `cluster` и `clusterAllReplicas`.
+Использование табличных функций `cluster` и `clusterAllReplicas` менее оптимально, чем создание таблицы типа `Distributed`, поскольку в этом случае при каждом новом запросе устанавливается новое соединение с сервером. При обработке большого количества запросов всегда создавайте `Distributed` таблицу заранее и не используйте табличные функции `cluster` и `clusterAllReplicas`.

 Табличные функции `cluster` and `clusterAllReplicas` могут быть полезны в следующих случаях:

@ -30,7 +52,7 @@ clusterAllReplicas('cluster_name', db, table)

 Настройки соединения `user`, `password`, `host`, `post`, `compression`, `secure` берутся из секции `<remote_servers>` файлов конфигурации. См. подробности в разделе [Distributed](../../engines/table-engines/special/distributed.md)

-**See Also**
+**См. также**

 -   [skip_unavailable_shards](../../operations/settings/settings.md#settings-skip_unavailable_shards)
 -   [load_balancing](../../operations/settings/settings.md#settings-load_balancing)
--- a/docs/zh/operations/backup.md
+++ b/docs/zh/operations/backup.md
@ -1,13 +1,11 @@
 ---
-machine_translated: true
-machine_translated_rev: 72537a2d527c63c07aa5d2361a8829f3895cf2bd
 toc_priority: 49
 toc_title: "\u6570\u636E\u5907\u4EFD"
 ---

 # 数据备份 {#data-backup}

-尽管 [副本] (../engines/table-engines/mergetree-family/replication.md) 可以提供针对硬件的错误防护, 但是它不能预防人为操作失误: 数据的意外删除, 错误表的删除或者错误集群上表的删除, 以及导致错误数据处理或者数据损坏的软件bug. 在很多案例中，这类意外可能会影响所有的副本. ClickHouse 有内置的保护措施可以预防一些错误 — 例如, 默认情况下 [不能人工删除使用带有MergeTree引擎且包含超过50Gb数据的表] (server-configuration-parameters/settings.md#max-table-size-to-drop). 但是，这些保护措施不能覆盖所有可能情况，并且这些措施可以被绕过。
+尽管 [副本](../engines/table-engines/mergetree-family/replication.md) 可以提供针对硬件的错误防护, 但是它不能预防人为操作失误: 数据的意外删除, 错误表的删除或者错误集群上表的删除, 以及导致错误数据处理或者数据损坏的软件bug. 在很多案例中，这类意外可能会影响所有的副本. ClickHouse 有内置的保护措施可以预防一些错误 — 例如, 默认情况下 [不能人工删除使用带有MergeTree引擎且包含超过50Gb数据的表](server-configuration-parameters/settings.md#max-table-size-to-drop). 但是，这些保护措施不能覆盖所有可能情况，并且这些措施可以被绕过。

 为了有效地减少可能的人为错误，您应该 **提前** 仔细的准备备份和数据还原的策略.

@ -18,26 +16,26 @@ toc_title: "\u6570\u636E\u5907\u4EFD"

 ## 将源数据复制到其它地方 {#duplicating-source-data-somewhere-else}

-通常摄入到ClickHouse的数据是通过某种持久队列传递的，例如 [Apache Kafka] (https://kafka.apache.org). 在这种情况下，可以配置一组额外的订阅服务器，这些订阅服务器将在写入ClickHouse时读取相同的数据流，并将其存储在冷存储中。 大多数公司已经有一些默认推荐的冷存储，可能是对象存储或分布式文件系统，如 [HDFS] (https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html).
+通常摄入到ClickHouse的数据是通过某种持久队列传递的，例如 [Apache Kafka](https://kafka.apache.org). 在这种情况下，可以配置一组额外的订阅服务器，这些订阅服务器将在写入ClickHouse时读取相同的数据流，并将其存储在冷存储中。 大多数公司已经有一些默认推荐的冷存储，可能是对象存储或分布式文件系统，如 [HDFS](https://hadoop.apache.org/docs/stable/hadoop-project-dist/hadoop-hdfs/HdfsDesign.html).

 ## 文件系统快照 {#filesystem-snapshots}

-某些本地文件系统提供快照功能（例如, [ZFS] (https://en.wikipedia.org/wiki/ZFS)），但它们可能不是提供实时查询的最佳选择。 一个可能的解决方案是使用这种文件系统创建额外的副本，并将它们与用于`SELECT` 查询的 [分布式] (../engines/table-engines/special/distributed.md) 表分离。 任何修改数据的查询都无法访问此类副本上的快照。 作为回报，这些副本可能具有特殊的硬件配置，每个服务器附加更多的磁盘，这将是经济高效的。
+某些本地文件系统提供快照功能（例如, [ZFS](https://en.wikipedia.org/wiki/ZFS)），但它们可能不是提供实时查询的最佳选择。 一个可能的解决方案是使用这种文件系统创建额外的副本，并将它们与用于`SELECT` 查询的 [分布式](../engines/table-engines/special/distributed.md) 表分离。 任何修改数据的查询都无法访问此类副本上的快照。 作为回报，这些副本可能具有特殊的硬件配置，每个服务器附加更多的磁盘，这将是经济高效的。

 ## clickhouse-copier {#clickhouse-copier}

-[clickhouse-copier] (utilities/clickhouse-copier.md) 是一个多功能工具，最初创建它是为了用于重新切分pb大小的表。 因为它能够在ClickHouse表和集群之间可靠地复制数据，所以它也可用于备份和还原数据。
+[clickhouse-copier](utilities/clickhouse-copier.md) 是一个多功能工具，最初创建它是为了用于重新切分pb大小的表。 因为它能够在ClickHouse表和集群之间可靠地复制数据，所以它也可用于备份和还原数据。

 对于较小的数据量，一个简单的 `INSERT INTO ... SELECT ...` 到远程表也可以工作。

 ## part操作 {#manipulations-with-parts}

-ClickHouse允许使用 `ALTER TABLE ... FREEZE PARTITION ...` 查询以创建表分区的本地副本。 这是利用硬链接(hardlink)到 `/var/lib/clickhouse/shadow/` 文件夹中实现的，所以它通常不会因为旧数据而占用额外的磁盘空间。 创建的文件副本不由ClickHouse服务器处理，所以你可以把它们留在那里：你将有一个简单的备份，不需要任何额外的外部系统，但它仍然容易出现硬件问题。 出于这个原因，最好将它们远程复制到另一个位置，然后删除本地副本。 分布式文件系统和对象存储仍然是一个不错的选择，但是具有足够大容量的正常附加文件服务器也可以工作（在这种情况下，传输将通过网络文件系统或者也许是 [rsync] (https://en.wikipedia.org/wiki/Rsync) 来进行).
+ClickHouse允许使用 `ALTER TABLE ... FREEZE PARTITION ...` 查询以创建表分区的本地副本。 这是利用硬链接(hardlink)到 `/var/lib/clickhouse/shadow/` 文件夹中实现的，所以它通常不会因为旧数据而占用额外的磁盘空间。 创建的文件副本不由ClickHouse服务器处理，所以你可以把它们留在那里：你将有一个简单的备份，不需要任何额外的外部系统，但它仍然容易出现硬件问题。 出于这个原因，最好将它们远程复制到另一个位置，然后删除本地副本。 分布式文件系统和对象存储仍然是一个不错的选择，但是具有足够大容量的正常附加文件服务器也可以工作（在这种情况下，传输将通过网络文件系统或者也许是 [rsync](https://en.wikipedia.org/wiki/Rsync) 来进行).

 数据可以使用 `ALTER TABLE ... ATTACH PARTITION ...` 从备份中恢复。

-有关与分区操作相关的查询的详细信息，请参阅 [更改文档] (../sql-reference/statements/alter.md#alter_manipulations-with-partitions).
+有关与分区操作相关的查询的详细信息，请参阅 [更改文档](../sql-reference/statements/alter.md#alter_manipulations-with-partitions).

-第三方工具可用于自动化此方法: [clickhouse-backup] (https://github.com/AlexAkulov/clickhouse-backup).
+第三方工具可用于自动化此方法: [clickhouse-backup](https://github.com/AlexAkulov/clickhouse-backup).

-[原始文章] (https://clickhouse.tech/docs/en/operations/backup/) <!--hide-->
+[原始文章](https://clickhouse.tech/docs/en/operations/backup/) <!--hide-->
--- a/docs/zh/operations/system-tables/data_type_families.md
+++ b/docs/zh/operations/system-tables/data_type_families.md
@ -1,9 +1,5 @@
---
-machine_translated: true
-machine_translated_rev: 5decc73b5dc60054f19087d3690c4eb99446a6c3
---

-# 系统。data_type_families {#system_tables-data_type_families}
+# system.data_type_families {#system_tables-data_type_families}

 包含有关受支持的[数据类型](../../sql-reference/data-types/)的信息.

--- a/docs/zh/operations/troubleshooting.md
+++ b/docs/zh/operations/troubleshooting.md
@ -26,7 +26,7 @@ toc_title: "常见问题"

 ### 服务器未运行 {#server-is-not-running}

-**检查服务器是否运行nnig**
+**检查服务器是否正在运行**

 命令:

--- a/docs/zh/sql-reference/functions/ym-dict-functions.md
+++ b/docs/zh/sql-reference/functions/ym-dict-functions.md
@ -1,8 +1,8 @@
-# 功能与Yandex的工作。梅特里卡词典 {#functions-for-working-with-yandex-metrica-dictionaries}
+# 使用 Yandex.Metrica 字典函数 {#functions-for-working-with-yandex-metrica-dictionaries}

-为了使下面的功能正常工作，服务器配置必须指定获取所有Yandex的路径和地址。梅特里卡字典. 字典在任何这些函数的第一次调用时加载。 如果无法加载引用列表，则会引发异常。
+为了使下面的功能正常工作，服务器配置必须指定获取所有 Yandex.Metrica 字典的路径和地址。Yandex.Metrica 字典在任何这些函数的第一次调用时加载。 如果无法加载引用列表，则会引发异常。

-For information about creating reference lists, see the section «Dictionaries».
+有关创建引用列表的信息，请参阅 «字典» 部分.

 ## 多个地理基 {#multiple-geobases}

@ -17,18 +17,18 @@ ClickHouse支持同时使用多个备选地理基（区域层次结构），以

 所有字典都在运行时重新加载（每隔一定数量的秒重新加载一次，如builtin_dictionaries_reload_interval config参数中定义，或默认情况下每小时一次）。 但是，可用字典列表在服务器启动时定义一次。

-All functions for working with regions have an optional argument at the end – the dictionary key. It is referred to as the geobase.
+所有处理区域的函数都在末尾有一个可选参数—字典键。它被称为地基。
 示例:

-    regionToCountry(RegionID) – Uses the default dictionary: /opt/geo/regions_hierarchy.txt
-    regionToCountry(RegionID, '') – Uses the default dictionary: /opt/geo/regions_hierarchy.txt
-    regionToCountry(RegionID, 'ua') – Uses the dictionary for the 'ua' key: /opt/geo/regions_hierarchy_ua.txt
+    regionToCountry(RegionID) – 使用默认路径: /opt/geo/regions_hierarchy.txt
+    regionToCountry(RegionID, '') – 使用默认路径: /opt/geo/regions_hierarchy.txt
+    regionToCountry(RegionID, 'ua') – 使用字典中的'ua' 键: /opt/geo/regions_hierarchy_ua.txt

-### ﾂ环板(ｮﾂ嘉ｯﾂ偲青regionｼﾂ氾ｶﾂ鉄ﾂ工ﾂ渉\]) {#regiontocityid-geobase}
+### regionToCity(id[, geobase]) {#regiontocityid-geobase}

-Accepts a UInt32 number – the region ID from the Yandex geobase. If this region is a city or part of a city, it returns the region ID for the appropriate city. Otherwise, returns 0.
+从 Yandex geobase 接收一个 UInt32 数字类型的区域ID 。如果该区域是一个城市或城市的一部分，它将返回相应城市的区域ID。否则,返回0。

-### 虏茅驴麓卤戮碌禄路戮鲁拢\]) {#regiontoareaid-geobase}
+### regionToArea(id[, geobase]) {#regiontoareaid-geobase}

 将区域转换为区域（地理数据库中的类型5）。 在所有其他方式，这个功能是一样的 ‘regionToCity’.

@ -84,36 +84,58 @@ LIMIT 15
    │ Federation of Bosnia and Herzegovina                     │
    └──────────────────────────────────────────────────────────┘

-### 虏茅驴麓卤戮碌禄路戮鲁拢(陆毛隆隆(803)888-8325\]) {#regiontocountryid-geobase}
+### regionToCountry(id[, geobase]) {#regiontocountryid-geobase}

 将区域转换为国家。 在所有其他方式，这个功能是一样的 ‘regionToCity’.
 示例: `regionToCountry(toUInt32(213)) = 225` 转换莫斯科（213）到俄罗斯（225）。

-### 掳胫((禄脢鹿脷露胫鲁隆鹿((酶-11-16""\[脪陆,ase\]) {#regiontocontinentid-geobase}
+### regionToContinent(id[, geobase]) {#regiontocontinentid-geobase}

 将区域转换为大陆。 在所有其他方式，这个功能是一样的 ‘regionToCity’.
 示例: `regionToContinent(toUInt32(213)) = 10001` 将莫斯科（213）转换为欧亚大陆（10001）。

-### ﾂ环板(ｮﾂ嘉ｯﾂ偲青regionｬﾂ静ｬﾂ青ｻﾂ催ｬﾂ渉\]) {#regiontopopulationid-geobase}
+### regionToTopContinent (#regiontotopcontinent) {#regiontotopcontinent-regiontotopcontinent}
+
+查找该区域层次结构中最高的大陆。
+
+**语法**
+
+``` sql
+regionToTopContinent(id[, geobase])
+```
+
+**参数**
+
+-   `id` — Yandex geobase 的区域 ID. [UInt32](../../sql-reference/data-types/int-uint.md).
+-   `geobase` — 字典的建. 参阅 [Multiple Geobases](#multiple-geobases). [String](../../sql-reference/data-types/string.md). 可选.
+
+**返回值**
+
+-   顶级大陆的标识符(当您在区域层次结构中攀爬时，是后者)。
+-   0，如果没有。
+
+类型: `UInt32`.
+
+### regionToPopulation(id\[, geobase\]) {#regiontopopulationid-geobase}

 获取区域的人口。
-The population can be recorded in files with the geobase. See the section «External dictionaries».
+人口可以记录在文件与地球基。请参阅«外部词典»部分。
 如果没有为该区域记录人口，则返回0。
 在Yandex地理数据库中，可能会为子区域记录人口，但不会为父区域记录人口。

 ### regionIn(lhs,rhs\[,地理数据库\]) {#regioninlhs-rhs-geobase}

 检查是否 ‘lhs’ 属于一个区域 ‘rhs’ 区域。 如果属于UInt8，则返回等于1的数字，如果不属于则返回0。
-The relationship is reflexive – any region also belongs to itself.
+这种关系是反射的——任何地区也属于自己。

-### ﾂ暗ｪﾂ氾环催ﾂ団ﾂ法ﾂ人\]) {#regionhierarchyid-geobase}
+### regionHierarchy(id\[, geobase\]) {#regionhierarchyid-geobase}

-Accepts a UInt32 number – the region ID from the Yandex geobase. Returns an array of region IDs consisting of the passed region and all parents along the chain.
+从 Yandex geobase 接收一个 UInt32 数字类型的区域ID。返回一个区域ID数组，由传递的区域和链上的所有父节点组成。
 示例: `regionHierarchy(toUInt32(213)) = [213,1,3,225,10001,10000]`.

-### 地区名称(id\[,郎\]) {#regiontonameid-lang}
+### regionToName(id\[, lang\]) {#regiontonameid-lang}

-Accepts a UInt32 number – the region ID from the Yandex geobase. A string with the name of the language can be passed as a second argument. Supported languages are: ru, en, ua, uk, by, kz, tr. If the second argument is omitted, the language ‘ru’ is used. If the language is not supported, an exception is thrown. Returns a string – the name of the region in the corresponding language. If the region with the specified ID doesn’t exist, an empty string is returned.
+从 Yandex geobase 接收一个 UInt32 数字类型的区域ID。带有语言名称的字符串可以作为第二个参数传递。支持的语言有:ru, en, ua, uk, by, kz, tr。如果省略第二个参数，则使用' ru '语言。如果不支持该语言，则抛出异常。返回一个字符串-对应语言的区域名称。如果指定ID的区域不存在，则返回一个空字符串。

 `ua` 和 `uk` 都意味着乌克兰。

--- a/programs/client/Client.cpp
+++ b/programs/client/Client.cpp
@ -2,6 +2,7 @@
 #include "Common/MemoryTracker.h"
 #include "Columns/ColumnsNumber.h"
 #include "ConnectionParameters.h"
+#include "IO/CompressionMethod.h"
 #include "QueryFuzzer.h"
 #include "Suggest.h"
 #include "TestHint.h"
@ -128,6 +129,7 @@ namespace ErrorCodes
    extern const int UNRECOGNIZED_ARGUMENTS;
    extern const int SYNTAX_ERROR;
    extern const int TOO_DEEP_RECURSION;
+    extern const int AUTHENTICATION_FAILED;
 }


@ -772,31 +774,50 @@ private:
                      << connection_parameters.host << ":" << connection_parameters.port
                      << (!connection_parameters.user.empty() ? " as user " + connection_parameters.user : "") << "." << std::endl;

-        connection = std::make_unique<Connection>(
-            connection_parameters.host,
-            connection_parameters.port,
-            connection_parameters.default_database,
-            connection_parameters.user,
-            connection_parameters.password,
-            "", /* cluster */
-            "", /* cluster_secret */
-            "client",
-            connection_parameters.compression,
-            connection_parameters.security);
-
        String server_name;
        UInt64 server_version_major = 0;
        UInt64 server_version_minor = 0;
        UInt64 server_version_patch = 0;

-        if (max_client_network_bandwidth)
+        try
        {
-            ThrottlerPtr throttler = std::make_shared<Throttler>(max_client_network_bandwidth, 0, "");
-            connection->setThrottler(throttler);
-        }
+            connection = std::make_unique<Connection>(
+                connection_parameters.host,
+                connection_parameters.port,
+                connection_parameters.default_database,
+                connection_parameters.user,
+                connection_parameters.password,
+                "", /* cluster */
+                "", /* cluster_secret */
+                "client",
+                connection_parameters.compression,
+                connection_parameters.security);

-        connection->getServerVersion(
-            connection_parameters.timeouts, server_name, server_version_major, server_version_minor, server_version_patch, server_revision);
+            if (max_client_network_bandwidth)
+            {
+                ThrottlerPtr throttler = std::make_shared<Throttler>(max_client_network_bandwidth, 0, "");
+                connection->setThrottler(throttler);
+            }
+
+            connection->getServerVersion(
+                connection_parameters.timeouts, server_name, server_version_major, server_version_minor, server_version_patch, server_revision);
+        }
+        catch (const Exception & e)
+        {
+            /// It is typical when users install ClickHouse, type some password and instantly forget it.
+            if ((connection_parameters.user.empty() || connection_parameters.user == "default")
+                && e.code() == DB::ErrorCodes::AUTHENTICATION_FAILED)
+            {
+                std::cerr << std::endl
+                    << "If you have installed ClickHouse and forgot password you can reset it in the configuration file." << std::endl
+                    << "The password for default user is typically located at /etc/clickhouse-server/users.d/default-password.xml" << std::endl
+                    << "and deleting this file will reset the password." << std::endl
+                    << "See also /etc/clickhouse-server/users.xml on the server where ClickHouse is installed." << std::endl
+                    << std::endl;
+            }
+
+            throw;
+        }

        server_version = toString(server_version_major) + "." + toString(server_version_minor) + "." + toString(server_version_patch);

@ -1823,7 +1844,7 @@ private:
    void processInsertQuery()
    {
        const auto parsed_insert_query = parsed_query->as<ASTInsertQuery &>();
-        if (!parsed_insert_query.data && (is_interactive || (!stdin_is_a_tty && std_in.eof())))
+        if ((!parsed_insert_query.data && !parsed_insert_query.infile) && (is_interactive || (!stdin_is_a_tty && std_in.eof())))
            throw Exception("No data to insert", ErrorCodes::NO_DATA_TO_INSERT);

        connection->sendQuery(
@ -1894,7 +1915,24 @@ private:
        if (!parsed_insert_query)
            return;

-        if (parsed_insert_query->data)
+        if (parsed_insert_query->infile)
+        {
+            const auto & in_file_node = parsed_insert_query->infile->as<ASTLiteral &>();
+            const auto in_file = in_file_node.value.safeGet<std::string>();
+
+            auto in_buffer = wrapReadBufferWithCompressionMethod(std::make_unique<ReadBufferFromFile>(in_file), chooseCompressionMethod(in_file, ""));
+
+            try
+            {
+                sendDataFrom(*in_buffer, sample, columns_description);
+            }
+            catch (Exception & e)
+            {
+                e.addMessage("data for INSERT was parsed from file");
+                throw;
+            }
+        }
+        else if (parsed_insert_query->data)
        {
            /// Send data contained in the query.
            ReadBufferFromMemory data_in(parsed_insert_query->data, parsed_insert_query->end - parsed_insert_query->data);
--- a/programs/keeper/Keeper.cpp
+++ b/programs/keeper/Keeper.cpp
@ -17,6 +17,7 @@
 #include <Poco/Version.h>
 #include <Poco/Environment.h>
 #include <Common/getMultipleKeysFromConfig.h>
+#include <Core/ServerUUID.h>
 #include <filesystem>
 #include <IO/UseSSL.h>

@ -326,6 +327,8 @@ int Keeper::main(const std::vector<std::string> & /*args*/)
        }
    }

+    DB::ServerUUID::load(path + "/uuid", log);
+
    const Settings & settings = global_context->getSettingsRef();

    GlobalThreadPool::initialize(config().getUInt("max_thread_pool_size", 100));
--- a/programs/local/LocalServer.cpp
+++ b/programs/local/LocalServer.cpp
@ -12,6 +12,7 @@
 #include <Interpreters/executeQuery.h>
 #include <Interpreters/loadMetadata.h>
 #include <Interpreters/DatabaseCatalog.h>
+#include <Interpreters/Session.h>
 #include <Common/Exception.h>
 #include <Common/Macros.h>
 #include <Common/Config/ConfigProcessor.h>
@ -374,14 +375,13 @@ void LocalServer::processQueries()
    if (!parse_res.second)
        throw Exception("Cannot parse and execute the following part of query: " + String(parse_res.first), ErrorCodes::SYNTAX_ERROR);

-    /// we can't mutate global global_context (can lead to races, as it was already passed to some background threads)
-    /// so we can't reuse it safely as a query context and need a copy here
-    auto context = Context::createCopy(global_context);
+    /// Authenticate and create a context to execute queries.
+    Session session{global_context, ClientInfo::Interface::TCP};
+    session.authenticate("default", "", Poco::Net::SocketAddress{});

-    context->makeSessionContext();
-    context->makeQueryContext();
-
-    context->setUser("default", "", Poco::Net::SocketAddress{});
+    /// Use the same context for all queries.
+    auto context = session.makeQueryContext();
+    context->makeSessionContext(); /// initial_create_query requires a session context to be set.
    context->setCurrentQueryId("");
    applyCmdSettings(context);

--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -39,6 +39,7 @@
 #include <Common/getMappedArea.h>
 #include <Common/remapExecutable.h>
 #include <Common/TLDListsHolder.h>
+#include <Core/ServerUUID.h>
 #include <IO/HTTPCommon.h>
 #include <IO/ReadHelpers.h>
 #include <IO/UseSSL.h>
@ -79,7 +80,6 @@
 #include <Server/HTTP/HTTPServer.h>
 #include <filesystem>

-
 #if !defined(ARCADIA_BUILD)
 #   include "config_core.h"
 #   include "Common/config_version.h"
@ -146,7 +146,6 @@ static bool jemallocOptionEnabled(const char *name)
 static bool jemallocOptionEnabled(const char *) { return 0; }
 #endif

-
 int mainEntryClickHouseServer(int argc, char ** argv)
 {
    DB::Server app;
@ -667,13 +666,14 @@ if (ThreadFuzzer::instance().isEffective())

    global_context->setRemoteHostFilter(config());

-    std::string path = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH));
+    std::string path_str = getCanonicalPath(config().getString("path", DBMS_DEFAULT_PATH));
+    fs::path path = path_str;
    std::string default_database = config().getString("default_database", "default");

    /// Check that the process user id matches the owner of the data.
    const auto effective_user_id = geteuid();
    struct stat statbuf;
-    if (stat(path.c_str(), &statbuf) == 0 && effective_user_id != statbuf.st_uid)
+    if (stat(path_str.c_str(), &statbuf) == 0 && effective_user_id != statbuf.st_uid)
    {
        const auto effective_user = getUserName(effective_user_id);
        const auto data_owner = getUserName(statbuf.st_uid);
@ -690,9 +690,11 @@ if (ThreadFuzzer::instance().isEffective())
        }
    }

-    global_context->setPath(path);
+    global_context->setPath(path_str);

-    StatusFile status{path + "status", StatusFile::write_full_info};
+    StatusFile status{path / "status", StatusFile::write_full_info};
+
+    DB::ServerUUID::load(path / "uuid", log);

    /// Try to increase limit on number of open files.
    {
@ -726,7 +728,7 @@ if (ThreadFuzzer::instance().isEffective())

    /// Storage with temporary data for processing of heavy queries.
    {
-        std::string tmp_path = config().getString("tmp_path", path + "tmp/");
+        std::string tmp_path = config().getString("tmp_path", path / "tmp/");
        std::string tmp_policy = config().getString("tmp_policy", "");
        const VolumePtr & volume = global_context->setTemporaryStorage(tmp_path, tmp_policy);
        for (const DiskPtr & disk : volume->getDisks())
@ -738,7 +740,7 @@ if (ThreadFuzzer::instance().isEffective())
      * Examples: do repair of local data; clone all replicated tables from replica.
      */
    {
-        auto flags_path = fs::path(path) / "flags/";
+        auto flags_path = path / "flags/";
        fs::create_directories(flags_path);
        global_context->setFlagsPath(flags_path);
    }
@ -747,29 +749,29 @@ if (ThreadFuzzer::instance().isEffective())
      */
    {

-        std::string user_files_path = config().getString("user_files_path", fs::path(path) / "user_files/");
+        std::string user_files_path = config().getString("user_files_path", path / "user_files/");
        global_context->setUserFilesPath(user_files_path);
        fs::create_directories(user_files_path);
    }

    {
-        std::string dictionaries_lib_path = config().getString("dictionaries_lib_path", fs::path(path) / "dictionaries_lib/");
+        std::string dictionaries_lib_path = config().getString("dictionaries_lib_path", path / "dictionaries_lib/");
        global_context->setDictionariesLibPath(dictionaries_lib_path);
        fs::create_directories(dictionaries_lib_path);
    }

    /// top_level_domains_lists
    {
-        const std::string & top_level_domains_path = config().getString("top_level_domains_path", fs::path(path) / "top_level_domains/");
+        const std::string & top_level_domains_path = config().getString("top_level_domains_path", path / "top_level_domains/");
        TLDListsHolder::getInstance().parseConfig(fs::path(top_level_domains_path) / "", config());
    }

    {
-        fs::create_directories(fs::path(path) / "data/");
-        fs::create_directories(fs::path(path) / "metadata/");
+        fs::create_directories(path / "data/");
+        fs::create_directories(path / "metadata/");

        /// Directory with metadata of tables, which was marked as dropped by Atomic database
-        fs::create_directories(fs::path(path) / "metadata_dropped/");
+        fs::create_directories(path / "metadata_dropped/");
    }

    if (config().has("interserver_http_port") && config().has("interserver_https_port"))
@ -952,7 +954,7 @@ if (ThreadFuzzer::instance().isEffective())
 #endif

    /// Set path for format schema files
-    fs::path format_schema_path(config().getString("format_schema_path", fs::path(path) / "format_schemas/"));
+    fs::path format_schema_path(config().getString("format_schema_path", path / "format_schemas/"));
    global_context->setFormatSchemaPath(format_schema_path);
    fs::create_directories(format_schema_path);

@ -1088,7 +1090,7 @@ if (ThreadFuzzer::instance().isEffective())
    /// system logs may copy global context.
    global_context->setCurrentDatabaseNameInGlobalContext(default_database);

-    LOG_INFO(log, "Loading metadata from {}", path);
+    LOG_INFO(log, "Loading metadata from {}", path_str);

    try
    {
@ -1428,7 +1430,6 @@ if (ThreadFuzzer::instance().isEffective())

        /// Must be done after initialization of `servers`, because async_metrics will access `servers` variable from its thread.
        async_metrics.start();
-        global_context->enableNamedSessions();

        {
            String level_str = config().getString("text_log.level", "");
--- a/src/Access/AccessControlManager.cpp
+++ b/src/Access/AccessControlManager.cpp
@ -1,6 +1,7 @@
 #include <Access/AccessControlManager.h>
 #include <Access/MultipleAccessStorage.h>
 #include <Access/MemoryAccessStorage.h>
+#include <Access/ReplicatedAccessStorage.h>
 #include <Access/UsersConfigAccessStorage.h>
 #include <Access/DiskAccessStorage.h>
 #include <Access/LDAPAccessStorage.h>
@ -225,6 +226,22 @@ void AccessControlManager::startPeriodicReloadingUsersConfigs()
    }
 }

+void AccessControlManager::addReplicatedStorage(
+    const String & storage_name_,
+    const String & zookeeper_path_,
+    const zkutil::GetZooKeeper & get_zookeeper_function_)
+{
+    auto storages = getStoragesPtr();
+    for (const auto & storage : *storages)
+    {
+        if (auto replicated_storage = typeid_cast<std::shared_ptr<ReplicatedAccessStorage>>(storage))
+            return;
+    }
+    auto new_storage = std::make_shared<ReplicatedAccessStorage>(storage_name_, zookeeper_path_, get_zookeeper_function_);
+    addStorage(new_storage);
+    LOG_DEBUG(getLogger(), "Added {} access storage '{}'", String(new_storage->getStorageType()), new_storage->getStorageName());
+    new_storage->startup();
+}

 void AccessControlManager::addDiskStorage(const String & directory_, bool readonly_)
 {
@ -322,6 +339,11 @@ void AccessControlManager::addStoragesFromUserDirectoriesConfig(
        {
            addLDAPStorage(name, config, prefix);
        }
+        else if (type == ReplicatedAccessStorage::STORAGE_TYPE)
+        {
+            String zookeeper_path = config.getString(prefix + ".zookeeper_path");
+            addReplicatedStorage(name, zookeeper_path, get_zookeeper_function);
+        }
        else
            throw Exception("Unknown storage type '" + type + "' at " + prefix + " in config", ErrorCodes::UNKNOWN_ELEMENT_IN_CONFIG);
    }
--- a/src/Access/AccessControlManager.h
+++ b/src/Access/AccessControlManager.h
@ -84,6 +84,10 @@ public:
    /// Adds LDAPAccessStorage which allows querying remote LDAP server for user info.
    void addLDAPStorage(const String & storage_name_, const Poco::Util::AbstractConfiguration & config_, const String & prefix_);

+    void addReplicatedStorage(const String & storage_name,
+                              const String & zookeeper_path,
+                              const zkutil::GetZooKeeper & get_zookeeper_function);
+
    /// Adds storages from <users_directories> config.
    void addStoragesFromUserDirectoriesConfig(const Poco::Util::AbstractConfiguration & config,
                                              const String & key,
--- a/src/Access/AccessEntityIO.cpp
+++ b/src/Access/AccessEntityIO.cpp
@ -0,0 +1,175 @@
+#include <Access/AccessEntityIO.h>
+#include <Access/IAccessEntity.h>
+#include <Access/IAccessStorage.h>
+#include <Access/Quota.h>
+#include <Access/Role.h>
+#include <Access/RowPolicy.h>
+#include <Access/SettingsProfile.h>
+#include <Access/User.h>
+#include <Core/Defines.h>
+#include <Interpreters/InterpreterCreateQuotaQuery.h>
+#include <Interpreters/InterpreterCreateRoleQuery.h>
+#include <Interpreters/InterpreterCreateRowPolicyQuery.h>
+#include <Interpreters/InterpreterCreateSettingsProfileQuery.h>
+#include <Interpreters/InterpreterCreateUserQuery.h>
+#include <Interpreters/InterpreterGrantQuery.h>
+#include <Interpreters/InterpreterShowCreateAccessEntityQuery.h>
+#include <Interpreters/InterpreterShowGrantsQuery.h>
+#include <Parsers/ASTCreateQuotaQuery.h>
+#include <Parsers/ASTCreateRoleQuery.h>
+#include <Parsers/ASTCreateRowPolicyQuery.h>
+#include <Parsers/ASTCreateSettingsProfileQuery.h>
+#include <Parsers/ASTCreateUserQuery.h>
+#include <Parsers/ASTGrantQuery.h>
+#include <Parsers/ParserCreateQuotaQuery.h>
+#include <Parsers/ParserCreateRoleQuery.h>
+#include <Parsers/ParserCreateRowPolicyQuery.h>
+#include <Parsers/ParserCreateSettingsProfileQuery.h>
+#include <Parsers/ParserCreateUserQuery.h>
+#include <Parsers/ParserGrantQuery.h>
+#include <Parsers/formatAST.h>
+#include <Parsers/parseQuery.h>
+#include <boost/range/algorithm/copy.hpp>
+#include <boost/range/algorithm_ext/push_back.hpp>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int INCORRECT_ACCESS_ENTITY_DEFINITION;
+}
+
+using EntityType = IAccessStorage::EntityType;
+using EntityTypeInfo = IAccessStorage::EntityTypeInfo;
+
+namespace
+{
+    /// Special parser for the 'ATTACH access entity' queries.
+    class ParserAttachAccessEntity : public IParserBase
+    {
+    protected:
+        const char * getName() const override { return "ATTACH access entity query"; }
+
+        bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override
+        {
+            ParserCreateUserQuery create_user_p;
+            ParserCreateRoleQuery create_role_p;
+            ParserCreateRowPolicyQuery create_policy_p;
+            ParserCreateQuotaQuery create_quota_p;
+            ParserCreateSettingsProfileQuery create_profile_p;
+            ParserGrantQuery grant_p;
+
+            create_user_p.useAttachMode();
+            create_role_p.useAttachMode();
+            create_policy_p.useAttachMode();
+            create_quota_p.useAttachMode();
+            create_profile_p.useAttachMode();
+            grant_p.useAttachMode();
+
+            return create_user_p.parse(pos, node, expected) || create_role_p.parse(pos, node, expected)
+                || create_policy_p.parse(pos, node, expected) || create_quota_p.parse(pos, node, expected)
+                || create_profile_p.parse(pos, node, expected) || grant_p.parse(pos, node, expected);
+        }
+    };
+
+}
+
+
+String serializeAccessEntity(const IAccessEntity & entity)
+{
+    /// Build list of ATTACH queries.
+    ASTs queries;
+    queries.push_back(InterpreterShowCreateAccessEntityQuery::getAttachQuery(entity));
+    if ((entity.getType() == EntityType::USER) || (entity.getType() == EntityType::ROLE))
+        boost::range::push_back(queries, InterpreterShowGrantsQuery::getAttachGrantQueries(entity));
+
+    /// Serialize the list of ATTACH queries to a string.
+    WriteBufferFromOwnString buf;
+    for (const ASTPtr & query : queries)
+    {
+        formatAST(*query, buf, false, true);
+        buf.write(";\n", 2);
+    }
+    return buf.str();
+}
+
+AccessEntityPtr deserializeAccessEntity(const String & definition, const String & path)
+{
+    ASTs queries;
+    ParserAttachAccessEntity parser;
+    const char * begin = definition.data(); /// begin of current query
+    const char * pos = begin; /// parser moves pos from begin to the end of current query
+    const char * end = begin + definition.size();
+    while (pos < end)
+    {
+        queries.emplace_back(parseQueryAndMovePosition(parser, pos, end, "", true, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH));
+        while (isWhitespaceASCII(*pos) || *pos == ';')
+            ++pos;
+    }
+
+    /// Interpret the AST to build an access entity.
+    std::shared_ptr<User> user;
+    std::shared_ptr<Role> role;
+    std::shared_ptr<RowPolicy> policy;
+    std::shared_ptr<Quota> quota;
+    std::shared_ptr<SettingsProfile> profile;
+    AccessEntityPtr res;
+
+    for (const auto & query : queries)
+    {
+        if (auto * create_user_query = query->as<ASTCreateUserQuery>())
+        {
+            if (res)
+                throw Exception("Two access entities attached in " + path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
+            res = user = std::make_unique<User>();
+            InterpreterCreateUserQuery::updateUserFromQuery(*user, *create_user_query);
+        }
+        else if (auto * create_role_query = query->as<ASTCreateRoleQuery>())
+        {
+            if (res)
+                throw Exception("Two access entities attached in " + path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
+            res = role = std::make_unique<Role>();
+            InterpreterCreateRoleQuery::updateRoleFromQuery(*role, *create_role_query);
+        }
+        else if (auto * create_policy_query = query->as<ASTCreateRowPolicyQuery>())
+        {
+            if (res)
+                throw Exception("Two access entities attached in " + path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
+            res = policy = std::make_unique<RowPolicy>();
+            InterpreterCreateRowPolicyQuery::updateRowPolicyFromQuery(*policy, *create_policy_query);
+        }
+        else if (auto * create_quota_query = query->as<ASTCreateQuotaQuery>())
+        {
+            if (res)
+                throw Exception("Two access entities attached in " + path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
+            res = quota = std::make_unique<Quota>();
+            InterpreterCreateQuotaQuery::updateQuotaFromQuery(*quota, *create_quota_query);
+        }
+        else if (auto * create_profile_query = query->as<ASTCreateSettingsProfileQuery>())
+        {
+            if (res)
+                throw Exception("Two access entities attached in " + path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
+            res = profile = std::make_unique<SettingsProfile>();
+            InterpreterCreateSettingsProfileQuery::updateSettingsProfileFromQuery(*profile, *create_profile_query);
+        }
+        else if (auto * grant_query = query->as<ASTGrantQuery>())
+        {
+            if (!user && !role)
+                throw Exception(
+                    "A user or role should be attached before grant in " + path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
+            if (user)
+                InterpreterGrantQuery::updateUserFromQuery(*user, *grant_query);
+            else
+                InterpreterGrantQuery::updateRoleFromQuery(*role, *grant_query);
+        }
+        else
+            throw Exception("No interpreter found for query " + query->getID(), ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
+    }
+
+    if (!res)
+        throw Exception("No access entities attached in " + path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
+
+    return res;
+}
+
+}
--- a/src/Access/AccessEntityIO.h
+++ b/src/Access/AccessEntityIO.h
@ -0,0 +1,12 @@
+#pragma once
+
+#include <Access/IAccessEntity.h>
+
+namespace DB
+{
+
+String serializeAccessEntity(const IAccessEntity & entity);
+
+AccessEntityPtr deserializeAccessEntity(const String & definition, const String & path);
+
+}
--- a/src/Access/ContextAccess.h
+++ b/src/Access/ContextAccess.h
@ -70,6 +70,7 @@ public:
    /// Returns the current user. The function can return nullptr.
    UserPtr getUser() const;
    String getUserName() const;
+    std::optional<UUID> getUserID() const { return getParams().user_id; }

    /// Returns information about current and enabled roles.
    std::shared_ptr<const EnabledRolesInfo> getRolesInfo() const;
--- a/src/Access/Credentials.h
+++ b/src/Access/Credentials.h
@ -26,6 +26,8 @@ protected:
    String user_name;
 };

+/// Does not check the password/credentials and that the specified host is allowed.
+/// (Used only internally in cluster, if the secret matches)
 class AlwaysAllowCredentials
    : public Credentials
 {
--- a/src/Access/DiskAccessStorage.cpp
+++ b/src/Access/DiskAccessStorage.cpp
@ -4,41 +4,20 @@
 #include <IO/ReadBufferFromFile.h>
 #include <IO/WriteBufferFromFile.h>
 #include <IO/ReadBufferFromString.h>
+#include <Access/AccessEntityIO.h>
 #include <Access/User.h>
 #include <Access/Role.h>
 #include <Access/RowPolicy.h>
 #include <Access/Quota.h>
-#include <Access/SettingsProfile.h>
 #include <Parsers/ASTCreateUserQuery.h>
-#include <Parsers/ASTCreateRoleQuery.h>
-#include <Parsers/ASTCreateRowPolicyQuery.h>
-#include <Parsers/ASTCreateQuotaQuery.h>
-#include <Parsers/ASTCreateSettingsProfileQuery.h>
-#include <Parsers/ASTGrantQuery.h>
-#include <Parsers/ParserCreateUserQuery.h>
-#include <Parsers/ParserCreateRoleQuery.h>
-#include <Parsers/ParserCreateRowPolicyQuery.h>
-#include <Parsers/ParserCreateQuotaQuery.h>
-#include <Parsers/ParserCreateSettingsProfileQuery.h>
-#include <Parsers/ParserGrantQuery.h>
 #include <Parsers/formatAST.h>
-#include <Parsers/parseQuery.h>
 #include <Interpreters/InterpreterCreateUserQuery.h>
-#include <Interpreters/InterpreterCreateRoleQuery.h>
-#include <Interpreters/InterpreterCreateRowPolicyQuery.h>
-#include <Interpreters/InterpreterCreateQuotaQuery.h>
-#include <Interpreters/InterpreterCreateSettingsProfileQuery.h>
-#include <Interpreters/InterpreterGrantQuery.h>
-#include <Interpreters/InterpreterShowCreateAccessEntityQuery.h>
 #include <Interpreters/InterpreterShowGrantsQuery.h>
 #include <Common/quoteString.h>
-#include <Core/Defines.h>
 #include <Poco/JSON/JSON.h>
 #include <Poco/JSON/Object.h>
 #include <Poco/JSON/Stringifier.h>
 #include <boost/range/adaptor/map.hpp>
-#include <boost/range/algorithm/copy.hpp>
-#include <boost/range/algorithm_ext/push_back.hpp>
 #include <filesystem>
 #include <fstream>

@ -49,7 +28,6 @@ namespace ErrorCodes
 {
    extern const int DIRECTORY_DOESNT_EXIST;
    extern const int FILE_DOESNT_EXIST;
-    extern const int INCORRECT_ACCESS_ENTITY_DEFINITION;
 }


@ -58,34 +36,6 @@ namespace
    using EntityType = IAccessStorage::EntityType;
    using EntityTypeInfo = IAccessStorage::EntityTypeInfo;

-    /// Special parser for the 'ATTACH access entity' queries.
-    class ParserAttachAccessEntity : public IParserBase
-    {
-    protected:
-        const char * getName() const override { return "ATTACH access entity query"; }
-
-        bool parseImpl(Pos & pos, ASTPtr & node, Expected & expected) override
-        {
-            ParserCreateUserQuery create_user_p;
-            ParserCreateRoleQuery create_role_p;
-            ParserCreateRowPolicyQuery create_policy_p;
-            ParserCreateQuotaQuery create_quota_p;
-            ParserCreateSettingsProfileQuery create_profile_p;
-            ParserGrantQuery grant_p;
-
-            create_user_p.useAttachMode();
-            create_role_p.useAttachMode();
-            create_policy_p.useAttachMode();
-            create_quota_p.useAttachMode();
-            create_profile_p.useAttachMode();
-            grant_p.useAttachMode();
-
-            return create_user_p.parse(pos, node, expected) || create_role_p.parse(pos, node, expected)
-                || create_policy_p.parse(pos, node, expected) || create_quota_p.parse(pos, node, expected)
-                || create_profile_p.parse(pos, node, expected) || grant_p.parse(pos, node, expected);
-        }
-    };
-

    /// Reads a file containing ATTACH queries and then parses it to build an access entity.
    AccessEntityPtr readEntityFile(const String & file_path)
@ -96,80 +46,7 @@ namespace
        readStringUntilEOF(file_contents, in);

        /// Parse the file contents.
-        ASTs queries;
-        ParserAttachAccessEntity parser;
-        const char * begin = file_contents.data(); /// begin of current query
-        const char * pos = begin; /// parser moves pos from begin to the end of current query
-        const char * end = begin + file_contents.size();
-        while (pos < end)
-        {
-            queries.emplace_back(parseQueryAndMovePosition(parser, pos, end, "", true, 0, DBMS_DEFAULT_MAX_PARSER_DEPTH));
-            while (isWhitespaceASCII(*pos) || *pos == ';')
-                ++pos;
-        }
-
-        /// Interpret the AST to build an access entity.
-        std::shared_ptr<User> user;
-        std::shared_ptr<Role> role;
-        std::shared_ptr<RowPolicy> policy;
-        std::shared_ptr<Quota> quota;
-        std::shared_ptr<SettingsProfile> profile;
-        AccessEntityPtr res;
-
-        for (const auto & query : queries)
-        {
-            if (auto * create_user_query = query->as<ASTCreateUserQuery>())
-            {
-                if (res)
-                    throw Exception("Two access entities in one file " + file_path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
-                res = user = std::make_unique<User>();
-                InterpreterCreateUserQuery::updateUserFromQuery(*user, *create_user_query);
-            }
-            else if (auto * create_role_query = query->as<ASTCreateRoleQuery>())
-            {
-                if (res)
-                    throw Exception("Two access entities in one file " + file_path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
-                res = role = std::make_unique<Role>();
-                InterpreterCreateRoleQuery::updateRoleFromQuery(*role, *create_role_query);
-            }
-            else if (auto * create_policy_query = query->as<ASTCreateRowPolicyQuery>())
-            {
-                if (res)
-                    throw Exception("Two access entities in one file " + file_path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
-                res = policy = std::make_unique<RowPolicy>();
-                InterpreterCreateRowPolicyQuery::updateRowPolicyFromQuery(*policy, *create_policy_query);
-            }
-            else if (auto * create_quota_query = query->as<ASTCreateQuotaQuery>())
-            {
-                if (res)
-                    throw Exception("Two access entities are attached in the same file " + file_path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
-                res = quota = std::make_unique<Quota>();
-                InterpreterCreateQuotaQuery::updateQuotaFromQuery(*quota, *create_quota_query);
-            }
-            else if (auto * create_profile_query = query->as<ASTCreateSettingsProfileQuery>())
-            {
-                if (res)
-                    throw Exception("Two access entities are attached in the same file " + file_path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
-                res = profile = std::make_unique<SettingsProfile>();
-                InterpreterCreateSettingsProfileQuery::updateSettingsProfileFromQuery(*profile, *create_profile_query);
-            }
-            else if (auto * grant_query = query->as<ASTGrantQuery>())
-            {
-                if (!user && !role)
-                    throw Exception("A user or role should be attached before grant in file " + file_path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
-                if (user)
-                    InterpreterGrantQuery::updateUserFromQuery(*user, *grant_query);
-                else
-                    InterpreterGrantQuery::updateRoleFromQuery(*role, *grant_query);
-            }
-            else
-                throw Exception("No interpreter found for query " + query->getID(), ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
-        }
-
-        if (!res)
-            throw Exception("No access entities attached in file " + file_path, ErrorCodes::INCORRECT_ACCESS_ENTITY_DEFINITION);
-
-        return res;
+        return deserializeAccessEntity(file_contents, file_path);
    }


@ -186,24 +63,10 @@ namespace
        }
    }

-
    /// Writes ATTACH queries for building a specified access entity to a file.
    void writeEntityFile(const String & file_path, const IAccessEntity & entity)
    {
-        /// Build list of ATTACH queries.
-        ASTs queries;
-        queries.push_back(InterpreterShowCreateAccessEntityQuery::getAttachQuery(entity));
-        if ((entity.getType() == EntityType::USER) || (entity.getType() == EntityType::ROLE))
-            boost::range::push_back(queries, InterpreterShowGrantsQuery::getAttachGrantQueries(entity));
-
-        /// Serialize the list of ATTACH queries to a string.
-        WriteBufferFromOwnString buf;
-        for (const ASTPtr & query : queries)
-        {
-            formatAST(*query, buf, false, true);
-            buf.write(";\n", 2);
-        }
-        String file_contents = buf.str();
+        String file_contents = serializeAccessEntity(entity);

        /// First we save *.tmp file and then we rename if everything's ok.
        auto tmp_file_path = std::filesystem::path{file_path}.replace_extension(".tmp");
--- a/src/Access/ReplicatedAccessStorage.cpp
+++ b/src/Access/ReplicatedAccessStorage.cpp
@ -0,0 +1,618 @@
+#include <Access/AccessEntityIO.h>
+#include <Access/MemoryAccessStorage.h>
+#include <Access/ReplicatedAccessStorage.h>
+#include <IO/ReadHelpers.h>
+#include <boost/container/flat_set.hpp>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/ZooKeeper/Types.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Common/escapeForFileName.h>
+#include <common/range.h>
+#include <common/sleep.h>
+
+
+namespace DB
+{
+namespace ErrorCodes
+{
+extern const int BAD_ARGUMENTS;
+extern const int NO_ZOOKEEPER;
+}
+
+static UUID parseUUID(const String & text)
+{
+    UUID uuid = UUIDHelpers::Nil;
+    auto buffer = ReadBufferFromMemory(text.data(), text.length());
+    readUUIDText(uuid, buffer);
+    return uuid;
+}
+
+ReplicatedAccessStorage::ReplicatedAccessStorage(
+    const String & storage_name_,
+    const String & zookeeper_path_,
+    zkutil::GetZooKeeper get_zookeeper_)
+    : IAccessStorage(storage_name_)
+    , zookeeper_path(zookeeper_path_)
+    , get_zookeeper(get_zookeeper_)
+{
+    if (zookeeper_path.empty())
+        throw Exception("ZooKeeper path must be non-empty", ErrorCodes::BAD_ARGUMENTS);
+
+    if (zookeeper_path.back() == '/')
+        zookeeper_path.resize(zookeeper_path.size() - 1);
+
+    /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it.
+    if (zookeeper_path.front() != '/')
+        zookeeper_path = "/" + zookeeper_path;
+}
+
+ReplicatedAccessStorage::~ReplicatedAccessStorage()
+{
+    ReplicatedAccessStorage::shutdown();
+}
+
+
+void ReplicatedAccessStorage::startup()
+{
+    initializeZookeeper();
+    worker_thread = ThreadFromGlobalPool(&ReplicatedAccessStorage::runWorkerThread, this);
+}
+
+void ReplicatedAccessStorage::shutdown()
+{
+    bool prev_stop_flag = stop_flag.exchange(true);
+    if (!prev_stop_flag)
+    {
+        /// Notify the worker thread to stop waiting for new queue items
+        refresh_queue.push(UUIDHelpers::Nil);
+        worker_thread.join();
+    }
+}
+
+template <typename Func>
+static void retryOnZooKeeperUserError(size_t attempts, Func && function)
+{
+    while (attempts > 0)
+    {
+        try
+        {
+            function();
+            return;
+        }
+        catch (zkutil::KeeperException & keeper_exception)
+        {
+            if (Coordination::isUserError(keeper_exception.code) && attempts > 1)
+                attempts -= 1;
+            else
+                throw;
+        }
+    }
+}
+
+UUID ReplicatedAccessStorage::insertImpl(const AccessEntityPtr & new_entity, bool replace_if_exists)
+{
+    const UUID id = generateRandomID();
+    const EntityTypeInfo type_info = EntityTypeInfo::get(new_entity->getType());
+    const String & name = new_entity->getName();
+    LOG_DEBUG(getLogger(), "Inserting entity of type {} named {} with id {}", type_info.name, name, toString(id));
+
+    auto zookeeper = get_zookeeper();
+    retryOnZooKeeperUserError(10, [&]{ insertZooKeeper(zookeeper, id, new_entity, replace_if_exists); });
+
+    Notifications notifications;
+    SCOPE_EXIT({ notify(notifications); });
+    std::lock_guard lock{mutex};
+    refreshEntityNoLock(zookeeper, id, notifications);
+    return id;
+}
+
+
+void ReplicatedAccessStorage::insertZooKeeper(
+    const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const AccessEntityPtr & new_entity, bool replace_if_exists)
+{
+    const String & name = new_entity->getName();
+    const EntityType type = new_entity->getType();
+    const EntityTypeInfo type_info = EntityTypeInfo::get(type);
+
+    const String entity_uuid = toString(id);
+    /// The entity data will be stored here, this ensures all entities have unique ids
+    const String entity_path = zookeeper_path + "/uuid/" + entity_uuid;
+    /// Then we create a znode with the entity name, inside the znode of each entity type
+    /// This ensure all entities of the same type have a unique name
+    const String name_path = zookeeper_path + "/" + type_info.unique_char + "/" + escapeForFileName(name);
+
+    Coordination::Requests ops;
+    const String new_entity_definition = serializeAccessEntity(*new_entity);
+    ops.emplace_back(zkutil::makeCreateRequest(entity_path, new_entity_definition, zkutil::CreateMode::Persistent));
+    /// The content of the "name" znode is the uuid of the entity owning that name
+    ops.emplace_back(zkutil::makeCreateRequest(name_path, entity_uuid, zkutil::CreateMode::Persistent));
+
+    Coordination::Responses responses;
+    const Coordination::Error res = zookeeper->tryMulti(ops, responses);
+    if (res == Coordination::Error::ZNODEEXISTS)
+    {
+        if (responses[0]->error == Coordination::Error::ZNODEEXISTS)
+        {
+            /// The UUID already exists, simply fail.
+
+            /// To fail with a nice error message, we need info about what already exists.
+            /// This itself could fail if the conflicting uuid disappears in the meantime.
+            /// If that happens, then we'll just retry from the start.
+            String existing_entity_definition = zookeeper->get(entity_path);
+
+            AccessEntityPtr existing_entity = deserializeAccessEntity(existing_entity_definition, entity_path);
+            EntityType existing_type = existing_entity->getType();
+            String existing_name = existing_entity->getName();
+            throwIDCollisionCannotInsert(id, type, name, existing_type, existing_name);
+        }
+        else if (replace_if_exists)
+        {
+            /// The name already exists for this type.
+            /// If asked to, we need to replace the existing entity.
+
+            /// First get the uuid of the existing entity
+            /// This itself could fail if the conflicting name disappears in the meantime.
+            /// If that happens, then we'll just retry from the start.
+            Coordination::Stat name_stat;
+            String existing_entity_uuid = zookeeper->get(name_path, &name_stat);
+
+            const String existing_entity_path = zookeeper_path + "/uuid/" + existing_entity_uuid;
+            Coordination::Requests replace_ops;
+            replace_ops.emplace_back(zkutil::makeRemoveRequest(existing_entity_path, -1));
+            replace_ops.emplace_back(zkutil::makeCreateRequest(entity_path, new_entity_definition, zkutil::CreateMode::Persistent));
+            replace_ops.emplace_back(zkutil::makeSetRequest(name_path, entity_uuid, name_stat.version));
+
+            /// If this fails, then we'll just retry from the start.
+            zookeeper->multi(replace_ops);
+        }
+        else
+        {
+            throwNameCollisionCannotInsert(type, name);
+        }
+    }
+    else
+    {
+        zkutil::KeeperMultiException::check(res, ops, responses);
+    }
+}
+
+void ReplicatedAccessStorage::removeImpl(const UUID & id)
+{
+    LOG_DEBUG(getLogger(), "Removing entity {}", toString(id));
+
+    auto zookeeper = get_zookeeper();
+    retryOnZooKeeperUserError(10, [&] { removeZooKeeper(zookeeper, id); });
+
+    Notifications notifications;
+    SCOPE_EXIT({ notify(notifications); });
+    std::lock_guard lock{mutex};
+    removeEntityNoLock(id, notifications);
+}
+
+
+void ReplicatedAccessStorage::removeZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id)
+{
+    const String entity_uuid = toString(id);
+    const String entity_path = zookeeper_path + "/uuid/" + entity_uuid;
+
+    String entity_definition;
+    Coordination::Stat entity_stat;
+    const bool uuid_exists = zookeeper->tryGet(entity_path, entity_definition, &entity_stat);
+    if (!uuid_exists)
+        throwNotFound(id);
+
+    const AccessEntityPtr entity = deserializeAccessEntity(entity_definition, entity_path);
+    const EntityTypeInfo type_info = EntityTypeInfo::get(entity->getType());
+    const String & name = entity->getName();
+
+    const String entity_name_path = zookeeper_path + "/" + type_info.unique_char + "/" + escapeForFileName(name);
+
+    Coordination::Requests ops;
+    ops.emplace_back(zkutil::makeRemoveRequest(entity_path, entity_stat.version));
+    ops.emplace_back(zkutil::makeRemoveRequest(entity_name_path, -1));
+    /// If this fails, then we'll just retry from the start.
+    zookeeper->multi(ops);
+}
+
+
+void ReplicatedAccessStorage::updateImpl(const UUID & id, const UpdateFunc & update_func)
+{
+    LOG_DEBUG(getLogger(), "Updating entity {}", toString(id));
+
+    auto zookeeper = get_zookeeper();
+    retryOnZooKeeperUserError(10, [&] { updateZooKeeper(zookeeper, id, update_func); });
+
+    Notifications notifications;
+    SCOPE_EXIT({ notify(notifications); });
+    std::lock_guard lock{mutex};
+    refreshEntityNoLock(zookeeper, id, notifications);
+}
+
+
+void ReplicatedAccessStorage::updateZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const UpdateFunc & update_func)
+{
+    const String entity_uuid = toString(id);
+    const String entity_path = zookeeper_path + "/uuid/" + entity_uuid;
+
+    String old_entity_definition;
+    Coordination::Stat stat;
+    const bool uuid_exists = zookeeper->tryGet(entity_path, old_entity_definition, &stat);
+    if (!uuid_exists)
+        throwNotFound(id);
+
+    const AccessEntityPtr old_entity = deserializeAccessEntity(old_entity_definition, entity_path);
+    const AccessEntityPtr new_entity = update_func(old_entity);
+
+    if (!new_entity->isTypeOf(old_entity->getType()))
+        throwBadCast(id, new_entity->getType(), new_entity->getName(), old_entity->getType());
+
+    const EntityTypeInfo type_info = EntityTypeInfo::get(new_entity->getType());
+
+    Coordination::Requests ops;
+    const String new_entity_definition = serializeAccessEntity(*new_entity);
+    ops.emplace_back(zkutil::makeSetRequest(entity_path, new_entity_definition, stat.version));
+
+    const String & old_name = old_entity->getName();
+    const String & new_name = new_entity->getName();
+    if (new_name != old_name)
+    {
+        auto old_name_path = zookeeper_path + "/" + type_info.unique_char + "/" + escapeForFileName(old_name);
+        auto new_name_path = zookeeper_path + "/" + type_info.unique_char + "/" + escapeForFileName(new_name);
+        ops.emplace_back(zkutil::makeRemoveRequest(old_name_path, -1));
+        ops.emplace_back(zkutil::makeCreateRequest(new_name_path, entity_uuid, zkutil::CreateMode::Persistent));
+    }
+
+    Coordination::Responses responses;
+    const Coordination::Error res = zookeeper->tryMulti(ops, responses);
+    if (res == Coordination::Error::ZNODEEXISTS)
+    {
+        throwNameCollisionCannotRename(new_entity->getType(), old_name, new_name);
+    }
+    else if (res == Coordination::Error::ZNONODE)
+    {
+        throwNotFound(id);
+    }
+    else
+    {
+        zkutil::KeeperMultiException::check(res, ops, responses);
+    }
+}
+
+
+void ReplicatedAccessStorage::runWorkerThread()
+{
+    LOG_DEBUG(getLogger(), "Started worker thread");
+    while (!stop_flag)
+    {
+        try
+        {
+            if (!initialized)
+                initializeZookeeper();
+            refresh();
+        }
+        catch (...)
+        {
+            tryLogCurrentException(getLogger(), "Unexpected error, will try to restart worker thread:");
+            resetAfterError();
+            sleepForSeconds(5);
+        }
+    }
+}
+
+void ReplicatedAccessStorage::resetAfterError()
+{
+    initialized = false;
+
+    UUID id;
+    while (refresh_queue.tryPop(id)) {}
+
+    std::lock_guard lock{mutex};
+    for (const auto type : collections::range(EntityType::MAX))
+        entries_by_name_and_type[static_cast<size_t>(type)].clear();
+    entries_by_id.clear();
+}
+
+void ReplicatedAccessStorage::initializeZookeeper()
+{
+    assert(!initialized);
+    auto zookeeper = get_zookeeper();
+
+    if (!zookeeper)
+        throw Exception("Can't have Replicated access without ZooKeeper", ErrorCodes::NO_ZOOKEEPER);
+
+    createRootNodes(zookeeper);
+
+    refreshEntities(zookeeper);
+
+    initialized = true;
+}
+
+void ReplicatedAccessStorage::createRootNodes(const zkutil::ZooKeeperPtr & zookeeper)
+{
+    zookeeper->createAncestors(zookeeper_path);
+    zookeeper->createIfNotExists(zookeeper_path, "");
+    zookeeper->createIfNotExists(zookeeper_path + "/uuid", "");
+    for (const auto type : collections::range(EntityType::MAX))
+    {
+        /// Create a znode for each type of AccessEntity
+        const auto type_info = EntityTypeInfo::get(type);
+        zookeeper->createIfNotExists(zookeeper_path + "/" + type_info.unique_char, "");
+    }
+}
+
+void ReplicatedAccessStorage::refresh()
+{
+    UUID id;
+    if (refresh_queue.tryPop(id, /* timeout_ms: */ 10000))
+    {
+        if (stop_flag)
+            return;
+
+        auto zookeeper = get_zookeeper();
+
+        if (id == UUIDHelpers::Nil)
+            refreshEntities(zookeeper);
+        else
+            refreshEntity(zookeeper, id);
+    }
+}
+
+
+void ReplicatedAccessStorage::refreshEntities(const zkutil::ZooKeeperPtr & zookeeper)
+{
+    LOG_DEBUG(getLogger(), "Refreshing entities list");
+
+    const String zookeeper_uuids_path = zookeeper_path + "/uuid";
+    auto watch_entities_list = [this](const Coordination::WatchResponse &)
+    {
+        refresh_queue.push(UUIDHelpers::Nil);
+    };
+    Coordination::Stat stat;
+    const auto entity_uuid_strs = zookeeper->getChildrenWatch(zookeeper_uuids_path, &stat, watch_entities_list);
+
+    std::unordered_set<UUID> entity_uuids;
+    entity_uuids.reserve(entity_uuid_strs.size());
+    for (const String & entity_uuid_str : entity_uuid_strs)
+        entity_uuids.insert(parseUUID(entity_uuid_str));
+
+    Notifications notifications;
+    SCOPE_EXIT({ notify(notifications); });
+    std::lock_guard lock{mutex};
+
+    std::vector<UUID> entities_to_remove;
+    /// Locally remove entities that were removed from ZooKeeper
+    for (const auto & pair : entries_by_id)
+    {
+        const UUID & entity_uuid = pair.first;
+        if (!entity_uuids.contains(entity_uuid))
+            entities_to_remove.push_back(entity_uuid);
+    }
+    for (const auto & entity_uuid : entities_to_remove)
+        removeEntityNoLock(entity_uuid, notifications);
+
+    /// Locally add entities that were added to ZooKeeper
+    for (const auto & entity_uuid : entity_uuids)
+    {
+        const auto it = entries_by_id.find(entity_uuid);
+        if (it == entries_by_id.end())
+            refreshEntityNoLock(zookeeper, entity_uuid, notifications);
+    }
+
+    LOG_DEBUG(getLogger(), "Refreshing entities list finished");
+}
+
+void ReplicatedAccessStorage::refreshEntity(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id)
+{
+    Notifications notifications;
+    SCOPE_EXIT({ notify(notifications); });
+    std::lock_guard lock{mutex};
+
+    refreshEntityNoLock(zookeeper, id, notifications);
+}
+
+void ReplicatedAccessStorage::refreshEntityNoLock(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, Notifications & notifications)
+{
+    LOG_DEBUG(getLogger(), "Refreshing entity {}", toString(id));
+
+    const auto watch_entity = [this, id](const Coordination::WatchResponse & response)
+    {
+        if (response.type == Coordination::Event::CHANGED)
+            refresh_queue.push(id);
+    };
+    Coordination::Stat entity_stat;
+    const String entity_path = zookeeper_path + "/uuid/" + toString(id);
+    String entity_definition;
+    const bool exists = zookeeper->tryGetWatch(entity_path, entity_definition, &entity_stat, watch_entity);
+    if (exists)
+    {
+        const AccessEntityPtr entity = deserializeAccessEntity(entity_definition, entity_path);
+        setEntityNoLock(id, entity, notifications);
+    }
+    else
+    {
+        removeEntityNoLock(id, notifications);
+    }
+}
+
+
+void ReplicatedAccessStorage::setEntityNoLock(const UUID & id, const AccessEntityPtr & entity, Notifications & notifications)
+{
+    LOG_DEBUG(getLogger(), "Setting id {} to entity named {}", toString(id), entity->getName());
+    const EntityType type = entity->getType();
+    const String & name = entity->getName();
+
+    /// If the type+name already exists and is a different entity, remove old entity
+    auto & entries_by_name = entries_by_name_and_type[static_cast<size_t>(type)];
+    if (auto it = entries_by_name.find(name); it != entries_by_name.end() && it->second->id != id)
+    {
+        removeEntityNoLock(it->second->id, notifications);
+    }
+
+    /// If the entity already exists under a different type+name, remove old type+name
+    if (auto it = entries_by_id.find(id); it != entries_by_id.end())
+    {
+        const AccessEntityPtr & existing_entity = it->second.entity;
+        const EntityType existing_type = existing_entity->getType();
+        const String & existing_name = existing_entity->getName();
+        if (existing_type != type || existing_name != name)
+        {
+            auto & existing_entries_by_name = entries_by_name_and_type[static_cast<size_t>(existing_type)];
+            existing_entries_by_name.erase(existing_name);
+        }
+    }
+
+    auto & entry = entries_by_id[id];
+    entry.id = id;
+    entry.entity = entity;
+    entries_by_name[name] = &entry;
+    prepareNotifications(entry, false, notifications);
+}
+
+
+void ReplicatedAccessStorage::removeEntityNoLock(const UUID & id, Notifications & notifications)
+{
+    LOG_DEBUG(getLogger(), "Removing entity with id {}", toString(id));
+    const auto it = entries_by_id.find(id);
+    if (it == entries_by_id.end())
+    {
+        LOG_DEBUG(getLogger(), "Id {} not found, ignoring removal", toString(id));
+        return;
+    }
+
+    const Entry & entry = it->second;
+    const EntityType type = entry.entity->getType();
+    const String & name = entry.entity->getName();
+    prepareNotifications(entry, true, notifications);
+
+    auto & entries_by_name = entries_by_name_and_type[static_cast<size_t>(type)];
+    const auto name_it = entries_by_name.find(name);
+    if (name_it == entries_by_name.end())
+        LOG_WARNING(getLogger(), "Entity {} not found in names, ignoring removal of name", toString(id));
+    else if (name_it->second != &(it->second))
+        LOG_WARNING(getLogger(), "Name {} not pointing to entity {}, ignoring removal of name", name, toString(id));
+    else
+        entries_by_name.erase(name);
+
+    entries_by_id.erase(id);
+    LOG_DEBUG(getLogger(), "Removed entity with id {}", toString(id));
+}
+
+
+std::optional<UUID> ReplicatedAccessStorage::findImpl(EntityType type, const String & name) const
+{
+    std::lock_guard lock{mutex};
+    const auto & entries_by_name = entries_by_name_and_type[static_cast<size_t>(type)];
+    const auto it = entries_by_name.find(name);
+    if (it == entries_by_name.end())
+        return {};
+
+    const Entry * entry = it->second;
+    return entry->id;
+}
+
+
+std::vector<UUID> ReplicatedAccessStorage::findAllImpl(EntityType type) const
+{
+    std::lock_guard lock{mutex};
+    std::vector<UUID> result;
+    result.reserve(entries_by_id.size());
+    for (const auto & [id, entry] : entries_by_id)
+        if (entry.entity->isTypeOf(type))
+            result.emplace_back(id);
+    return result;
+}
+
+
+bool ReplicatedAccessStorage::existsImpl(const UUID & id) const
+{
+    std::lock_guard lock{mutex};
+    return entries_by_id.count(id);
+}
+
+
+AccessEntityPtr ReplicatedAccessStorage::readImpl(const UUID & id) const
+{
+    std::lock_guard lock{mutex};
+    const auto it = entries_by_id.find(id);
+    if (it == entries_by_id.end())
+        throwNotFound(id);
+    const Entry & entry = it->second;
+    return entry.entity;
+}
+
+
+String ReplicatedAccessStorage::readNameImpl(const UUID & id) const
+{
+    return readImpl(id)->getName();
+}
+
+
+void ReplicatedAccessStorage::prepareNotifications(const Entry & entry, bool remove, Notifications & notifications) const
+{
+    const AccessEntityPtr entity = remove ? nullptr : entry.entity;
+    for (const auto & handler : entry.handlers_by_id)
+        notifications.push_back({handler, entry.id, entity});
+
+    for (const auto & handler : handlers_by_type[static_cast<size_t>(entry.entity->getType())])
+        notifications.push_back({handler, entry.id, entity});
+}
+
+
+scope_guard ReplicatedAccessStorage::subscribeForChangesImpl(EntityType type, const OnChangedHandler & handler) const
+{
+    std::lock_guard lock{mutex};
+    auto & handlers = handlers_by_type[static_cast<size_t>(type)];
+    handlers.push_back(handler);
+    auto handler_it = std::prev(handlers.end());
+
+    return [this, type, handler_it]
+    {
+        std::lock_guard lock2{mutex};
+        auto & handlers2 = handlers_by_type[static_cast<size_t>(type)];
+        handlers2.erase(handler_it);
+    };
+}
+
+
+scope_guard ReplicatedAccessStorage::subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const
+{
+    std::lock_guard lock{mutex};
+    const auto it = entries_by_id.find(id);
+    if (it == entries_by_id.end())
+        return {};
+    const Entry & entry = it->second;
+    auto handler_it = entry.handlers_by_id.insert(entry.handlers_by_id.end(), handler);
+
+    return [this, id, handler_it]
+    {
+        std::lock_guard lock2{mutex};
+        auto it2 = entries_by_id.find(id);
+        if (it2 != entries_by_id.end())
+        {
+            const Entry & entry2 = it2->second;
+            entry2.handlers_by_id.erase(handler_it);
+        }
+    };
+}
+
+
+bool ReplicatedAccessStorage::hasSubscriptionImpl(const UUID & id) const
+{
+    std::lock_guard lock{mutex};
+    const auto & it = entries_by_id.find(id);
+    if (it != entries_by_id.end())
+    {
+        const Entry & entry = it->second;
+        return !entry.handlers_by_id.empty();
+    }
+    return false;
+}
+
+
+bool ReplicatedAccessStorage::hasSubscriptionImpl(EntityType type) const
+{
+    std::lock_guard lock{mutex};
+    const auto & handlers = handlers_by_type[static_cast<size_t>(type)];
+    return !handlers.empty();
+}
+}
--- a/src/Access/ReplicatedAccessStorage.h
+++ b/src/Access/ReplicatedAccessStorage.h
@ -0,0 +1,87 @@
+#pragma once
+
+#include <Access/IAccessStorage.h>
+#include <Common/ThreadPool.h>
+#include <Common/ZooKeeper/Common.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <common/scope_guard.h>
+#include <Coordination/ThreadSafeQueue.h>
+#include <atomic>
+#include <list>
+#include <memory>
+#include <mutex>
+#include <unordered_map>
+
+
+namespace DB
+{
+/// Implementation of IAccessStorage which keeps all data in zookeeper.
+class ReplicatedAccessStorage : public IAccessStorage
+{
+public:
+    static constexpr char STORAGE_TYPE[] = "replicated";
+
+    ReplicatedAccessStorage(const String & storage_name, const String & zookeeper_path, zkutil::GetZooKeeper get_zookeeper);
+    virtual ~ReplicatedAccessStorage() override;
+
+    const char * getStorageType() const override { return STORAGE_TYPE; }
+
+    virtual void startup();
+    virtual void shutdown();
+
+private:
+    String zookeeper_path;
+    zkutil::GetZooKeeper get_zookeeper;
+
+    std::atomic<bool> initialized = false;
+    std::atomic<bool> stop_flag = false;
+    ThreadFromGlobalPool worker_thread;
+    ThreadSafeQueue<UUID> refresh_queue;
+
+    UUID insertImpl(const AccessEntityPtr & entity, bool replace_if_exists) override;
+    void removeImpl(const UUID & id) override;
+    void updateImpl(const UUID & id, const UpdateFunc & update_func) override;
+
+    void insertZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const AccessEntityPtr & entity, bool replace_if_exists);
+    void removeZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id);
+    void updateZooKeeper(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, const UpdateFunc & update_func);
+
+    void runWorkerThread();
+    void resetAfterError();
+    void initializeZookeeper();
+    void createRootNodes(const zkutil::ZooKeeperPtr & zookeeper);
+
+    void refresh();
+    void refreshEntities(const zkutil::ZooKeeperPtr & zookeeper);
+    void refreshEntity(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id);
+    void refreshEntityNoLock(const zkutil::ZooKeeperPtr & zookeeper, const UUID & id, Notifications & notifications);
+
+    void setEntityNoLock(const UUID & id, const AccessEntityPtr & entity, Notifications & notifications);
+    void removeEntityNoLock(const UUID & id, Notifications & notifications);
+
+    struct Entry
+    {
+        UUID id;
+        AccessEntityPtr entity;
+        mutable std::list<OnChangedHandler> handlers_by_id;
+    };
+
+    std::optional<UUID> findImpl(EntityType type, const String & name) const override;
+    std::vector<UUID> findAllImpl(EntityType type) const override;
+    bool existsImpl(const UUID & id) const override;
+    AccessEntityPtr readImpl(const UUID & id) const override;
+    String readNameImpl(const UUID & id) const override;
+    bool canInsertImpl(const AccessEntityPtr &) const override { return true; }
+
+    void prepareNotifications(const Entry & entry, bool remove, Notifications & notifications) const;
+    scope_guard subscribeForChangesImpl(const UUID & id, const OnChangedHandler & handler) const override;
+    scope_guard subscribeForChangesImpl(EntityType type, const OnChangedHandler & handler) const override;
+    bool hasSubscriptionImpl(const UUID & id) const override;
+    bool hasSubscriptionImpl(EntityType type) const override;
+
+    mutable std::mutex mutex;
+    std::unordered_map<UUID, Entry> entries_by_id;
+    std::unordered_map<String, Entry *> entries_by_name_and_type[static_cast<size_t>(EntityType::MAX)];
+    mutable std::list<OnChangedHandler> handlers_by_type[static_cast<size_t>(EntityType::MAX)];
+};
+}
--- a/src/Access/ya.make
+++ b/src/Access/ya.make
@ -10,6 +10,7 @@ PEERDIR(

 SRCS(
    AccessControlManager.cpp
+    AccessEntityIO.cpp
    AccessRights.cpp
    AccessRightsElement.cpp
    AllowedClientHosts.cpp
@ -34,6 +35,7 @@ SRCS(
    Quota.cpp
    QuotaCache.cpp
    QuotaUsage.cpp
+    ReplicatedAccessStorage.cpp
    Role.cpp
    RoleCache.cpp
    RolesOrUsersSet.cpp
--- a/src/AggregateFunctions/UniqVariadicHash.h
+++ b/src/AggregateFunctions/UniqVariadicHash.h
@ -5,6 +5,7 @@
 #include <Common/SipHash.h>
 #include <Common/assert_cast.h>
 #include <Columns/ColumnTuple.h>
+#include <DataTypes/IDataType.h>


 namespace DB
--- a/src/Bridge/IBridgeHelper.cpp
+++ b/src/Bridge/IBridgeHelper.cpp
@ -5,6 +5,7 @@
 #include <Poco/Net/HTTPRequest.h>
 #include <Poco/URI.h>
 #include <filesystem>
+#include <thread>

 namespace fs = std::filesystem;

--- a/src/Client/MultiplexedConnections.cpp
+++ b/src/Client/MultiplexedConnections.cpp
@ -373,7 +373,9 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead
            except_list,
            is_draining ? drain_timeout : receive_timeout);

-        if (n == 0)
+        /// We treat any error as timeout for simplicity.
+        /// And we also check if read_list is still empty just in case.
+        if (n <= 0 || read_list.empty())
        {
            auto err_msg = fmt::format("Timeout exceeded while reading from {}", dumpAddressesUnlocked());
            for (ReplicaState & state : replica_states)
@ -389,9 +391,7 @@ MultiplexedConnections::ReplicaState & MultiplexedConnections::getReplicaForRead
        }
    }

-    /// TODO Absolutely wrong code: read_list could be empty; motivation of rand is unclear.
-    /// This code path is disabled by default.
-
+    /// TODO Motivation of rand is unclear.
    auto & socket = read_list[thread_local_rng() % read_list.size()];
    if (fd_to_replica_state_idx.empty())
    {
--- a/src/Columns/ColumnAggregateFunction.cpp
+++ b/src/Columns/ColumnAggregateFunction.cpp
@ -1,5 +1,6 @@
 #include <Columns/ColumnAggregateFunction.h>
 #include <Columns/ColumnsCommon.h>
+#include <Columns/MaskOperations.h>
 #include <Common/assert_cast.h>
 #include <DataStreams/ColumnGathererStream.h>
 #include <IO/WriteBufferFromArena.h>
@ -308,6 +309,10 @@ ColumnPtr ColumnAggregateFunction::filter(const Filter & filter, ssize_t result_
    return res;
 }

+void ColumnAggregateFunction::expand(const Filter & mask, bool inverted)
+{
+    expandDataByMask<char *>(data, mask, inverted);
+}

 ColumnPtr ColumnAggregateFunction::permute(const Permutation & perm, size_t limit) const
 {
--- a/src/Columns/ColumnAggregateFunction.h
+++ b/src/Columns/ColumnAggregateFunction.h
@ -177,6 +177,8 @@ public:

    ColumnPtr filter(const Filter & filter, ssize_t result_size_hint) const override;

+    void expand(const Filter & mask, bool inverted) override;
+
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;

    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@ -8,6 +8,7 @@
 #include <Columns/ColumnConst.h>
 #include <Columns/ColumnsCommon.h>
 #include <Columns/ColumnCompressed.h>
+#include <Columns/MaskOperations.h>

 #include <common/unaligned.h>
 #include <common/sort.h>
@ -551,6 +552,34 @@ ColumnPtr ColumnArray::filter(const Filter & filt, ssize_t result_size_hint) con
    return filterGeneric(filt, result_size_hint);
 }

+void ColumnArray::expand(const IColumn::Filter & mask, bool inverted)
+{
+    auto & offsets_data = getOffsets();
+    if (mask.size() < offsets_data.size())
+        throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
+
+    int index = mask.size() - 1;
+    int from = offsets_data.size() - 1;
+    offsets_data.resize(mask.size());
+    UInt64 last_offset = offsets_data[from];
+    while (index >= 0)
+    {
+        offsets_data[index] = last_offset;
+        if (!!mask[index] ^ inverted)
+        {
+            if (from < 0)
+                throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
+
+            --from;
+            last_offset = offsets_data[from];
+        }
+
+        --index;
+    }
+
+    if (from != -1)
+        throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);}
+
 template <typename T>
 ColumnPtr ColumnArray::filterNumber(const Filter & filt, ssize_t result_size_hint) const
 {
--- a/src/Columns/ColumnArray.h
+++ b/src/Columns/ColumnArray.h
@ -71,6 +71,7 @@ public:
    void insertDefault() override;
    void popBack(size_t n) override;
    ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
+    void expand(const Filter & mask, bool inverted) override;
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
    template <typename Type> ColumnPtr indexImpl(const PaddedPODArray<Type> & indexes, size_t limit) const;
--- a/src/Columns/ColumnCompressed.h
+++ b/src/Columns/ColumnCompressed.h
@ -90,6 +90,7 @@ public:
    void updateWeakHash32(WeakHash32 &) const override { throwMustBeDecompressed(); }
    void updateHashFast(SipHash &) const override { throwMustBeDecompressed(); }
    ColumnPtr filter(const Filter &, ssize_t) const override { throwMustBeDecompressed(); }
+    void expand(const Filter &, bool) override { throwMustBeDecompressed(); }
    ColumnPtr permute(const Permutation &, size_t) const override { throwMustBeDecompressed(); }
    ColumnPtr index(const IColumn &, size_t) const override { throwMustBeDecompressed(); }
    int compareAt(size_t, size_t, const IColumn &, int) const override { throwMustBeDecompressed(); }
--- a/src/Columns/ColumnConst.cpp
+++ b/src/Columns/ColumnConst.cpp
@ -59,9 +59,28 @@ ColumnPtr ColumnConst::filter(const Filter & filt, ssize_t /*result_size_hint*/)
        throw Exception("Size of filter (" + toString(filt.size()) + ") doesn't match size of column (" + toString(s) + ")",
            ErrorCodes::SIZES_OF_COLUMNS_DOESNT_MATCH);

-    return ColumnConst::create(data, countBytesInFilter(filt));
+    size_t new_size = countBytesInFilter(filt);
+    return ColumnConst::create(data, new_size);
 }

+void ColumnConst::expand(const Filter & mask, bool inverted)
+{
+    if (mask.size() < s)
+        throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
+
+    size_t bytes_count = countBytesInFilter(mask);
+    if (inverted)
+        bytes_count = mask.size() - bytes_count;
+
+    if (bytes_count < s)
+        throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
+    else if (bytes_count > s)
+        throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
+
+    s = mask.size();
+}
+
+
 ColumnPtr ColumnConst::replicate(const Offsets & offsets) const
 {
    if (s != offsets.size())
--- a/src/Columns/ColumnConst.h
+++ b/src/Columns/ColumnConst.h
@ -181,6 +181,8 @@ public:
    }

    ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
+    void expand(const Filter & mask, bool inverted) override;
+
    ColumnPtr replicate(const Offsets & offsets) const override;
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
--- a/src/Columns/ColumnDecimal.cpp
+++ b/src/Columns/ColumnDecimal.cpp
@ -15,6 +15,7 @@
 #include <Columns/ColumnsCommon.h>
 #include <Columns/ColumnDecimal.h>
 #include <Columns/ColumnCompressed.h>
+#include <Columns/MaskOperations.h>
 #include <DataStreams/ColumnGathererStream.h>


@ -320,6 +321,12 @@ ColumnPtr ColumnDecimal<T>::filter(const IColumn::Filter & filt, ssize_t result_
    return res;
 }

+template <typename T>
+void ColumnDecimal<T>::expand(const IColumn::Filter & mask, bool inverted)
+{
+    expandDataByMask<T>(data, mask, inverted);
+}
+
 template <typename T>
 ColumnPtr ColumnDecimal<T>::index(const IColumn & indexes, size_t limit) const
 {
--- a/src/Columns/ColumnDecimal.h
+++ b/src/Columns/ColumnDecimal.h
@ -151,6 +151,8 @@ public:
    bool isDefaultAt(size_t n) const override { return data[n].value == 0; }

    ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override;
+    void expand(const IColumn::Filter & mask, bool inverted) override;
+
    ColumnPtr permute(const IColumn::Permutation & perm, size_t limit) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;

--- a/src/Columns/ColumnFixedString.cpp
+++ b/src/Columns/ColumnFixedString.cpp
@ -344,6 +344,32 @@ ColumnPtr ColumnFixedString::filter(const IColumn::Filter & filt, ssize_t result
    return res;
 }

+void ColumnFixedString::expand(const IColumn::Filter & mask, bool inverted)
+{
+    if (mask.size() < size())
+        throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
+
+    int index = mask.size() - 1;
+    int from = size() - 1;
+    chars.resize_fill(mask.size() * n, 0);
+    while (index >= 0)
+    {
+        if (!!mask[index] ^ inverted)
+        {
+            if (from < 0)
+                throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
+
+            memcpy(&chars[index * n], &chars[from * n], n);
+            --from;
+        }
+
+        --index;
+    }
+
+    if (from != -1)
+        throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
+}
+
 ColumnPtr ColumnFixedString::permute(const Permutation & perm, size_t limit) const
 {
    size_t col_size = size();
--- a/src/Columns/ColumnFixedString.h
+++ b/src/Columns/ColumnFixedString.h
@ -147,6 +147,8 @@ public:

    ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override;

+    void expand(const IColumn::Filter & mask, bool inverted) override;
+
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;

    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
--- a/src/Columns/ColumnFunction.cpp
+++ b/src/Columns/ColumnFunction.cpp
@ -2,9 +2,15 @@
 #include <Columns/ColumnFunction.h>
 #include <Columns/ColumnsCommon.h>
 #include <Common/PODArray.h>
+#include <Common/ProfileEvents.h>
 #include <IO/WriteHelpers.h>
 #include <Functions/IFunction.h>

+namespace ProfileEvents
+{
+    extern const Event FunctionExecute;
+    extern const Event CompiledFunctionExecute;
+}

 namespace DB
 {
@ -15,8 +21,8 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

-ColumnFunction::ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture)
-        : size_(size), function(function_)
+ColumnFunction::ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture, bool is_short_circuit_argument_, bool is_function_compiled_)
+        : size_(size), function(function_), is_short_circuit_argument(is_short_circuit_argument_), is_function_compiled(is_function_compiled_)
 {
    appendArguments(columns_to_capture);
 }
@ -27,7 +33,7 @@ MutableColumnPtr ColumnFunction::cloneResized(size_t size) const
    for (auto & column : capture)
        column.column = column.column->cloneResized(size);

-    return ColumnFunction::create(size, function, capture);
+    return ColumnFunction::create(size, function, capture, is_short_circuit_argument, is_function_compiled);
 }

 ColumnPtr ColumnFunction::replicate(const Offsets & offsets) const
@ -41,7 +47,7 @@ ColumnPtr ColumnFunction::replicate(const Offsets & offsets) const
        column.column = column.column->replicate(offsets);

    size_t replicated_size = 0 == size_ ? 0 : offsets.back();
-    return ColumnFunction::create(replicated_size, function, capture);
+    return ColumnFunction::create(replicated_size, function, capture, is_short_circuit_argument, is_function_compiled);
 }

 ColumnPtr ColumnFunction::cut(size_t start, size_t length) const
@ -50,7 +56,7 @@ ColumnPtr ColumnFunction::cut(size_t start, size_t length) const
    for (auto & column : capture)
        column.column = column.column->cut(start, length);

-    return ColumnFunction::create(length, function, capture);
+    return ColumnFunction::create(length, function, capture, is_short_circuit_argument, is_function_compiled);
 }

 ColumnPtr ColumnFunction::filter(const Filter & filt, ssize_t result_size_hint) const
@ -65,11 +71,24 @@ ColumnPtr ColumnFunction::filter(const Filter & filt, ssize_t result_size_hint)

    size_t filtered_size = 0;
    if (capture.empty())
+    {
        filtered_size = countBytesInFilter(filt);
+    }
    else
        filtered_size = capture.front().column->size();

-    return ColumnFunction::create(filtered_size, function, capture);
+    return ColumnFunction::create(filtered_size, function, capture, is_short_circuit_argument, is_function_compiled);
+}
+
+void ColumnFunction::expand(const Filter & mask, bool inverted)
+{
+    for (auto & column : captured_columns)
+    {
+        column.column = column.column->cloneResized(column.column->size());
+        column.column->assumeMutable()->expand(mask, inverted);
+    }
+
+    size_ = mask.size();
 }

 ColumnPtr ColumnFunction::permute(const Permutation & perm, size_t limit) const
@ -87,7 +106,7 @@ ColumnPtr ColumnFunction::permute(const Permutation & perm, size_t limit) const
    for (auto & column : capture)
        column.column = column.column->permute(perm, limit);

-    return ColumnFunction::create(limit, function, capture);
+    return ColumnFunction::create(limit, function, capture, is_short_circuit_argument, is_function_compiled);
 }

 ColumnPtr ColumnFunction::index(const IColumn & indexes, size_t limit) const
@ -96,7 +115,7 @@ ColumnPtr ColumnFunction::index(const IColumn & indexes, size_t limit) const
    for (auto & column : capture)
        column.column = column.column->index(indexes, limit);

-    return ColumnFunction::create(limit, function, capture);
+    return ColumnFunction::create(limit, function, capture, is_short_circuit_argument, is_function_compiled);
 }

 std::vector<MutableColumnPtr> ColumnFunction::scatter(IColumn::ColumnIndex num_columns,
@ -125,7 +144,7 @@ std::vector<MutableColumnPtr> ColumnFunction::scatter(IColumn::ColumnIndex num_c
    {
        auto & capture = captures[part];
        size_t capture_size = capture.empty() ? counts[part] : capture.front().column->size();
-        columns.emplace_back(ColumnFunction::create(capture_size, function, std::move(capture)));
+        columns.emplace_back(ColumnFunction::create(capture_size, function, std::move(capture), is_short_circuit_argument));
    }

    return columns;
@ -179,7 +198,7 @@ void ColumnFunction::appendArgument(const ColumnWithTypeAndName & column)
    const auto & argumnet_types = function->getArgumentTypes();

    auto index = captured_columns.size();
-    if (!column.type->equals(*argumnet_types[index]))
+    if (!is_short_circuit_argument && !column.type->equals(*argumnet_types[index]))
        throw Exception("Cannot capture column " + std::to_string(argumnet_types.size()) +
                        " because it has incompatible type: got " + column.type->getName() +
                        ", but " + argumnet_types[index]->getName() + " is expected.", ErrorCodes::LOGICAL_ERROR);
@ -187,6 +206,11 @@ void ColumnFunction::appendArgument(const ColumnWithTypeAndName & column)
    captured_columns.push_back(column);
 }

+DataTypePtr ColumnFunction::getResultType() const
+{
+    return function->getResultType();
+}
+
 ColumnWithTypeAndName ColumnFunction::reduce() const
 {
    auto args = function->getArgumentTypes().size();
@ -196,11 +220,33 @@ ColumnWithTypeAndName ColumnFunction::reduce() const
        throw Exception("Cannot call function " + function->getName() + " because is has " + toString(args) +
                        "arguments but " + toString(captured) + " columns were captured.", ErrorCodes::LOGICAL_ERROR);

-    auto columns = captured_columns;
+    ColumnsWithTypeAndName columns = captured_columns;
+    if (is_short_circuit_argument)
+    {
+        /// Arguments of lazy executed function can also be lazy executed.
+        for (auto & col : columns)
+        {
+            if (const ColumnFunction * arg = checkAndGetShortCircuitArgument(col.column))
+                col = arg->reduce();
+        }
+    }
+
    ColumnWithTypeAndName res{nullptr, function->getResultType(), ""};

+    ProfileEvents::increment(ProfileEvents::FunctionExecute);
+    if (is_function_compiled)
+        ProfileEvents::increment(ProfileEvents::CompiledFunctionExecute);
+
    res.column = function->execute(columns, res.type, size_);
    return res;
 }

+const ColumnFunction * checkAndGetShortCircuitArgument(const ColumnPtr & column)
+{
+    const ColumnFunction * column_function;
+    if ((column_function = typeid_cast<const ColumnFunction *>(column.get())) && column_function->isShortCircuitArgument())
+        return column_function;
+    return nullptr;
+}
+
 }
--- a/src/Columns/ColumnFunction.h
+++ b/src/Columns/ColumnFunction.h
@ -5,9 +5,6 @@
 #include <Core/ColumnsWithTypeAndName.h>
 #include <Columns/IColumn.h>

-class IFunctionBase;
-using FunctionBasePtr = std::shared_ptr<IFunctionBase>;
-

 namespace DB
 {
@ -16,6 +13,8 @@ namespace ErrorCodes
    extern const int NOT_IMPLEMENTED;
 }

+class IFunctionBase;
+using FunctionBasePtr = std::shared_ptr<IFunctionBase>;

 /** A column containing a lambda expression.
  * Behaves like a constant-column. Contains an expression, but not input or output data.
@ -25,7 +24,7 @@ class ColumnFunction final : public COWHelper<IColumn, ColumnFunction>
 private:
    friend class COWHelper<IColumn, ColumnFunction>;

-    ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture);
+    ColumnFunction(size_t size, FunctionBasePtr function_, const ColumnsWithTypeAndName & columns_to_capture, bool is_short_circuit_argument_ = false, bool is_function_compiled_ = false);

 public:
    const char * getFamilyName() const override { return "Function"; }
@ -38,6 +37,7 @@ public:
    ColumnPtr cut(size_t start, size_t length) const override;
    ColumnPtr replicate(const Offsets & offsets) const override;
    ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
+    void expand(const Filter & mask, bool inverted) override;
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;

@ -153,12 +153,29 @@ public:
        throw Exception("Method gather is not supported for " + getName(), ErrorCodes::NOT_IMPLEMENTED);
    }

+    bool isShortCircuitArgument() const { return is_short_circuit_argument; }
+
+    DataTypePtr getResultType() const;
+
 private:
    size_t size_;
    FunctionBasePtr function;
    ColumnsWithTypeAndName captured_columns;

+    /// Determine if it's used as a lazy executed argument for short-circuit function.
+    /// It's needed to distinguish between lazy executed argument and
+    /// argument with ColumnFunction column (some functions can return it)
+    /// See ExpressionActions.cpp for details.
+    bool is_short_circuit_argument;
+
+    /// Determine if passed function is compiled. Used for profiling.
+    bool is_function_compiled;
+
    void appendArgument(const ColumnWithTypeAndName & column);
+
+    void addOffsetsForReplication(const IColumn::Offsets & offsets);
 };

+const ColumnFunction * checkAndGetShortCircuitArgument(const ColumnPtr & column);
+
 }
--- a/src/Columns/ColumnLowCardinality.h
+++ b/src/Columns/ColumnLowCardinality.h
@ -110,6 +110,11 @@ public:
        return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().filter(filt, result_size_hint));
    }

+    void expand(const Filter & mask, bool inverted) override
+    {
+        idx.getPositionsPtr()->expand(mask, inverted);
+    }
+
    ColumnPtr permute(const Permutation & perm, size_t limit) const override
    {
        return ColumnLowCardinality::create(dictionary.getColumnUniquePtr(), getIndexes().permute(perm, limit));
--- a/src/Columns/ColumnMap.cpp
+++ b/src/Columns/ColumnMap.cpp
@ -149,6 +149,11 @@ ColumnPtr ColumnMap::filter(const Filter & filt, ssize_t result_size_hint) const
    return ColumnMap::create(filtered);
 }

+void ColumnMap::expand(const IColumn::Filter & mask, bool inverted)
+{
+    nested->expand(mask, inverted);
+}
+
 ColumnPtr ColumnMap::permute(const Permutation & perm, size_t limit) const
 {
    auto permuted = nested->permute(perm, limit);
--- a/src/Columns/ColumnMap.h
+++ b/src/Columns/ColumnMap.h
@ -64,6 +64,7 @@ public:
    void updateHashFast(SipHash & hash) const override;
    void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
    ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
+    void expand(const Filter & mask, bool inverted) override;
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
    ColumnPtr replicate(const Offsets & offsets) const override;
--- a/src/Columns/ColumnNullable.cpp
+++ b/src/Columns/ColumnNullable.cpp
@ -221,6 +221,12 @@ ColumnPtr ColumnNullable::filter(const Filter & filt, ssize_t result_size_hint)
    return ColumnNullable::create(filtered_data, filtered_null_map);
 }

+void ColumnNullable::expand(const IColumn::Filter & mask, bool inverted)
+{
+    nested_column->expand(mask, inverted);
+    null_map->expand(mask, inverted);
+}
+
 ColumnPtr ColumnNullable::permute(const Permutation & perm, size_t limit) const
 {
    ColumnPtr permuted_data = getNestedColumn().permute(perm, limit);
--- a/src/Columns/ColumnNullable.h
+++ b/src/Columns/ColumnNullable.h
@ -88,6 +88,7 @@ public:

    void popBack(size_t n) override;
    ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
+    void expand(const Filter & mask, bool inverted) override;
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
    int compareAt(size_t n, size_t m, const IColumn & rhs_, int null_direction_hint) const override;
--- a/src/Columns/ColumnString.cpp
+++ b/src/Columns/ColumnString.cpp
@ -3,6 +3,7 @@
 #include <Columns/Collator.h>
 #include <Columns/ColumnsCommon.h>
 #include <Columns/ColumnCompressed.h>
+#include <Columns/MaskOperations.h>
 #include <DataStreams/ColumnGathererStream.h>
 #include <Common/Arena.h>
 #include <Common/HashTable/Hash.h>
@ -157,6 +158,53 @@ ColumnPtr ColumnString::filter(const Filter & filt, ssize_t result_size_hint) co
    return res;
 }

+void ColumnString::expand(const IColumn::Filter & mask, bool inverted)
+{
+    auto & offsets_data = getOffsets();
+    auto & chars_data = getChars();
+    if (mask.size() < offsets_data.size())
+        throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
+
+    /// We cannot change only offsets, because each string should end with terminating zero byte.
+    /// So, we will insert one zero byte when mask value is zero.
+
+    int index = mask.size() - 1;
+    int from = offsets_data.size() - 1;
+    /// mask.size() - offsets_data.size() should be equal to the number of zeros in mask
+    /// (if not, one of exceptions below will throw) and we can calculate the resulting chars size.
+    UInt64 last_offset = offsets_data[from] + (mask.size() - offsets_data.size());
+    offsets_data.resize(mask.size());
+    chars_data.resize_fill(last_offset, 0);
+    while (index >= 0)
+    {
+        offsets_data[index] = last_offset;
+        if (!!mask[index] ^ inverted)
+        {
+            if (from < 0)
+                throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
+
+            size_t len = offsets_data[from] - offsets_data[from - 1];
+
+            /// Copy only if it makes sense. It's important to copy backward, because
+            /// ranges can overlap, but destination is always is more to the right then source
+            if (last_offset - len != offsets_data[from - 1])
+                std::copy_backward(&chars_data[offsets_data[from - 1]], &chars_data[offsets_data[from]], &chars_data[last_offset]);
+            last_offset -= len;
+            --from;
+        }
+        else
+        {
+            chars_data[last_offset - 1] = 0;
+            --last_offset;
+        }
+
+        --index;
+    }
+
+    if (from != -1)
+        throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
+}
+

 ColumnPtr ColumnString::permute(const Permutation & perm, size_t limit) const
 {
--- a/src/Columns/ColumnString.h
+++ b/src/Columns/ColumnString.h
@ -212,6 +212,8 @@ public:

    ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;

+    void expand(const Filter & mask, bool inverted) override;
+
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;

    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
--- a/src/Columns/ColumnTuple.cpp
+++ b/src/Columns/ColumnTuple.cpp
@ -232,6 +232,12 @@ ColumnPtr ColumnTuple::filter(const Filter & filt, ssize_t result_size_hint) con
    return ColumnTuple::create(new_columns);
 }

+void ColumnTuple::expand(const Filter & mask, bool inverted)
+{
+    for (auto & column : columns)
+        column->expand(mask, inverted);
+}
+
 ColumnPtr ColumnTuple::permute(const Permutation & perm, size_t limit) const
 {
    const size_t tuple_size = columns.size();
--- a/src/Columns/ColumnTuple.h
+++ b/src/Columns/ColumnTuple.h
@ -1,6 +1,6 @@
 #pragma once

-#include <Core/Block.h>
+#include <Columns/IColumn.h>


 namespace DB
@ -67,6 +67,7 @@ public:
    void updateHashFast(SipHash & hash) const override;
    void insertRangeFrom(const IColumn & src, size_t start, size_t length) override;
    ColumnPtr filter(const Filter & filt, ssize_t result_size_hint) const override;
+    void expand(const Filter & mask, bool inverted) override;
    ColumnPtr permute(const Permutation & perm, size_t limit) const override;
    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
    ColumnPtr replicate(const Offsets & offsets) const override;
--- a/src/Columns/ColumnUnique.h
+++ b/src/Columns/ColumnUnique.h
@ -304,7 +304,7 @@ size_t ColumnUnique<ColumnType>::uniqueInsert(const Field & x)
    if (x.getType() == Field::Types::Null)
        return getNullValueIndex();

-    if (isNumeric())
+    if (valuesHaveFixedSize())
        return uniqueInsertData(&x.reinterpret<char>(), size_of_value_if_fixed);

    auto & val = x.get<String>();
--- a/src/Columns/ColumnVector.cpp
+++ b/src/Columns/ColumnVector.cpp
@ -3,6 +3,7 @@
 #include <pdqsort.h>
 #include <Columns/ColumnsCommon.h>
 #include <Columns/ColumnCompressed.h>
+#include <Columns/MaskOperations.h>
 #include <DataStreams/ColumnGathererStream.h>
 #include <IO/WriteHelpers.h>
 #include <Common/Arena.h>
@ -408,6 +409,12 @@ ColumnPtr ColumnVector<T>::filter(const IColumn::Filter & filt, ssize_t result_s
    return res;
 }

+template <typename T>
+void ColumnVector<T>::expand(const IColumn::Filter & mask, bool inverted)
+{
+    expandDataByMask<T>(data, mask, inverted);
+}
+
 template <typename T>
 void ColumnVector<T>::applyZeroMap(const IColumn::Filter & filt, bool inverted)
 {
--- a/src/Columns/ColumnVector.h
+++ b/src/Columns/ColumnVector.h
@ -239,6 +239,7 @@ public:
        return data[n];
    }

+
    void get(size_t n, Field & res) const override
    {
        res = (*this)[n];
@ -284,6 +285,8 @@ public:

    ColumnPtr filter(const IColumn::Filter & filt, ssize_t result_size_hint) const override;

+    void expand(const IColumn::Filter & mask, bool inverted) override;
+
    ColumnPtr permute(const IColumn::Permutation & perm, size_t limit) const override;

    ColumnPtr index(const IColumn & indexes, size_t limit) const override;
--- a/src/Columns/IColumn.cpp
+++ b/src/Columns/IColumn.cpp
@ -3,7 +3,6 @@
 #include <Columns/IColumn.h>
 #include <Columns/ColumnNullable.h>
 #include <Columns/ColumnConst.h>
-#include <Columns/ColumnArray.h>
 #include <Core/Field.h>


--- a/src/Columns/IColumn.h
+++ b/src/Columns/IColumn.h
@ -230,12 +230,20 @@ public:
    /** Removes elements that don't match the filter.
      * Is used in WHERE and HAVING operations.
      * If result_size_hint > 0, then makes advance reserve(result_size_hint) for the result column;
-      *  if 0, then don't makes reserve(),
-      *  otherwise (i.e. < 0), makes reserve() using size of source column.
+      * if 0, then don't makes reserve(),
+      * otherwise (i.e. < 0), makes reserve() using size of source column.
      */
    using Filter = PaddedPODArray<UInt8>;
    virtual Ptr filter(const Filter & filt, ssize_t result_size_hint) const = 0;

+    /** Expand column by mask inplace. After expanding column will
+      * satisfy the following: if we filter it by given mask, we will
+      * get initial column. Values with indexes i: mask[i] = 0
+      * shouldn't be used after expanding.
+      * If inverted is true, inverted mask will be used.
+      */
+    virtual void expand(const Filter & /*mask*/, bool /*inverted*/) = 0;
+
    /// Permutes elements using specified permutation. Is used in sorting.
    /// limit - if it isn't 0, puts only first limit elements in the result.
    using Permutation = PaddedPODArray<size_t>;
--- a/src/Columns/IColumnDummy.h
+++ b/src/Columns/IColumnDummy.h
@ -100,7 +100,16 @@ public:

    ColumnPtr filter(const Filter & filt, ssize_t /*result_size_hint*/) const override
    {
-        return cloneDummy(countBytesInFilter(filt));
+        size_t bytes = countBytesInFilter(filt);
+        return cloneDummy(bytes);
+    }
+
+    void expand(const IColumn::Filter & mask, bool inverted) override
+    {
+        size_t bytes = countBytesInFilter(mask);
+        if (inverted)
+            bytes = mask.size() - bytes;
+        s = bytes;
    }

    ColumnPtr permute(const Permutation & perm, size_t limit) const override
--- a/src/Columns/IColumnUnique.h
+++ b/src/Columns/IColumnUnique.h
@ -139,6 +139,11 @@ public:
        throw Exception("Method filter is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
    }

+    void expand(const IColumn::Filter &, bool) override
+    {
+        throw Exception("Method expand is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
+    }
+
    ColumnPtr permute(const IColumn::Permutation &, size_t) const override
    {
        throw Exception("Method permute is not supported for ColumnUnique.", ErrorCodes::NOT_IMPLEMENTED);
--- a/src/Columns/MaskOperations.cpp
+++ b/src/Columns/MaskOperations.cpp
@ -0,0 +1,316 @@
+#include <Columns/MaskOperations.h>
+#include <Columns/ColumnFunction.h>
+#include <Columns/ColumnNullable.h>
+#include <Columns/ColumnNothing.h>
+#include <Columns/ColumnsCommon.h>
+#include <Columns/ColumnConst.h>
+#include <algorithm>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+    extern const int ILLEGAL_COLUMN;
+}
+
+template <typename T>
+void expandDataByMask(PaddedPODArray<T> & data, const PaddedPODArray<UInt8> & mask, bool inverted)
+{
+    if (mask.size() < data.size())
+        throw Exception("Mask size should be no less than data size.", ErrorCodes::LOGICAL_ERROR);
+
+    int from = data.size() - 1;
+    int index = mask.size() - 1;
+    data.resize(mask.size());
+    while (index >= 0)
+    {
+        if (!!mask[index] ^ inverted)
+        {
+            if (from < 0)
+                throw Exception("Too many bytes in mask", ErrorCodes::LOGICAL_ERROR);
+
+            /// Copy only if it makes sense.
+            if (index != from)
+                data[index] = data[from];
+            --from;
+        }
+        else
+            data[index] = T();
+
+        --index;
+    }
+
+    if (from != -1)
+        throw Exception("Not enough bytes in mask", ErrorCodes::LOGICAL_ERROR);
+}
+
+/// Explicit instantiations - not to place the implementation of the function above in the header file.
+#define INSTANTIATE(TYPE) \
+template void expandDataByMask<TYPE>(PaddedPODArray<TYPE> &, const PaddedPODArray<UInt8> &, bool);
+
+INSTANTIATE(UInt8)
+INSTANTIATE(UInt16)
+INSTANTIATE(UInt32)
+INSTANTIATE(UInt64)
+INSTANTIATE(UInt128)
+INSTANTIATE(UInt256)
+INSTANTIATE(Int8)
+INSTANTIATE(Int16)
+INSTANTIATE(Int32)
+INSTANTIATE(Int64)
+INSTANTIATE(Int128)
+INSTANTIATE(Int256)
+INSTANTIATE(Float32)
+INSTANTIATE(Float64)
+INSTANTIATE(Decimal32)
+INSTANTIATE(Decimal64)
+INSTANTIATE(Decimal128)
+INSTANTIATE(Decimal256)
+INSTANTIATE(DateTime64)
+INSTANTIATE(char *)
+INSTANTIATE(UUID)
+
+#undef INSTANTIATE
+
+template <bool inverted, bool column_is_short, typename Container>
+size_t extractMaskNumericImpl(
+    PaddedPODArray<UInt8> & mask,
+    const Container & data,
+    UInt8 null_value,
+    const PaddedPODArray<UInt8> * null_bytemap,
+    PaddedPODArray<UInt8> * nulls)
+{
+    size_t ones_count = 0;
+    size_t data_index = 0;
+    for (size_t i = 0; i != mask.size(); ++i)
+    {
+        // Change mask only where value is 1.
+        if (!mask[i])
+            continue;
+
+        UInt8 value;
+        size_t index;
+        if constexpr (column_is_short)
+        {
+            index = data_index;
+            ++data_index;
+        }
+        else
+            index = i;
+
+        if (null_bytemap && (*null_bytemap)[index])
+        {
+            value = null_value;
+            if (nulls)
+                (*nulls)[i] = 1;
+        }
+        else
+            value = !!data[index];
+
+        if constexpr (inverted)
+            value = !value;
+
+        if (value)
+            ++ones_count;
+
+        mask[i] = value;
+    }
+    return ones_count;
+}
+
+template <bool inverted, typename NumericType>
+bool extractMaskNumeric(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    UInt8 null_value,
+    const PaddedPODArray<UInt8> * null_bytemap,
+    PaddedPODArray<UInt8> * nulls,
+    MaskInfo & mask_info)
+{
+    const auto * numeric_column = checkAndGetColumn<ColumnVector<NumericType>>(column.get());
+    if (!numeric_column)
+        return false;
+
+    const auto & data = numeric_column->getData();
+    size_t ones_count;
+    if (column->size() < mask.size())
+        ones_count = extractMaskNumericImpl<inverted, true>(mask, data, null_value, null_bytemap, nulls);
+    else
+        ones_count = extractMaskNumericImpl<inverted, false>(mask, data, null_value, null_bytemap, nulls);
+
+    mask_info.has_ones = ones_count > 0;
+    mask_info.has_zeros = ones_count != mask.size();
+    return true;
+}
+
+template <bool inverted>
+MaskInfo extractMaskFromConstOrNull(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    UInt8 null_value,
+    PaddedPODArray<UInt8> * nulls = nullptr)
+{
+    UInt8 value;
+    if (column->onlyNull())
+    {
+        value = null_value;
+        if (nulls)
+            std::fill(nulls->begin(), nulls->end(), 1);
+    }
+    else
+        value = column->getBool(0);
+
+    if constexpr (inverted)
+        value = !value;
+
+    size_t ones_count = 0;
+    if (value)
+        ones_count = countBytesInFilter(mask);
+    else
+        std::fill(mask.begin(), mask.end(), 0);
+
+    return {.has_ones = ones_count > 0, .has_zeros = ones_count != mask.size()};
+}
+
+template <bool inverted>
+MaskInfo extractMaskImpl(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    UInt8 null_value,
+    const PaddedPODArray<UInt8> * null_bytemap,
+    PaddedPODArray<UInt8> * nulls = nullptr)
+{
+    /// Special implementation for Null and Const columns.
+    if (column->onlyNull() || checkAndGetColumn<ColumnConst>(*column))
+        return extractMaskFromConstOrNull<inverted>(mask, column, null_value, nulls);
+
+    if (const auto * col = checkAndGetColumn<ColumnNullable>(*column))
+    {
+        const PaddedPODArray<UInt8> & null_map = col->getNullMapData();
+        return extractMaskImpl<inverted>(mask, col->getNestedColumnPtr(), null_value, &null_map, nulls);
+    }
+
+    MaskInfo mask_info;
+
+    if (!(extractMaskNumeric<inverted, UInt8>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, UInt16>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, UInt32>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, UInt64>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, Int8>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, Int16>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, Int32>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, Int64>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, Float32>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, Float64>(mask, column, null_value, null_bytemap, nulls, mask_info)))
+        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot convert column {} to mask.", column->getName());
+
+    return mask_info;
+}
+
+MaskInfo extractMask(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    UInt8 null_value)
+{
+    return extractMaskImpl<false>(mask, column, null_value, nullptr);
+}
+
+MaskInfo extractInvertedMask(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    UInt8 null_value)
+{
+    return extractMaskImpl<true>(mask, column, null_value, nullptr);
+}
+
+MaskInfo extractMask(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    PaddedPODArray<UInt8> * nulls,
+    UInt8 null_value)
+{
+    return extractMaskImpl<false>(mask, column, null_value, nullptr, nulls);
+}
+
+MaskInfo extractInvertedMask(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    PaddedPODArray<UInt8> * nulls,
+    UInt8 null_value)
+{
+    return extractMaskImpl<true>(mask, column, null_value, nullptr, nulls);
+}
+
+
+void inverseMask(PaddedPODArray<UInt8> & mask, MaskInfo & mask_info)
+{
+    for (size_t i = 0; i != mask.size(); ++i)
+        mask[i] = !mask[i];
+    std::swap(mask_info.has_ones, mask_info.has_zeros);
+}
+
+void maskedExecute(ColumnWithTypeAndName & column, const PaddedPODArray<UInt8> & mask, const MaskInfo & mask_info)
+{
+    const auto * column_function = checkAndGetShortCircuitArgument(column.column);
+    if (!column_function)
+        return;
+
+    ColumnWithTypeAndName result;
+    /// If mask contains only zeros, we can just create
+    /// an empty column with the execution result type.
+    if (!mask_info.has_ones)
+    {
+        auto result_type = column_function->getResultType();
+        auto empty_column = result_type->createColumn();
+        result = {std::move(empty_column), result_type, ""};
+    }
+    /// Filter column only if mask contains zeros.
+    else if (mask_info.has_zeros)
+    {
+        auto filtered = column_function->filter(mask, -1);
+        result = typeid_cast<const ColumnFunction *>(filtered.get())->reduce();
+    }
+    else
+        result = column_function->reduce();
+
+    column = std::move(result);
+}
+
+void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty)
+{
+    const auto * column_function = checkAndGetShortCircuitArgument(column.column);
+    if (!column_function)
+        return;
+
+    if (!empty)
+        column = column_function->reduce();
+    else
+        column.column = column_function->getResultType()->createColumn();
+}
+
+int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments)
+{
+    int last_short_circuit_argument_index = -1;
+    for (size_t i = 0; i != arguments.size(); ++i)
+    {
+        if (checkAndGetShortCircuitArgument(arguments[i].column))
+            last_short_circuit_argument_index = i;
+    }
+
+    return last_short_circuit_argument_index;
+}
+
+void copyMask(const PaddedPODArray<UInt8> & from, PaddedPODArray<UInt8> & to)
+{
+    if (from.size() != to.size())
+        throw Exception("Cannot copy mask, because source and destination have different size", ErrorCodes::LOGICAL_ERROR);
+
+    if (from.empty())
+        return;
+
+    memcpy(to.data(), from.data(), from.size() * sizeof(*from.data()));
+}
+
+}
--- a/src/Columns/MaskOperations.h
+++ b/src/Columns/MaskOperations.h
@ -0,0 +1,73 @@
+#pragma once
+
+#include <Core/ColumnWithTypeAndName.h>
+#include <Core/ColumnsWithTypeAndName.h>
+#include <Core/Field.h>
+#include <Common/PODArray.h>
+
+namespace DB
+{
+
+/// Expand data by mask. After expanding data will satisfy the following: if we filter data
+/// by given mask, we get initial data. In places where mask[i] = 0 we insert default value.
+/// If inverted is true, we will work with inverted mask. This function is used in implementations of
+/// expand() method in IColumn interface.
+template <typename T>
+void expandDataByMask(PaddedPODArray<T> & data, const PaddedPODArray<UInt8> & mask, bool inverted);
+
+struct MaskInfo
+{
+    bool has_ones;
+    bool has_zeros;
+};
+
+/// The next functions are used to extract UInt8 mask from a column,
+/// filtered by some condition (mask). We will use value from a column
+/// only when value in condition is 1. Column should satisfy the
+/// condition: sum(mask) = column.size() or mask.size() = column.size().
+/// You can set flag 'inverted' to use inverted values
+/// from a column. You can also determine value that will be used when
+/// column value is Null (argument null_value).
+
+MaskInfo extractMask(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    UInt8 null_value = 0);
+
+MaskInfo extractInvertedMask(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    UInt8 null_value = 0);
+
+/// The same as extractMask, but fills
+/// nulls so that nulls[i] = 1 when column[i] = Null.
+MaskInfo extractMask(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    PaddedPODArray<UInt8> * nulls,
+    UInt8 null_value = 0);
+
+MaskInfo extractInvertedMask(
+    PaddedPODArray<UInt8> & mask,
+    const ColumnPtr & column,
+    PaddedPODArray<UInt8> * nulls,
+    UInt8 null_value = 0);
+
+/// Inplace inversion.
+void inverseMask(PaddedPODArray<UInt8> & mask, MaskInfo & mask_info);
+
+/// If given column is lazy executed argument (ColumnFunction with isShortCircuitArgument() = true),
+/// filter it by mask and then reduce. If inverted is true, we will work with inverted mask.
+void maskedExecute(ColumnWithTypeAndName & column, const PaddedPODArray<UInt8> & mask, const MaskInfo & mask_info);
+
+/// If given column is lazy executed argument, reduce it. If empty is true,
+/// create an empty column with the execution result type.
+void executeColumnIfNeeded(ColumnWithTypeAndName & column, bool empty = false);
+
+/// Check if arguments contain lazy executed argument. If contain, return index of the last one,
+/// otherwise return -1.
+int checkShirtCircuitArguments(const ColumnsWithTypeAndName & arguments);
+
+void copyMask(const PaddedPODArray<UInt8> & from, PaddedPODArray<UInt8> & to);
+
+}
--- a/src/Columns/ya.make
+++ b/src/Columns/ya.make
@ -35,6 +35,7 @@ SRCS(
    ColumnsCommon.cpp
    FilterDescription.cpp
    IColumn.cpp
+    MaskOperations.cpp
    getLeastSuperColumn.cpp

 )
--- a/Show More
+++ b/Show More