Merge branch 'master' into persistent_nukeeper_log_storage

2024-11-21 23:21:59 +00:00 · 2021-02-22 23:52:26 +03:00 · 2021-02-22 23:52:26 +03:00 · 5b888e1193
commit 5b888e1193
parent d7c3dfe5fa 6946514c0b
213 changed files with 5046 additions and 1569 deletions
--- a/.github/codecov.yml
+++ b/.github/codecov.yml
@ -1,5 +1,5 @@
 codecov:
-  max_report_age: off
+  max_report_age: "off"
  strict_yaml_branch: "master"

 ignore:
--- a/.github/workflows/anchore-analysis.yml
+++ b/.github/workflows/anchore-analysis.yml
@ -8,7 +8,7 @@

 name: Docker Container Scan (clickhouse-server)

-on:
+"on":
  pull_request:
    paths:
      - docker/server/Dockerfile
--- a/.yamllint
+++ b/.yamllint
@ -0,0 +1,15 @@
+# vi: ft=yaml
+extends: default
+
+rules:
+    indentation:
+        level: warning
+        indent-sequences: consistent
+    line-length:
+        # there are some bash -c "", so this is OK
+        max: 300
+        level: warning
+    comments:
+        min-spaces-from-content: 1
+    document-start:
+        present: false
--- a/base/common/DateLUTImpl.h
+++ b/base/common/DateLUTImpl.h
@ -7,6 +7,7 @@
 #include <ctime>
 #include <string>

+
 #define DATE_LUT_MAX (0xFFFFFFFFU - 86400)
 #define DATE_LUT_MAX_DAY_NUM (0xFFFFFFFFU / 86400)
 /// Table size is bigger than DATE_LUT_MAX_DAY_NUM to fill all indices within UInt16 range: this allows to remove extra check.
@ -249,7 +250,7 @@ public:
    {
        DayNum index = findIndex(t);

-        if (unlikely(index == 0))
+        if (unlikely(index == 0 || index > DATE_LUT_MAX_DAY_NUM))
            return t + offset_at_start_of_epoch;

        time_t res = t - lut[index].date;
@ -264,18 +265,18 @@ public:
    {
        DayNum index = findIndex(t);

-        /// If it is not 1970 year (findIndex found nothing appropriate),
-        ///  than limit number of hours to avoid insane results like 1970-01-01 89:28:15
-        if (unlikely(index == 0))
+        /// If it is overflow case,
+        ///  then limit number of hours to avoid insane results like 1970-01-01 89:28:15
+        if (unlikely(index == 0 || index > DATE_LUT_MAX_DAY_NUM))
            return static_cast<unsigned>((t + offset_at_start_of_epoch) / 3600) % 24;

-        time_t res = t - lut[index].date;
+        time_t time = t - lut[index].date;

-        /// Data is cleaned to avoid possibility of underflow.
-        if (res >= lut[index].time_at_offset_change)
-            res += lut[index].amount_of_offset_change;
+        if (time >= lut[index].time_at_offset_change)
+            time += lut[index].amount_of_offset_change;

-        return res / 3600;
+        unsigned res = time / 3600;
+        return res <= 23 ? res : 0;
    }

    /** Calculating offset from UTC in seconds.
@ -314,12 +315,12 @@ public:
      *  each minute, with added or subtracted leap second, spans exactly 60 unix timestamps.
      */

-    inline unsigned toSecond(time_t t) const { return t % 60; }
+    inline unsigned toSecond(time_t t) const { return UInt32(t) % 60; }

    inline unsigned toMinute(time_t t) const
    {
        if (offset_is_whole_number_of_hours_everytime)
-            return (t / 60) % 60;
+            return (UInt32(t) / 60) % 60;

        UInt32 date = find(t).date;
        return (UInt32(t) - date) / 60 % 60;
@ -555,9 +556,7 @@ public:
        }
    }

-    /*
-     * check and change mode to effective
-     */
+    /// Check and change mode to effective.
    inline UInt8 check_week_mode(UInt8 mode) const
    {
        UInt8 week_format = (mode & 7);
@ -566,8 +565,7 @@ public:
        return week_format;
    }

-    /*
-     * Calc weekday from d
+    /** Calculate weekday from d.
      * Returns 0 for monday, 1 for tuesday...
      */
    inline unsigned calc_weekday(DayNum d, bool sunday_first_day_of_week) const
@ -578,7 +576,7 @@ public:
            return toDayOfWeek(DayNum(d + 1)) - 1;
    }

-    /* Calc days in one year. */
+    /// Calculate days in one year.
    inline unsigned calc_days_in_year(UInt16 year) const
    {
        return ((year & 3) == 0 && (year % 100 || (year % 400 == 0 && year)) ? 366 : 365);
--- a/base/common/arithmeticOverflow.h
+++ b/base/common/arithmeticOverflow.h
@ -6,6 +6,25 @@

 namespace common
 {
+    /// Multiply and ignore overflow.
+    template <typename T1, typename T2>
+    inline auto NO_SANITIZE_UNDEFINED mulIgnoreOverflow(T1 x, T2 y)
+    {
+        return x * y;
+    }
+
+    template <typename T1, typename T2>
+    inline auto NO_SANITIZE_UNDEFINED addIgnoreOverflow(T1 x, T2 y)
+    {
+        return x + y;
+    }
+
+    template <typename T1, typename T2>
+    inline auto NO_SANITIZE_UNDEFINED subIgnoreOverflow(T1 x, T2 y)
+    {
+        return x - y;
+    }
+
    template <typename T>
    inline bool addOverflow(T x, T y, T & res)
    {
@ -35,14 +54,14 @@ namespace common
    {
        static constexpr __int128 min_int128 = minInt128();
        static constexpr __int128 max_int128 = maxInt128();
-        res = x + y;
+        res = addIgnoreOverflow(x, y);
        return (y > 0 && x > max_int128 - y) || (y < 0 && x < min_int128 - y);
    }

    template <>
    inline bool addOverflow(wInt256 x, wInt256 y, wInt256 & res)
    {
-        res = x + y;
+        res = addIgnoreOverflow(x, y);
        return (y > 0 && x > std::numeric_limits<wInt256>::max() - y) ||
            (y < 0 && x < std::numeric_limits<wInt256>::min() - y);
    }
@ -50,7 +69,7 @@ namespace common
    template <>
    inline bool addOverflow(wUInt256 x, wUInt256 y, wUInt256 & res)
    {
-        res = x + y;
+        res = addIgnoreOverflow(x, y);
        return x > std::numeric_limits<wUInt256>::max() - y;
    }

@ -83,14 +102,14 @@ namespace common
    {
        static constexpr __int128 min_int128 = minInt128();
        static constexpr __int128 max_int128 = maxInt128();
-        res = x - y;
+        res = subIgnoreOverflow(x, y);
        return (y < 0 && x > max_int128 + y) || (y > 0 && x < min_int128 + y);
    }

    template <>
    inline bool subOverflow(wInt256 x, wInt256 y, wInt256 & res)
    {
-        res = x - y;
+        res = subIgnoreOverflow(x, y);
        return (y < 0 && x > std::numeric_limits<wInt256>::max() + y) ||
            (y > 0 && x < std::numeric_limits<wInt256>::min() + y);
    }
@ -98,7 +117,7 @@ namespace common
    template <>
    inline bool subOverflow(wUInt256 x, wUInt256 y, wUInt256 & res)
    {
-        res = x - y;
+        res = subIgnoreOverflow(x, y);
        return x < y;
    }

@ -129,40 +148,33 @@ namespace common
    template <>
    inline bool mulOverflow(__int128 x, __int128 y, __int128 & res)
    {
-        res = static_cast<unsigned __int128>(x) * static_cast<unsigned __int128>(y);    /// Avoid signed integer overflow.
+        res = mulIgnoreOverflow(x, y);
        if (!x || !y)
            return false;

        unsigned __int128 a = (x > 0) ? x : -x;
        unsigned __int128 b = (y > 0) ? y : -y;
-        return (a * b) / b != a;
+        return mulIgnoreOverflow(a, b) / b != a;
    }

    template <>
    inline bool mulOverflow(wInt256 x, wInt256 y, wInt256 & res)
    {
-        res = x * y;
+        res = mulIgnoreOverflow(x, y);
        if (!x || !y)
            return false;

        wInt256 a = (x > 0) ? x : -x;
        wInt256 b = (y > 0) ? y : -y;
-        return (a * b) / b != a;
+        return mulIgnoreOverflow(a, b) / b != a;
    }

    template <>
    inline bool mulOverflow(wUInt256 x, wUInt256 y, wUInt256 & res)
    {
-        res = x * y;
+        res = mulIgnoreOverflow(x, y);
        if (!x || !y)
            return false;
-        return (x * y) / y != x;
-    }
-
-    /// Multiply and ignore overflow.
-    template <typename T1, typename T2>
-    inline auto NO_SANITIZE_UNDEFINED mulIgnoreOverflow(T1 x, T2 y)
-    {
-        return x * y;
+        return res / y != x;
    }
 }
--- a/docker/test/fasttest/run.sh
+++ b/docker/test/fasttest/run.sh
@ -340,7 +340,7 @@ function run_tests
        # Look at DistributedFilesToInsert, so cannot run in parallel.
        01460_DistributedFilesToInsert

-        01541_max_memory_usage_for_user
+        01541_max_memory_usage_for_user_long

        # Require python libraries like scipy, pandas and numpy
        01322_ttest_scipy
--- a/docker/test/integration/runner/compose/docker_compose_mysql_5_7_for_materialize_mysql.yml
+++ b/docker/test/integration/runner/compose/docker_compose_mysql_5_7_for_materialize_mysql.yml
@ -7,4 +7,8 @@ services:
            MYSQL_ROOT_PASSWORD: clickhouse
        ports:
            - 3308:3306
-        command: --server_id=100 --log-bin='mysql-bin-1.log' --default-time-zone='+3:00' --gtid-mode="ON" --enforce-gtid-consistency
+        command: --server_id=100 --log-bin='mysql-bin-1.log'
+            --default-time-zone='+3:00'
+            --gtid-mode="ON"
+            --enforce-gtid-consistency
+            --log-error-verbosity=3
--- a/docker/test/integration/runner/compose/docker_compose_mysql_8_0_for_materialize_mysql.yml
+++ b/docker/test/integration/runner/compose/docker_compose_mysql_8_0_for_materialize_mysql.yml
@ -7,4 +7,9 @@ services:
            MYSQL_ROOT_PASSWORD: clickhouse
        ports:
            - 33308:3306
-        command: --server_id=100 --log-bin='mysql-bin-1.log' --default_authentication_plugin='mysql_native_password' --default-time-zone='+3:00' --gtid-mode="ON" --enforce-gtid-consistency
+        command: --server_id=100 --log-bin='mysql-bin-1.log'
+            --default_authentication_plugin='mysql_native_password'
+            --default-time-zone='+3:00'
+            --gtid-mode="ON"
+            --enforce-gtid-consistency
+            --log-error-verbosity=3
--- a/docker/test/performance-comparison/compare.sh
+++ b/docker/test/performance-comparison/compare.sh
@ -97,6 +97,7 @@ function configure
    rm -r right/db ||:
    rm -r db0/preprocessed_configs ||:
    rm -r db0/{data,metadata}/system ||:
+    rm db0/status ||:
    cp -al db0/ left/db/
    cp -al db0/ right/db/
 }
--- a/docker/test/stateful/run.sh
+++ b/docker/test/stateful/run.sh
@ -60,4 +60,8 @@ fi
 # more idiologically correct.
 read -ra ADDITIONAL_OPTIONS <<< "${ADDITIONAL_OPTIONS:-}"

+if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
+    ADDITIONAL_OPTIONS+=('--replicated-database')
+fi
+
 clickhouse-test --testname --shard --zookeeper --no-stateless --hung-check --print-time "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" "$SKIP_TESTS_OPTION" 2>&1 | ts '%Y-%m-%d %H:%M:%S' | tee test_output/test_result.txt
--- a/docker/test/stateless/run.sh
+++ b/docker/test/stateless/run.sh
@ -57,6 +57,10 @@ function run_tests()
        ADDITIONAL_OPTIONS+=('4')
    fi

+    if [[ -n "$USE_DATABASE_REPLICATED" ]] && [[ "$USE_DATABASE_REPLICATED" -eq 1 ]]; then
+        ADDITIONAL_OPTIONS+=('--replicated-database')
+    fi
+
    clickhouse-test --testname --shard --zookeeper --hung-check --print-time \
            --test-runs "$NUM_TRIES" \
            "$SKIP_LIST_OPT" "${ADDITIONAL_OPTIONS[@]}" 2>&1 \
--- a/docker/test/stress/stress
+++ b/docker/test/stress/stress
@ -23,12 +23,15 @@ def get_options(i):
    if 0 < i:
        options += " --order=random"

-    if i % 2 == 1:
+    if i % 3 == 1:
        options += " --db-engine=Ordinary"

+    if i % 3 == 2:
+        options += ''' --db-engine="Replicated('/test/db/test_{}', 's1', 'r1')"'''.format(i)
+
    # If database name is not specified, new database is created for each functional test.
    # Run some threads with one database for all tests.
-    if i % 3 == 1:
+    if i % 2 == 1:
        options += " --database=test_{}".format(i)

    if i == 13:
--- a/docker/test/style/Dockerfile
+++ b/docker/test/style/Dockerfile
@ -1,7 +1,14 @@
 # docker build -t yandex/clickhouse-style-test .
 FROM ubuntu:20.04

-RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes shellcheck libxml2-utils git python3-pip pylint && pip3 install codespell
+RUN apt-get update && env DEBIAN_FRONTEND=noninteractive apt-get install --yes \
+    shellcheck \
+    libxml2-utils \
+    git \
+    python3-pip \
+    pylint \
+    yamllint \
+    && pip3 install codespell


 # For |& syntax
--- a/docs/en/operations/settings/settings-users.md
+++ b/docs/en/operations/settings/settings-users.md
@ -139,7 +139,7 @@ You can assign a quotas set for the user. For a detailed description of quotas c

 ### user_name/databases {#user-namedatabases}

-In this section, you can you can limit rows that are returned by ClickHouse for `SELECT` queries made by the current user, thus implementing basic row-level security.
+In this section, you can limit rows that are returned by ClickHouse for `SELECT` queries made by the current user, thus implementing basic row-level security.

 **Example**

--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@ -2659,3 +2659,23 @@ Result:
 Note that this setting influences [Materialized view](../../sql-reference/statements/create/view.md#materialized) and [MaterializeMySQL](../../engines/database-engines/materialize-mysql.md) behaviour.

 [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) <!-- hide -->
+
+## engine_file_empty_if_not_exists {#engine-file-empty_if-not-exists}
+
+Allows to select data from a file engine table without file.
+
+Possible values:
+- 0 — `SELECT` throws exception.
+- 1 — `SELECT` returns empty result.
+
+Default value: `0`.
+
+## engine_file_truncate_on_insert {#engine-file-truncate-on-insert}
+
+Enables or disables truncate before insert in file engine tables.
+
+Possible values:
+- 0 — Disabled.
+- 1 — Enabled.
+
+Default value: `0`.
--- a/docs/en/sql-reference/statements/create/view.md
+++ b/docs/en/sql-reference/statements/create/view.md
@ -41,7 +41,6 @@ SELECT a, b, c FROM (SELECT ...)
 CREATE MATERIALIZED VIEW [IF NOT EXISTS] [db.]table_name [ON CLUSTER] [TO[db.]name] [ENGINE = engine] [POPULATE] AS SELECT ...
 ```

-
 Materialized views store data transformed by the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query.

 When creating a materialized view without `TO [db].[table]`, you must specify `ENGINE` – the table engine for storing data.
@ -65,4 +64,191 @@ Views look the same as normal tables. For example, they are listed in the result

 There isn’t a separate query for deleting views. To delete a view, use [DROP TABLE](../../../sql-reference/statements/drop.md).

+## Live View (Experimental) {#live-view}
+
+!!! important "Important"
+    This is an experimental feature that may change in backwards-incompatible ways in the future releases.
+    Enable usage of live views and `WATCH` query using `set allow_experimental_live_view = 1`.
+
+
+```sql
+CREATE LIVE VIEW [IF NOT EXISTS] [db.]table_name [WITH [TIMEOUT [value_in_sec] [AND]] [REFRESH [value_in_sec]]] AS SELECT ...
+```
+
+Live views store result of the corresponding [SELECT](../../../sql-reference/statements/select/index.md) query and are updated any time the result of the query changes. Query result as well as partial result needed to combine with new data are stored in memory providing increased performance for repeated queries. Live views can provide push notifications when query result changes using the [WATCH](../../../sql-reference/statements/watch.md) query.
+
+Live views are triggered by insert into the innermost table specified in the query. 
+
+Live views work similarly to how a query in a distributed table works. But instead of combining partial results from different servers they combine partial result from current data with partial result from the new data. When a live view query includes a subquery then the cached partial result is only stored for the innermost subquery.
+
+!!! info "Limitations"
+    - [Table function](../../../sql-reference/table-functions/index.md) is not supported as the innermost table.
+    - Tables that do not have inserts such as a [dictionary](../../../sql-reference/dictionaries/index.md), [system table](../../../operations/system-tables/index.md), a [normal view](#normal), or a [materialized view](#materialized) will not trigger a live view.
+    - Only queries where one can combine partial result from the old data plus partial result from the new data will work. Live view will not work for queries that require the complete data set to compute the final result or aggregations where the state of the aggregation must be preserved.
+    - Does not work with replicated or distributed tables where inserts are performed on different nodes.
+    - Can't be triggered by multiple tables.
+
+    See [WITH REFRESH](#live-view-with-refresh) to force periodic updates of a live view that in some cases can be used as a workaround.
+
+You can watch for changes in the live view query result using the [WATCH](../../../sql-reference/statements/watch.md) query
+
+```sql
+WATCH [db.]live_view
+```
+
+**Example:**
+
+```sql
+CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x;
+CREATE LIVE VIEW lv AS SELECT sum(x) FROM mt;
+```
+
+Watch a live view while doing a parallel insert into the source table.
+
+```sql
+WATCH lv
+```
+
+```bash
+┌─sum(x)─┬─_version─┐
+│      1 │        1 │
+└────────┴──────────┘
+┌─sum(x)─┬─_version─┐
+│      2 │        2 │
+└────────┴──────────┘
+┌─sum(x)─┬─_version─┐
+│      6 │        3 │
+└────────┴──────────┘
+...
+```
+
+```sql
+INSERT INTO mt VALUES (1);
+INSERT INTO mt VALUES (2);
+INSERT INTO mt VALUES (3);
+```
+
+or add [EVENTS](../../../sql-reference/statements/watch.md#events-clause) clause to just get change events.
+
+```sql
+WATCH [db.]live_view EVENTS
+```
+
+**Example:**
+
+```sql
+WATCH lv EVENTS
+```
+
+```bash
+┌─version─┐
+│       1 │
+└─────────┘
+┌─version─┐
+│       2 │
+└─────────┘
+┌─version─┐
+│       3 │
+└─────────┘
+...
+```
+
+You can execute [SELECT](../../../sql-reference/statements/select/index.md) query on a live view in the same way as for any regular view or a table. If the query result is cached it will return the result immediately without running the stored query on the underlying tables.
+
+```sql
+SELECT * FROM [db.]live_view WHERE ...
+```
+
+### Force Refresh {#live-view-alter-refresh}
+
+You can force live view refresh using the `ALTER LIVE VIEW [db.]table_name REFRESH` statement.
+
+### With Timeout {#live-view-with-timeout}
+
+When a live view is create with a `WITH TIMEOUT` clause then the live view will be dropped automatically after the specified number of seconds elapse since the end of the last [WATCH](../../../sql-reference/statements/watch.md) query that was watching the live view. 
+
+```sql
+CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AS SELECT ...
+```
+
+If the timeout value is not specified then the value specified by the `temporary_live_view_timeout` setting is used.
+
+**Example:**
+
+```sql
+CREATE TABLE mt (x Int8) Engine = MergeTree ORDER BY x;
+CREATE LIVE VIEW lv WITH TIMEOUT 15 AS SELECT sum(x) FROM mt;
+```
+
+### With Refresh {#live-view-with-refresh}
+
+When a live view is created with a `WITH REFRESH` clause then it will be automatically refreshed after the specified number of seconds elapse since the last refresh or trigger.
+
+```sql
+CREATE LIVE VIEW [db.]table_name WITH REFRESH [value_in_sec] AS SELECT ...
+```
+
+If the refresh value is not specified then the value specified by the `periodic_live_view_refresh` setting is used.
+
+**Example:**
+
+```sql
+CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now();
+WATCH lv
+```
+
+```bash
+┌───────────────now()─┬─_version─┐
+│ 2021-02-21 08:47:05 │        1 │
+└─────────────────────┴──────────┘
+┌───────────────now()─┬─_version─┐
+│ 2021-02-21 08:47:10 │        2 │
+└─────────────────────┴──────────┘
+┌───────────────now()─┬─_version─┐
+│ 2021-02-21 08:47:15 │        3 │
+└─────────────────────┴──────────┘
+```
+
+You can combine `WITH TIMEOUT` and `WITH REFRESH` clauses using an `AND` clause. 
+
+```sql
+CREATE LIVE VIEW [db.]table_name WITH TIMEOUT [value_in_sec] AND REFRESH [value_in_sec] AS SELECT ...
+```
+
+**Example:**
+
+```sql
+CREATE LIVE VIEW lv WITH TIMEOUT 15 AND REFRESH 5 AS SELECT now();
+```
+
+After 15 sec the live view will be automatically dropped if there are no active `WATCH` queries.
+
+```sql
+WATCH lv
+```
+
+```
+Code: 60. DB::Exception: Received from localhost:9000. DB::Exception: Table default.lv doesn't exist.. 
+```
+
+### Usage
+
+Most common uses of live view tables include:
+
+- Providing push notifications for query result changes to avoid polling.
+- Caching results of most frequent queries to provide immediate query results.
+- Watching for table changes and triggering a follow-up select queries.
+- Watching metrics from system tables using periodic refresh.
+
+### Settings {#live-view-settings}
+
+You can use the following settings to control the behaviour of live views.
+
+- `allow_experimental_live_view` - enable live views. Default is `0`.
+- `live_view_heartbeat_interval` - the heartbeat interval in seconds to indicate live query is alive. Default is `15` seconds.
+- `max_live_view_insert_blocks_before_refresh` - maximum number of inserted blocks after which
+   mergeable blocks are dropped and query is re-executed. Default is `64` inserts.
+- `temporary_live_view_timeout` - interval after which live view with timeout is deleted. Default is `5` seconds.
+- `periodic_live_view_refresh` - interval after which periodically refreshed live view is forced to refresh. Default is `60` seconds.
+
 [Original article](https://clickhouse.tech/docs/en/sql-reference/statements/create/view/) <!--hide-->
--- a/docs/en/sql-reference/statements/watch.md
+++ b/docs/en/sql-reference/statements/watch.md
@ -0,0 +1,106 @@
+---
+toc_priority: 53
+toc_title: WATCH
+---
+
+# WATCH Statement (Experimental) {#watch}
+
+!!! important "Important"
+    This is an experimental feature that may change in backwards-incompatible ways in the future releases.
+    Enable live views and `WATCH` query using `set allow_experimental_live_view = 1`.
+
+
+``` sql
+WATCH [db.]live_view
+[EVENTS]
+[LIMIT n]
+[FORMAT format]
+```
+
+The `WATCH` query performs continuous data retrieval from a [live view](./create/view.md#live-view) table. Unless the `LIMIT` clause is specified it provides an infinite stream of query results from a [live view](./create/view.md#live-view).
+
+```sql
+WATCH [db.]live_view
+```
+
+The virtual `_version` column in the query result indicates the current result version.
+
+**Example:**
+
+```sql
+CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now();
+WATCH lv
+```
+
+```bash
+┌───────────────now()─┬─_version─┐
+│ 2021-02-21 09:17:21 │        1 │
+└─────────────────────┴──────────┘
+┌───────────────now()─┬─_version─┐
+│ 2021-02-21 09:17:26 │        2 │
+└─────────────────────┴──────────┘
+┌───────────────now()─┬─_version─┐
+│ 2021-02-21 09:17:31 │        3 │
+└─────────────────────┴──────────┘
+...
+```
+
+By default, the requested data is returned to the client, while in conjunction with [INSERT INTO](../../sql-reference/statements/insert-into.md) it can be forwarded to a different table.
+
+```sql
+INSERT INTO [db.]table WATCH [db.]live_view ...
+```
+
+## EVENTS Clause {#events-clause}
+
+The `EVENTS` clause can be used to obtain a short form of the `WATCH` query where instead of the query result you will just get the latest query result version.
+
+```sql
+WATCH [db.]live_view EVENTS
+```
+
+**Example:**
+
+```sql
+CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now();
+WATCH lv EVENTS
+```
+
+```bash
+┌─version─┐
+│       1 │
+└─────────┘
+┌─version─┐
+│       2 │
+└─────────┘
+...
+```
+
+## LIMIT Clause {#limit-clause}
+
+The `LIMIT n` clause species the number of updates the `WATCH` query should wait for before terminating. By default there is no limit on the number of updates and therefore the query will not terminate. The value of `0` indicates that the `WATCH` query should not wait for any new query results and therefore will return immediately once query is evaluated.
+
+```sql
+WATCH [db.]live_view LIMIT 1
+```
+
+**Example:**
+
+```sql
+CREATE LIVE VIEW lv WITH REFRESH 5 AS SELECT now();
+WATCH lv EVENTS LIMIT 1
+```
+
+```bash
+┌─version─┐
+│       1 │
+└─────────┘
+```
+
+## FORMAT Clause {#format-clause}
+
+The `FORMAT` clause works the same way as for the [SELECT](../../sql-reference/statements/select/format.md#format-clause).
+
+!!! info "Note"
+    The [JSONEachRowWithProgress](../../../interfaces/formats/#jsoneachrowwithprogress) format should be used when watching [live view](./create/view.md#live-view) tables over the HTTP interface. The progress messages will be added to the output to keep the long-lived HTTP connection alive until the query result changes. The interval between progress messages is controlled using the [live_view_heartbeat_interval](./create/view.md#live-view-settings) setting.
+
--- a/programs/odbc-bridge/ColumnInfoHandler.cpp
+++ b/programs/odbc-bridge/ColumnInfoHandler.cpp
@ -160,7 +160,15 @@ void ODBCColumnsInfoHandler::handleRequest(HTTPServerRequest & request, HTTPServ
        }

        WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout);
+        try
+        {
            writeStringBinary(columns.toString(), out);
+            out.finalize();
+        }
+        catch (...)
+        {
+            out.finalize();
+        }
    }
    catch (...)
    {
--- a/programs/odbc-bridge/IdentifierQuoteHandler.cpp
+++ b/programs/odbc-bridge/IdentifierQuoteHandler.cpp
@ -50,7 +50,15 @@ void IdentifierQuoteHandler::handleRequest(HTTPServerRequest & request, HTTPServ
        auto identifier = getIdentifierQuote(hdbc);

        WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout);
+        try
+        {
            writeStringBinary(identifier, out);
+            out.finalize();
+        }
+        catch (...)
+        {
+            out.finalize();
+        }
    }
    catch (...)
    {
--- a/programs/odbc-bridge/MainHandler.cpp
+++ b/programs/odbc-bridge/MainHandler.cpp
@ -187,9 +187,27 @@ void ODBCHandler::handleRequest(HTTPServerRequest & request, HTTPServerResponse
        auto message = getCurrentExceptionMessage(true);
        response.setStatusAndReason(
                Poco::Net::HTTPResponse::HTTP_INTERNAL_SERVER_ERROR); // can't call process_error, because of too soon response sending
+
+        try
+        {
            writeStringBinary(message, out);
+            out.finalize();
+        }
+        catch (...)
+        {
            tryLogCurrentException(log);
+        }

+        tryLogCurrentException(log);
+    }
+
+    try
+    {
+        out.finalize();
+    }
+    catch (...)
+    {
+        tryLogCurrentException(log);
    }
 }

--- a/programs/odbc-bridge/SchemaAllowedHandler.cpp
+++ b/programs/odbc-bridge/SchemaAllowedHandler.cpp
@ -61,7 +61,15 @@ void SchemaAllowedHandler::handleRequest(HTTPServerRequest & request, HTTPServer
        bool result = isSchemaAllowed(hdbc);

        WriteBufferFromHTTPServerResponse out(response, request.getMethod() == Poco::Net::HTTPRequest::HTTP_HEAD, keep_alive_timeout);
+        try
+        {
            writeBoolText(result, out);
+            out.finalize();
+        }
+        catch (...)
+        {
+            out.finalize();
+        }
    }
    catch (...)
    {
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@ -103,6 +103,7 @@ namespace CurrentMetrics
    extern const Metric Revision;
    extern const Metric VersionInteger;
    extern const Metric MemoryTracking;
+    extern const Metric MaxDDLEntryID;
 }


@ -1012,7 +1013,8 @@ int Server::main(const std::vector<std::string> & /*args*/)
        int pool_size = config().getInt("distributed_ddl.pool_size", 1);
        if (pool_size < 1)
            throw Exception("distributed_ddl.pool_size should be greater then 0", ErrorCodes::ARGUMENT_OUT_OF_BOUND);
-        global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(), "distributed_ddl"));
+        global_context->setDDLWorker(std::make_unique<DDLWorker>(pool_size, ddl_zookeeper_path, *global_context, &config(),
+                                                                 "distributed_ddl", "DDLWorker", &CurrentMetrics::MaxDDLEntryID));
    }

    std::unique_ptr<DNSCacheUpdater> dns_cache_updater;
--- a/src/AggregateFunctions/AggregateFunctionFactory.cpp
+++ b/src/AggregateFunctions/AggregateFunctionFactory.cpp
@ -30,6 +30,10 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

+const String & getAggregateFunctionCanonicalNameIfAny(const String & name)
+{
+    return AggregateFunctionFactory::instance().getCanonicalNameIfAny(name);
+}

 void AggregateFunctionFactory::registerFunction(const String & name, Value creator_with_properties, CaseSensitiveness case_sensitiveness)
 {
@ -41,10 +45,14 @@ void AggregateFunctionFactory::registerFunction(const String & name, Value creat
        throw Exception("AggregateFunctionFactory: the aggregate function name '" + name + "' is not unique",
            ErrorCodes::LOGICAL_ERROR);

-    if (case_sensitiveness == CaseInsensitive
-        && !case_insensitive_aggregate_functions.emplace(Poco::toLower(name), creator_with_properties).second)
+    if (case_sensitiveness == CaseInsensitive)
+    {
+        auto key = Poco::toLower(name);
+        if (!case_insensitive_aggregate_functions.emplace(key, creator_with_properties).second)
            throw Exception("AggregateFunctionFactory: the case insensitive aggregate function name '" + name + "' is not unique",
                ErrorCodes::LOGICAL_ERROR);
+        case_insensitive_name_mapping[key] = name;
+    }
 }

 static DataTypes convertLowCardinalityTypesToNested(const DataTypes & types)
--- a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h
+++ b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.h
@ -52,7 +52,7 @@ struct MovingSumData : public MovingData<T>
 {
    static constexpr auto name = "groupArrayMovingSum";

-    T get(size_t idx, UInt64 window_size) const
+    T NO_SANITIZE_UNDEFINED get(size_t idx, UInt64 window_size) const
    {
        if (idx < window_size)
            return this->value[idx];
@ -66,7 +66,7 @@ struct MovingAvgData : public MovingData<T>
 {
    static constexpr auto name = "groupArrayMovingAvg";

-    T get(size_t idx, UInt64 window_size) const
+    T NO_SANITIZE_UNDEFINED get(size_t idx, UInt64 window_size) const
    {
        if (idx < window_size)
            return this->value[idx] / window_size;
@ -114,7 +114,7 @@ public:
            return std::make_shared<DataTypeArray>(std::make_shared<DataTypeResult>());
    }

-    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
+    void NO_SANITIZE_UNDEFINED add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena * arena) const override
    {
        auto value = static_cast<const ColumnSource &>(*columns[0]).getData()[row_num];
        this->data(place).add(static_cast<ResultT>(value), arena);
--- a/src/AggregateFunctions/AggregateFunctionSumMap.h
+++ b/src/AggregateFunctions/AggregateFunctionSumMap.h
@ -115,6 +115,13 @@ public:
                        "Values for {} are expected to be Numeric, Float or Decimal, passed type {}",
                        getName(), value_type->getName()};

+                WhichDataType value_type_to_check(value_type);
+
+                /// Do not promote decimal because of implementation issues of this function design
+                /// If we decide to make this function more efficient we should promote decimal type during summ
+                if (value_type_to_check.isDecimal())
+                    result_type = value_type_without_nullable;
+                else
                    result_type = value_type_without_nullable->promoteNumericType();
            }

--- a/src/Columns/ColumnAggregateFunction.cpp
+++ b/src/Columns/ColumnAggregateFunction.cpp
@ -75,29 +75,9 @@ void ColumnAggregateFunction::set(const AggregateFunctionPtr & func_)
 ColumnAggregateFunction::~ColumnAggregateFunction()
 {
    if (!func->hasTrivialDestructor() && !src)
-    {
-        if (copiedDataInfo.empty())
-        {
        for (auto * val : data)
-            {
            func->destroy(val);
 }
-        }
-        else
-        {
-            size_t pos;
-            for (Map::iterator it = copiedDataInfo.begin(), it_end = copiedDataInfo.end(); it != it_end; ++it)
-            {
-                pos = it->getValue().second;
-                if (data[pos] != nullptr)
-                {
-                    func->destroy(data[pos]);
-                    data[pos] = nullptr;
-                }
-            }
-        }
-    }
-}

 void ColumnAggregateFunction::addArena(ConstArenaPtr arena_)
 {
@ -475,37 +455,14 @@ void ColumnAggregateFunction::insertFrom(const IColumn & from, size_t n)
    ///  (only as a whole, see comment above).
    ensureOwnership();
    insertDefault();
-    insertCopyFrom(assert_cast<const ColumnAggregateFunction &>(from).data[n]);
+    insertMergeFrom(from, n);
 }

 void ColumnAggregateFunction::insertFrom(ConstAggregateDataPtr place)
 {
    ensureOwnership();
    insertDefault();
-    insertCopyFrom(place);
-}
-
-void ColumnAggregateFunction::insertCopyFrom(ConstAggregateDataPtr place)
-{
-    Map::LookupResult result;
-    result = copiedDataInfo.find(place);
-    if (result == nullptr)
-    {
-        copiedDataInfo[place] = data.size()-1;
-        func->merge(data.back(), place, &createOrGetArena());
-    }
-    else
-    {
-        size_t pos = result->getValue().second;
-        if (pos != data.size() - 1)
-        {
-            data[data.size() - 1] = data[pos];
-        }
-        else /// insert same data to same pos, merge them.
-        {
-            func->merge(data.back(), place, &createOrGetArena());
-        }
-    }
+    insertMergeFrom(place);
 }

 void ColumnAggregateFunction::insertMergeFrom(ConstAggregateDataPtr place)
@ -740,4 +697,5 @@ MutableColumnPtr ColumnAggregateFunction::cloneResized(size_t size) const
        return cloned_col;
    }
 }
+
 }
--- a/src/Columns/ColumnAggregateFunction.h
+++ b/src/Columns/ColumnAggregateFunction.h
@ -13,8 +13,6 @@

 #include <Functions/FunctionHelpers.h>

-#include <Common/HashTable/HashMap.h>
-
 namespace DB
 {

@ -84,17 +82,6 @@ private:
    /// Name of the type to distinguish different aggregation states.
    String type_string;

-    /// MergedData records, used to avoid duplicated data copy.
-    ///key: src pointer, val:  pos in current column.
-    using Map = HashMap<
-        ConstAggregateDataPtr,
-        size_t,
-        DefaultHash<ConstAggregateDataPtr>,
-        HashTableGrower<3>,
-        HashTableAllocatorWithStackMemory<sizeof(std::pair<ConstAggregateDataPtr, size_t>) * (1 << 3)>>;
-
-    Map copiedDataInfo;
-
    ColumnAggregateFunction() {}

    /// Create a new column that has another column as a source.
@ -153,8 +140,6 @@ public:

    void insertFrom(ConstAggregateDataPtr place);

-    void insertCopyFrom(ConstAggregateDataPtr place);
-
    /// Merge state at last row with specified state in another column.
    void insertMergeFrom(ConstAggregateDataPtr place);

--- a/src/Common/ErrorCodes.cpp
+++ b/src/Common/ErrorCodes.cpp
@ -537,6 +537,7 @@
    M(568, RAFT_ERROR) \
    M(569, MULTIPLE_COLUMNS_SERIALIZED_TO_SAME_PROTOBUF_FIELD) \
    M(570, DATA_TYPE_INCOMPATIBLE_WITH_PROTOBUF_FIELD) \
+    M(571, DATABASE_REPLICATION_FAILED) \
    \
    M(999, KEEPER_EXCEPTION) \
    M(1000, POCO_EXCEPTION) \
--- a/src/Common/HashTable/HashTable.h
+++ b/src/Common/HashTable/HashTable.h
@ -69,16 +69,11 @@ namespace ZeroTraits
 {

 template <typename T>
-inline bool check(const T x) { return x == 0; }
+bool check(const T x) { return x == 0; }

 template <typename T>
-inline void set(T & x) { x = 0; }
+void set(T & x) { x = 0; }

-template <>
-inline bool check(const char * x) { return x == nullptr; }
-
-template <>
-inline void set(const char *& x){ x = nullptr; }
 }


--- a/src/Common/IFactoryWithAliases.h
+++ b/src/Common/IFactoryWithAliases.h
@ -35,6 +35,8 @@ protected:
            return name;
    }

+    std::unordered_map<String, String> case_insensitive_name_mapping;
+
 public:
    /// For compatibility with SQL, it's possible to specify that certain function name is case insensitive.
    enum CaseSensitiveness
@ -68,9 +70,12 @@ public:
                factory_name + ": the alias name '" + alias_name + "' is already registered as real name", ErrorCodes::LOGICAL_ERROR);

        if (case_sensitiveness == CaseInsensitive)
+        {
            if (!case_insensitive_aliases.emplace(alias_name_lowercase, real_dict_name).second)
                throw Exception(
                    factory_name + ": case insensitive alias name '" + alias_name + "' is not unique", ErrorCodes::LOGICAL_ERROR);
+            case_insensitive_name_mapping[alias_name_lowercase] = real_name;
+        }

        if (!aliases.emplace(alias_name, real_dict_name).second)
            throw Exception(factory_name + ": alias name '" + alias_name + "' is not unique", ErrorCodes::LOGICAL_ERROR);
@ -111,6 +116,15 @@ public:
        return getMap().count(name) || getCaseInsensitiveMap().count(name) || isAlias(name);
    }

+    /// Return the canonical name (the name used in registration) if it's different from `name`.
+    const String & getCanonicalNameIfAny(const String & name) const
+    {
+        auto it = case_insensitive_name_mapping.find(Poco::toLower(name));
+        if (it != case_insensitive_name_mapping.end())
+            return it->second;
+        return name;
+    }
+
    virtual ~IFactoryWithAliases() override {}

 private:
--- a/src/Common/ZooKeeper/IKeeper.cpp
+++ b/src/Common/ZooKeeper/IKeeper.cpp
@ -59,7 +59,7 @@ static void addRootPath(String & path, const String & root_path)
        throw Exception("Path cannot be empty", Error::ZBADARGUMENTS);

    if (path[0] != '/')
-        throw Exception("Path must begin with /", Error::ZBADARGUMENTS);
+        throw Exception("Path must begin with /, got " + path, Error::ZBADARGUMENTS);

    if (root_path.empty())
        return;
--- a/src/Common/ZooKeeper/ZooKeeper.h
+++ b/src/Common/ZooKeeper/ZooKeeper.h
@ -194,6 +194,7 @@ public:
    void removeChildren(const std::string & path);

    using WaitCondition = std::function<bool()>;
+
    /// Wait for the node to disappear or return immediately if it doesn't exist.
    /// If condition is specified, it is used to return early (when condition returns false)
    /// The function returns true if waited and false if waiting was interrupted by condition.
@ -314,8 +315,15 @@ public:
        return std::make_shared<EphemeralNodeHolder>(path, zookeeper, false, false, "");
    }

+    void setAlreadyRemoved()
+    {
+        need_remove = false;
+    }
+
    ~EphemeralNodeHolder()
    {
+        if (!need_remove)
+            return;
        try
        {
            zookeeper.tryRemove(path);
@ -331,6 +339,7 @@ private:
    std::string path;
    ZooKeeper & zookeeper;
    CurrentMetrics::Increment metric_increment{CurrentMetrics::EphemeralNode};
+    bool need_remove = true;
 };

 using EphemeralNodeHolderPtr = EphemeralNodeHolder::Ptr;
--- a/src/Core/DecimalComparison.h
+++ b/src/Core/DecimalComparison.h
@ -78,7 +78,7 @@ public:

    static bool compare(A a, B b, UInt32 scale_a, UInt32 scale_b)
    {
-        static const UInt32 max_scale = DecimalUtils::maxPrecision<Decimal256>();
+        static const UInt32 max_scale = DecimalUtils::max_precision<Decimal256>;
        if (scale_a > max_scale || scale_b > max_scale)
            throw Exception("Bad scale of decimal field", ErrorCodes::DECIMAL_OVERFLOW);

--- a/src/Core/DecimalFunctions.h
+++ b/src/Core/DecimalFunctions.h
@ -24,13 +24,13 @@ namespace ErrorCodes
 namespace DecimalUtils
 {

-static constexpr size_t minPrecision() { return 1; }
-template <typename T> static constexpr size_t maxPrecision() { return 0; }
-template <> constexpr size_t maxPrecision<Decimal32>() { return 9; }
-template <> constexpr size_t maxPrecision<Decimal64>() { return 18; }
-template <> constexpr size_t maxPrecision<DateTime64>() { return 18; }
-template <> constexpr size_t maxPrecision<Decimal128>() { return 38; }
-template <> constexpr size_t maxPrecision<Decimal256>() { return 76; }
+inline constexpr size_t min_precision = 1;
+template <typename T> inline constexpr size_t max_precision = 0;
+template <> inline constexpr size_t max_precision<Decimal32> = 9;
+template <> inline constexpr size_t max_precision<Decimal64> = 18;
+template <> inline constexpr size_t max_precision<DateTime64> = 18;
+template <> inline constexpr size_t max_precision<Decimal128> = 38;
+template <> inline constexpr size_t max_precision<Decimal256> = 76;

 template <typename T>
 inline auto scaleMultiplier(UInt32 scale)
@ -87,7 +87,7 @@ struct DataTypeDecimalTrait
  *
  * Sign of `whole` controls sign of result: negative whole => negative result, positive whole => positive result.
  * Sign of `fractional` is expected to be positive, otherwise result is undefined.
-  * If `scale` is to big (scale > maxPrecision<DecimalType::NativeType>), result is undefined.
+  * If `scale` is to big (scale > max_precision<DecimalType::NativeType>), result is undefined.
  */
 template <typename DecimalType>
 inline DecimalType decimalFromComponentsWithMultiplier(
@ -287,21 +287,21 @@ inline auto binaryOpResult(const DecimalType<T> & tx, const DecimalType<U> & ty)
        scale = (tx.getScale() > ty.getScale() ? tx.getScale() : ty.getScale());

    if constexpr (sizeof(T) < sizeof(U))
-        return DataTypeDecimalTrait<U>(DecimalUtils::maxPrecision<U>(), scale);
+        return DataTypeDecimalTrait<U>(DecimalUtils::max_precision<U>, scale);
    else
-        return DataTypeDecimalTrait<T>(DecimalUtils::maxPrecision<T>(), scale);
+        return DataTypeDecimalTrait<T>(DecimalUtils::max_precision<T>, scale);
 }

 template <bool, bool, typename T, typename U, template <typename> typename DecimalType>
 inline const DataTypeDecimalTrait<T> binaryOpResult(const DecimalType<T> & tx, const DataTypeNumber<U> &)
 {
-    return DataTypeDecimalTrait<T>(DecimalUtils::maxPrecision<T>(), tx.getScale());
+    return DataTypeDecimalTrait<T>(DecimalUtils::max_precision<T>, tx.getScale());
 }

 template <bool, bool, typename T, typename U, template <typename> typename DecimalType>
 inline const DataTypeDecimalTrait<U> binaryOpResult(const DataTypeNumber<T> &, const DecimalType<U> & ty)
 {
-    return DataTypeDecimalTrait<U>(DecimalUtils::maxPrecision<U>(), ty.getScale());
+    return DataTypeDecimalTrait<U>(DecimalUtils::max_precision<U>, ty.getScale());
 }

 }
--- a/src/Core/MySQL/MySQLReplication.cpp
+++ b/src/Core/MySQL/MySQLReplication.cpp
@ -475,11 +475,11 @@ namespace MySQLReplication
                    {
                        const auto & dispatch = [](const size_t & precision, const size_t & scale, const auto & function) -> Field
                        {
-                            if (precision <= DecimalUtils::maxPrecision<Decimal32>())
+                            if (precision <= DecimalUtils::max_precision<Decimal32>)
                                return Field(function(precision, scale, Decimal32()));
-                            else if (precision <= DecimalUtils::maxPrecision<Decimal64>())
+                            else if (precision <= DecimalUtils::max_precision<Decimal64>)
                                return Field(function(precision, scale, Decimal64()));
-                            else if (precision <= DecimalUtils::maxPrecision<Decimal128>())
+                            else if (precision <= DecimalUtils::max_precision<Decimal128>)
                                return Field(function(precision, scale, Decimal128()));

                            return Field(function(precision, scale, Decimal256()));
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@ -100,7 +100,7 @@ class IColumn;
    M(UInt64, min_count_to_compile_expression, 3, "The number of identical expressions before they are JIT-compiled", 0) \
    M(UInt64, group_by_two_level_threshold, 100000, "From what number of keys, a two-level aggregation starts. 0 - the threshold is not set.", 0) \
    M(UInt64, group_by_two_level_threshold_bytes, 100000000, "From what size of the aggregation state in bytes, a two-level aggregation begins to be used. 0 - the threshold is not set. Two-level aggregation is used when at least one of the thresholds is triggered.", 0) \
-    M(Bool, distributed_aggregation_memory_efficient, false, "Is the memory-saving mode of distributed aggregation enabled.", 0) \
+    M(Bool, distributed_aggregation_memory_efficient, true, "Is the memory-saving mode of distributed aggregation enabled.", 0) \
    M(UInt64, aggregation_memory_efficient_merge_threads, 0, "Number of threads to use for merge intermediate aggregation results in memory efficient mode. When bigger, then more memory is consumed. 0 means - same as 'max_threads'.", 0) \
    \
    M(UInt64, max_parallel_replicas, 1, "The maximum number of replicas of each shard used when the query is executed. For consistency (to get different parts of the same partition), this option only works for the specified sampling key. The lag of the replicas is not controlled.", 0) \
@ -383,6 +383,7 @@ class IColumn;
    M(Bool, optimize_if_chain_to_multiif, false, "Replace if(cond1, then1, if(cond2, ...)) chains to multiIf. Currently it's not beneficial for numeric types.", 0) \
    M(Bool, optimize_if_transform_strings_to_enum, false, "Replaces string-type arguments in If and Transform to enum. Disabled by default cause it could make inconsistent change in distributed query that would lead to its fail.", 0) \
    M(Bool, optimize_monotonous_functions_in_order_by, true, "Replace monotonous function with its argument in ORDER BY", 0) \
+    M(Bool, normalize_function_names, true, "Normalize function names to their canonical names", 0) \
    M(Bool, allow_experimental_alter_materialized_view_structure, false, "Allow atomic alter on Materialized views. Work in progress.", 0) \
    M(Bool, enable_early_constant_folding, true, "Enable query optimization where we analyze function and subqueries results and rewrite query if there're constants there", 0) \
    M(Bool, deduplicate_blocks_in_dependent_materialized_views, false, "Should deduplicate blocks for materialized views if the block is not a duplicate for the table. Use true to always deduplicate in dependent tables.", 0) \
@ -421,6 +422,11 @@ class IColumn;
    M(Bool, optimize_rewrite_sum_if_to_count_if, true, "Rewrite sumIf() and sum(if()) function countIf() function when logically equivalent", 0) \
    M(UInt64, insert_shard_id, 0, "If non zero, when insert into a distributed table, the data will be inserted into the shard `insert_shard_id` synchronously. Possible values range from 1 to `shards_number` of corresponding distributed table", 0) \
    M(Bool, allow_experimental_query_deduplication, false, "Allow sending parts' UUIDs for a query in order to deduplicate data parts if any", 0) \
+    M(Bool, engine_file_empty_if_not_exists, false, "Allows to select data from a file engine table without file", 0) \
+    M(Bool, engine_file_truncate_on_insert, false, "Enables or disables truncate before insert in file engine tables", 0) \
+    M(Bool, allow_experimental_database_replicated, false, "Allow to create databases with Replicated engine", 0) \
+    M(UInt64, database_replicated_initial_query_timeout_sec, 300, "How long initial DDL query should wait for Replicated database to precess previous DDL queue entries", 0) \
+    M(Bool, database_replicated_ddl_output, true, "Return table with query execution status as a result of DDL query", 0) \
    \
    /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
    \
--- a/src/DataTypes/DataTypeDateTime64.cpp
+++ b/src/DataTypes/DataTypeDateTime64.cpp
@ -28,7 +28,7 @@ namespace ErrorCodes
 static constexpr UInt32 max_scale = 9;

 DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const std::string & time_zone_name)
-    : DataTypeDecimalBase<DateTime64>(DecimalUtils::maxPrecision<DateTime64>(), scale_),
+    : DataTypeDecimalBase<DateTime64>(DecimalUtils::max_precision<DateTime64>, scale_),
      TimezoneMixin(time_zone_name)
 {
    if (scale > max_scale)
@ -37,7 +37,7 @@ DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const std::string & time_z
 }

 DataTypeDateTime64::DataTypeDateTime64(UInt32 scale_, const TimezoneMixin & time_zone_info)
-    : DataTypeDecimalBase<DateTime64>(DecimalUtils::maxPrecision<DateTime64>(), scale_),
+    : DataTypeDecimalBase<DateTime64>(DecimalUtils::max_precision<DateTime64>, scale_),
      TimezoneMixin(time_zone_info)
 {
    if (scale > max_scale)
--- a/src/DataTypes/DataTypeDecimalBase.h
+++ b/src/DataTypes/DataTypeDecimalBase.h
@ -65,7 +65,7 @@ public:

    static constexpr bool is_parametric = true;

-    static constexpr size_t maxPrecision() { return DecimalUtils::maxPrecision<T>(); }
+    static constexpr size_t maxPrecision() { return DecimalUtils::max_precision<T>; }

    DataTypeDecimalBase(UInt32 precision_, UInt32 scale_)
    :   precision(precision_),
@ -197,17 +197,17 @@ inline const DecimalType<U> decimalResultType(const DataTypeNumber<T> & tx, cons
 template <template <typename> typename DecimalType>
 inline DataTypePtr createDecimal(UInt64 precision_value, UInt64 scale_value)
 {
-    if (precision_value < DecimalUtils::minPrecision() || precision_value > DecimalUtils::maxPrecision<Decimal256>())
+    if (precision_value < DecimalUtils::min_precision || precision_value > DecimalUtils::max_precision<Decimal256>)
        throw Exception("Wrong precision", ErrorCodes::ARGUMENT_OUT_OF_BOUND);

    if (static_cast<UInt64>(scale_value) > precision_value)
        throw Exception("Negative scales and scales larger than precision are not supported", ErrorCodes::ARGUMENT_OUT_OF_BOUND);

-    if (precision_value <= DecimalUtils::maxPrecision<Decimal32>())
+    if (precision_value <= DecimalUtils::max_precision<Decimal32>)
        return std::make_shared<DecimalType<Decimal32>>(precision_value, scale_value);
-    else if (precision_value <= DecimalUtils::maxPrecision<Decimal64>())
+    else if (precision_value <= DecimalUtils::max_precision<Decimal64>)
        return std::make_shared<DecimalType<Decimal64>>(precision_value, scale_value);
-    else if (precision_value <= DecimalUtils::maxPrecision<Decimal128>())
+    else if (precision_value <= DecimalUtils::max_precision<Decimal128>)
       return std::make_shared<DecimalType<Decimal128>>(precision_value, scale_value);
    return std::make_shared<DecimalType<Decimal256>>(precision_value, scale_value);
 }
--- a/src/DataTypes/DataTypesDecimal.cpp
+++ b/src/DataTypes/DataTypesDecimal.cpp
@ -141,7 +141,7 @@ static DataTypePtr createExact(const ASTPtr & arguments)
    if (!scale_arg || !(scale_arg->value.getType() == Field::Types::Int64 || scale_arg->value.getType() == Field::Types::UInt64))
        throw Exception("Decimal data type family must have a two numbers as its arguments", ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT);

-    UInt64 precision = DecimalUtils::maxPrecision<T>();
+    UInt64 precision = DecimalUtils::max_precision<T>;
    UInt64 scale = scale_arg->value.get<UInt64>();

    return createDecimal<DataTypeDecimal>(precision, scale);
--- a/src/DataTypes/DataTypesDecimal.h
+++ b/src/DataTypes/DataTypesDecimal.h
@ -270,7 +270,7 @@ tryConvertToDecimal(const typename FromDataType::FieldType & value, UInt32 scale
 template <typename T>
 inline DataTypePtr createDecimalMaxPrecision(UInt64 scale)
 {
-    return std::make_shared<DataTypeDecimal<T>>(DecimalUtils::maxPrecision<T>(), scale);
+    return std::make_shared<DataTypeDecimal<T>>(DecimalUtils::max_precision<T>, scale);
 }

 }
--- a/src/DataTypes/convertMySQLDataType.cpp
+++ b/src/DataTypes/convertMySQLDataType.cpp
@ -103,11 +103,11 @@ DataTypePtr convertMySQLDataType(MultiEnum<MySQLDataTypesSupport> type_support,
    }
    else if (type_support.isSet(MySQLDataTypesSupport::DECIMAL) && (type_name == "numeric" || type_name == "decimal"))
    {
-        if (precision <= DecimalUtils::maxPrecision<Decimal32>())
+        if (precision <= DecimalUtils::max_precision<Decimal32>)
            res = std::make_shared<DataTypeDecimal<Decimal32>>(precision, scale);
-        else if (precision <= DecimalUtils::maxPrecision<Decimal64>())
+        else if (precision <= DecimalUtils::max_precision<Decimal64>)
            res = std::make_shared<DataTypeDecimal<Decimal64>>(precision, scale);
-        else if (precision <= DecimalUtils::maxPrecision<Decimal128>())
+        else if (precision <= DecimalUtils::max_precision<Decimal128>)
            res = std::make_shared<DataTypeDecimal<Decimal128>>(precision, scale);
    }

--- a/src/Databases/DatabaseAtomic.cpp
+++ b/src/Databases/DatabaseAtomic.cpp
@ -4,13 +4,14 @@
 #include <Poco/Path.h>
 #include <IO/ReadHelpers.h>
 #include <IO/WriteHelpers.h>
+#include <IO/ReadBufferFromFile.h>
 #include <Parsers/formatAST.h>
 #include <Common/renameat2.h>
 #include <Storages/StorageMaterializedView.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/ExternalDictionariesLoader.h>
 #include <filesystem>
-
+#include <Interpreters/DDLTask.h>

 namespace DB
 {
@ -34,7 +35,6 @@ public:
    UUID uuid() const override { return table()->getStorageID().uuid; }
 };

-
 DatabaseAtomic::DatabaseAtomic(String name_, String metadata_path_, UUID uuid, const String & logger_name, const Context & context_)
    : DatabaseOrdinary(name_, std::move(metadata_path_), "store/", logger_name, context_)
    , path_to_table_symlinks(global_context.getPath() + "data/" + escapeForFileName(name_) + "/")
@ -106,7 +106,7 @@ StoragePtr DatabaseAtomic::detachTable(const String & name)
    return table;
 }

-void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool no_delay)
+void DatabaseAtomic::dropTable(const Context & context, const String & table_name, bool no_delay)
 {
    String table_metadata_path = getObjectMetadataPath(table_name);
    String table_metadata_path_drop;
@ -115,6 +115,16 @@ void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool
        std::unique_lock lock(mutex);
        table = getTableUnlocked(table_name, lock);
        table_metadata_path_drop = DatabaseCatalog::instance().getPathForDroppedMetadata(table->getStorageID());
+        auto txn = context.getZooKeeperMetadataTransaction();
+        if (txn && !context.isInternalSubquery())
+            txn->commit();      /// Commit point (a sort of) for Replicated database
+
+        /// NOTE: replica will be lost if server crashes before the following rename
+        /// We apply changes in ZooKeeper before applying changes in local metadata file
+        /// to reduce probability of failures between these operations
+        /// (it's more likely to lost connection, than to fail before applying local changes).
+        /// TODO better detection and recovery
+
        Poco::File(table_metadata_path).renameTo(table_metadata_path_drop);    /// Mark table as dropped
        DatabaseWithDictionaries::detachTableUnlocked(table_name, lock);       /// Should never throw
        table_name_to_path.erase(table_name);
@ -124,7 +134,7 @@ void DatabaseAtomic::dropTable(const Context &, const String & table_name, bool
    /// Remove the inner table (if any) to avoid deadlock
    /// (due to attempt to execute DROP from the worker thread)
    if (auto * mv = dynamic_cast<StorageMaterializedView *>(table.get()))
-        mv->dropInnerTable(no_delay);
+        mv->dropInnerTable(no_delay, context);
    /// Notify DatabaseCatalog that table was dropped. It will remove table data in background.
    /// Cleanup is performed outside of database to allow easily DROP DATABASE without waiting for cleanup to complete.
    DatabaseCatalog::instance().enqueueDroppedTableCleanup(table->getStorageID(), table, table_metadata_path_drop, no_delay);
@ -144,6 +154,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n

    if (exchange && dictionary)
        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot exchange dictionaries");
+    if (exchange && !supportsRenameat2())
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "RENAME EXCHANGE is not supported");

    auto & other_db = dynamic_cast<DatabaseAtomic &>(to_database);
    bool inside_database = this == &other_db;
@ -232,6 +244,13 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n
    }

    /// Table renaming actually begins here
+    auto txn = context.getZooKeeperMetadataTransaction();
+    if (txn && !context.isInternalSubquery())
+        txn->commit();     /// Commit point (a sort of) for Replicated database
+
+    /// NOTE: replica will be lost if server crashes before the following rename
+    /// TODO better detection and recovery
+
    if (exchange)
        renameExchange(old_metadata_path, new_metadata_path);
    else
@ -267,7 +286,8 @@ void DatabaseAtomic::renameTable(const Context & context, const String & table_n
 }

 void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
-                                       const String & table_metadata_tmp_path, const String & table_metadata_path)
+                                       const String & table_metadata_tmp_path, const String & table_metadata_path,
+                                       const Context & query_context)
 {
    DetachedTables not_in_use;
    auto table_data_path = getTableDataPath(query);
@ -284,6 +304,14 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora
        /// We will get en exception if some table with the same UUID exists (even if it's detached table or table from another database)
        DatabaseCatalog::instance().addUUIDMapping(query.uuid);
        locked_uuid = true;
+
+        auto txn = query_context.getZooKeeperMetadataTransaction();
+        if (txn && !query_context.isInternalSubquery())
+            txn->commit();     /// Commit point (a sort of) for Replicated database
+
+        /// NOTE: replica will be lost if server crashes before the following renameNoReplace(...)
+        /// TODO better detection and recovery
+
        /// It throws if `table_metadata_path` already exists (it's possible if table was detached)
        renameNoReplace(table_metadata_tmp_path, table_metadata_path);  /// Commit point (a sort of)
        attachTableUnlocked(query.table, table, lock);   /// Should never throw
@ -300,7 +328,8 @@ void DatabaseAtomic::commitCreateTable(const ASTCreateQuery & query, const Stora
        tryCreateSymlink(query.table, table_data_path);
 }

-void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path)
+void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path,
+                                      const String & /*statement*/, const Context & query_context)
 {
    bool check_file_exists = true;
    SCOPE_EXIT({ std::error_code code; if (check_file_exists) std::filesystem::remove(table_metadata_tmp_path, code); });
@ -311,6 +340,13 @@ void DatabaseAtomic::commitAlterTable(const StorageID & table_id, const String &
    if (table_id.uuid != actual_table_id.uuid)
        throw Exception("Cannot alter table because it was renamed", ErrorCodes::CANNOT_ASSIGN_ALTER);

+    auto txn = query_context.getZooKeeperMetadataTransaction();
+    if (txn && !query_context.isInternalSubquery())
+        txn->commit();      /// Commit point (a sort of) for Replicated database
+
+    /// NOTE: replica will be lost if server crashes before the following rename
+    /// TODO better detection and recovery
+
    check_file_exists = renameExchangeIfSupported(table_metadata_tmp_path, table_metadata_path);
    if (!check_file_exists)
        std::filesystem::rename(table_metadata_tmp_path, table_metadata_path);
@ -329,6 +365,12 @@ void DatabaseAtomic::assertDetachedTableNotInUse(const UUID & uuid)
              ", because it was detached but still used by some query. Retry later.", ErrorCodes::TABLE_ALREADY_EXISTS);
 }

+void DatabaseAtomic::setDetachedTableNotInUseForce(const UUID & uuid)
+{
+    std::unique_lock lock{mutex};
+    detached_tables.erase(uuid);
+}
+
 DatabaseAtomic::DetachedTables DatabaseAtomic::cleanupDetachedTables()
 {
    DetachedTables not_in_use;
--- a/src/Databases/DatabaseAtomic.h
+++ b/src/Databases/DatabaseAtomic.h
@ -58,11 +58,12 @@ public:
    void tryRemoveSymlink(const String & table_name);

    void waitDetachedTableNotInUse(const UUID & uuid) override;
+    void setDetachedTableNotInUseForce(const UUID & uuid);

-private:
-    void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path) override;
+protected:
+    void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context) override;
    void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
-                           const String & table_metadata_tmp_path, const String & table_metadata_path) override;
+                           const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context) override;

    void assertDetachedTableNotInUse(const UUID & uuid);
    typedef std::unordered_map<UUID, StoragePtr> DetachedTables;
--- a/src/Databases/DatabaseFactory.cpp
+++ b/src/Databases/DatabaseFactory.cpp
@ -1,6 +1,7 @@
 #include <Databases/DatabaseFactory.h>

 #include <Databases/DatabaseAtomic.h>
+#include <Databases/DatabaseReplicated.h>
 #include <Databases/DatabaseDictionary.h>
 #include <Databases/DatabaseLazy.h>
 #include <Databases/DatabaseMemory.h>
@ -13,6 +14,7 @@
 #include <Poco/File.h>
 #include <Poco/Path.h>
 #include <Interpreters/Context.h>
+#include <Common/Macros.h>

 #if !defined(ARCADIA_BUILD)
 #    include "config_core.h"
@ -96,11 +98,16 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String
    const String & engine_name = engine_define->engine->name;
    const UUID & uuid = create.uuid;

-    if (engine_name != "MySQL" && engine_name != "MaterializeMySQL" && engine_name != "Lazy" && engine_name != "PostgreSQL" && engine_define->engine->arguments)
+    bool engine_may_have_arguments = engine_name == "MySQL" || engine_name == "MaterializeMySQL" || engine_name == "Lazy" ||
+                                     engine_name == "Replicated" || engine_name == "PostgreSQL";
+    if (engine_define->engine->arguments && !engine_may_have_arguments)
        throw Exception("Database engine " + engine_name + " cannot have arguments", ErrorCodes::BAD_ARGUMENTS);

-    if (engine_define->engine->parameters || engine_define->partition_by || engine_define->primary_key || engine_define->order_by ||
-        engine_define->sample_by || (!endsWith(engine_name, "MySQL") && engine_define->settings))
+    bool has_unexpected_element = engine_define->engine->parameters || engine_define->partition_by ||
+                                  engine_define->primary_key || engine_define->order_by ||
+                                  engine_define->sample_by;
+    bool may_have_settings = endsWith(engine_name, "MySQL") || engine_name == "Replicated";
+    if (has_unexpected_element || (!may_have_settings && engine_define->settings))
        throw Exception("Database engine " + engine_name + " cannot have parameters, primary_key, order_by, sample_by, settings",
                        ErrorCodes::UNKNOWN_ELEMENT_IN_AST);

@ -184,6 +191,32 @@ DatabasePtr DatabaseFactory::getImpl(const ASTCreateQuery & create, const String
        return std::make_shared<DatabaseLazy>(database_name, metadata_path, cache_expiration_time_seconds, context);
    }

+    else if (engine_name == "Replicated")
+    {
+        const ASTFunction * engine = engine_define->engine;
+
+        if (!engine->arguments || engine->arguments->children.size() != 3)
+            throw Exception("Replicated database requires 3 arguments: zookeeper path, shard name and replica name", ErrorCodes::BAD_ARGUMENTS);
+
+        const auto & arguments = engine->arguments->children;
+
+        String zookeeper_path = safeGetLiteralValue<String>(arguments[0], "Replicated");
+        String shard_name = safeGetLiteralValue<String>(arguments[1], "Replicated");
+        String replica_name  = safeGetLiteralValue<String>(arguments[2], "Replicated");
+
+        zookeeper_path = context.getMacros()->expand(zookeeper_path);
+        shard_name = context.getMacros()->expand(shard_name);
+        replica_name = context.getMacros()->expand(replica_name);
+
+        DatabaseReplicatedSettings database_replicated_settings{};
+        if (engine_define->settings)
+            database_replicated_settings.loadFromQuery(*engine_define);
+
+        return std::make_shared<DatabaseReplicated>(database_name, metadata_path, uuid,
+                                                    zookeeper_path, shard_name, replica_name,
+                                                    std::move(database_replicated_settings), context);
+    }
+
 #if USE_LIBPQXX

    else if (engine_name == "PostgreSQL")
--- a/src/Databases/DatabaseOnDisk.cpp
+++ b/src/Databases/DatabaseOnDisk.cpp
@ -129,6 +129,60 @@ String getObjectDefinitionFromCreateQuery(const ASTPtr & query)
    return statement_buf.str();
 }

+void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata)
+{
+    auto & ast_create_query = query->as<ASTCreateQuery &>();
+
+    bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns;
+    if (ast_create_query.as_table_function && !has_structure)
+        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function"
+                                                     " and doesn't have structure in metadata", backQuote(ast_create_query.table));
+
+    assert(has_structure);
+    ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns);
+    ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices);
+    ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints);
+
+    ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns);
+    ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices);
+    ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints);
+
+    if (metadata.select.select_query)
+    {
+        query->replace(ast_create_query.select, metadata.select.select_query);
+    }
+
+    /// MaterializedView is one type of CREATE query without storage.
+    if (ast_create_query.storage)
+    {
+        ASTStorage & storage_ast = *ast_create_query.storage;
+
+        bool is_extended_storage_def
+            = storage_ast.partition_by || storage_ast.primary_key || storage_ast.order_by || storage_ast.sample_by || storage_ast.settings;
+
+        if (is_extended_storage_def)
+        {
+            if (metadata.sorting_key.definition_ast)
+                storage_ast.set(storage_ast.order_by, metadata.sorting_key.definition_ast);
+
+            if (metadata.primary_key.definition_ast)
+                storage_ast.set(storage_ast.primary_key, metadata.primary_key.definition_ast);
+
+            if (metadata.sampling_key.definition_ast)
+                storage_ast.set(storage_ast.sample_by, metadata.sampling_key.definition_ast);
+
+            if (metadata.table_ttl.definition_ast)
+                storage_ast.set(storage_ast.ttl_table, metadata.table_ttl.definition_ast);
+            else if (storage_ast.ttl_table != nullptr) /// TTL was removed
+                storage_ast.ttl_table = nullptr;
+
+            if (metadata.settings_changes)
+                storage_ast.set(storage_ast.settings, metadata.settings_changes);
+        }
+    }
+}
+
+
 DatabaseOnDisk::DatabaseOnDisk(
    const String & name,
    const String & metadata_path_,
@ -214,7 +268,7 @@ void DatabaseOnDisk::createTable(
        out.close();
    }

-    commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path);
+    commitCreateTable(create, table, table_metadata_tmp_path, table_metadata_path, context);

    removeDetachedPermanentlyFlag(table_name, table_metadata_path);
 }
@ -238,7 +292,8 @@ void DatabaseOnDisk::removeDetachedPermanentlyFlag(const String & table_name, co
 }

 void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
-                                       const String & table_metadata_tmp_path, const String & table_metadata_path)
+                                       const String & table_metadata_tmp_path, const String & table_metadata_path,
+                                       const Context & /*query_context*/)
 {
    try
    {
@ -256,7 +311,7 @@ void DatabaseOnDisk::commitCreateTable(const ASTCreateQuery & query, const Stora
    }
 }

-void DatabaseOnDisk::detachTablePermanently(const String & table_name)
+void DatabaseOnDisk::detachTablePermanently(const Context &, const String & table_name)
 {
    auto table = detachTable(table_name);

@ -352,6 +407,8 @@ void DatabaseOnDisk::renameTable(
            from_ordinary_to_atomic = true;
        else if (typeid_cast<DatabaseAtomic *>(this) && typeid_cast<DatabaseOrdinary *>(&to_database))
            from_atomic_to_ordinary = true;
+        else if (dynamic_cast<DatabaseAtomic *>(this) && typeid_cast<DatabaseOrdinary *>(&to_database) && getEngineName() == "Replicated")
+            from_atomic_to_ordinary = true;
        else
            throw Exception("Moving tables between databases of different engines is not supported", ErrorCodes::NOT_IMPLEMENTED);
    }
@ -363,6 +420,7 @@ void DatabaseOnDisk::renameTable(
    /// DatabaseLazy::detachTable may return nullptr even if table exists, so we need tryGetTable for this case.
    StoragePtr table = tryGetTable(table_name, global_context);
    detachTable(table_name);
+    UUID prev_uuid = UUIDHelpers::Nil;
    try
    {
        table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout);
@ -375,7 +433,7 @@ void DatabaseOnDisk::renameTable(
        if (from_ordinary_to_atomic)
            create.uuid = UUIDHelpers::generateV4();
        if (from_atomic_to_ordinary)
-            create.uuid = UUIDHelpers::Nil;
+            std::swap(create.uuid, prev_uuid);

        if (auto * target_db = dynamic_cast<DatabaseOnDisk *>(&to_database))
            target_db->checkMetadataFilenameAvailability(to_table_name);
@ -400,12 +458,16 @@ void DatabaseOnDisk::renameTable(

    Poco::File(table_metadata_path).remove();

+    if (from_atomic_to_ordinary)
+    {
+        auto & atomic_db = dynamic_cast<DatabaseAtomic &>(*this);
        /// Special case: usually no actions with symlinks are required when detaching/attaching table,
        /// but not when moving from Atomic database to Ordinary
-    if (from_atomic_to_ordinary && table->storesDataOnDisk())
-    {
-        auto & atomic_db = assert_cast<DatabaseAtomic &>(*this);
+        if (table->storesDataOnDisk())
            atomic_db.tryRemoveSymlink(table_name);
+        /// Forget about UUID, now it's possible to reuse it for new table
+        DatabaseCatalog::instance().removeUUIDMappingFinally(prev_uuid);
+        atomic_db.setDetachedTableNotInUseForce(prev_uuid);
    }
 }

--- a/src/Databases/DatabaseOnDisk.h
+++ b/src/Databases/DatabaseOnDisk.h
@ -25,6 +25,8 @@ std::pair<String, StoragePtr> createTableFromAST(
  */
 String getObjectDefinitionFromCreateQuery(const ASTPtr & query);

+void applyMetadataChangesToCreateQuery(const ASTPtr & query, const StorageInMemoryMetadata & metadata);
+

 /* Class to provide basic operations with tables when metadata is stored on disk in .sql files.
 */
@ -39,7 +41,7 @@ public:
        const StoragePtr & table,
        const ASTPtr & query) override;

-    void detachTablePermanently(const String & table_name) override;
+    void detachTablePermanently(const Context & context, const String & table_name) override;

    void dropTable(
        const Context & context,
@ -90,7 +92,7 @@ protected:
    ASTPtr getCreateQueryFromMetadata(const String & metadata_path, bool throw_on_error) const;

    virtual void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
-                                   const String & table_metadata_tmp_path, const String & table_metadata_path);
+                                   const String & table_metadata_tmp_path, const String & table_metadata_path, const Context & query_context);

    const String metadata_path;
    const String data_path;
--- a/src/Databases/DatabaseOrdinary.cpp
+++ b/src/Databases/DatabaseOrdinary.cpp
@ -33,11 +33,6 @@ static constexpr size_t PRINT_MESSAGE_EACH_N_OBJECTS = 256;
 static constexpr size_t PRINT_MESSAGE_EACH_N_SECONDS = 5;
 static constexpr size_t METADATA_FILE_BUFFER_SIZE = 32768;

-namespace ErrorCodes
-{
-    extern const int NOT_IMPLEMENTED;
-}
-
 namespace
 {
    void tryAttachTable(
@ -272,55 +267,7 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab
        0,
        context.getSettingsRef().max_parser_depth);

-    auto & ast_create_query = ast->as<ASTCreateQuery &>();
-
-    bool has_structure = ast_create_query.columns_list && ast_create_query.columns_list->columns;
-    if (ast_create_query.as_table_function && !has_structure)
-        throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Cannot alter table {} because it was created AS table function"
-                                                     " and doesn't have structure in metadata", backQuote(table_name));
-
-    assert(has_structure);
-    ASTPtr new_columns = InterpreterCreateQuery::formatColumns(metadata.columns);
-    ASTPtr new_indices = InterpreterCreateQuery::formatIndices(metadata.secondary_indices);
-    ASTPtr new_constraints = InterpreterCreateQuery::formatConstraints(metadata.constraints);
-
-    ast_create_query.columns_list->replace(ast_create_query.columns_list->columns, new_columns);
-    ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->indices, new_indices);
-    ast_create_query.columns_list->setOrReplace(ast_create_query.columns_list->constraints, new_constraints);
-
-    if (metadata.select.select_query)
-    {
-        ast->replace(ast_create_query.select, metadata.select.select_query);
-    }
-
-    /// MaterializedView is one type of CREATE query without storage.
-    if (ast_create_query.storage)
-    {
-        ASTStorage & storage_ast = *ast_create_query.storage;
-
-        bool is_extended_storage_def
-            = storage_ast.partition_by || storage_ast.primary_key || storage_ast.order_by || storage_ast.sample_by || storage_ast.settings;
-
-        if (is_extended_storage_def)
-        {
-            if (metadata.sorting_key.definition_ast)
-                storage_ast.set(storage_ast.order_by, metadata.sorting_key.definition_ast);
-
-            if (metadata.primary_key.definition_ast)
-                storage_ast.set(storage_ast.primary_key, metadata.primary_key.definition_ast);
-
-            if (metadata.sampling_key.definition_ast)
-                storage_ast.set(storage_ast.sample_by, metadata.sampling_key.definition_ast);
-
-            if (metadata.table_ttl.definition_ast)
-                storage_ast.set(storage_ast.ttl_table, metadata.table_ttl.definition_ast);
-            else if (storage_ast.ttl_table != nullptr) /// TTL was removed
-                storage_ast.ttl_table = nullptr;
-
-            if (metadata.settings_changes)
-                storage_ast.set(storage_ast.settings, metadata.settings_changes);
-        }
-    }
+    applyMetadataChangesToCreateQuery(ast, metadata);

    statement = getObjectDefinitionFromCreateQuery(ast);
    {
@ -332,10 +279,10 @@ void DatabaseOrdinary::alterTable(const Context & context, const StorageID & tab
        out.close();
    }

-    commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path);
+    commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, context);
 }

-void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path)
+void DatabaseOrdinary::commitAlterTable(const StorageID &, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & /*statement*/, const Context & /*query_context*/)
 {
    try
    {
--- a/src/Databases/DatabaseOrdinary.h
+++ b/src/Databases/DatabaseOrdinary.h
@ -30,7 +30,7 @@ public:
        const StorageInMemoryMetadata & metadata) override;

 protected:
-    virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path);
+    virtual void commitAlterTable(const StorageID & table_id, const String & table_metadata_tmp_path, const String & table_metadata_path, const String & statement, const Context & query_context);

    void startupTables(ThreadPool & thread_pool);
 };
--- a/src/Databases/DatabaseReplicated.cpp
+++ b/src/Databases/DatabaseReplicated.cpp
@ -0,0 +1,719 @@
+#include <DataTypes/DataTypeString.h>
+#include <Databases/DatabaseReplicated.h>
+#include <IO/ReadBufferFromFile.h>
+#include <IO/ReadBufferFromString.h>
+#include <IO/ReadHelpers.h>
+#include <IO/WriteHelpers.h>
+#include <Interpreters/Context.h>
+#include <Interpreters/executeQuery.h>
+#include <Parsers/queryToString.h>
+#include <Common/Exception.h>
+#include <Common/Stopwatch.h>
+#include <Common/ZooKeeper/KeeperException.h>
+#include <Common/ZooKeeper/Types.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Databases/DatabaseReplicatedWorker.h>
+#include <Interpreters/DDLTask.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
+#include <Interpreters/Cluster.h>
+#include <common/getFQDNOrHostName.h>
+#include <Parsers/ASTAlterQuery.h>
+#include <Parsers/ParserCreateQuery.h>
+#include <Parsers/parseQuery.h>
+#include <Interpreters/InterpreterCreateQuery.h>
+#include <Parsers/formatAST.h>
+
+namespace DB
+{
+namespace ErrorCodes
+{
+    extern const int NO_ZOOKEEPER;
+    extern const int LOGICAL_ERROR;
+    extern const int BAD_ARGUMENTS;
+    extern const int REPLICA_IS_ALREADY_EXIST;
+    extern const int DATABASE_REPLICATION_FAILED;
+    extern const int UNKNOWN_DATABASE;
+    extern const int UNKNOWN_TABLE;
+    extern const int NOT_IMPLEMENTED;
+    extern const int INCORRECT_QUERY;
+    extern const int ALL_CONNECTION_TRIES_FAILED;
+}
+
+static constexpr const char * DROPPED_MARK = "DROPPED";
+static constexpr const char * BROKEN_TABLES_SUFFIX = "_broken_tables";
+
+
+zkutil::ZooKeeperPtr DatabaseReplicated::getZooKeeper() const
+{
+    return global_context.getZooKeeper();
+}
+
+static inline String getHostID(const Context & global_context, const UUID & db_uuid)
+{
+    return Cluster::Address::toString(getFQDNOrHostName(), global_context.getTCPPort()) + ':' + toString(db_uuid);
+}
+
+
+DatabaseReplicated::~DatabaseReplicated() = default;
+
+DatabaseReplicated::DatabaseReplicated(
+    const String & name_,
+    const String & metadata_path_,
+    UUID uuid,
+    const String & zookeeper_path_,
+    const String & shard_name_,
+    const String & replica_name_,
+    DatabaseReplicatedSettings db_settings_,
+    const Context & context_)
+    : DatabaseAtomic(name_, metadata_path_, uuid, "DatabaseReplicated (" + name_ + ")", context_)
+    , zookeeper_path(zookeeper_path_)
+    , shard_name(shard_name_)
+    , replica_name(replica_name_)
+    , db_settings(std::move(db_settings_))
+{
+    if (zookeeper_path.empty() || shard_name.empty() || replica_name.empty())
+        throw Exception("ZooKeeper path, shard and replica names must be non-empty", ErrorCodes::BAD_ARGUMENTS);
+    if (shard_name.find('/') != std::string::npos || replica_name.find('/') != std::string::npos)
+        throw Exception("Shard and replica names should not contain '/'", ErrorCodes::BAD_ARGUMENTS);
+    if (shard_name.find('|') != std::string::npos || replica_name.find('|') != std::string::npos)
+        throw Exception("Shard and replica names should not contain '|'", ErrorCodes::BAD_ARGUMENTS);
+
+    if (zookeeper_path.back() == '/')
+        zookeeper_path.resize(zookeeper_path.size() - 1);
+
+    /// If zookeeper chroot prefix is used, path should start with '/', because chroot concatenates without it.
+    if (zookeeper_path.front() != '/')
+        zookeeper_path = "/" + zookeeper_path;
+}
+
+String DatabaseReplicated::getFullReplicaName() const
+{
+    return shard_name + '|' + replica_name;
+}
+
+std::pair<String, String> DatabaseReplicated::parseFullReplicaName(const String & name)
+{
+    String shard;
+    String replica;
+    auto pos = name.find('|');
+    if (pos == std::string::npos || name.find('|', pos + 1) != std::string::npos)
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Incorrect replica identifier: {}", name);
+    shard = name.substr(0, pos);
+    replica = name.substr(pos + 1);
+    return {shard, replica};
+}
+
+ClusterPtr DatabaseReplicated::getCluster() const
+{
+    /// TODO Maintain up-to-date Cluster and allow to use it in Distributed tables
+    Strings hosts;
+    Strings host_ids;
+
+    auto zookeeper = global_context.getZooKeeper();
+    constexpr int max_retries = 10;
+    int iteration = 0;
+    bool success = false;
+    while (++iteration <= max_retries)
+    {
+        host_ids.resize(0);
+        Coordination::Stat stat;
+        hosts = zookeeper->getChildren(zookeeper_path + "/replicas", &stat);
+        if (hosts.empty())
+            throw Exception(ErrorCodes::LOGICAL_ERROR, "No hosts found");
+        Int32 cver = stat.cversion;
+        std::sort(hosts.begin(), hosts.end());
+
+        std::vector<zkutil::ZooKeeper::FutureGet> futures;
+        futures.reserve(hosts.size());
+        host_ids.reserve(hosts.size());
+        for (const auto & host : hosts)
+            futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/replicas/" + host));
+
+        success = true;
+        for (auto & future : futures)
+        {
+            auto res = future.get();
+            if (res.error != Coordination::Error::ZOK)
+                success = false;
+            host_ids.emplace_back(res.data);
+        }
+
+        zookeeper->get(zookeeper_path + "/replicas", &stat);
+        if (success && cver == stat.version)
+            break;
+    }
+    if (!success)
+        throw Exception(ErrorCodes::ALL_CONNECTION_TRIES_FAILED, "Cannot get consistent cluster snapshot,"
+                                                                 "because replicas are created or removed concurrently");
+
+    assert(!hosts.empty());
+    assert(hosts.size() == host_ids.size());
+    String current_shard = parseFullReplicaName(hosts.front()).first;
+    std::vector<Strings> shards;
+    shards.emplace_back();
+    for (size_t i = 0; i < hosts.size(); ++i)
+    {
+        const auto & id = host_ids[i];
+        if (id == DROPPED_MARK)
+            continue;
+        auto [shard, replica] = parseFullReplicaName(hosts[i]);
+        auto pos = id.find(':');
+        String host = id.substr(0, pos);
+        if (shard != current_shard)
+        {
+            current_shard = shard;
+            if (!shards.back().empty())
+                shards.emplace_back();
+        }
+        shards.back().emplace_back(unescapeForFileName(host));
+    }
+
+    /// TODO make it configurable
+    String username = "default";
+    String password;
+
+    return std::make_shared<Cluster>(global_context.getSettingsRef(), shards, username, password, global_context.getTCPPort(), false);
+}
+
+void DatabaseReplicated::tryConnectToZooKeeperAndInitDatabase(bool force_attach)
+{
+    try
+    {
+        if (!global_context.hasZooKeeper())
+        {
+            throw Exception("Can't create replicated database without ZooKeeper", ErrorCodes::NO_ZOOKEEPER);
+        }
+
+        auto current_zookeeper = global_context.getZooKeeper();
+
+        if (!current_zookeeper->exists(zookeeper_path))
+        {
+            /// Create new database, multiple nodes can execute it concurrently
+            createDatabaseNodesInZooKeeper(current_zookeeper);
+        }
+
+        replica_path = zookeeper_path + "/replicas/" + getFullReplicaName();
+
+        String replica_host_id;
+        if (current_zookeeper->tryGet(replica_path, replica_host_id))
+        {
+            String host_id = getHostID(global_context, db_uuid);
+            if (replica_host_id != host_id)
+                throw Exception(ErrorCodes::REPLICA_IS_ALREADY_EXIST,
+                                "Replica {} of shard {} of replicated database at {} already exists. Replica host ID: '{}', current host ID: '{}'",
+                                replica_name, shard_name, zookeeper_path, replica_host_id, host_id);
+        }
+        else
+        {
+            /// Throws if replica with the same name already exists
+            createReplicaNodesInZooKeeper(current_zookeeper);
+        }
+
+        is_readonly = false;
+    }
+    catch (...)
+    {
+        if (!force_attach)
+            throw;
+
+        /// It's server startup, ignore error.
+        /// Worker thread will try to setup ZooKeeper connection
+        tryLogCurrentException(log);
+    }
+}
+
+bool DatabaseReplicated::createDatabaseNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper)
+{
+    current_zookeeper->createAncestors(zookeeper_path);
+
+    Coordination::Requests ops;
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path, "", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/log", "", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/replicas", "", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter", "", zkutil::CreateMode::Persistent));
+    /// We create and remove counter/cnt- node to increment sequential number of counter/ node and make log entry numbers start from 1.
+    /// New replicas are created with log pointer equal to 0 and log pointer is a number of the last executed entry.
+    /// It means that we cannot have log entry with number 0.
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/counter/cnt-", "", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeRemoveRequest(zookeeper_path + "/counter/cnt-", -1));
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/metadata", "", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/max_log_ptr", "1", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(zookeeper_path + "/logs_to_keep", "1000", zkutil::CreateMode::Persistent));
+
+    Coordination::Responses responses;
+    auto res = current_zookeeper->tryMulti(ops, responses);
+    if (res == Coordination::Error::ZOK)
+        return true;    /// Created new database (it's the first replica)
+    if (res == Coordination::Error::ZNODEEXISTS)
+        return false;   /// Database exists, we will add new replica
+
+    /// Other codes are unexpected, will throw
+    zkutil::KeeperMultiException::check(res, ops, responses);
+    assert(false);
+    __builtin_unreachable();
+}
+
+void DatabaseReplicated::createReplicaNodesInZooKeeper(const zkutil::ZooKeeperPtr & current_zookeeper)
+{
+    /// Write host name to replica_path, it will protect from multiple replicas with the same name
+    auto host_id = getHostID(global_context, db_uuid);
+
+    /// On replica creation add empty entry to log. Can be used to trigger some actions on other replicas (e.g. update cluster info).
+    DDLLogEntry entry{};
+
+    String query_path_prefix = zookeeper_path + "/log/query-";
+    String counter_prefix = zookeeper_path + "/counter/cnt-";
+    String counter_path = current_zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential);
+    String query_path = query_path_prefix + counter_path.substr(counter_prefix.size());
+
+    Coordination::Requests ops;
+    ops.emplace_back(zkutil::makeCreateRequest(replica_path, host_id, zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(replica_path + "/log_ptr", "0", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(query_path, entry.toString(), zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1));
+    current_zookeeper->multi(ops);
+}
+
+void DatabaseReplicated::loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach)
+{
+    tryConnectToZooKeeperAndInitDatabase(force_attach);
+
+    DatabaseAtomic::loadStoredObjects(context, has_force_restore_data_flag, force_attach);
+
+    ddl_worker = std::make_unique<DatabaseReplicatedDDLWorker>(this, global_context);
+    ddl_worker->startup();
+}
+
+BlockIO DatabaseReplicated::tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context)
+{
+    if (is_readonly)
+        throw Exception(ErrorCodes::NO_ZOOKEEPER, "Database is in readonly mode, because it cannot connect to ZooKeeper");
+
+    if (query_context.getClientInfo().query_kind != ClientInfo::QueryKind::INITIAL_QUERY)
+        throw Exception(ErrorCodes::INCORRECT_QUERY, "It's not initial query. ON CLUSTER is not allowed for Replicated database.");
+
+    /// Replicas will set correct name of current database in query context (database name can be different on replicas)
+    if (auto * ddl_query = query->as<ASTQueryWithTableAndOutput>())
+        ddl_query->database.clear();
+
+    if (const auto * query_alter = query->as<ASTAlterQuery>())
+    {
+        for (const auto & command : query_alter->command_list->children)
+        {
+            if (!isSupportedAlterType(command->as<ASTAlterCommand&>().type))
+                throw Exception("Unsupported type of ALTER query", ErrorCodes::NOT_IMPLEMENTED);
+        }
+    }
+
+    LOG_DEBUG(log, "Proposing query: {}", queryToString(query));
+
+    /// TODO maybe write current settings to log entry?
+    DDLLogEntry entry;
+    entry.query = queryToString(query);
+    entry.initiator = ddl_worker->getCommonHostID();
+    String node_path = ddl_worker->tryEnqueueAndExecuteEntry(entry, query_context);
+
+    BlockIO io;
+    if (query_context.getSettingsRef().distributed_ddl_task_timeout == 0)
+        return io;
+
+    Strings hosts_to_wait = getZooKeeper()->getChildren(zookeeper_path + "/replicas");
+    auto stream = std::make_shared<DDLQueryStatusInputStream>(node_path, entry, query_context, hosts_to_wait);
+    if (query_context.getSettingsRef().database_replicated_ddl_output)
+        io.in = std::move(stream);
+    return io;
+}
+
+static UUID getTableUUIDIfReplicated(const String & metadata, const Context & context)
+{
+    bool looks_like_replicated = metadata.find("ReplicatedMergeTree") != std::string::npos;
+    if (!looks_like_replicated)
+        return UUIDHelpers::Nil;
+
+    ParserCreateQuery parser;
+    auto size = context.getSettingsRef().max_query_size;
+    auto depth = context.getSettingsRef().max_parser_depth;
+    ASTPtr query = parseQuery(parser, metadata, size, depth);
+    const ASTCreateQuery & create = query->as<const ASTCreateQuery &>();
+    if (!create.storage || !create.storage->engine)
+        return UUIDHelpers::Nil;
+    if (!startsWith(create.storage->engine->name, "Replicated") || !endsWith(create.storage->engine->name, "MergeTree"))
+        return UUIDHelpers::Nil;
+    assert(create.uuid != UUIDHelpers::Nil);
+    return create.uuid;
+}
+
+void DatabaseReplicated::recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr)
+{
+    /// Let's compare local (possibly outdated) metadata with (most actual) metadata stored in ZooKeeper
+    /// and try to update the set of local tables.
+    /// We could drop all local tables and create the new ones just like it's new replica.
+    /// But it will cause all ReplicatedMergeTree tables to fetch all data parts again and data in other tables will be lost.
+
+    bool new_replica = our_log_ptr == 0;
+    if (new_replica)
+        LOG_INFO(log, "Will create new replica from log pointer {}", max_log_ptr);
+    else
+        LOG_WARNING(log, "Will recover replica with staled log pointer {} from log pointer {}", our_log_ptr, max_log_ptr);
+
+    if (new_replica && !empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "It's new replica, but database is not empty");
+
+    auto table_name_to_metadata = tryGetConsistentMetadataSnapshot(current_zookeeper, max_log_ptr);
+
+    /// For ReplicatedMergeTree tables we can compare only UUIDs to ensure that it's the same table.
+    /// Metadata can be different, it's handled on table replication level.
+    /// We need to handle renamed tables only.
+    /// TODO maybe we should also update MergeTree SETTINGS if required?
+    std::unordered_map<UUID, String> zk_replicated_id_to_name;
+    for (const auto & zk_table : table_name_to_metadata)
+    {
+        UUID zk_replicated_id = getTableUUIDIfReplicated(zk_table.second, global_context);
+        if (zk_replicated_id != UUIDHelpers::Nil)
+            zk_replicated_id_to_name.emplace(zk_replicated_id, zk_table.first);
+    }
+
+    /// We will drop or move tables which exist only in local metadata
+    Strings tables_to_detach;
+    std::vector<std::pair<String, String>> replicated_tables_to_rename;
+    size_t total_tables = 0;
+    std::vector<UUID> replicated_ids;
+    for (auto existing_tables_it = getTablesIterator(global_context, {}); existing_tables_it->isValid(); existing_tables_it->next(), ++total_tables)
+    {
+        String name = existing_tables_it->name();
+        UUID local_replicated_id = UUIDHelpers::Nil;
+        if (existing_tables_it->table()->supportsReplication())
+        {
+            /// Check if replicated tables have the same UUID
+            local_replicated_id = existing_tables_it->table()->getStorageID().uuid;
+            auto it = zk_replicated_id_to_name.find(local_replicated_id);
+            if (it != zk_replicated_id_to_name.end())
+            {
+                if (name != it->second)
+                {
+                    /// Need just update table name
+                    replicated_tables_to_rename.emplace_back(name, it->second);
+                }
+                continue;
+            }
+        }
+
+        auto in_zk = table_name_to_metadata.find(name);
+        if (in_zk == table_name_to_metadata.end() || in_zk->second != readMetadataFile(name))
+        {
+            /// Local table does not exits in ZooKeeper or has different metadata
+            tables_to_detach.emplace_back(std::move(name));
+        }
+    }
+
+    String db_name = getDatabaseName();
+    String to_db_name = getDatabaseName() + BROKEN_TABLES_SUFFIX;
+    if (total_tables * db_settings.max_broken_tables_ratio < tables_to_detach.size())
+        throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Too many tables to recreate: {} of {}", tables_to_detach.size(), total_tables);
+    else if (!tables_to_detach.empty())
+    {
+        LOG_WARNING(log, "Will recreate {} broken tables to recover replica", tables_to_detach.size());
+        /// It's too dangerous to automatically drop tables, so we will move them to special database.
+        /// We use Ordinary engine for destination database, because it's the only way to discard table UUID
+        /// and make possible creation of new table with the same UUID.
+        String query = fmt::format("CREATE DATABASE IF NOT EXISTS {} ENGINE=Ordinary", backQuoteIfNeed(to_db_name));
+        Context query_context = global_context;
+        executeQuery(query, query_context, true);
+    }
+
+    size_t dropped_dicts = 0;
+    size_t moved_tables = 0;
+    std::vector<UUID> dropped_tables;
+    for (const auto & table_name : tables_to_detach)
+    {
+        DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, table_name);
+        if (getDatabaseName() != db_name)
+            throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database was renamed, will retry");
+
+        auto table = tryGetTable(table_name, global_context);
+        if (isDictionaryExist(table_name))
+        {
+            /// We can safely drop any dictionaries because they do not store data
+            LOG_DEBUG(log, "Will DROP DICTIONARY {}", backQuoteIfNeed(table_name));
+            DatabaseAtomic::removeDictionary(global_context, table_name);
+            ++dropped_dicts;
+        }
+        else if (!table->storesDataOnDisk())
+        {
+            LOG_DEBUG(log, "Will DROP TABLE {}, because it does not store data on disk and can be safely dropped", backQuoteIfNeed(table_name));
+            dropped_tables.push_back(tryGetTableUUID(table_name));
+            table->shutdown();
+            DatabaseAtomic::dropTable(global_context, table_name, true);
+        }
+        else
+        {
+            /// Table probably stores some data. Let's move it to another database.
+            String to_name = fmt::format("{}_{}_{}", table_name, max_log_ptr, thread_local_rng() % 1000);
+            LOG_DEBUG(log, "Will RENAME TABLE {} TO {}.{}", backQuoteIfNeed(table_name), backQuoteIfNeed(to_db_name), backQuoteIfNeed(to_name));
+            assert(db_name < to_db_name);
+            DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(to_db_name, to_name);
+            auto to_db_ptr = DatabaseCatalog::instance().getDatabase(to_db_name);
+            DatabaseAtomic::renameTable(global_context, table_name, *to_db_ptr, to_name, false, false);
+            ++moved_tables;
+        }
+    }
+
+    if (!tables_to_detach.empty())
+        LOG_WARNING(log, "Cleaned {} outdated objects: dropped {} dictionaries and {} tables, moved {} tables",
+                    tables_to_detach.size(), dropped_dicts, dropped_tables.size(), moved_tables);
+
+    /// Now database is cleared from outdated tables, let's rename ReplicatedMergeTree tables to actual names
+    for (const auto & old_to_new : replicated_tables_to_rename)
+    {
+        const String & from = old_to_new.first;
+        const String & to = old_to_new.second;
+
+        LOG_DEBUG(log, "Will RENAME TABLE {} TO {}", backQuoteIfNeed(from), backQuoteIfNeed(to));
+        /// TODO Maybe we should do it in two steps: rename all tables to temporary names and then rename them to actual names?
+        DDLGuardPtr table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::min(from, to));
+        DDLGuardPtr to_table_guard = DatabaseCatalog::instance().getDDLGuard(db_name, std::max(from, to));
+        DatabaseAtomic::renameTable(global_context, from, *this, to, false, false);
+    }
+
+    for (const auto & id : dropped_tables)
+        DatabaseCatalog::instance().waitTableFinallyDropped(id);
+
+    for (const auto & name_and_meta : table_name_to_metadata)
+    {
+        if (isTableExist(name_and_meta.first, global_context))
+        {
+            assert(name_and_meta.second == readMetadataFile(name_and_meta.first));
+            continue;
+        }
+
+        auto query_ast = parseQueryFromMetadataInZooKeeper(name_and_meta.first, name_and_meta.second);
+
+        Context query_context = global_context;
+        query_context.makeQueryContext();
+        query_context.getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
+        query_context.setCurrentDatabase(database_name);
+        query_context.setCurrentQueryId(""); // generate random query_id
+
+        LOG_INFO(log, "Executing {}", serializeAST(*query_ast));
+        InterpreterCreateQuery(query_ast, query_context).execute();
+    }
+
+    current_zookeeper->set(replica_path + "/log_ptr", toString(max_log_ptr));
+}
+
+std::map<String, String> DatabaseReplicated::tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr)
+{
+    std::map<String, String> table_name_to_metadata;
+    constexpr int max_retries = 10;
+    int iteration = 0;
+    while (++iteration <= max_retries)
+    {
+        table_name_to_metadata.clear();
+        LOG_DEBUG(log, "Trying to get consistent metadata snapshot for log pointer {}", max_log_ptr);
+        Strings table_names = zookeeper->getChildren(zookeeper_path + "/metadata");
+
+        std::vector<zkutil::ZooKeeper::FutureGet> futures;
+        futures.reserve(table_names.size());
+        for (const auto & table : table_names)
+            futures.emplace_back(zookeeper->asyncTryGet(zookeeper_path + "/metadata/" + table));
+
+        for (size_t i = 0; i < table_names.size(); ++i)
+        {
+            auto res = futures[i].get();
+            if (res.error != Coordination::Error::ZOK)
+                break;
+            table_name_to_metadata.emplace(unescapeForFileName(table_names[i]), res.data);
+        }
+
+        UInt32 new_max_log_ptr = parse<UInt32>(zookeeper->get(zookeeper_path + "/max_log_ptr"));
+        if (new_max_log_ptr == max_log_ptr && table_names.size() == table_name_to_metadata.size())
+            break;
+
+        if (max_log_ptr < new_max_log_ptr)
+        {
+            LOG_DEBUG(log, "Log pointer moved from {} to {}, will retry", max_log_ptr, new_max_log_ptr);
+            max_log_ptr = new_max_log_ptr;
+        }
+        else
+        {
+            assert(max_log_ptr == new_max_log_ptr);
+            assert(table_names.size() != table_name_to_metadata.size());
+            LOG_DEBUG(log, "Cannot get metadata of some tables due to ZooKeeper error, will retry");
+        }
+    }
+
+    if (max_retries < iteration)
+        throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "Cannot get consistent metadata snapshot");
+
+    LOG_DEBUG(log, "Got consistent metadata snapshot for log pointer {}", max_log_ptr);
+
+    return table_name_to_metadata;
+}
+
+ASTPtr DatabaseReplicated::parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query)
+{
+    ParserCreateQuery parser;
+    String description = "in ZooKeeper " + zookeeper_path + "/metadata/" + node_name;
+    auto ast = parseQuery(parser, query, description, 0, global_context.getSettingsRef().max_parser_depth);
+
+    auto & create = ast->as<ASTCreateQuery &>();
+    if (create.uuid == UUIDHelpers::Nil || create.table != TABLE_WITH_UUID_NAME_PLACEHOLDER || ! create.database.empty())
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Got unexpected query from {}: {}", node_name, query);
+
+    create.database = getDatabaseName();
+    create.table = unescapeForFileName(node_name);
+    create.attach = false;
+
+    return ast;
+}
+
+void DatabaseReplicated::drop(const Context & context_)
+{
+    auto current_zookeeper = getZooKeeper();
+    current_zookeeper->set(replica_path, DROPPED_MARK);
+    DatabaseAtomic::drop(context_);
+    current_zookeeper->tryRemoveRecursive(replica_path);
+    /// TODO it may leave garbage in ZooKeeper if the last node lost connection here
+    if (current_zookeeper->tryRemove(zookeeper_path + "/replicas") == Coordination::Error::ZOK)
+    {
+        /// It was the last replica, remove all metadata
+        current_zookeeper->tryRemoveRecursive(zookeeper_path);
+    }
+}
+
+void DatabaseReplicated::stopReplication()
+{
+    if (ddl_worker)
+        ddl_worker->shutdown();
+}
+
+void DatabaseReplicated::shutdown()
+{
+    stopReplication();
+    ddl_worker = nullptr;
+    DatabaseAtomic::shutdown();
+}
+
+
+void DatabaseReplicated::dropTable(const Context & context, const String & table_name, bool no_delay)
+{
+    auto txn = context.getZooKeeperMetadataTransaction();
+    assert(!ddl_worker->isCurrentlyActive() || txn);
+    if (txn && txn->isInitialQuery())
+    {
+        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
+        txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
+    }
+    DatabaseAtomic::dropTable(context, table_name, no_delay);
+}
+
+void DatabaseReplicated::renameTable(const Context & context, const String & table_name, IDatabase & to_database,
+                                     const String & to_table_name, bool exchange, bool dictionary)
+{
+    auto txn = context.getZooKeeperMetadataTransaction();
+    assert(txn);
+
+    if (txn->isInitialQuery())
+    {
+        if (this != &to_database)
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Moving tables between databases is not supported for Replicated engine");
+        if (table_name == to_table_name)
+            throw Exception(ErrorCodes::INCORRECT_QUERY, "Cannot rename table to itself");
+        if (!isTableExist(table_name, context))
+            throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", table_name);
+        if (exchange && !to_database.isTableExist(to_table_name, context))
+            throw Exception(ErrorCodes::UNKNOWN_TABLE, "Table {} does not exist", to_table_name);
+
+        String statement = readMetadataFile(table_name);
+        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
+        String metadata_zk_path_to = zookeeper_path + "/metadata/" + escapeForFileName(to_table_name);
+        txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
+        if (exchange)
+        {
+            String statement_to = readMetadataFile(to_table_name);
+            txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path_to, -1));
+            txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement_to, zkutil::CreateMode::Persistent));
+        }
+        txn->addOp(zkutil::makeCreateRequest(metadata_zk_path_to, statement, zkutil::CreateMode::Persistent));
+    }
+
+    DatabaseAtomic::renameTable(context, table_name, to_database, to_table_name, exchange, dictionary);
+}
+
+void DatabaseReplicated::commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
+                       const String & table_metadata_tmp_path, const String & table_metadata_path,
+                       const Context & query_context)
+{
+    auto txn = query_context.getZooKeeperMetadataTransaction();
+    assert(!ddl_worker->isCurrentlyActive() || txn);
+    if (txn && txn->isInitialQuery())
+    {
+        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(query.table);
+        String statement = getObjectDefinitionFromCreateQuery(query.clone());
+        /// zk::multi(...) will throw if `metadata_zk_path` exists
+        txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent));
+    }
+    DatabaseAtomic::commitCreateTable(query, table, table_metadata_tmp_path, table_metadata_path, query_context);
+}
+
+void DatabaseReplicated::commitAlterTable(const StorageID & table_id,
+                                          const String & table_metadata_tmp_path, const String & table_metadata_path,
+                                          const String & statement, const Context & query_context)
+{
+    auto txn = query_context.getZooKeeperMetadataTransaction();
+    if (txn && txn->isInitialQuery())
+    {
+        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_id.table_name);
+        txn->addOp(zkutil::makeSetRequest(metadata_zk_path, statement, -1));
+    }
+    DatabaseAtomic::commitAlterTable(table_id, table_metadata_tmp_path, table_metadata_path, statement, query_context);
+}
+
+void DatabaseReplicated::createDictionary(const Context & context,
+                                          const String & dictionary_name,
+                                          const ASTPtr & query)
+{
+    auto txn = context.getZooKeeperMetadataTransaction();
+    assert(!ddl_worker->isCurrentlyActive() || txn);
+    if (txn && txn->isInitialQuery())
+    {
+        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name);
+        String statement = getObjectDefinitionFromCreateQuery(query->clone());
+        txn->addOp(zkutil::makeCreateRequest(metadata_zk_path, statement, zkutil::CreateMode::Persistent));
+    }
+    DatabaseAtomic::createDictionary(context, dictionary_name, query);
+}
+
+void DatabaseReplicated::removeDictionary(const Context & context, const String & dictionary_name)
+{
+    auto txn = context.getZooKeeperMetadataTransaction();
+    assert(!ddl_worker->isCurrentlyActive() || txn);
+    if (txn && txn->isInitialQuery())
+    {
+        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(dictionary_name);
+        txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
+    }
+    DatabaseAtomic::removeDictionary(context, dictionary_name);
+}
+
+void DatabaseReplicated::detachTablePermanently(const Context & context, const String & table_name)
+{
+    auto txn = context.getZooKeeperMetadataTransaction();
+    assert(!ddl_worker->isCurrentlyActive() || txn);
+    if (txn && txn->isInitialQuery())
+    {
+        String metadata_zk_path = zookeeper_path + "/metadata/" + escapeForFileName(table_name);
+        txn->addOp(zkutil::makeRemoveRequest(metadata_zk_path, -1));
+    }
+    DatabaseAtomic::detachTablePermanently(context, table_name);
+}
+
+String DatabaseReplicated::readMetadataFile(const String & table_name) const
+{
+    String statement;
+    ReadBufferFromFile in(getObjectMetadataPath(table_name), 4096);
+    readStringUntilEOF(statement, in);
+    return statement;
+}
+
+}
--- a/src/Databases/DatabaseReplicated.h
+++ b/src/Databases/DatabaseReplicated.h
@ -0,0 +1,91 @@
+#pragma once
+
+#include <Databases/DatabaseAtomic.h>
+#include <Databases/DatabaseReplicatedSettings.h>
+#include <Common/ZooKeeper/ZooKeeper.h>
+#include <Core/BackgroundSchedulePool.h>
+#include <DataStreams/BlockIO.h>
+#include <DataStreams/OneBlockInputStream.h>
+#include <Interpreters/Context.h>
+
+
+namespace DB
+{
+
+class DatabaseReplicatedDDLWorker;
+using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
+
+class Cluster;
+using ClusterPtr = std::shared_ptr<Cluster>;
+
+class DatabaseReplicated : public DatabaseAtomic
+{
+public:
+    DatabaseReplicated(const String & name_, const String & metadata_path_, UUID uuid,
+                       const String & zookeeper_path_, const String & shard_name_, const String & replica_name_,
+                       DatabaseReplicatedSettings db_settings_,
+                       const Context & context);
+
+    ~DatabaseReplicated() override;
+
+    String getEngineName() const override { return "Replicated"; }
+
+    /// If current query is initial, then the following methods add metadata updating ZooKeeper operations to current ZooKeeperMetadataTransaction.
+    void dropTable(const Context &, const String & table_name, bool no_delay) override;
+    void renameTable(const Context & context, const String & table_name, IDatabase & to_database,
+                     const String & to_table_name, bool exchange, bool dictionary) override;
+    void commitCreateTable(const ASTCreateQuery & query, const StoragePtr & table,
+                           const String & table_metadata_tmp_path, const String & table_metadata_path,
+                           const Context & query_context) override;
+    void commitAlterTable(const StorageID & table_id,
+                          const String & table_metadata_tmp_path, const String & table_metadata_path,
+                          const String & statement, const Context & query_context) override;
+    void createDictionary(const Context & context,
+                          const String & dictionary_name,
+                          const ASTPtr & query) override;
+    void removeDictionary(const Context & context, const String & dictionary_name) override;
+    void detachTablePermanently(const Context & context, const String & table_name) override;
+
+    /// Try to execute DLL query on current host as initial query. If query is succeed,
+    /// then it will be executed on all replicas.
+    BlockIO tryEnqueueReplicatedDDL(const ASTPtr & query, const Context & query_context);
+
+    void stopReplication();
+
+    String getFullReplicaName() const;
+    static std::pair<String, String> parseFullReplicaName(const String & name);
+
+    /// Returns cluster consisting of database replicas
+    ClusterPtr getCluster() const;
+
+    void drop(const Context & /*context*/) override;
+
+    void loadStoredObjects(Context & context, bool has_force_restore_data_flag, bool force_attach) override;
+    void shutdown() override;
+
+    friend struct DatabaseReplicatedTask;
+    friend class DatabaseReplicatedDDLWorker;
+private:
+    void tryConnectToZooKeeperAndInitDatabase(bool force_attach);
+    bool createDatabaseNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper);
+    void createReplicaNodesInZooKeeper(const ZooKeeperPtr & current_zookeeper);
+
+    void recoverLostReplica(const ZooKeeperPtr & current_zookeeper, UInt32 our_log_ptr, UInt32 max_log_ptr);
+    std::map<String, String> tryGetConsistentMetadataSnapshot(const ZooKeeperPtr & zookeeper, UInt32 & max_log_ptr);
+
+    ASTPtr parseQueryFromMetadataInZooKeeper(const String & node_name, const String & query);
+    String readMetadataFile(const String & table_name) const;
+
+    String zookeeper_path;
+    String shard_name;
+    String replica_name;
+    String replica_path;
+    DatabaseReplicatedSettings db_settings;
+
+    zkutil::ZooKeeperPtr getZooKeeper() const;
+
+    std::atomic_bool is_readonly = true;
+    std::unique_ptr<DatabaseReplicatedDDLWorker> ddl_worker;
+};
+
+}
--- a/src/Databases/DatabaseReplicatedSettings.cpp
+++ b/src/Databases/DatabaseReplicatedSettings.cpp
@ -0,0 +1,23 @@
+#include <Databases/DatabaseReplicatedSettings.h>
+#include <Parsers/ASTFunction.h>
+#include <Parsers/ASTCreateQuery.h>
+
+namespace DB
+{
+
+IMPLEMENT_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS)
+
+void DatabaseReplicatedSettings::loadFromQuery(ASTStorage & storage_def)
+{
+    if (storage_def.settings)
+    {
+        applyChanges(storage_def.settings->changes);
+        return;
+    }
+
+    auto settings_ast = std::make_shared<ASTSetQuery>();
+    settings_ast->is_standalone = false;
+    storage_def.set(storage_def.settings, settings_ast);
+}
+
+}
--- a/src/Databases/DatabaseReplicatedSettings.h
+++ b/src/Databases/DatabaseReplicatedSettings.h
@ -0,0 +1,26 @@
+#pragma once
+#include <Core/Defines.h>
+#include <Core/BaseSettings.h>
+
+namespace DB
+{
+
+class ASTStorage;
+
+#define LIST_OF_DATABASE_REPLICATED_SETTINGS(M) \
+    M(Float, max_broken_tables_ratio, 0.5, "Do not recover replica automatically if the ratio of staled tables to all tables is greater", 0) \
+    M(UInt64, max_replication_lag_to_enqueue, 10, "Replica will throw exception on attempt to execute query if its replication lag greater", 0) \
+    M(UInt64, wait_entry_commited_timeout_sec, 3600, "Replicas will try to cancel query if timeout exceed, but initiator host has not executed it yet", 0) \
+
+DECLARE_SETTINGS_TRAITS(DatabaseReplicatedSettingsTraits, LIST_OF_DATABASE_REPLICATED_SETTINGS)
+
+
+/** Settings for the MaterializeMySQL database engine.
+  * Could be loaded from a CREATE DATABASE query (SETTINGS clause).
+  */
+struct DatabaseReplicatedSettings : public BaseSettings<DatabaseReplicatedSettingsTraits>
+{
+    void loadFromQuery(ASTStorage & storage_def);
+};
+
+}
--- a/src/Databases/DatabaseReplicatedWorker.cpp
+++ b/src/Databases/DatabaseReplicatedWorker.cpp
@ -0,0 +1,260 @@
+#include <Databases/DatabaseReplicatedWorker.h>
+#include <Databases/DatabaseReplicated.h>
+#include <Interpreters/DDLTask.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int LOGICAL_ERROR;
+    extern const int DATABASE_REPLICATION_FAILED;
+    extern const int NOT_A_LEADER;
+    extern const int UNFINISHED;
+}
+
+DatabaseReplicatedDDLWorker::DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_)
+    : DDLWorker(/* pool_size */ 1, db->zookeeper_path + "/log", context_, nullptr, {}, fmt::format("DDLWorker({})", db->getDatabaseName()))
+    , database(db)
+{
+    /// Pool size must be 1 to avoid reordering of log entries.
+    /// TODO Make a dependency graph of DDL queries. It will allow to execute independent entries in parallel.
+    /// We also need similar graph to load tables on server startup in order of topsort.
+}
+
+void DatabaseReplicatedDDLWorker::initializeMainThread()
+{
+    while (!stop_flag)
+    {
+        try
+        {
+            auto zookeeper = getAndSetZooKeeper();
+            if (database->is_readonly)
+                database->tryConnectToZooKeeperAndInitDatabase(false);
+            initializeReplication();
+            initialized = true;
+            return;
+        }
+        catch (...)
+        {
+            tryLogCurrentException(log, fmt::format("Error on initialization of {}", database->getDatabaseName()));
+            sleepForSeconds(5);
+        }
+    }
+}
+
+void DatabaseReplicatedDDLWorker::shutdown()
+{
+    DDLWorker::shutdown();
+    wait_current_task_change.notify_all();
+}
+
+void DatabaseReplicatedDDLWorker::initializeReplication()
+{
+    /// Check if we need to recover replica.
+    /// Invariant: replica is lost if it's log_ptr value is less then max_log_ptr - logs_to_keep.
+
+    String log_ptr_str = current_zookeeper->get(database->replica_path + "/log_ptr");
+    UInt32 our_log_ptr = parse<UInt32>(log_ptr_str);
+    UInt32 max_log_ptr = parse<UInt32>(current_zookeeper->get(database->zookeeper_path + "/max_log_ptr"));
+    logs_to_keep = parse<UInt32>(current_zookeeper->get(database->zookeeper_path + "/logs_to_keep"));
+    if (our_log_ptr == 0 || our_log_ptr + logs_to_keep < max_log_ptr)
+        database->recoverLostReplica(current_zookeeper, our_log_ptr, max_log_ptr);
+    else
+        last_skipped_entry_name.emplace(log_ptr_str);
+}
+
+String DatabaseReplicatedDDLWorker::enqueueQuery(DDLLogEntry & entry)
+{
+    auto zookeeper = getAndSetZooKeeper();
+    const String query_path_prefix = queue_dir + "/query-";
+
+    /// We cannot create sequential node and it's ephemeral child in a single transaction, so allocate sequential number another way
+    String counter_prefix = database->zookeeper_path + "/counter/cnt-";
+    String counter_path = zookeeper->create(counter_prefix, "", zkutil::CreateMode::EphemeralSequential);
+    String node_path = query_path_prefix + counter_path.substr(counter_prefix.size());
+
+    Coordination::Requests ops;
+    /// Query is not committed yet, but we have to write it into log to avoid reordering
+    ops.emplace_back(zkutil::makeCreateRequest(node_path, entry.toString(), zkutil::CreateMode::Persistent));
+    /// '/try' will be replaced with '/committed' or will be removed due to expired session or other error
+    ops.emplace_back(zkutil::makeCreateRequest(node_path + "/try", database->getFullReplicaName(), zkutil::CreateMode::Ephemeral));
+    /// We don't need it anymore
+    ops.emplace_back(zkutil::makeRemoveRequest(counter_path, -1));
+    /// Create status dirs
+    ops.emplace_back(zkutil::makeCreateRequest(node_path + "/active", "", zkutil::CreateMode::Persistent));
+    ops.emplace_back(zkutil::makeCreateRequest(node_path + "/finished", "", zkutil::CreateMode::Persistent));
+    zookeeper->multi(ops);
+
+    return node_path;
+}
+
+String DatabaseReplicatedDDLWorker::tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context)
+{
+    /// NOTE Possibly it would be better to execute initial query on the most up-to-date node,
+    /// but it requires more complex logic around /try node.
+
+    auto zookeeper = getAndSetZooKeeper();
+    UInt32 our_log_ptr = parse<UInt32>(zookeeper->get(database->replica_path + "/log_ptr"));
+    UInt32 max_log_ptr = parse<UInt32>(zookeeper->get(database->zookeeper_path + "/max_log_ptr"));
+    assert(our_log_ptr <= max_log_ptr);
+    if (database->db_settings.max_replication_lag_to_enqueue < max_log_ptr - our_log_ptr)
+        throw Exception(ErrorCodes::NOT_A_LEADER, "Cannot enqueue query on this replica, "
+                        "because it has replication lag of {} queries. Try other replica.", max_log_ptr - our_log_ptr);
+
+    String entry_path = enqueueQuery(entry);
+    auto try_node = zkutil::EphemeralNodeHolder::existing(entry_path + "/try", *zookeeper);
+    String entry_name = entry_path.substr(entry_path.rfind('/') + 1);
+    auto task = std::make_unique<DatabaseReplicatedTask>(entry_name, entry_path, database);
+    task->entry = entry;
+    task->parseQueryFromEntry(context);
+    assert(!task->entry.query.empty());
+    assert(!zookeeper->exists(task->getFinishedNodePath()));
+    task->is_initial_query = true;
+
+    LOG_DEBUG(log, "Waiting for worker thread to process all entries before {}", entry_name);
+    UInt64 timeout = query_context.getSettingsRef().database_replicated_initial_query_timeout_sec;
+    {
+        std::unique_lock lock{mutex};
+        bool processed = wait_current_task_change.wait_for(lock, std::chrono::seconds(timeout), [&]()
+        {
+            assert(zookeeper->expired() || current_task <= entry_name);
+            return zookeeper->expired() || current_task == entry_name || stop_flag;
+        });
+
+        if (!processed)
+            throw Exception(ErrorCodes::UNFINISHED, "Timeout: Cannot enqueue query on this replica,"
+                            "most likely because replica is busy with previous queue entries");
+    }
+
+    if (zookeeper->expired() || stop_flag)
+        throw Exception(ErrorCodes::DATABASE_REPLICATION_FAILED, "ZooKeeper session expired or replication stopped, try again");
+
+    processTask(*task, zookeeper);
+
+    if (!task->was_executed)
+    {
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "Entry {} was executed, but was not committed: code {}: {}",
+                        task->execution_status.code, task->execution_status.message);
+    }
+
+    try_node->setAlreadyRemoved();
+
+    return entry_path;
+}
+
+DDLTaskPtr DatabaseReplicatedDDLWorker::initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper)
+{
+    {
+        std::lock_guard lock{mutex};
+        if (current_task < entry_name)
+        {
+            current_task = entry_name;
+            wait_current_task_change.notify_all();
+        }
+    }
+
+    UInt32 our_log_ptr = parse<UInt32>(current_zookeeper->get(database->replica_path + "/log_ptr"));
+    UInt32 entry_num = DatabaseReplicatedTask::getLogEntryNumber(entry_name);
+
+    if (entry_num <= our_log_ptr)
+    {
+        out_reason = fmt::format("Task {} already executed according to log pointer {}", entry_name, our_log_ptr);
+        return {};
+    }
+
+    String entry_path = queue_dir + "/" + entry_name;
+    auto task = std::make_unique<DatabaseReplicatedTask>(entry_name, entry_path, database);
+
+    String initiator_name;
+    zkutil::EventPtr wait_committed_or_failed = std::make_shared<Poco::Event>();
+
+    String try_node_path = entry_path + "/try";
+    if (zookeeper->tryGet(try_node_path, initiator_name, nullptr, wait_committed_or_failed))
+    {
+        task->is_initial_query = initiator_name == task->host_id_str;
+
+        /// Query is not committed yet. We cannot just skip it and execute next one, because reordering may break replication.
+        LOG_TRACE(log, "Waiting for initiator {} to commit or rollback entry {}", initiator_name, entry_path);
+        constexpr size_t wait_time_ms = 1000;
+        size_t max_iterations = database->db_settings.wait_entry_commited_timeout_sec;
+        size_t iteration = 0;
+
+        while (!wait_committed_or_failed->tryWait(wait_time_ms))
+        {
+            if (stop_flag)
+            {
+                /// We cannot return task to process and we cannot return nullptr too,
+                /// because nullptr means "task should not be executed".
+                /// We can only exit by exception.
+                throw Exception(ErrorCodes::UNFINISHED, "Replication was stopped");
+            }
+
+            if (max_iterations <= ++iteration)
+            {
+                /// What can we do if initiator hangs for some reason? Seems like we can remove /try node.
+                /// Initiator will fail to commit ZooKeeperMetadataTransaction (including ops for replicated table) if /try does not exist.
+                /// But it's questionable.
+
+                /// We use tryRemove(...) because multiple hosts (including initiator) may try to do it concurrently.
+                auto code = zookeeper->tryRemove(try_node_path);
+                if (code != Coordination::Error::ZOK && code != Coordination::Error::ZNONODE)
+                    throw Coordination::Exception(code, try_node_path);
+
+                if (!zookeeper->exists(entry_path + "/committed"))
+                {
+                    out_reason = fmt::format("Entry {} was forcefully cancelled due to timeout", entry_name);
+                    return {};
+                }
+            }
+        }
+    }
+
+    if (!zookeeper->exists(entry_path + "/committed"))
+    {
+        out_reason = fmt::format("Entry {} hasn't been committed", entry_name);
+        return {};
+    }
+
+    if (task->is_initial_query)
+    {
+        assert(!zookeeper->exists(entry_path + "/try"));
+        assert(zookeeper->exists(entry_path + "/committed") == (zookeeper->get(task->getFinishedNodePath()) == ExecutionStatus(0).serializeText()));
+        out_reason = fmt::format("Entry {} has been executed as initial query", entry_name);
+        return {};
+    }
+
+    String node_data;
+    if (!zookeeper->tryGet(entry_path, node_data))
+    {
+        LOG_ERROR(log, "Cannot get log entry {}", entry_path);
+        throw Exception(ErrorCodes::LOGICAL_ERROR, "should be unreachable");
+    }
+
+    task->entry.parse(node_data);
+
+    if (task->entry.query.empty())
+    {
+        out_reason = fmt::format("Entry {} is a dummy task", entry_name);
+        return {};
+    }
+
+    task->parseQueryFromEntry(context);
+
+    if (zookeeper->exists(task->getFinishedNodePath()))
+    {
+        out_reason = fmt::format("Task {} has been already processed", entry_name);
+        return {};
+    }
+
+    return task;
+}
+
+bool DatabaseReplicatedDDLWorker::canRemoveQueueEntry(const String & entry_name, const Coordination::Stat &)
+{
+    UInt32 entry_number = DDLTaskBase::getLogEntryNumber(entry_name);
+    UInt32 max_log_ptr = parse<UInt32>(getAndSetZooKeeper()->get(database->zookeeper_path + "/max_log_ptr"));
+    return entry_number + logs_to_keep < max_log_ptr;
+}
+
+}
--- a/src/Databases/DatabaseReplicatedWorker.h
+++ b/src/Databases/DatabaseReplicatedWorker.h
@ -0,0 +1,46 @@
+#pragma once
+#include <Interpreters/DDLWorker.h>
+
+namespace DB
+{
+
+class DatabaseReplicated;
+
+/// It's similar to DDLWorker, but has the following differences:
+/// 1. DDL queue in ZooKeeper is not shared between multiple clusters and databases,
+///    each DatabaseReplicated has its own queue in ZooKeeper and DatabaseReplicatedDDLWorker object.
+/// 2. Shards and replicas are identified by shard_name and replica_name arguments of database engine,
+///    not by address:port pairs. Cluster (of multiple database replicas) is identified by its zookeeper_path.
+/// 3. After creation of an entry in DDL queue initiator tries to execute the entry locally
+///    and other hosts wait for query to finish on initiator host.
+///    If query succeed on initiator, then all hosts must execute it, so they will retry until query succeed.
+///    We assume that cluster is homogeneous, so if replicas are in consistent state and query succeed on one host,
+///    then all hosts can execute it (maybe after several retries).
+/// 4. Each database replica stores its log pointer in ZooKeeper. Cleanup thread removes old entry
+///    if its number < max_log_ptr - logs_to_keep.
+class DatabaseReplicatedDDLWorker : public DDLWorker
+{
+public:
+    DatabaseReplicatedDDLWorker(DatabaseReplicated * db, const Context & context_);
+
+    String enqueueQuery(DDLLogEntry & entry) override;
+
+    String tryEnqueueAndExecuteEntry(DDLLogEntry & entry, const Context & query_context);
+
+    void shutdown() override;
+
+private:
+    void initializeMainThread() override;
+    void initializeReplication();
+
+    DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper) override;
+    bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat) override;
+
+    DatabaseReplicated * const database;
+    mutable std::mutex mutex;
+    std::condition_variable wait_current_task_change;
+    String current_task;
+    UInt32 logs_to_keep = std::numeric_limits<UInt32>::max();
+};
+
+}
--- a/src/Databases/DatabaseWithDictionaries.cpp
+++ b/src/Databases/DatabaseWithDictionaries.cpp
@ -4,6 +4,7 @@
 #include <Interpreters/ExternalDictionariesLoader.h>
 #include <Interpreters/ExternalLoaderTempConfigRepository.h>
 #include <Interpreters/ExternalLoaderDatabaseConfigRepository.h>
+#include <Interpreters/DDLTask.h>
 #include <Dictionaries/getDictionaryConfigurationFromAST.h>
 #include <Dictionaries/DictionaryStructure.h>
 #include <Parsers/ASTCreateQuery.h>
@ -193,6 +194,10 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S
            detachDictionary(dictionary_name);
    });

+    auto txn = context.getZooKeeperMetadataTransaction();
+    if (txn && !context.isInternalSubquery())
+        txn->commit();      /// Commit point (a sort of) for Replicated database
+
    /// If it was ATTACH query and file with dictionary metadata already exist
    /// (so, ATTACH is done after DETACH), then rename atomically replaces old file with new one.
    Poco::File(dictionary_metadata_tmp_path).renameTo(dictionary_metadata_path);
@ -205,7 +210,7 @@ void DatabaseWithDictionaries::createDictionary(const Context & context, const S
    succeeded = true;
 }

-void DatabaseWithDictionaries::removeDictionary(const Context &, const String & dictionary_name)
+void DatabaseWithDictionaries::removeDictionary(const Context & context, const String & dictionary_name)
 {
    DictionaryAttachInfo attach_info;
    detachDictionaryImpl(dictionary_name, attach_info);
@ -213,6 +218,11 @@ void DatabaseWithDictionaries::removeDictionary(const Context &, const String &
    try
    {
        String dictionary_metadata_path = getObjectMetadataPath(dictionary_name);
+
+        auto txn = context.getZooKeeperMetadataTransaction();
+        if (txn && !context.isInternalSubquery())
+            txn->commit();      /// Commit point (a sort of) for Replicated database
+
        Poco::File(dictionary_metadata_path).remove();
        CurrentStatusInfo::unset(CurrentStatusInfo::DictionaryStatus,
                                 StorageID(attach_info.create_query).getInternalDictionaryName());
--- a/src/Databases/IDatabase.h
+++ b/src/Databases/IDatabase.h
@ -249,7 +249,7 @@ public:

    /// Forget about the table without deleting it's data, but rename metadata file to prevent reloading it
    /// with next restart. The database may not support this method.
-    virtual void detachTablePermanently(const String & /*name*/)
+    virtual void detachTablePermanently(const Context & /*context*/, const String & /*name*/)
    {
        throw Exception("There is no DETACH TABLE PERMANENTLY query for Database" + getEngineName(), ErrorCodes::NOT_IMPLEMENTED);
    }
--- a/src/Databases/MySQL/DatabaseConnectionMySQL.cpp
+++ b/src/Databases/MySQL/DatabaseConnectionMySQL.cpp
@ -395,7 +395,7 @@ void DatabaseConnectionMySQL::loadStoredObjects(Context &, bool, bool /*force_at
    }
 }

-void DatabaseConnectionMySQL::detachTablePermanently(const String & table_name)
+void DatabaseConnectionMySQL::detachTablePermanently(const Context &, const String & table_name)
 {
    std::lock_guard<std::mutex> lock{mutex};

@ -429,9 +429,9 @@ void DatabaseConnectionMySQL::detachTablePermanently(const String & table_name)
    table_iter->second.second->is_dropped = true;
 }

-void DatabaseConnectionMySQL::dropTable(const Context &, const String & table_name, bool /*no_delay*/)
+void DatabaseConnectionMySQL::dropTable(const Context & context, const String & table_name, bool /*no_delay*/)
 {
-    detachTablePermanently(table_name);
+    detachTablePermanently(context, table_name);
 }

 DatabaseConnectionMySQL::~DatabaseConnectionMySQL()
--- a/src/Databases/MySQL/DatabaseConnectionMySQL.h
+++ b/src/Databases/MySQL/DatabaseConnectionMySQL.h
@ -72,9 +72,9 @@ public:

    StoragePtr detachTable(const String & table_name) override;

-    void detachTablePermanently(const String & table_name) override;
+    void detachTablePermanently(const Context & context, const String & table_name) override;

-    void dropTable(const Context &, const String & table_name, bool no_delay) override;
+    void dropTable(const Context & context, const String & table_name, bool no_delay) override;

    void attachTable(const String & table_name, const StoragePtr & storage, const String & relative_table_path) override;

--- a/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp
+++ b/src/Databases/PostgreSQL/fetchPostgreSQLTableStructure.cpp
@ -59,13 +59,13 @@ static DataTypePtr convertPostgreSQLDataType(std::string & type, bool is_nullabl
        uint32_t precision = getDecimalPrecision(*res);
        uint32_t scale = getDecimalScale(*res);

-        if (precision <= DecimalUtils::maxPrecision<Decimal32>())
+        if (precision <= DecimalUtils::max_precision<Decimal32>)
            res = std::make_shared<DataTypeDecimal<Decimal32>>(precision, scale);
-        else if (precision <= DecimalUtils::maxPrecision<Decimal64>())
+        else if (precision <= DecimalUtils::max_precision<Decimal64>)
            res = std::make_shared<DataTypeDecimal<Decimal64>>(precision, scale);
-        else if (precision <= DecimalUtils::maxPrecision<Decimal128>())
+        else if (precision <= DecimalUtils::max_precision<Decimal128>)
            res = std::make_shared<DataTypeDecimal<Decimal128>>(precision, scale);
-        else if (precision <= DecimalUtils::maxPrecision<Decimal256>())
+        else if (precision <= DecimalUtils::max_precision<Decimal256>)
            res = std::make_shared<DataTypeDecimal<Decimal256>>(precision, scale);
    }

--- a/src/Databases/ya.make
+++ b/src/Databases/ya.make
@ -16,6 +16,9 @@ SRCS(
    DatabaseMemory.cpp
    DatabaseOnDisk.cpp
    DatabaseOrdinary.cpp
+    DatabaseReplicated.cpp
+    DatabaseReplicatedSettings.cpp
+    DatabaseReplicatedWorker.cpp
    DatabaseWithDictionaries.cpp
    DatabasesCommon.cpp
    MySQL/ConnectionMySQLSettings.cpp
--- a/src/Functions/FunctionFactory.cpp
+++ b/src/Functions/FunctionFactory.cpp
@ -21,6 +21,10 @@ namespace ErrorCodes
    extern const int LOGICAL_ERROR;
 }

+const String & getFunctionCanonicalNameIfAny(const String & name)
+{
+    return FunctionFactory::instance().getCanonicalNameIfAny(name);
+}

 void FunctionFactory::registerFunction(const
    std::string & name,
@ -36,10 +40,13 @@ void FunctionFactory::registerFunction(const
        throw Exception("FunctionFactory: the function name '" + name + "' is already registered as alias",
                        ErrorCodes::LOGICAL_ERROR);

-    if (case_sensitiveness == CaseInsensitive
-        && !case_insensitive_functions.emplace(function_name_lowercase, creator).second)
+    if (case_sensitiveness == CaseInsensitive)
+    {
+        if (!case_insensitive_functions.emplace(function_name_lowercase, creator).second)
            throw Exception("FunctionFactory: the case insensitive function name '" + name + "' is not unique",
                ErrorCodes::LOGICAL_ERROR);
+        case_insensitive_name_mapping[function_name_lowercase] = name;
+    }
 }


--- a/src/Functions/FunctionsConversion.h
+++ b/src/Functions/FunctionsConversion.h
@ -516,6 +516,25 @@ struct ToDateTime64TransformSigned
        return DecimalUtils::decimalFromComponentsWithMultiplier<DateTime64>(from, 0, scale_multiplier);
    }
 };
+template <typename FromDataType, typename FromType>
+struct ToDateTime64TransformFloat
+{
+    static constexpr auto name = "toDateTime64";
+
+    const UInt32 scale = 1;
+
+    ToDateTime64TransformFloat(UInt32 scale_ = 0)
+        : scale(scale_)
+    {}
+
+    inline NO_SANITIZE_UNDEFINED DateTime64::NativeType execute(FromType from, const DateLUTImpl &) const
+    {
+        if (from < 0)
+            return 0;
+        from = std::min<FromType>(from, FromType(0xFFFFFFFF));
+        return convertToDecimal<FromDataType, DataTypeDateTime64>(from, scale);
+    }
+};

 template <typename Name> struct ConvertImpl<DataTypeInt8, DataTypeDateTime64, Name>
    : DateTimeTransformImpl<DataTypeInt8, DataTypeDateTime64, ToDateTime64TransformSigned<Int8>> {};
@ -528,9 +547,9 @@ template <typename Name> struct ConvertImpl<DataTypeInt64, DataTypeDateTime64, N
 template <typename Name> struct ConvertImpl<DataTypeUInt64, DataTypeDateTime64, Name>
    : DateTimeTransformImpl<DataTypeUInt64, DataTypeDateTime64, ToDateTime64TransformUnsigned<UInt64>> {};
 template <typename Name> struct ConvertImpl<DataTypeFloat32, DataTypeDateTime64, Name>
-    : DateTimeTransformImpl<DataTypeFloat32, DataTypeDateTime64, ToDateTime64TransformSigned<Float32>> {};
+    : DateTimeTransformImpl<DataTypeFloat32, DataTypeDateTime64, ToDateTime64TransformFloat<DataTypeFloat32, Float32>> {};
 template <typename Name> struct ConvertImpl<DataTypeFloat64, DataTypeDateTime64, Name>
-    : DateTimeTransformImpl<DataTypeFloat64, DataTypeDateTime64, ToDateTime64TransformSigned<Float64>> {};
+    : DateTimeTransformImpl<DataTypeFloat64, DataTypeDateTime64, ToDateTime64TransformFloat<DataTypeFloat64, Float64>> {};

 /** Conversion of DateTime64 to Date or DateTime: discards fractional part.
 */
@ -1313,7 +1332,7 @@ public:
            else if constexpr (std::is_same_v<Name, NameToDecimal256>)
                return createDecimalMaxPrecision<Decimal256>(scale);

-            throw Exception("Something wrong with toDecimalNN()", ErrorCodes::LOGICAL_ERROR);
+            throw Exception("Unexpected branch in code of conversion function: it is a bug.", ErrorCodes::LOGICAL_ERROR);
        }
        else
        {
@ -1337,7 +1356,7 @@ public:
            if constexpr (std::is_same_v<ToDataType, DataTypeDateTime>)
                return std::make_shared<DataTypeDateTime>(extractTimeZoneNameFromFunctionArguments(arguments, timezone_arg_position, 0));
            else if constexpr (std::is_same_v<ToDataType, DataTypeDateTime64>)
-                throw Exception("LOGICAL ERROR: It is a bug.", ErrorCodes::LOGICAL_ERROR);
+                throw Exception("Unexpected branch in code of conversion function: it is a bug.", ErrorCodes::LOGICAL_ERROR);
            else
                return std::make_shared<ToDataType>();
        }
--- a/src/Functions/FunctionsRound.cpp
+++ b/src/Functions/FunctionsRound.cpp
@ -8,7 +8,7 @@ namespace DB
 void registerFunctionsRound(FunctionFactory & factory)
 {
    factory.registerFunction<FunctionRound>("round", FunctionFactory::CaseInsensitive);
-    factory.registerFunction<FunctionRoundBankers>("roundBankers", FunctionFactory::CaseInsensitive);
+    factory.registerFunction<FunctionRoundBankers>("roundBankers", FunctionFactory::CaseSensitive);
    factory.registerFunction<FunctionFloor>("floor", FunctionFactory::CaseInsensitive);
    factory.registerFunction<FunctionCeil>("ceil", FunctionFactory::CaseInsensitive);
    factory.registerFunction<FunctionTrunc>("trunc", FunctionFactory::CaseInsensitive);
--- a/src/Functions/array/arrayAggregation.cpp
+++ b/src/Functions/array/arrayAggregation.cpp
@ -103,7 +103,7 @@ struct ArrayAggregateImpl
            {
                using DecimalReturnType = ArrayAggregateResult<typename DataType::FieldType, aggregate_operation>;
                UInt32 scale = getDecimalScale(*expression_return);
-                result = std::make_shared<DataTypeDecimal<DecimalReturnType>>(DecimalUtils::maxPrecision<DecimalReturnType>(), scale);
+                result = std::make_shared<DataTypeDecimal<DecimalReturnType>>(DecimalUtils::max_precision<DecimalReturnType>, scale);

                return true;
            }
--- a/src/Functions/array/arrayCumSum.cpp
+++ b/src/Functions/array/arrayCumSum.cpp
@ -37,7 +37,7 @@ struct ArrayCumSumImpl
        if (which.isDecimal())
        {
            UInt32 scale = getDecimalScale(*expression_return);
-            DataTypePtr nested = std::make_shared<DataTypeDecimal<Decimal128>>(DecimalUtils::maxPrecision<Decimal128>(), scale);
+            DataTypePtr nested = std::make_shared<DataTypeDecimal<Decimal128>>(DecimalUtils::max_precision<Decimal128>, scale);
            return std::make_shared<DataTypeArray>(nested);
        }

--- a/src/Functions/array/arrayCumSumNonNegative.cpp
+++ b/src/Functions/array/arrayCumSumNonNegative.cpp
@ -40,7 +40,7 @@ struct ArrayCumSumNonNegativeImpl
        if (which.isDecimal())
        {
            UInt32 scale = getDecimalScale(*expression_return);
-            DataTypePtr nested = std::make_shared<DataTypeDecimal<Decimal128>>(DecimalUtils::maxPrecision<Decimal128>(), scale);
+            DataTypePtr nested = std::make_shared<DataTypeDecimal<Decimal128>>(DecimalUtils::max_precision<Decimal128>, scale);
            return std::make_shared<DataTypeArray>(nested);
        }

--- a/src/Functions/extractAllGroupsVertical.cpp
+++ b/src/Functions/extractAllGroupsVertical.cpp
@ -18,7 +18,7 @@ namespace DB
 void registerFunctionExtractAllGroupsVertical(FunctionFactory & factory)
 {
    factory.registerFunction<FunctionExtractAllGroups<VerticalImpl>>();
-    factory.registerAlias("extractAllGroups", VerticalImpl::Name, FunctionFactory::CaseInsensitive);
+    factory.registerAlias("extractAllGroups", VerticalImpl::Name, FunctionFactory::CaseSensitive);
 }

 }
--- a/src/Functions/isDecimalOverflow.cpp
+++ b/src/Functions/isDecimalOverflow.cpp
@ -133,7 +133,7 @@ private:
        static_assert(IsDecimalNumber<T>);
        using NativeT = typename T::NativeType;

-        if (precision > DecimalUtils::maxPrecision<T>())
+        if (precision > DecimalUtils::max_precision<T>)
            return false;

        NativeT pow10 = intExp10OfSize<NativeT>(precision);
--- a/src/Functions/transform.cpp
+++ b/src/Functions/transform.cpp
@ -1,4 +1,6 @@
 #include <mutex>
+#include <ext/bit_cast.h>
+
 #include <Common/FieldVisitors.h>
 #include <DataTypes/DataTypeArray.h>
 #include <Columns/ColumnString.h>
@ -13,6 +15,7 @@
 #include <Functions/FunctionHelpers.h>
 #include <Functions/FunctionFactory.h>
 #include <DataTypes/getLeastSupertype.h>
+#include <Interpreters/convertFieldToType.h>


 namespace DB
@ -491,7 +494,7 @@ private:
        dst.resize(size);
        for (size_t i = 0; i < size; ++i)
        {
-            auto it = table.find(src[i]);
+            const auto * it = table.find(ext::bit_cast<UInt64>(src[i]));
            if (it)
                memcpy(&dst[i], &it->getMapped(), sizeof(dst[i]));    /// little endian.
            else
@ -507,7 +510,7 @@ private:
        dst.resize(size);
        for (size_t i = 0; i < size; ++i)
        {
-            auto it = table.find(src[i]);
+            const auto * it = table.find(ext::bit_cast<UInt64>(src[i]));
            if (it)
                memcpy(&dst[i], &it->getMapped(), sizeof(dst[i]));    /// little endian.
            else
@ -523,7 +526,7 @@ private:
        dst.resize(size);
        for (size_t i = 0; i < size; ++i)
        {
-            auto it = table.find(src[i]);
+            const auto * it = table.find(ext::bit_cast<UInt64>(src[i]));
            if (it)
                memcpy(&dst[i], &it->getMapped(), sizeof(dst[i]));
            else
@ -541,7 +544,7 @@ private:
        ColumnString::Offset current_dst_offset = 0;
        for (size_t i = 0; i < size; ++i)
        {
-            auto it = table.find(src[i]);
+            const auto * it = table.find(ext::bit_cast<UInt64>(src[i]));
            StringRef ref = it ? it->getMapped() : dst_default;
            dst_data.resize(current_dst_offset + ref.size);
            memcpy(&dst_data[current_dst_offset], ref.data, ref.size);
@ -562,7 +565,8 @@ private:
        ColumnString::Offset current_dst_default_offset = 0;
        for (size_t i = 0; i < size; ++i)
        {
-            auto it = table.find(src[i]);
+            Field key = src[i];
+            const auto * it = table.find(key.reinterpret<UInt64>());
            StringRef ref;

            if (it)
@ -778,29 +782,44 @@ private:

        /// Note: Doesn't check the duplicates in the `from` array.

-        if (from[0].getType() != Field::Types::String && to[0].getType() != Field::Types::String)
+        const IDataType & from_type = *arguments[0].type;
+
+        if (from[0].getType() != Field::Types::String)
+        {
+            if (to[0].getType() != Field::Types::String)
            {
                cache.table_num_to_num = std::make_unique<Cache::NumToNum>();
                auto & table = *cache.table_num_to_num;
                for (size_t i = 0; i < size; ++i)
                {
+                    Field key = convertFieldToType(from[i], from_type);
+                    if (key.isNull())
+                        continue;
+
                    // Field may be of Float type, but for the purpose of bitwise
                    // equality we can treat them as UInt64, hence the reinterpret().
-                table[from[i].reinterpret<UInt64>()] = (*used_to)[i].reinterpret<UInt64>();
+                    table[key.reinterpret<UInt64>()] = (*used_to)[i].reinterpret<UInt64>();
                }
            }
-        else if (from[0].getType() != Field::Types::String && to[0].getType() == Field::Types::String)
+            else
            {
                cache.table_num_to_string = std::make_unique<Cache::NumToString>();
                auto & table = *cache.table_num_to_string;
                for (size_t i = 0; i < size; ++i)
                {
+                    Field key = convertFieldToType(from[i], from_type);
+                    if (key.isNull())
+                        continue;
+
                    const String & str_to = to[i].get<const String &>();
                    StringRef ref{cache.string_pool.insert(str_to.data(), str_to.size() + 1), str_to.size() + 1};
-                table[from[i].reinterpret<UInt64>()] = ref;
+                    table[key.reinterpret<UInt64>()] = ref;
                }
            }
-        else if (from[0].getType() == Field::Types::String && to[0].getType() != Field::Types::String)
+        }
+        else
+        {
+            if (to[0].getType() != Field::Types::String)
            {
                cache.table_string_to_num = std::make_unique<Cache::StringToNum>();
                auto & table = *cache.table_string_to_num;
@ -811,7 +830,7 @@ private:
                    table[ref] = (*used_to)[i].reinterpret<UInt64>();
                }
            }
-        else if (from[0].getType() == Field::Types::String && to[0].getType() == Field::Types::String)
+            else
            {
                cache.table_string_to_string = std::make_unique<Cache::StringToString>();
                auto & table = *cache.table_string_to_string;
@ -824,6 +843,7 @@ private:
                    table[ref_from] = ref_to;
                }
            }
+        }

        cache.initialized = true;
    }
--- a/src/IO/ReadHelpers.cpp
+++ b/src/IO/ReadHelpers.cpp
@ -831,14 +831,18 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
    static constexpr auto date_time_broken_down_length = 19;
    /// YYYY-MM-DD
    static constexpr auto date_broken_down_length = 10;
-    /// unix timestamp max length
-    static constexpr auto unix_timestamp_max_length = 10;

    char s[date_time_broken_down_length];
    char * s_pos = s;

-    /// A piece similar to unix timestamp.
-    while (s_pos < s + unix_timestamp_max_length && !buf.eof() && isNumericASCII(*buf.position()))
+    /** Read characters, that could represent unix timestamp.
+      * Only unix timestamp of at least 5 characters is supported.
+      * Then look at 5th character. If it is a number - treat whole as unix timestamp.
+      * If it is not a number - then parse datetime in YYYY-MM-DD hh:mm:ss or YYYY-MM-DD format.
+      */
+
+    /// A piece similar to unix timestamp, maybe scaled to subsecond precision.
+    while (s_pos < s + date_time_broken_down_length && !buf.eof() && isNumericASCII(*buf.position()))
    {
        *s_pos = *buf.position();
        ++s_pos;
@ -846,7 +850,7 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
    }

    /// 2015-01-01 01:02:03 or 2015-01-01
-    if (s_pos == s + 4 && !buf.eof() && (*buf.position() < '0' || *buf.position() > '9'))
+    if (s_pos == s + 4 && !buf.eof() && !isNumericASCII(*buf.position()))
    {
        const auto already_read_length = s_pos - s;
        const size_t remaining_date_time_size = date_time_broken_down_length - already_read_length;
@ -885,8 +889,7 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
    }
    else
    {
-        /// Only unix timestamp of 5-10 characters is supported. For consistency. See readDateTimeTextImpl.
-        if (s_pos - s >= 5 && s_pos - s <= 10)
+        if (s_pos - s >= 5)
        {
            /// Not very efficient.
            datetime = 0;
--- a/src/IO/ReadHelpers.h
+++ b/src/IO/ReadHelpers.h
@ -703,12 +703,6 @@ ReturnType readDateTimeTextFallback(time_t & datetime, ReadBuffer & buf, const D
 template <typename ReturnType = void>
 inline ReturnType readDateTimeTextImpl(time_t & datetime, ReadBuffer & buf, const DateLUTImpl & date_lut)
 {
-    /** Read 10 characters, that could represent unix timestamp.
-      * Only unix timestamp of 5-10 characters is supported.
-      * Then look at 5th character. If it is a number - treat whole as unix timestamp.
-      * If it is not a number - then parse datetime in YYYY-MM-DD hh:mm:ss or YYYY-MM-DD format.
-      */
-
    /// Optimistic path, when whole value is in buffer.
    const char * s = buf.position();

@ -779,6 +773,18 @@ inline ReturnType readDateTimeTextImpl(DateTime64 & datetime64, UInt32 scale, Re
        while (!buf.eof() && isNumericASCII(*buf.position()))
            ++buf.position();
    }
+    else if (scale && (whole >= 1000000000LL * scale))
+    {
+        /// Unix timestamp with subsecond precision, already scaled to integer.
+        /// For disambiguation we support only time since 2001-09-09 01:46:40 UTC and less than 30 000 years in future.
+
+        for (size_t i = 0; i < scale; ++i)
+        {
+            components.fractional *= 10;
+            components.fractional += components.whole % 10;
+            components.whole /= 10;
+        }
+    }

    datetime64 = DecimalUtils::decimalFromComponents<DateTime64>(components, scale);

--- a/src/IO/WriteHelpers.h
+++ b/src/IO/WriteHelpers.h
@ -709,7 +709,7 @@ inline void writeUUIDText(const UUID & uuid, WriteBuffer & buf)
 template<typename DecimalType>
 inline void writeDecimalTypeFractionalText(typename DecimalType::NativeType fractional, UInt32 scale, WriteBuffer & buf)
 {
-    static constexpr UInt32 MaxScale = DecimalUtils::maxPrecision<DecimalType>();
+    static constexpr UInt32 MaxScale = DecimalUtils::max_precision<DecimalType>;

    char data[20] = {'0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0', '0'};
    static_assert(sizeof(data) >= MaxScale);
@ -831,19 +831,19 @@ inline void writeDateTimeText(time_t datetime, WriteBuffer & buf, const DateLUTI
 template <char date_delimeter = '-', char time_delimeter = ':', char between_date_time_delimiter = ' ', char fractional_time_delimiter = '.'>
 inline void writeDateTimeText(DateTime64 datetime64, UInt32 scale, WriteBuffer & buf, const DateLUTImpl & date_lut = DateLUT::instance())
 {
-    static constexpr UInt32 MaxScale = DecimalUtils::maxPrecision<DateTime64>();
+    static constexpr UInt32 MaxScale = DecimalUtils::max_precision<DateTime64>;
    scale = scale > MaxScale ? MaxScale : scale;

-    auto c = DecimalUtils::split(datetime64, scale);
-    const auto & values = date_lut.getValues(c.whole);
+    auto components = DecimalUtils::split(datetime64, scale);
+    const auto & values = date_lut.getValues(components.whole);
    writeDateTimeText<date_delimeter, time_delimeter, between_date_time_delimiter>(
        LocalDateTime(values.year, values.month, values.day_of_month,
-            date_lut.toHour(c.whole), date_lut.toMinute(c.whole), date_lut.toSecond(c.whole)), buf);
+            date_lut.toHour(components.whole), date_lut.toMinute(components.whole), date_lut.toSecond(components.whole)), buf);

    if (scale > 0)
    {
        buf.write(fractional_time_delimiter);
-        writeDecimalTypeFractionalText<DateTime64>(c.fractional, scale, buf);
+        writeDecimalTypeFractionalText<DateTime64>(components.fractional, scale, buf);
    }
 }

@ -887,16 +887,16 @@ inline void writeDateTimeTextISO(DateTime64 datetime64, UInt32 scale, WriteBuffe

 inline void writeDateTimeUnixTimestamp(DateTime64 datetime64, UInt32 scale, WriteBuffer & buf)
 {
-    static constexpr UInt32 MaxScale = DecimalUtils::maxPrecision<DateTime64>();
+    static constexpr UInt32 MaxScale = DecimalUtils::max_precision<DateTime64>;
    scale = scale > MaxScale ? MaxScale : scale;

-    auto c = DecimalUtils::split(datetime64, scale);
-    writeIntText(c.whole, buf);
+    auto components = DecimalUtils::split(datetime64, scale);
+    writeIntText(components.whole, buf);

-    if (scale > 0)
+    if (scale > 0) //-V547
    {
        buf.write('.');
-        writeDecimalTypeFractionalText<DateTime64>(c.fractional, scale, buf);
+        writeDecimalTypeFractionalText<DateTime64>(components.fractional, scale, buf);
    }
 }

--- a/src/Interpreters/AggregationCommon.h
+++ b/src/Interpreters/AggregationCommon.h
@ -271,9 +271,13 @@ static T inline packFixedShuffle(
    size_t idx,
    const uint8_t * __restrict masks)
 {
-    __m128i res{};
+    assert(num_srcs > 0);

-    for (size_t i = 0; i < num_srcs; ++i)
+    __m128i res = _mm_shuffle_epi8(
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(srcs[0] + elem_sizes[0] * idx)),
+        _mm_loadu_si128(reinterpret_cast<const __m128i *>(masks)));
+
+    for (size_t i = 1; i < num_srcs; ++i)
    {
        res = _mm_xor_si128(res,
            _mm_shuffle_epi8(
--- a/src/Interpreters/Aggregator.cpp
+++ b/src/Interpreters/Aggregator.cpp
@ -558,7 +558,7 @@ void NO_INLINE Aggregator::executeImplBatch(

    /// Generic case.

-    PODArray<AggregateDataPtr> places(rows);
+    std::unique_ptr<AggregateDataPtr[]> places(new AggregateDataPtr[rows]);

    /// For all rows.
    for (size_t i = 0; i < rows; ++i)
@ -589,9 +589,9 @@ void NO_INLINE Aggregator::executeImplBatch(
    for (AggregateFunctionInstruction * inst = aggregate_instructions; inst->that; ++inst)
    {
        if (inst->offsets)
-            inst->batch_that->addBatchArray(rows, places.data(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool);
+            inst->batch_that->addBatchArray(rows, places.get(), inst->state_offset, inst->batch_arguments, inst->offsets, aggregates_pool);
        else
-            inst->batch_that->addBatch(rows, places.data(), inst->state_offset, inst->batch_arguments, aggregates_pool);
+            inst->batch_that->addBatch(rows, places.get(), inst->state_offset, inst->batch_arguments, aggregates_pool);
    }
 }

--- a/src/Interpreters/Context.cpp
+++ b/src/Interpreters/Context.cpp
@ -342,6 +342,7 @@ struct ContextShared
    ReplicatedFetchList replicated_fetch_list;
    ConfigurationPtr users_config;                          /// Config with the users, profiles and quotas sections.
    InterserverIOHandler interserver_io_handler;            /// Handler for interserver communication.
+
    mutable std::optional<BackgroundSchedulePool> buffer_flush_schedule_pool; /// A thread pool that can do background flush for Buffer tables.
    mutable std::optional<BackgroundSchedulePool> schedule_pool;    /// A thread pool that can run different jobs in background (used in replicated tables)
    mutable std::optional<BackgroundSchedulePool> distributed_schedule_pool; /// A thread pool that can run different jobs in background (used for distributed sends)
@ -1552,6 +1553,7 @@ void Context::setDDLWorker(std::unique_ptr<DDLWorker> ddl_worker)
    auto lock = getLock();
    if (shared->ddl_worker)
        throw Exception("DDL background thread has already been initialized", ErrorCodes::LOGICAL_ERROR);
+    ddl_worker->startup();
    shared->ddl_worker = std::move(ddl_worker);
 }

@ -2551,6 +2553,19 @@ StorageID Context::resolveStorageIDImpl(StorageID storage_id, StorageNamespace w
    return StorageID::createEmpty();
 }

+void Context::initZooKeeperMetadataTransaction(ZooKeeperMetadataTransactionPtr txn, [[maybe_unused]] bool attach_existing)
+{
+    assert(!metadata_transaction);
+    assert(attach_existing || query_context == this);
+    metadata_transaction = std::move(txn);
+}
+
+ZooKeeperMetadataTransactionPtr Context::getZooKeeperMetadataTransaction() const
+{
+    assert(!metadata_transaction || hasQueryContext());
+    return metadata_transaction;
+}
+
 PartUUIDsPtr Context::getPartUUIDs()
 {
    auto lock = getLock();
--- a/src/Interpreters/Context.h
+++ b/src/Interpreters/Context.h
@ -117,6 +117,8 @@ using VolumePtr = std::shared_ptr<IVolume>;
 struct NamedSession;
 struct BackgroundTaskSchedulingSettings;

+class ZooKeeperMetadataTransaction;
+using ZooKeeperMetadataTransactionPtr = std::shared_ptr<ZooKeeperMetadataTransaction>;

 #if USE_EMBEDDED_COMPILER
 class CompiledExpressionCache;
@ -279,6 +281,12 @@ private:
                                   /// to be customized in HTTP and TCP servers by overloading the customizeContext(DB::Context&)
                                   /// methods.

+    ZooKeeperMetadataTransactionPtr metadata_transaction;    /// Distributed DDL context. I'm not sure if it's a suitable place for this,
+                                                    /// but it's the easiest way to pass this through the whole stack from executeQuery(...)
+                                                    /// to DatabaseOnDisk::commitCreateTable(...) or IStorage::alter(...) without changing
+                                                    /// thousands of signatures.
+                                                    /// And I hope it will be replaced with more common Transaction sometime.
+
    /// Use copy constructor or createGlobal() instead
    Context();

@ -534,6 +542,7 @@ public:
    const Context & getQueryContext() const;
    Context & getQueryContext();
    bool hasQueryContext() const { return query_context != nullptr; }
+    bool isInternalSubquery() const { return hasQueryContext() && query_context != this; }

    const Context & getSessionContext() const;
    Context & getSessionContext();
@ -737,6 +746,11 @@ public:
    IHostContextPtr & getHostContext();
    const IHostContextPtr & getHostContext() const;

+    /// Initialize context of distributed DDL query with Replicated database.
+    void initZooKeeperMetadataTransaction(ZooKeeperMetadataTransactionPtr txn, bool attach_existing = false);
+    /// Returns context of current distributed DDL query or nullptr.
+    ZooKeeperMetadataTransactionPtr getZooKeeperMetadataTransaction() const;
+
    struct MySQLWireContext
    {
        uint8_t sequence_id = 0;
--- a/src/Interpreters/DDLTask.cpp
+++ b/src/Interpreters/DDLTask.cpp
@ -0,0 +1,344 @@
+#include <Interpreters/DDLTask.h>
+#include <Common/DNSResolver.h>
+#include <Common/isLocalAddress.h>
+#include <IO/WriteHelpers.h>
+#include <IO/ReadHelpers.h>
+#include <IO/Operators.h>
+#include <IO/ReadBufferFromString.h>
+#include <Poco/Net/NetException.h>
+#include <common/logger_useful.h>
+#include <Parsers/ParserQuery.h>
+#include <Parsers/parseQuery.h>
+#include <Parsers/ASTQueryWithOnCluster.h>
+#include <Parsers/ASTQueryWithTableAndOutput.h>
+#include <Databases/DatabaseReplicated.h>
+
+namespace DB
+{
+
+namespace ErrorCodes
+{
+    extern const int UNKNOWN_FORMAT_VERSION;
+    extern const int UNKNOWN_TYPE_OF_QUERY;
+    extern const int INCONSISTENT_CLUSTER_DEFINITION;
+}
+
+HostID HostID::fromString(const String & host_port_str)
+{
+    HostID res;
+    std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str);
+    return res;
+}
+
+bool HostID::isLocalAddress(UInt16 clickhouse_port) const
+{
+    try
+    {
+        return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port);
+    }
+    catch (const Poco::Net::NetException &)
+    {
+        /// Avoid "Host not found" exceptions
+        return false;
+    }
+}
+
+
+String DDLLogEntry::toString() const
+{
+    WriteBufferFromOwnString wb;
+
+    Strings host_id_strings(hosts.size());
+    std::transform(hosts.begin(), hosts.end(), host_id_strings.begin(), HostID::applyToString);
+
+    auto version = CURRENT_VERSION;
+    wb << "version: " << version << "\n";
+    wb << "query: " << escape << query << "\n";
+    wb << "hosts: " << host_id_strings << "\n";
+    wb << "initiator: " << initiator << "\n";
+
+    return wb.str();
+}
+
+void DDLLogEntry::parse(const String & data)
+{
+    ReadBufferFromString rb(data);
+
+    int version;
+    rb >> "version: " >> version >> "\n";
+
+    if (version != CURRENT_VERSION)
+        throw Exception(ErrorCodes::UNKNOWN_FORMAT_VERSION, "Unknown DDLLogEntry format version: {}", version);
+
+    Strings host_id_strings;
+    rb >> "query: " >> escape >> query >> "\n";
+    rb >> "hosts: " >> host_id_strings >> "\n";
+
+    if (!rb.eof())
+        rb >> "initiator: " >> initiator >> "\n";
+    else
+        initiator.clear();
+
+    assertEOF(rb);
+
+    hosts.resize(host_id_strings.size());
+    std::transform(host_id_strings.begin(), host_id_strings.end(), hosts.begin(), HostID::fromString);
+}
+
+
+void DDLTaskBase::parseQueryFromEntry(const Context & context)
+{
+    const char * begin = entry.query.data();
+    const char * end = begin + entry.query.size();
+
+    ParserQuery parser_query(end);
+    String description;
+    query = parseQuery(parser_query, begin, end, description, 0, context.getSettingsRef().max_parser_depth);
+}
+
+std::unique_ptr<Context> DDLTaskBase::makeQueryContext(Context & from_context, const ZooKeeperPtr & /*zookeeper*/)
+{
+    auto query_context = std::make_unique<Context>(from_context);
+    query_context->makeQueryContext();
+    query_context->setCurrentQueryId(""); // generate random query_id
+    query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
+    return query_context;
+}
+
+
+bool DDLTask::findCurrentHostID(const Context & global_context, Poco::Logger * log)
+{
+    bool host_in_hostlist = false;
+
+    for (const HostID & host : entry.hosts)
+    {
+        auto maybe_secure_port = global_context.getTCPPortSecure();
+
+        /// The port is considered local if it matches TCP or TCP secure port that the server is listening.
+        bool is_local_port = (maybe_secure_port && host.isLocalAddress(*maybe_secure_port))
+                             || host.isLocalAddress(global_context.getTCPPort());
+
+        if (!is_local_port)
+            continue;
+
+        if (host_in_hostlist)
+        {
+            /// This check could be slow a little bit
+            LOG_WARNING(log, "There are two the same ClickHouse instances in task {}: {} and {}. Will use the first one only.",
+                             entry_name, host_id.readableString(), host.readableString());
+        }
+        else
+        {
+            host_in_hostlist = true;
+            host_id = host;
+            host_id_str = host.toString();
+        }
+    }
+
+    return host_in_hostlist;
+}
+
+void DDLTask::setClusterInfo(const Context & context, Poco::Logger * log)
+{
+    auto * query_on_cluster = dynamic_cast<ASTQueryWithOnCluster *>(query.get());
+    if (!query_on_cluster)
+        throw Exception("Received unknown DDL query", ErrorCodes::UNKNOWN_TYPE_OF_QUERY);
+
+    cluster_name = query_on_cluster->cluster;
+    cluster = context.tryGetCluster(cluster_name);
+
+    if (!cluster)
+        throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION,
+                        "DDL task {} contains current host {} in cluster {}, but there are no such cluster here.",
+                        entry_name, host_id.readableString(), cluster_name);
+
+    /// Try to find host from task host list in cluster
+    /// At the first, try find exact match (host name and ports should be literally equal)
+    /// If the attempt fails, try find it resolving host name of each instance
+
+    if (!tryFindHostInCluster())
+    {
+        LOG_WARNING(log, "Not found the exact match of host {} from task {} in cluster {} definition. Will try to find it using host name resolving.",
+                         host_id.readableString(), entry_name, cluster_name);
+
+        if (!tryFindHostInClusterViaResolving(context))
+            throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION, "Not found host {} in definition of cluster {}",
+                                                                 host_id.readableString(), cluster_name);
+
+        LOG_INFO(log, "Resolved host {} from task {} as host {} in definition of cluster {}",
+                 host_id.readableString(), entry_name, address_in_cluster.readableString(), cluster_name);
+    }
+
+    query = query_on_cluster->getRewrittenASTWithoutOnCluster(address_in_cluster.default_database);
+    query_on_cluster = nullptr;
+}
+
+bool DDLTask::tryFindHostInCluster()
+{
+    const auto & shards = cluster->getShardsAddresses();
+    bool found_exact_match = false;
+    String default_database;
+
+    for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num)
+    {
+        for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num)
+        {
+            const Cluster::Address & address = shards[shard_num][replica_num];
+
+            if (address.host_name == host_id.host_name && address.port == host_id.port)
+            {
+                if (found_exact_match)
+                {
+                    if (default_database == address.default_database)
+                    {
+                        throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION,
+                                        "There are two exactly the same ClickHouse instances {} in cluster {}",
+                                        address.readableString(), cluster_name);
+                    }
+                    else
+                    {
+                        /* Circular replication is used.
+                         * It is when every physical node contains
+                         * replicas of different shards of the same table.
+                         * To distinguish one replica from another on the same node,
+                         * every shard is placed into separate database.
+                         * */
+                        is_circular_replicated = true;
+                        auto * query_with_table = dynamic_cast<ASTQueryWithTableAndOutput *>(query.get());
+                        if (!query_with_table || query_with_table->database.empty())
+                        {
+                            throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION,
+                                            "For a distributed DDL on circular replicated cluster its table name must be qualified by database name.");
+                        }
+                        if (default_database == query_with_table->database)
+                            return true;
+                    }
+                }
+                found_exact_match = true;
+                host_shard_num = shard_num;
+                host_replica_num = replica_num;
+                address_in_cluster = address;
+                default_database = address.default_database;
+            }
+        }
+    }
+
+    return found_exact_match;
+}
+
+bool DDLTask::tryFindHostInClusterViaResolving(const Context & context)
+{
+    const auto & shards = cluster->getShardsAddresses();
+    bool found_via_resolving = false;
+
+    for (size_t shard_num = 0; shard_num < shards.size(); ++shard_num)
+    {
+        for (size_t replica_num = 0; replica_num < shards[shard_num].size(); ++replica_num)
+        {
+            const Cluster::Address & address = shards[shard_num][replica_num];
+
+            if (auto resolved = address.getResolvedAddress();
+                resolved && (isLocalAddress(*resolved, context.getTCPPort())
+                             || (context.getTCPPortSecure() && isLocalAddress(*resolved, *context.getTCPPortSecure()))))
+            {
+                if (found_via_resolving)
+                {
+                    throw Exception(ErrorCodes::INCONSISTENT_CLUSTER_DEFINITION,
+                                    "There are two the same ClickHouse instances in cluster {} : {} and {}",
+                                    cluster_name, address_in_cluster.readableString(), address.readableString());
+                }
+                else
+                {
+                    found_via_resolving = true;
+                    host_shard_num = shard_num;
+                    host_replica_num = replica_num;
+                    address_in_cluster = address;
+                }
+            }
+        }
+    }
+
+    return found_via_resolving;
+}
+
+String DDLTask::getShardID() const
+{
+    /// Generate unique name for shard node, it will be used to execute the query by only single host
+    /// Shard node name has format 'replica_name1,replica_name2,...,replica_nameN'
+    /// Where replica_name is 'replica_config_host_name:replica_port'
+
+    auto shard_addresses = cluster->getShardsAddresses().at(host_shard_num);
+
+    Strings replica_names;
+    for (const Cluster::Address & address : shard_addresses)
+        replica_names.emplace_back(address.readableString());
+    std::sort(replica_names.begin(), replica_names.end());
+
+    String res;
+    for (auto it = replica_names.begin(); it != replica_names.end(); ++it)
+        res += *it + (std::next(it) != replica_names.end() ? "," : "");
+
+    return res;
+}
+
+DatabaseReplicatedTask::DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_)
+    : DDLTaskBase(name, path)
+    , database(database_)
+{
+    host_id_str = database->getFullReplicaName();
+}
+
+String DatabaseReplicatedTask::getShardID() const
+{
+    return database->shard_name;
+}
+
+std::unique_ptr<Context> DatabaseReplicatedTask::makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper)
+{
+    auto query_context = DDLTaskBase::makeQueryContext(from_context, zookeeper);
+    query_context->getClientInfo().query_kind = ClientInfo::QueryKind::SECONDARY_QUERY;
+    query_context->setCurrentDatabase(database->getDatabaseName());
+
+    auto txn = std::make_shared<ZooKeeperMetadataTransaction>(zookeeper, database->zookeeper_path, is_initial_query);
+    query_context->initZooKeeperMetadataTransaction(txn);
+
+    if (is_initial_query)
+    {
+        txn->addOp(zkutil::makeRemoveRequest(entry_path + "/try", -1));
+        txn->addOp(zkutil::makeCreateRequest(entry_path + "/committed", host_id_str, zkutil::CreateMode::Persistent));
+        txn->addOp(zkutil::makeSetRequest(database->zookeeper_path + "/max_log_ptr", toString(getLogEntryNumber(entry_name)), -1));
+    }
+
+    txn->addOp(zkutil::makeSetRequest(database->replica_path + "/log_ptr", toString(getLogEntryNumber(entry_name)), -1));
+
+    for (auto & op : ops)
+        txn->addOp(std::move(op));
+    ops.clear();
+
+    return query_context;
+}
+
+String DDLTaskBase::getLogEntryName(UInt32 log_entry_number)
+{
+    constexpr size_t seq_node_digits = 10;
+    String number = toString(log_entry_number);
+    String name = "query-" + String(seq_node_digits - number.size(), '0') + number;
+    return name;
+}
+
+UInt32 DDLTaskBase::getLogEntryNumber(const String & log_entry_name)
+{
+    constexpr const char * name = "query-";
+    assert(startsWith(log_entry_name, name));
+    return parse<UInt32>(log_entry_name.substr(strlen(name)));
+}
+
+void ZooKeeperMetadataTransaction::commit()
+{
+    assert(state == CREATED);
+    state = FAILED;
+    current_zookeeper->multi(ops);
+    state = COMMITTED;
+}
+
+}
--- a/src/Interpreters/DDLTask.h
+++ b/src/Interpreters/DDLTask.h
@ -0,0 +1,195 @@
+#pragma once
+#include <Core/Types.h>
+#include <Interpreters/Cluster.h>
+#include <Common/ZooKeeper/Types.h>
+
+namespace Poco
+{
+class Logger;
+}
+
+namespace zkutil
+{
+class ZooKeeper;
+}
+
+namespace DB
+{
+
+class ASTQueryWithOnCluster;
+using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
+class DatabaseReplicated;
+
+class ZooKeeperMetadataTransaction;
+using ZooKeeperMetadataTransactionPtr = std::shared_ptr<ZooKeeperMetadataTransaction>;
+
+struct HostID
+{
+    String host_name;
+    UInt16 port;
+
+    HostID() = default;
+
+    explicit HostID(const Cluster::Address & address)
+        : host_name(address.host_name), port(address.port) {}
+
+    static HostID fromString(const String & host_port_str);
+
+    String toString() const
+    {
+        return Cluster::Address::toString(host_name, port);
+    }
+
+    String readableString() const
+    {
+        return host_name + ":" + DB::toString(port);
+    }
+
+    bool isLocalAddress(UInt16 clickhouse_port) const;
+
+    static String applyToString(const HostID & host_id)
+    {
+        return host_id.toString();
+    }
+};
+
+
+struct DDLLogEntry
+{
+    String query;
+    std::vector<HostID> hosts;
+    String initiator; // optional
+
+    static constexpr int CURRENT_VERSION = 1;
+
+    String toString() const;
+
+    void parse(const String & data);
+};
+
+struct DDLTaskBase
+{
+    const String entry_name;
+    const String entry_path;
+
+    DDLLogEntry entry;
+
+    String host_id_str;
+    ASTPtr query;
+
+    bool is_initial_query = false;
+    bool is_circular_replicated = false;
+    bool execute_on_leader = false;
+
+    Coordination::Requests ops;
+    ExecutionStatus execution_status;
+    bool was_executed = false;
+
+    std::atomic_bool completely_processed = false;
+
+    DDLTaskBase(const String & name, const String & path) : entry_name(name), entry_path(path) {}
+    DDLTaskBase(const DDLTaskBase &) = delete;
+    virtual ~DDLTaskBase() = default;
+
+    void parseQueryFromEntry(const Context & context);
+
+    virtual String getShardID() const = 0;
+
+    virtual std::unique_ptr<Context> makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper);
+
+    inline String getActiveNodePath() const { return entry_path + "/active/" + host_id_str; }
+    inline String getFinishedNodePath() const { return entry_path + "/finished/" + host_id_str; }
+    inline String getShardNodePath() const { return entry_path + "/shards/" + getShardID(); }
+
+    static String getLogEntryName(UInt32 log_entry_number);
+    static UInt32 getLogEntryNumber(const String & log_entry_name);
+};
+
+struct DDLTask : public DDLTaskBase
+{
+    DDLTask(const String & name, const String & path) : DDLTaskBase(name, path) {}
+
+    bool findCurrentHostID(const Context & global_context, Poco::Logger * log);
+
+    void setClusterInfo(const Context & context, Poco::Logger * log);
+
+    String getShardID() const override;
+
+private:
+    bool tryFindHostInCluster();
+    bool tryFindHostInClusterViaResolving(const Context & context);
+
+    HostID host_id;
+    String cluster_name;
+    ClusterPtr cluster;
+    Cluster::Address address_in_cluster;
+    size_t host_shard_num;
+    size_t host_replica_num;
+};
+
+struct DatabaseReplicatedTask : public DDLTaskBase
+{
+    DatabaseReplicatedTask(const String & name, const String & path, DatabaseReplicated * database_);
+
+    String getShardID() const override;
+    std::unique_ptr<Context> makeQueryContext(Context & from_context, const ZooKeeperPtr & zookeeper) override;
+
+    DatabaseReplicated * database;
+};
+
+/// The main purpose of ZooKeeperMetadataTransaction is to execute all zookeeper operation related to query
+/// in a single transaction when we performed all required checks and ready to "commit" changes.
+/// For example, create ALTER_METADATA entry in ReplicatedMergeTree log,
+/// create path/to/entry/finished/host_id node in distributed DDL queue to mark query as executed and
+/// update metadata in path/to/replicated_database/metadata/table_name
+/// It's used for DatabaseReplicated.
+/// TODO we can also use it for ordinary ON CLUSTER queries
+class ZooKeeperMetadataTransaction
+{
+    enum State
+    {
+        CREATED,
+        COMMITTED,
+        FAILED
+    };
+
+    State state = CREATED;
+    ZooKeeperPtr current_zookeeper;
+    String zookeeper_path;
+    bool is_initial_query;
+    Coordination::Requests ops;
+
+public:
+    ZooKeeperMetadataTransaction(const ZooKeeperPtr & current_zookeeper_, const String & zookeeper_path_, bool is_initial_query_)
+    : current_zookeeper(current_zookeeper_)
+    , zookeeper_path(zookeeper_path_)
+    , is_initial_query(is_initial_query_)
+    {
+    }
+
+    bool isInitialQuery() const { return is_initial_query; }
+
+    bool isExecuted() const { return state != CREATED; }
+
+    String getDatabaseZooKeeperPath() const { return zookeeper_path; }
+
+    void addOp(Coordination::RequestPtr && op)
+    {
+        assert(!isExecuted());
+        ops.emplace_back(op);
+    }
+
+    void moveOpsTo(Coordination::Requests & other_ops)
+    {
+        assert(!isExecuted());
+        std::move(ops.begin(), ops.end(), std::back_inserter(other_ops));
+        ops.clear();
+        state = COMMITTED;
+    }
+
+    void commit();
+
+    ~ZooKeeperMetadataTransaction() { assert(isExecuted() || std::uncaught_exception()); }
+};
+
+}
--- a/src/Interpreters/DDLWorker.cpp
+++ b/src/Interpreters/DDLWorker.cpp
--- a/src/Interpreters/DDLWorker.h
+++ b/src/Interpreters/DDLWorker.h
@ -1,15 +1,11 @@
 #pragma once

-#include <DataStreams/BlockIO.h>
-#include <Interpreters/Cluster.h>
-#include <Interpreters/Context.h>
-#include <Storages/IStorage_fwd.h>
-#include <Poco/Net/NetException.h>
 #include <Common/CurrentThread.h>
 #include <Common/DNSResolver.h>
 #include <Common/ThreadPool.h>
-#include <Common/isLocalAddress.h>
-#include <common/logger_useful.h>
+#include <Storages/IStorage_fwd.h>
+#include <Parsers/IAST_fwd.h>
+#include <Interpreters/Context.h>

 #include <atomic>
 #include <chrono>
@ -22,87 +18,36 @@ namespace zkutil
    class ZooKeeper;
 }

+namespace Poco
+{
+    class Logger;
+    namespace Util { class AbstractConfiguration; }
+}
+
+namespace Coordination
+{
+    struct Stat;
+}
+
 namespace DB
 {
-class Context;
 class ASTAlterQuery;
+struct DDLLogEntry;
+struct DDLTaskBase;
+using DDLTaskPtr = std::unique_ptr<DDLTaskBase>;
+using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
 class AccessRightsElements;

-struct HostID
-{
-    String host_name;
-    UInt16 port;
-
-    HostID() = default;
-
-    explicit HostID(const Cluster::Address & address) : host_name(address.host_name), port(address.port) { }
-
-    static HostID fromString(const String & host_port_str)
-    {
-        HostID res;
-        std::tie(res.host_name, res.port) = Cluster::Address::fromString(host_port_str);
-        return res;
-    }
-
-    String toString() const { return Cluster::Address::toString(host_name, port); }
-
-    String readableString() const { return host_name + ":" + DB::toString(port); }
-
-    bool isLocalAddress(UInt16 clickhouse_port) const
-    {
-        try
-        {
-            return DB::isLocalAddress(DNSResolver::instance().resolveAddress(host_name, port), clickhouse_port);
-        }
-        catch (const Poco::Net::NetException &)
-        {
-            /// Avoid "Host not found" exceptions
-            return false;
-        }
-    }
-
-    static String applyToString(const HostID & host_id) { return host_id.toString(); }
-};
-
-struct DDLLogEntry
-{
-    String query;
-    std::vector<HostID> hosts;
-    String initiator; // optional
-
-    static constexpr int CURRENT_VERSION = 1;
-
-public:
-    String toString();
-    void parse(const String & data);
-};
-
-struct DDLTask;
-using DDLTaskPtr = std::unique_ptr<DDLTask>;
-
-
-/// Pushes distributed DDL query to the queue
-BlockIO executeDDLQueryOnCluster(const ASTPtr & query_ptr, const Context & context);
-BlockIO executeDDLQueryOnCluster(
-    const ASTPtr & query_ptr,
-    const Context & context,
-    const AccessRightsElements & query_requires_access,
-    bool query_requires_grant_option = false);
-BlockIO executeDDLQueryOnCluster(
-    const ASTPtr & query_ptr,
-    const Context & context,
-    AccessRightsElements && query_requires_access,
-    bool query_requires_grant_option = false);
-

 class DDLWorker
 {
 public:
-    DDLWorker(int pool_size_, const std::string & zk_root_dir, Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix);
-    ~DDLWorker();
+    DDLWorker(int pool_size_, const std::string & zk_root_dir, const Context & context_, const Poco::Util::AbstractConfiguration * config, const String & prefix,
+              const String & logger_name = "DDLWorker", const CurrentMetrics::Metric * max_entry_metric_ = nullptr);
+    virtual ~DDLWorker();

    /// Pushes query into DDL queue, returns path to created node
-    String enqueueQuery(DDLLogEntry & entry);
+    virtual String enqueueQuery(DDLLogEntry & entry);

    /// Host ID (name:port) for logging purposes
    /// Note that in each task hosts are identified individually by name:port from initiator server cluster config
@ -111,30 +56,32 @@ public:
        return host_fqdn_id;
    }

-private:
-    using ZooKeeperPtr = std::shared_ptr<zkutil::ZooKeeper>;
+    void startup();
+    virtual void shutdown();
+
+    bool isCurrentlyActive() const { return initialized && !stop_flag; }
+
+protected:

    /// Returns cached ZooKeeper session (possibly expired).
    ZooKeeperPtr tryGetZooKeeper() const;
    /// If necessary, creates a new session and caches it.
    ZooKeeperPtr getAndSetZooKeeper();
-    /// ZooKeeper recover loop (while not stopped).
-    void recoverZooKeeper();

-    void checkCurrentTasks();
+    /// Iterates through queue tasks in ZooKeeper, runs execution of new tasks
    void scheduleTasks();
-    void saveTask(const String & entry_name);
+
+    DDLTaskBase & saveTask(DDLTaskPtr && task);

    /// Reads entry and check that the host belongs to host list of the task
    /// Returns non-empty DDLTaskPtr if entry parsed and the check is passed
-    DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper);
+    virtual DDLTaskPtr initAndCheckTask(const String & entry_name, String & out_reason, const ZooKeeperPtr & zookeeper);

-    void updateMaxDDLEntryID(const DDLTask & task);
-    void enqueueTask(DDLTaskPtr task);
-    void processTask(DDLTask & task);
+    void processTask(DDLTaskBase & task, const ZooKeeperPtr & zookeeper);
+    void updateMaxDDLEntryID(const String & entry_name);

    /// Check that query should be executed on leader replica only
-    static bool taskShouldBeExecutedOnLeader(const ASTPtr ast_ddl, StoragePtr storage);
+    static bool taskShouldBeExecutedOnLeader(const ASTPtr & ast_ddl, StoragePtr storage);

    /// Executes query only on leader replica in case of replicated table.
    /// Queries like TRUNCATE/ALTER .../OPTIMIZE have to be executed only on one node of shard.
@ -142,27 +89,27 @@ private:
    /// query via RemoteBlockOutputStream to leader, so to avoid such "2-phase" query execution we
    /// execute query directly on leader.
    bool tryExecuteQueryOnLeaderReplica(
-        DDLTask & task,
+        DDLTaskBase & task,
        StoragePtr storage,
        const String & rewritten_query,
        const String & node_path,
        const ZooKeeperPtr & zookeeper);

-    void parseQueryAndResolveHost(DDLTask & task);
-
-    bool tryExecuteQuery(const String & query, const DDLTask & task, ExecutionStatus & status);
+    bool tryExecuteQuery(const String & query, DDLTaskBase & task, const ZooKeeperPtr & zookeeper);

    /// Checks and cleanups queue's nodes
    void cleanupQueue(Int64 current_time_seconds, const ZooKeeperPtr & zookeeper);
+    virtual bool canRemoveQueueEntry(const String & entry_name, const Coordination::Stat & stat);

    /// Init task node
-    static void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper);
+    void createStatusDirs(const std::string & node_path, const ZooKeeperPtr & zookeeper);

+    virtual void initializeMainThread();

    void runMainThread();
    void runCleanupThread();

-private:
+protected:
    Context context;
    Poco::Logger * log;

@ -174,10 +121,12 @@ private:
    ZooKeeperPtr current_zookeeper;

    /// Save state of executed task to avoid duplicate execution on ZK error
-    std::vector<std::string> last_tasks;
+    std::optional<String> last_skipped_entry_name;
+    std::list<DDLTaskPtr> current_tasks;

    std::shared_ptr<Poco::Event> queue_updated_event = std::make_shared<Poco::Event>();
    std::shared_ptr<Poco::Event> cleanup_event = std::make_shared<Poco::Event>();
+    std::atomic<bool> initialized = false;
    std::atomic<bool> stop_flag = false;

    ThreadFromGlobalPool main_thread;
@ -195,9 +144,7 @@ private:
    size_t max_tasks_in_queue = 1000;

    std::atomic<UInt64> max_id = 0;
-
-    friend class DDLQueryStatusInputStream;
-    friend struct DDLTask;
+    const CurrentMetrics::Metric * max_entry_metric;
 };


--- a/src/Interpreters/DatabaseCatalog.cpp
+++ b/src/Interpreters/DatabaseCatalog.cpp
@ -609,7 +609,7 @@ DatabaseCatalog::updateDependency(const StorageID & old_from, const StorageID &
        view_dependencies[{new_from.getDatabaseName(), new_from.getTableName()}].insert(new_where);
 }

-std::unique_ptr<DDLGuard> DatabaseCatalog::getDDLGuard(const String & database, const String & table)
+DDLGuardPtr DatabaseCatalog::getDDLGuard(const String & database, const String & table)
 {
    std::unique_lock lock(ddl_guards_mutex);
    auto db_guard_iter = ddl_guards.try_emplace(database).first;
@ -956,36 +956,38 @@ DDLGuard::DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_lock<s
    ++it->second.counter;
    guards_lock.unlock();
    table_lock = std::unique_lock(*it->second.mutex);
-    bool is_database = elem.empty();
-    if (!is_database)
+    is_database_guard = elem.empty();
+    if (!is_database_guard)
    {

        bool locked_database_for_read = db_mutex.try_lock_shared();
        if (!locked_database_for_read)
        {
-            removeTableLock();
+            releaseTableLock();
            throw Exception(ErrorCodes::UNKNOWN_DATABASE, "Database {} is currently dropped or renamed", database_name);
        }
    }
 }

-void DDLGuard::removeTableLock()
+void DDLGuard::releaseTableLock() noexcept
 {
+    if (table_lock_removed)
+        return;
+
+    table_lock_removed = true;
    guards_lock.lock();
-    --it->second.counter;
-    if (!it->second.counter)
-    {
+    UInt32 counter = --it->second.counter;
    table_lock.unlock();
+    if (counter == 0)
        map.erase(it);
-    }
+    guards_lock.unlock();
 }

 DDLGuard::~DDLGuard()
 {
-    bool is_database = it->first.empty();
-    if (!is_database)
+    if (!is_database_guard)
        db_mutex.unlock_shared();
-    removeTableLock();
+    releaseTableLock();
 }

 }
--- a/src/Interpreters/DatabaseCatalog.h
+++ b/src/Interpreters/DatabaseCatalog.h
@ -54,16 +54,21 @@ public:
    DDLGuard(Map & map_, std::shared_mutex & db_mutex_, std::unique_lock<std::mutex> guards_lock_, const String & elem, const String & database_name);
    ~DDLGuard();

+    /// Unlocks table name, keeps holding read lock for database name
+    void releaseTableLock() noexcept;
+
 private:
    Map & map;
    std::shared_mutex & db_mutex;
    Map::iterator it;
    std::unique_lock<std::mutex> guards_lock;
    std::unique_lock<std::mutex> table_lock;
-
-    void removeTableLock();
+    bool table_lock_removed = false;
+    bool is_database_guard = false;
 };

+using DDLGuardPtr = std::unique_ptr<DDLGuard>;
+

 /// Creates temporary table in `_temporary_and_external_tables` with randomly generated unique StorageID.
 /// Such table can be accessed from everywhere by its ID.
@ -117,7 +122,7 @@ public:
    void loadDatabases();

    /// Get an object that protects the table from concurrently executing multiple DDL operations.
-    std::unique_ptr<DDLGuard> getDDLGuard(const String & database, const String & table);
+    DDLGuardPtr getDDLGuard(const String & database, const String & table);
    /// Get an object that protects the database from concurrent DDL queries all tables in the database
    std::unique_lock<std::shared_mutex> getExclusiveDDLGuardForDatabase(const String & database);

--- a/src/Interpreters/FunctionNameNormalizer.cpp
+++ b/src/Interpreters/FunctionNameNormalizer.cpp
@ -0,0 +1,45 @@
+#include <Interpreters/FunctionNameNormalizer.h>
+
+#include <Parsers/ASTColumnDeclaration.h>
+#include <Parsers/ASTCreateQuery.h>
+
+namespace DB
+{
+
+const String & getFunctionCanonicalNameIfAny(const String & name);
+const String & getAggregateFunctionCanonicalNameIfAny(const String & name);
+
+void FunctionNameNormalizer::visit(IAST * ast)
+{
+    if (!ast)
+        return;
+
+    // Normalize only selected children. Avoid normalizing engine clause because some engine might
+    // have the same name as function, e.g. Log.
+    if (auto * node_storage = ast->as<ASTStorage>())
+    {
+        visit(node_storage->partition_by);
+        visit(node_storage->primary_key);
+        visit(node_storage->order_by);
+        visit(node_storage->sample_by);
+        visit(node_storage->ttl_table);
+        return;
+    }
+
+    // Normalize only selected children. Avoid normalizing type clause because some type might
+    // have the same name as function, e.g. Date.
+    if (auto * node_decl = ast->as<ASTColumnDeclaration>())
+    {
+        visit(node_decl->default_expression.get());
+        visit(node_decl->ttl.get());
+        return;
+    }
+
+    if (auto * node_func = ast->as<ASTFunction>())
+        node_func->name = getAggregateFunctionCanonicalNameIfAny(getFunctionCanonicalNameIfAny(node_func->name));
+
+    for (auto & child : ast->children)
+        visit(child.get());
+}
+
+}
--- a/src/Interpreters/FunctionNameNormalizer.h
+++ b/src/Interpreters/FunctionNameNormalizer.h
@ -0,0 +1,14 @@
+#pragma once
+
+#include <Parsers/IAST.h>
+#include <Parsers/ASTFunction.h>
+
+namespace DB
+{
+
+struct FunctionNameNormalizer
+{
+    static void visit(IAST *);
+};
+
+}
--- a/src/Interpreters/InterpreterAlterQuery.cpp
+++ b/src/Interpreters/InterpreterAlterQuery.cpp
@ -1,5 +1,5 @@
 #include <Interpreters/InterpreterAlterQuery.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Interpreters/MutationsInterpreter.h>
 #include <Interpreters/AddDefaultDatabaseVisitor.h>
 #include <Interpreters/Context.h>
@ -16,6 +16,9 @@
 #include <Common/typeid_cast.h>
 #include <boost/range/algorithm_ext/push_back.hpp>
 #include <algorithm>
+#include <Databases/IDatabase.h>
+#include <Databases/DatabaseReplicated.h>
+#include <Databases/DatabaseFactory.h>


 namespace DB
@ -25,6 +28,7 @@ namespace ErrorCodes
 {
    extern const int LOGICAL_ERROR;
    extern const int INCORRECT_QUERY;
+    extern const int NOT_IMPLEMENTED;
 }


@ -38,11 +42,21 @@ BlockIO InterpreterAlterQuery::execute()
    BlockIO res;
    const auto & alter = query_ptr->as<ASTAlterQuery &>();

+
    if (!alter.cluster.empty())
        return executeDDLQueryOnCluster(query_ptr, context, getRequiredAccess());

    context.checkAccess(getRequiredAccess());
    auto table_id = context.resolveStorageID(alter, Context::ResolveOrdinary);
+
+    DatabasePtr database = DatabaseCatalog::instance().getDatabase(table_id.database_name);
+    if (typeid_cast<DatabaseReplicated *>(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
+    {
+        auto guard = DatabaseCatalog::instance().getDDLGuard(table_id.database_name, table_id.table_name);
+        guard->releaseTableLock();
+        return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
+    }
+
    StoragePtr table = DatabaseCatalog::instance().getTable(table_id, context);
    auto alter_lock = table->lockForAlter(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout);
    auto metadata_snapshot = table->getInMemoryMetadataPtr();
@ -80,6 +94,14 @@ BlockIO InterpreterAlterQuery::execute()
            throw Exception("Wrong parameter type in ALTER query", ErrorCodes::LOGICAL_ERROR);
    }

+    if (typeid_cast<DatabaseReplicated *>(database.get()))
+    {
+        int command_types_count = !mutation_commands.empty() + !partition_commands.empty() + !live_view_commands.empty() + !alter_commands.empty();
+        if (1 < command_types_count)
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "For Replicated databases it's not allowed "
+                                                         "to execute ALTERs of different types in single query");
+    }
+
    if (!mutation_commands.empty())
    {
        MutationsInterpreter(table, metadata_snapshot, mutation_commands, context, false).validate();
--- a/src/Interpreters/InterpreterCreateQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuery.cpp
@ -30,7 +30,8 @@
 #include <Storages/StorageInMemoryMetadata.h>

 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
+#include <Interpreters/Cluster.h>
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/InterpreterCreateQuery.h>
 #include <Interpreters/InterpreterSelectWithUnionQuery.h>
@ -46,6 +47,7 @@
 #include <DataTypes/DataTypeNullable.h>

 #include <Databases/DatabaseFactory.h>
+#include <Databases/DatabaseReplicated.h>
 #include <Databases/IDatabase.h>
 #include <Databases/DatabaseOnDisk.h>

@ -56,6 +58,7 @@
 #include <Interpreters/InterpreterDropQuery.h>
 #include <Interpreters/QueryLog.h>
 #include <Interpreters/addTypeConversionToAST.h>
+#include <Interpreters/FunctionNameNormalizer.h>

 #include <TableFunctions/TableFunctionFactory.h>
 #include <common/logger_useful.h>
@ -79,6 +82,7 @@ namespace ErrorCodes
    extern const int ILLEGAL_SYNTAX_FOR_DATA_TYPE;
    extern const int ILLEGAL_COLUMN;
    extern const int LOGICAL_ERROR;
+    extern const int UNKNOWN_DATABASE;
    extern const int PATH_ACCESS_DENIED;
    extern const int NOT_IMPLEMENTED;
    extern const int UNKNOWN_TABLE;
@ -146,7 +150,7 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
        throw Exception(ErrorCodes::UNKNOWN_DATABASE_ENGINE, "Unknown database engine: {}", serializeAST(*create.storage));
    }

-    if (create.storage->engine->name == "Atomic")
+    if (create.storage->engine->name == "Atomic" || create.storage->engine->name == "Replicated")
    {
        if (create.attach && create.uuid == UUIDHelpers::Nil)
            throw Exception(ErrorCodes::INCORRECT_QUERY, "UUID must be specified for ATTACH. "
@ -205,6 +209,12 @@ BlockIO InterpreterCreateQuery::createDatabase(ASTCreateQuery & create)
                        "Enable allow_experimental_database_materialize_mysql to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE);
    }

+    if (create.storage->engine->name == "Replicated" && !context.getSettingsRef().allow_experimental_database_replicated && !internal)
+    {
+        throw Exception("Replicated is an experimental database engine. "
+                        "Enable allow_experimental_database_replicated to use it.", ErrorCodes::UNKNOWN_DATABASE_ENGINE);
+    }
+
    DatabasePtr database = DatabaseFactory::get(create, metadata_path / "", context);

    if (create.uuid != UUIDHelpers::Nil)
@ -556,6 +566,11 @@ InterpreterCreateQuery::TableProperties InterpreterCreateQuery::setProperties(AS
    validateTableStructure(create, properties);
    /// Set the table engine if it was not specified explicitly.
    setEngine(create);
+
+    assert(as_database_saved.empty() && as_table_saved.empty());
+    std::swap(create.as_database, as_database_saved);
+    std::swap(create.as_table, as_table_saved);
+
    return properties;
 }

@ -702,6 +717,12 @@ void InterpreterCreateQuery::assertOrSetUUID(ASTCreateQuery & create, const Data
    const auto * kind = create.is_dictionary ? "Dictionary" : "Table";
    const auto * kind_upper = create.is_dictionary ? "DICTIONARY" : "TABLE";

+    if (database->getEngineName() == "Replicated" && context.getClientInfo().query_kind == ClientInfo::QueryKind::SECONDARY_QUERY && !internal)
+    {
+        if (create.uuid == UUIDHelpers::Nil)
+            throw Exception("Table UUID is not specified in DDL log", ErrorCodes::LOGICAL_ERROR);
+    }
+
    bool from_path = create.attach_from_path.has_value();

    if (database->getUUID() != UUIDHelpers::Nil)
@ -776,11 +797,11 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
            ErrorCodes::BAD_DATABASE_FOR_TEMPORARY_TABLE);

    String current_database = context.getCurrentDatabase();
+    auto database_name = create.database.empty() ? current_database : create.database;

    // If this is a stub ATTACH query, read the query definition from the database
    if (create.attach && !create.storage && !create.columns_list)
    {
-        auto database_name = create.database.empty() ? current_database : create.database;
        auto database = DatabaseCatalog::instance().getDatabase(database_name);
        bool if_not_exists = create.if_not_exists;

@ -800,19 +821,30 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)

    if (create.attach_from_path)
    {
-        fs::path data_path = fs::path(*create.attach_from_path).lexically_normal();
        fs::path user_files = fs::path(context.getUserFilesPath()).lexically_normal();
+        fs::path root_path = fs::path(context.getPath()).lexically_normal();
+
+        if (context.getClientInfo().query_kind == ClientInfo::QueryKind::INITIAL_QUERY)
+        {
+            fs::path data_path = fs::path(*create.attach_from_path).lexically_normal();
            if (data_path.is_relative())
                data_path = (user_files / data_path).lexically_normal();
            if (!startsWith(data_path, user_files))
                throw Exception(ErrorCodes::PATH_ACCESS_DENIED,
                                "Data directory {} must be inside {} to attach it", String(data_path), String(user_files));

-        fs::path root_path = fs::path(context.getPath()).lexically_normal();
            /// Data path must be relative to root_path
            create.attach_from_path = fs::relative(data_path, root_path) / "";
        }
-    else if (create.attach && !create.attach_short_syntax)
+        else
+        {
+            fs::path data_path = (root_path / *create.attach_from_path).lexically_normal();
+            if (!startsWith(data_path, user_files))
+                throw Exception(ErrorCodes::PATH_ACCESS_DENIED,
+                                "Data directory {} must be inside {} to attach it", String(data_path), String(user_files));
+        }
+    }
+    else if (create.attach && !create.attach_short_syntax && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
    {
        auto * log = &Poco::Logger::get("InterpreterCreateQuery");
        LOG_WARNING(log, "ATTACH TABLE query with full table definition is not recommended: "
@ -836,11 +868,29 @@ BlockIO InterpreterCreateQuery::createTable(ASTCreateQuery & create)
    /// Set and retrieve list of columns, indices and constraints. Set table engine if needed. Rewrite query in canonical way.
    TableProperties properties = setProperties(create);

+    DatabasePtr database;
+    bool need_add_to_database = !create.temporary;
+    if (need_add_to_database)
+        database = DatabaseCatalog::instance().getDatabase(database_name);
+
+    if (need_add_to_database && database->getEngineName() == "Replicated")
+    {
+        auto guard = DatabaseCatalog::instance().getDDLGuard(create.database, create.table);
+        database = DatabaseCatalog::instance().getDatabase(create.database);
+        if (typeid_cast<DatabaseReplicated *>(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
+        {
+            assertOrSetUUID(create, database);
+            guard->releaseTableLock();
+            return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
+        }
+    }
+
    if (create.replace_table)
        return doCreateOrReplaceTable(create, properties);

    /// Actually creates table
    bool created = doCreateTable(create, properties);
+
    if (!created)   /// Table already exists
        return {};

@ -880,7 +930,8 @@ bool InterpreterCreateQuery::doCreateTable(ASTCreateQuery & create,
                drop_ast->table = create.table;
                drop_ast->no_ddl_lock = true;

-                InterpreterDropQuery interpreter(drop_ast, context);
+                Context drop_context = context;
+                InterpreterDropQuery interpreter(drop_ast, drop_context);
                interpreter.execute();
            }
            else
@ -1037,6 +1088,14 @@ BlockIO InterpreterCreateQuery::createDictionary(ASTCreateQuery & create)
    auto guard = DatabaseCatalog::instance().getDDLGuard(database_name, dictionary_name);
    DatabasePtr database = DatabaseCatalog::instance().getDatabase(database_name);

+    if (typeid_cast<DatabaseReplicated *>(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
+    {
+        if (!create.attach)
+            assertOrSetUUID(create, database);
+        guard->releaseTableLock();
+        return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
+    }
+
    if (database->isDictionaryExist(dictionary_name))
    {
        /// TODO Check structure of dictionary
@ -1118,6 +1177,7 @@ void InterpreterCreateQuery::prepareOnClusterQuery(ASTCreateQuery & create, cons

 BlockIO InterpreterCreateQuery::execute()
 {
+    FunctionNameNormalizer().visit(query_ptr.get());
    auto & create = query_ptr->as<ASTCreateQuery &>();
    if (!create.cluster.empty())
    {
@ -1189,15 +1249,14 @@ AccessRightsElements InterpreterCreateQuery::getRequiredAccess() const
    return required_access;
 }

-void InterpreterCreateQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, const Context &) const
+void InterpreterCreateQuery::extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr &, const Context &) const
 {
-    const auto & create = ast->as<const ASTCreateQuery &>();
    elem.query_kind = "Create";
-    if (!create.as_table.empty())
+    if (!as_table_saved.empty())
    {
-        String database = backQuoteIfNeed(create.as_database.empty() ? context.getCurrentDatabase() : create.as_database);
+        String database = backQuoteIfNeed(as_database_saved.empty() ? context.getCurrentDatabase() : as_database_saved);
        elem.query_databases.insert(database);
-        elem.query_tables.insert(database + "." + backQuoteIfNeed(create.as_table));
+        elem.query_tables.insert(database + "." + backQuoteIfNeed(as_table_saved));
    }
 }

--- a/src/Interpreters/InterpreterCreateQuery.h
+++ b/src/Interpreters/InterpreterCreateQuery.h
@ -95,5 +95,8 @@ private:
    /// Is this an internal query - not from the user.
    bool internal = false;
    bool force_attach = false;
+
+    mutable String as_database_saved;
+    mutable String as_table_saved;
 };
 }
--- a/src/Interpreters/InterpreterCreateQuotaQuery.cpp
+++ b/src/Interpreters/InterpreterCreateQuotaQuery.cpp
@ -2,7 +2,7 @@
 #include <Parsers/ASTCreateQuotaQuery.h>
 #include <Parsers/ASTRolesOrUsersSet.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Access/AccessControlManager.h>
 #include <Access/AccessFlags.h>
 #include <ext/range.h>
--- a/src/Interpreters/InterpreterCreateRoleQuery.cpp
+++ b/src/Interpreters/InterpreterCreateRoleQuery.cpp
@ -1,7 +1,7 @@
 #include <Interpreters/InterpreterCreateRoleQuery.h>
 #include <Parsers/ASTCreateRoleQuery.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Access/AccessControlManager.h>
 #include <Access/Role.h>

--- a/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp
+++ b/src/Interpreters/InterpreterCreateRowPolicyQuery.cpp
@ -4,7 +4,7 @@
 #include <Parsers/ASTRolesOrUsersSet.h>
 #include <Parsers/formatAST.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Access/AccessControlManager.h>
 #include <Access/AccessFlags.h>
 #include <boost/range/algorithm/sort.hpp>
--- a/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp
+++ b/src/Interpreters/InterpreterCreateSettingsProfileQuery.cpp
@ -2,7 +2,7 @@
 #include <Parsers/ASTCreateSettingsProfileQuery.h>
 #include <Parsers/ASTRolesOrUsersSet.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Access/AccessControlManager.h>
 #include <Access/SettingsProfile.h>
 #include <Access/AccessFlags.h>
--- a/src/Interpreters/InterpreterCreateUserQuery.cpp
+++ b/src/Interpreters/InterpreterCreateUserQuery.cpp
@ -1,7 +1,7 @@
 #include <Interpreters/InterpreterCreateUserQuery.h>
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterSetRoleQuery.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Parsers/ASTCreateUserQuery.h>
 #include <Parsers/ASTUserNameWithHost.h>
 #include <Parsers/ASTRolesOrUsersSet.h>
--- a/src/Interpreters/InterpreterDropAccessEntityQuery.cpp
+++ b/src/Interpreters/InterpreterDropAccessEntityQuery.cpp
@ -2,7 +2,7 @@
 #include <Parsers/ASTDropAccessEntityQuery.h>
 #include <Parsers/ASTRowPolicyName.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Access/AccessControlManager.h>
 #include <Access/AccessFlags.h>
 #include <Access/User.h>
--- a/src/Interpreters/InterpreterDropQuery.cpp
+++ b/src/Interpreters/InterpreterDropQuery.cpp
@ -2,7 +2,7 @@

 #include <Databases/IDatabase.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Interpreters/InterpreterDropQuery.h>
 #include <Interpreters/ExternalDictionariesLoader.h>
 #include <Interpreters/QueryLog.h>
@ -12,6 +12,7 @@
 #include <Common/escapeForFileName.h>
 #include <Common/quoteString.h>
 #include <Common/typeid_cast.h>
+#include <Databases/DatabaseReplicated.h>

 #if !defined(ARCADIA_BUILD)
 #    include "config_core.h"
@ -32,6 +33,7 @@ namespace ErrorCodes
    extern const int UNKNOWN_TABLE;
    extern const int UNKNOWN_DICTIONARY;
    extern const int NOT_IMPLEMENTED;
+    extern const int INCORRECT_QUERY;
 }


@ -118,32 +120,55 @@ BlockIO InterpreterDropQuery::executeToTableImpl(const ASTDropQuery & query, Dat

    if (database && table)
    {
-        if (query_ptr->as<ASTDropQuery &>().is_view && !table->isView())
+        if (query.as<ASTDropQuery &>().is_view && !table->isView())
            throw Exception("Table " + table_id.getNameForLogs() + " is not a View", ErrorCodes::LOGICAL_ERROR);

        /// Now get UUID, so we can wait for table data to be finally dropped
        table_id.uuid = database->tryGetTableUUID(table_id.table_name);

+        /// Prevents recursive drop from drop database query. The original query must specify a table.
+        bool is_drop_or_detach_database = query_ptr->as<ASTDropQuery>()->table.empty();
+        bool is_replicated_ddl_query = typeid_cast<DatabaseReplicated *>(database.get()) &&
+                                       context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY &&
+                                       !is_drop_or_detach_database;
+        if (is_replicated_ddl_query)
+        {
+            if (query.kind == ASTDropQuery::Kind::Detach && !query.permanently)
+                throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH TABLE is not allowed for Replicated databases. "
+                                                             "Use DETACH TABLE PERMANENTLY or SYSTEM RESTART REPLICA");
+
+            if (query.kind == ASTDropQuery::Kind::Detach)
+                context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id);
+            else if (query.kind == ASTDropQuery::Kind::Truncate)
+                context.checkAccess(AccessType::TRUNCATE, table_id);
+            else if (query.kind == ASTDropQuery::Kind::Drop)
+                context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id);
+
+            ddl_guard->releaseTableLock();
+            table.reset();
+            return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query.clone(), context);
+        }
+
        if (query.kind == ASTDropQuery::Kind::Detach)
        {
            context.checkAccess(table->isView() ? AccessType::DROP_VIEW : AccessType::DROP_TABLE, table_id);
            table->checkTableCanBeDetached();
            table->shutdown();
            TableExclusiveLockHolder table_lock;
+
            if (database->getUUID() == UUIDHelpers::Nil)
                table_lock = table->lockExclusively(context.getCurrentQueryId(), context.getSettingsRef().lock_acquire_timeout);

            if (query.permanently)
            {
                /// Drop table from memory, don't touch data, metadata file renamed and will be skipped during server restart
-                database->detachTablePermanently(table_id.table_name);
+                database->detachTablePermanently(context, table_id.table_name);
            }
            else
            {
                /// Drop table from memory, don't touch data and metadata
                database->detachTable(table_id.table_name);
            }
-
        }
        else if (query.kind == ASTDropQuery::Kind::Truncate)
        {
@ -194,6 +219,21 @@ BlockIO InterpreterDropQuery::executeToDictionary(

    DatabasePtr database = tryGetDatabase(database_name, if_exists);

+    bool is_drop_or_detach_database = query_ptr->as<ASTDropQuery>()->table.empty();
+    bool is_replicated_ddl_query = typeid_cast<DatabaseReplicated *>(database.get()) &&
+                                   context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY &&
+                                   !is_drop_or_detach_database;
+    if (is_replicated_ddl_query)
+    {
+        if (kind == ASTDropQuery::Kind::Detach)
+            throw Exception(ErrorCodes::INCORRECT_QUERY, "DETACH DICTIONARY is not allowed for Replicated databases.");
+
+        context.checkAccess(AccessType::DROP_DICTIONARY, database_name, dictionary_name);
+
+        ddl_guard->releaseTableLock();
+        return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
+    }
+
    if (!database || !database->isDictionaryExist(dictionary_name))
    {
        if (!if_exists)
@ -307,6 +347,8 @@ BlockIO InterpreterDropQuery::executeToDatabaseImpl(const ASTDropQuery & query,
            if (database->getEngineName() == "MaterializeMySQL")
                stopDatabaseSynchronization(database);
 #endif
+            if (auto * replicated = typeid_cast<DatabaseReplicated *>(database.get()))
+                replicated->stopReplication();

            if (database->shouldBeEmptyOnDetach())
            {
--- a/src/Interpreters/InterpreterGrantQuery.cpp
+++ b/src/Interpreters/InterpreterGrantQuery.cpp
@ -2,7 +2,7 @@
 #include <Parsers/ASTGrantQuery.h>
 #include <Parsers/ASTRolesOrUsersSet.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Access/AccessControlManager.h>
 #include <Access/ContextAccess.h>
 #include <Access/RolesOrUsersSet.h>
--- a/src/Interpreters/InterpreterKillQueryQuery.cpp
+++ b/src/Interpreters/InterpreterKillQueryQuery.cpp
@ -2,7 +2,7 @@
 #include <Parsers/ASTKillQueryQuery.h>
 #include <Parsers/queryToString.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Interpreters/ProcessList.h>
 #include <Interpreters/executeQuery.h>
 #include <Interpreters/CancellationCode.h>
--- a/src/Interpreters/InterpreterOptimizeQuery.cpp
+++ b/src/Interpreters/InterpreterOptimizeQuery.cpp
@ -1,7 +1,7 @@
 #include <Storages/IStorage.h>
 #include <Parsers/ASTOptimizeQuery.h>
 #include <Interpreters/Context.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Interpreters/InterpreterOptimizeQuery.h>
 #include <Access/AccessRightsElement.h>
 #include <Common/typeid_cast.h>
--- a/src/Interpreters/InterpreterRenameQuery.cpp
+++ b/src/Interpreters/InterpreterRenameQuery.cpp
@ -3,14 +3,20 @@
 #include <Interpreters/Context.h>
 #include <Interpreters/InterpreterRenameQuery.h>
 #include <Storages/IStorage.h>
-#include <Interpreters/DDLWorker.h>
+#include <Interpreters/executeDDLQueryOnCluster.h>
 #include <Interpreters/QueryLog.h>
 #include <Access/AccessRightsElement.h>
+#include <Common/typeid_cast.h>
+#include <Databases/DatabaseReplicated.h>


 namespace DB
 {

+namespace ErrorCodes
+{
+    extern const int NOT_IMPLEMENTED;
+}

 InterpreterRenameQuery::InterpreterRenameQuery(const ASTPtr & query_ptr_, Context & context_)
    : query_ptr(query_ptr_), context(context_)
@ -61,10 +67,10 @@ BlockIO InterpreterRenameQuery::execute()
    if (rename.database)
        return executeToDatabase(rename, descriptions);
    else
-        return executeToTables(rename, descriptions);
+        return executeToTables(rename, descriptions, table_guards);
 }

-BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions)
+BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions, TableGuards & ddl_guards)
 {
    auto & database_catalog = DatabaseCatalog::instance();

@ -73,7 +79,22 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c
        if (!rename.exchange)
            database_catalog.assertTableDoesntExist(StorageID(elem.to_database_name, elem.to_table_name), context);

-        database_catalog.getDatabase(elem.from_database_name)->renameTable(
+        DatabasePtr database = database_catalog.getDatabase(elem.from_database_name);
+        if (typeid_cast<DatabaseReplicated *>(database.get()) && context.getClientInfo().query_kind != ClientInfo::QueryKind::SECONDARY_QUERY)
+        {
+            if (1 < descriptions.size())
+                throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Database {} is Replicated, "
+                                "it does not support renaming of multiple tables in single query.", elem.from_database_name);
+
+            UniqueTableName from(elem.from_database_name, elem.from_table_name);
+            UniqueTableName to(elem.to_database_name, elem.to_table_name);
+            ddl_guards[from]->releaseTableLock();
+            ddl_guards[to]->releaseTableLock();
+            return typeid_cast<DatabaseReplicated *>(database.get())->tryEnqueueReplicatedDDL(query_ptr, context);
+        }
+        else
+        {
+            database->renameTable(
                context,
                elem.from_table_name,
                *database_catalog.getDatabase(elem.to_database_name),
@ -81,6 +102,7 @@ BlockIO InterpreterRenameQuery::executeToTables(const ASTRenameQuery & rename, c
                rename.exchange,
                rename.dictionary);
        }
+    }

    return {};
 }
--- a/src/Interpreters/InterpreterRenameQuery.h
+++ b/src/Interpreters/InterpreterRenameQuery.h
@ -57,7 +57,7 @@ public:
    void extendQueryLogElemImpl(QueryLogElement & elem, const ASTPtr & ast, const Context &) const override;

 private:
-    BlockIO executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions);
+    BlockIO executeToTables(const ASTRenameQuery & rename, const RenameDescriptions & descriptions, TableGuards & ddl_guards);
    static BlockIO executeToDatabase(const ASTRenameQuery & rename, const RenameDescriptions & descriptions);

    AccessRightsElements getRequiredAccess() const;
--- a/src/Interpreters/InterpreterSelectQuery.cpp
+++ b/src/Interpreters/InterpreterSelectQuery.cpp
@ -561,10 +561,20 @@ Block InterpreterSelectQuery::getSampleBlockImpl()
    if (storage && !options.only_analyze)
        from_stage = storage->getQueryProcessingStage(*context, options.to_stage, query_info);

-    /// Do I need to perform the first part of the pipeline - running on remote servers during distributed processing.
+    /// Do I need to perform the first part of the pipeline?
+    /// Running on remote servers during distributed processing or if query is not distributed.
+    ///
+    /// Also note that with distributed_group_by_no_merge=1 or when there is
+    /// only one remote server, it is equal to local query in terms of query
+    /// stages (or when due to optimize_distributed_group_by_sharding_key the query was processed up to Complete stage).
    bool first_stage = from_stage < QueryProcessingStage::WithMergeableState
        && options.to_stage >= QueryProcessingStage::WithMergeableState;
-    /// Do I need to execute the second part of the pipeline - running on the initiating server during distributed processing.
+    /// Do I need to execute the second part of the pipeline?
+    /// Running on the initiating server during distributed processing or if query is not distributed.
+    ///
+    /// Also note that with distributed_group_by_no_merge=2 (i.e. when optimize_distributed_group_by_sharding_key takes place)
+    /// the query on the remote server will be processed up to WithMergeableStateAfterAggregation,
+    /// So it will do partial second stage (second_stage=true), and initiator will do the final part.
    bool second_stage = from_stage <= QueryProcessingStage::WithMergeableState
        && options.to_stage > QueryProcessingStage::WithMergeableState;

@ -1093,9 +1103,15 @@ void InterpreterSelectQuery::executeImpl(QueryPlan & query_plan, const BlockInpu
                /** If there is an ORDER BY for distributed query processing,
                  *  but there is no aggregation, then on the remote servers ORDER BY was made
                  *  - therefore, we merge the sorted streams from remote servers.
+                  *
+                  * Also in case of remote servers was process the query up to WithMergeableStateAfterAggregation
+                  * (distributed_group_by_no_merge=2 or optimize_distributed_group_by_sharding_key=1 takes place),
+                  * then merge the sorted streams is enough, since remote servers already did full ORDER BY.
                  */

-                if (!expressions.first_stage && !expressions.need_aggregate && !(query.group_by_with_totals && !aggregate_final))
+                if (from_aggregation_stage)
+                    executeMergeSorted(query_plan, "for ORDER BY");
+                else if (!expressions.first_stage && !expressions.need_aggregate && !(query.group_by_with_totals && !aggregate_final))
                    executeMergeSorted(query_plan, "for ORDER BY");
                else    /// Otherwise, just sort.
                    executeOrder(query_plan, query_info.input_order_info);
@ -1269,8 +1285,11 @@ void InterpreterSelectQuery::executeFetchColumns(
        const auto & desc = query_analyzer->aggregates()[0];
        const auto & func = desc.function;
        std::optional<UInt64> num_rows{};
+
        if (!query.prewhere() && !query.where())
+        {
            num_rows = storage->totalRows(settings);
+        }
        else // It's possible to optimize count() given only partition predicates
        {
            SelectQueryInfo temp_query_info;
@ -1280,6 +1299,7 @@ void InterpreterSelectQuery::executeFetchColumns(

            num_rows = storage->totalRowsByPartitionPredicate(temp_query_info, *context);
        }
+
        if (num_rows)
        {
            AggregateFunctionCount & agg_count = static_cast<AggregateFunctionCount &>(*func);
@ -1774,7 +1794,7 @@ void InterpreterSelectQuery::executeMergeAggregated(QueryPlan & query_plan, bool
    auto merging_aggregated = std::make_unique<MergingAggregatedStep>(
            query_plan.getCurrentDataStream(),
            std::move(transform_params),
-            settings.distributed_aggregation_memory_efficient,
+            settings.distributed_aggregation_memory_efficient && storage && storage->isRemote(),
            settings.max_threads,
            settings.aggregation_memory_efficient_merge_threads);

--- a/Show More
+++ b/Show More