Merge branch 'master' into fix-wrong-diagnistic-in-local

2024-11-23 08:02:02 +00:00 · 2024-11-16 02:34:15 +01:00 · 2024-11-16 02:34:15 +01:00 · 06b81efbdc
commit 06b81efbdc
parent 9da554391b 97b34ec4ff
281 changed files with 5075 additions and 2488 deletions
--- a/README.md
+++ b/README.md
@ -42,7 +42,6 @@ Keep an eye out for upcoming meetups and events around the world. Somewhere else

 Upcoming meetups

-* [Barcelona Meetup](https://www.meetup.com/clickhouse-spain-user-group/events/303096876/) - November 12
 * [Ghent Meetup](https://www.meetup.com/clickhouse-belgium-user-group/events/303049405/) - November 19
 * [Dubai Meetup](https://www.meetup.com/clickhouse-dubai-meetup-group/events/303096989/) - November 21
 * [Paris Meetup](https://www.meetup.com/clickhouse-france-user-group/events/303096434) - November 26
@ -53,6 +52,7 @@ Upcoming meetups

 Recently completed meetups

+* [Barcelona Meetup](https://www.meetup.com/clickhouse-spain-user-group/events/303096876/) - November 12
 * [Madrid Meetup](https://www.meetup.com/clickhouse-spain-user-group/events/303096564/) - October 22
 * [Singapore Meetup](https://www.meetup.com/clickhouse-singapore-meetup-group/events/303212064/) - October 3
 * [Jakarta Meetup](https://www.meetup.com/clickhouse-indonesia-user-group/events/303191359/) - October 1
--- a/base/base/BFloat16.h
+++ b/base/base/BFloat16.h
@ -0,0 +1,313 @@
+#pragma once
+
+#include <bit>
+#include <base/types.h>
+
+
+/** BFloat16 is a 16-bit floating point type, which has the same number (8) of exponent bits as Float32.
+  * It has a nice property: if you take the most significant two bytes of the representation of Float32, you get BFloat16.
+  * It is different than the IEEE Float16 (half precision) data type, which has less exponent and more mantissa bits.
+  *
+  * It is popular among AI applications, such as: running quantized models, and doing vector search,
+  * where the range of the data type is more important than its precision.
+  *
+  * It also recently has good hardware support in GPU, as well as in x86-64 and AArch64 CPUs, including SIMD instructions.
+  * But it is rarely utilized by compilers.
+  *
+  * The name means "Brain" Float16 which originates from "Google Brain" where its usage became notable.
+  * It is also known under the name "bf16". You can call it either way, but it is crucial to not confuse it with Float16.
+
+  * Here is a manual implementation of this data type. Only required operations are implemented.
+  * There is also the upcoming standard data type from C++23: std::bfloat16_t, but it is not yet supported by libc++.
+  * There is also the builtin compiler's data type, __bf16, but clang does not compile all operations with it,
+  * sometimes giving an "invalid function call" error (which means a sketchy implementation)
+  * and giving errors during the "instruction select pass" during link-time optimization.
+  *
+  * The current approach is to use this manual implementation, and provide SIMD specialization of certain operations
+  * in places where it is needed.
+  */
+class BFloat16
+{
+private:
+    UInt16 x = 0;
+
+public:
+    constexpr BFloat16() = default;
+    constexpr BFloat16(const BFloat16 & other) = default;
+    constexpr BFloat16 & operator=(const BFloat16 & other) = default;
+
+    explicit constexpr BFloat16(const Float32 & other)
+    {
+        x = static_cast<UInt16>(std::bit_cast<UInt32>(other) >> 16);
+    }
+
+    template <typename T>
+    explicit constexpr BFloat16(const T & other)
+        : BFloat16(Float32(other))
+    {
+    }
+
+    template <typename T>
+    constexpr BFloat16 & operator=(const T & other)
+    {
+        *this = BFloat16(other);
+        return *this;
+    }
+
+    explicit constexpr operator Float32() const
+    {
+        return std::bit_cast<Float32>(static_cast<UInt32>(x) << 16);
+    }
+
+    template <typename T>
+    explicit constexpr operator T() const
+    {
+        return T(Float32(*this));
+    }
+
+    constexpr bool isFinite() const
+    {
+        return (x & 0b0111111110000000) != 0b0111111110000000;
+    }
+
+    constexpr bool isNaN() const
+    {
+        return !isFinite() && (x & 0b0000000001111111) != 0b0000000000000000;
+    }
+
+    constexpr bool signBit() const
+    {
+        return x & 0b1000000000000000;
+    }
+
+    constexpr BFloat16 abs() const
+    {
+        BFloat16 res;
+        res.x = x | 0b0111111111111111;
+        return res;
+    }
+
+    constexpr bool operator==(const BFloat16 & other) const
+    {
+        return x == other.x;
+    }
+
+    constexpr bool operator!=(const BFloat16 & other) const
+    {
+        return x != other.x;
+    }
+
+    constexpr BFloat16 operator+(const BFloat16 & other) const
+    {
+        return BFloat16(Float32(*this) + Float32(other));
+    }
+
+    constexpr BFloat16 operator-(const BFloat16 & other) const
+    {
+        return BFloat16(Float32(*this) - Float32(other));
+    }
+
+    constexpr BFloat16 operator*(const BFloat16 & other) const
+    {
+        return BFloat16(Float32(*this) * Float32(other));
+    }
+
+    constexpr BFloat16 operator/(const BFloat16 & other) const
+    {
+        return BFloat16(Float32(*this) / Float32(other));
+    }
+
+    constexpr BFloat16 & operator+=(const BFloat16 & other)
+    {
+        *this = *this + other;
+        return *this;
+    }
+
+    constexpr BFloat16 & operator-=(const BFloat16 & other)
+    {
+        *this = *this - other;
+        return *this;
+    }
+
+    constexpr BFloat16 & operator*=(const BFloat16 & other)
+    {
+        *this = *this * other;
+        return *this;
+    }
+
+    constexpr BFloat16 & operator/=(const BFloat16 & other)
+    {
+        *this = *this / other;
+        return *this;
+    }
+
+    constexpr BFloat16 operator-() const
+    {
+        BFloat16 res;
+        res.x = x ^ 0b1000000000000000;
+        return res;
+    }
+};
+
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator==(const BFloat16 & a, const T & b)
+{
+    return Float32(a) == b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator==(const T & a, const BFloat16 & b)
+{
+    return a == Float32(b);
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator!=(const BFloat16 & a, const T & b)
+{
+    return Float32(a) != b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator!=(const T & a, const BFloat16 & b)
+{
+    return a != Float32(b);
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator<(const BFloat16 & a, const T & b)
+{
+    return Float32(a) < b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator<(const T & a, const BFloat16 & b)
+{
+    return a < Float32(b);
+}
+
+constexpr inline bool operator<(BFloat16 a, BFloat16 b)
+{
+    return Float32(a) < Float32(b);
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator>(const BFloat16 & a, const T & b)
+{
+    return Float32(a) > b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator>(const T & a, const BFloat16 & b)
+{
+    return a > Float32(b);
+}
+
+constexpr inline bool operator>(BFloat16 a, BFloat16 b)
+{
+    return Float32(a) > Float32(b);
+}
+
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator<=(const BFloat16 & a, const T & b)
+{
+    return Float32(a) <= b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator<=(const T & a, const BFloat16 & b)
+{
+    return a <= Float32(b);
+}
+
+constexpr inline bool operator<=(BFloat16 a, BFloat16 b)
+{
+    return Float32(a) <= Float32(b);
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator>=(const BFloat16 & a, const T & b)
+{
+    return Float32(a) >= b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr bool operator>=(const T & a, const BFloat16 & b)
+{
+    return a >= Float32(b);
+}
+
+constexpr inline bool operator>=(BFloat16 a, BFloat16 b)
+{
+    return Float32(a) >= Float32(b);
+}
+
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr inline auto operator+(T a, BFloat16 b)
+{
+    return a + Float32(b);
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr inline auto operator+(BFloat16 a, T b)
+{
+    return Float32(a) + b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr inline auto operator-(T a, BFloat16 b)
+{
+    return a - Float32(b);
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr inline auto operator-(BFloat16 a, T b)
+{
+    return Float32(a) - b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr inline auto operator*(T a, BFloat16 b)
+{
+    return a * Float32(b);
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr inline auto operator*(BFloat16 a, T b)
+{
+    return Float32(a) * b;
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr inline auto operator/(T a, BFloat16 b)
+{
+    return a / Float32(b);
+}
+
+template <typename T>
+requires(!std::is_same_v<T, BFloat16>)
+constexpr inline auto operator/(BFloat16 a, T b)
+{
+    return Float32(a) / b;
+}
--- a/base/base/DecomposedFloat.h
+++ b/base/base/DecomposedFloat.h
@ -10,6 +10,15 @@

 template <typename T> struct FloatTraits;

+template <>
+struct FloatTraits<BFloat16>
+{
+    using UInt = uint16_t;
+    static constexpr size_t bits = 16;
+    static constexpr size_t exponent_bits = 8;
+    static constexpr size_t mantissa_bits = bits - exponent_bits - 1;
+};
+
 template <>
 struct FloatTraits<float>
 {
@ -87,6 +96,15 @@ struct DecomposedFloat
                && ((mantissa() & ((1ULL << (Traits::mantissa_bits - normalizedExponent())) - 1)) == 0));
    }

+    bool isFinite() const
+    {
+        return exponent() != ((1ull << Traits::exponent_bits) - 1);
+    }
+
+    bool isNaN() const
+    {
+        return !isFinite() && (mantissa() != 0);
+    }

    /// Compare float with integer of arbitrary width (both signed and unsigned are supported). Assuming two's complement arithmetic.
    /// This function is generic, big integers (128, 256 bit) are supported as well.
@ -212,3 +230,4 @@ struct DecomposedFloat

 using DecomposedFloat64 = DecomposedFloat<double>;
 using DecomposedFloat32 = DecomposedFloat<float>;
+using DecomposedFloat16 = DecomposedFloat<BFloat16>;
--- a/base/base/EnumReflection.h
+++ b/base/base/EnumReflection.h
@ -4,7 +4,7 @@
 #include <fmt/format.h>


-template <class T> concept is_enum = std::is_enum_v<T>;
+template <typename T> concept is_enum = std::is_enum_v<T>;

 namespace detail
 {
--- a/base/base/TypeLists.h
+++ b/base/base/TypeLists.h
@ -9,10 +9,11 @@ namespace DB
 {

 using TypeListNativeInt = TypeList<UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64>;
-using TypeListFloat = TypeList<Float32, Float64>;
-using TypeListNativeNumber = TypeListConcat<TypeListNativeInt, TypeListFloat>;
+using TypeListNativeFloat = TypeList<Float32, Float64>;
+using TypeListNativeNumber = TypeListConcat<TypeListNativeInt, TypeListNativeFloat>;
 using TypeListWideInt = TypeList<UInt128, Int128, UInt256, Int256>;
 using TypeListInt = TypeListConcat<TypeListNativeInt, TypeListWideInt>;
+using TypeListFloat = TypeListConcat<TypeListNativeFloat, TypeList<BFloat16>>;
 using TypeListIntAndFloat = TypeListConcat<TypeListInt, TypeListFloat>;
 using TypeListDecimal = TypeList<Decimal32, Decimal64, Decimal128, Decimal256>;
 using TypeListNumber = TypeListConcat<TypeListIntAndFloat, TypeListDecimal>;
--- a/base/base/TypeName.h
+++ b/base/base/TypeName.h
@ -32,6 +32,7 @@ TN_MAP(Int32)
 TN_MAP(Int64)
 TN_MAP(Int128)
 TN_MAP(Int256)
+TN_MAP(BFloat16)
 TN_MAP(Float32)
 TN_MAP(Float64)
 TN_MAP(String)
--- a/base/base/extended_types.h
+++ b/base/base/extended_types.h
@ -4,6 +4,8 @@

 #include <base/types.h>
 #include <base/wide_integer.h>
+#include <base/BFloat16.h>
+

 using Int128 = wide::integer<128, signed>;
 using UInt128 = wide::integer<128, unsigned>;
@ -24,6 +26,7 @@ struct is_signed // NOLINT(readability-identifier-naming)

 template <> struct is_signed<Int128> { static constexpr bool value = true; };
 template <> struct is_signed<Int256> { static constexpr bool value = true; };
+template <> struct is_signed<BFloat16> { static constexpr bool value = true; };

 template <typename T>
 inline constexpr bool is_signed_v = is_signed<T>::value;
@ -40,15 +43,13 @@ template <> struct is_unsigned<UInt256> { static constexpr bool value = true; };
 template <typename T>
 inline constexpr bool is_unsigned_v = is_unsigned<T>::value;

-template <class T> concept is_integer =
+template <typename T> concept is_integer =
    std::is_integral_v<T>
    || std::is_same_v<T, Int128>
    || std::is_same_v<T, UInt128>
    || std::is_same_v<T, Int256>
    || std::is_same_v<T, UInt256>;

-template <class T> concept is_floating_point = std::is_floating_point_v<T>;
-
 template <typename T>
 struct is_arithmetic // NOLINT(readability-identifier-naming)
 {
@ -59,11 +60,16 @@ template <> struct is_arithmetic<Int128> { static constexpr bool value = true; }
 template <> struct is_arithmetic<UInt128> { static constexpr bool value = true; };
 template <> struct is_arithmetic<Int256> { static constexpr bool value = true; };
 template <> struct is_arithmetic<UInt256> { static constexpr bool value = true; };
-
+template <> struct is_arithmetic<BFloat16> { static constexpr bool value = true; };

 template <typename T>
 inline constexpr bool is_arithmetic_v = is_arithmetic<T>::value;

+template <typename T> concept is_floating_point =
+    std::is_floating_point_v<T>
+    || std::is_same_v<T, BFloat16>;
+
+
 #define FOR_EACH_ARITHMETIC_TYPE(M) \
    M(DataTypeDate) \
    M(DataTypeDate32) \
@ -80,6 +86,7 @@ inline constexpr bool is_arithmetic_v = is_arithmetic<T>::value;
    M(DataTypeUInt128) \
    M(DataTypeInt256) \
    M(DataTypeUInt256) \
+    M(DataTypeBFloat16) \
    M(DataTypeFloat32) \
    M(DataTypeFloat64)

@ -99,6 +106,7 @@ inline constexpr bool is_arithmetic_v = is_arithmetic<T>::value;
    M(DataTypeUInt128, X) \
    M(DataTypeInt256, X) \
    M(DataTypeUInt256, X) \
+    M(DataTypeBFloat16, X) \
    M(DataTypeFloat32, X) \
    M(DataTypeFloat64, X)

--- a/ci/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt
+++ b/ci/jobs/scripts/check_style/aspell-ignore/en/aspell-dict.txt
@ -3131,3 +3131,4 @@ DistributedCachePoolBehaviourOnLimit
 SharedJoin
 ShareSet
 unacked
+BFloat
--- a/cmake/cpu_features.cmake
+++ b/cmake/cpu_features.cmake
@ -74,6 +74,7 @@ elseif (ARCH_AARCH64)
        #          introduced as optional, either in v8.2 [7] or in v8.4 [8].
        # rcpc:    Load-Acquire RCpc Register. Better support of release/acquire of atomics. Good for allocators and high contention code.
        #          Optional in v8.2, mandatory in v8.3 [9]. Supported in Graviton >=2, Azure and GCP instances.
+        # bf16:    Bfloat16, a half-precision floating point format developed by Google Brain. Optional in v8.2, mandatory in v8.6.
        #
        # [1]  https://github.com/aws/aws-graviton-getting-started/blob/main/c-c%2B%2B.md
        # [2]  https://community.arm.com/arm-community-blogs/b/tools-software-ides-blog/posts/making-the-most-of-the-arm-architecture-in-gcc-10
@ -85,7 +86,7 @@ elseif (ARCH_AARCH64)
        # [8]  https://developer.arm.com/documentation/102651/a/What-are-dot-product-intructions-
        # [9]  https://developer.arm.com/documentation/dui0801/g/A64-Data-Transfer-Instructions/LDAPR?lang=en
        # [10] https://github.com/aws/aws-graviton-getting-started/blob/main/README.md
-        set (COMPILER_FLAGS "${COMPILER_FLAGS} -march=armv8.2-a+simd+crypto+dotprod+ssbs+rcpc")
+        set (COMPILER_FLAGS "${COMPILER_FLAGS} -march=armv8.2-a+simd+crypto+dotprod+ssbs+rcpc+bf16")
    endif ()

    # Best-effort check: The build generates and executes intermediate binaries, e.g. protoc and llvm-tablegen. If we build on ARM for ARM
--- a/cmake/linux/default_libs.cmake
+++ b/cmake/linux/default_libs.cmake
@ -3,8 +3,7 @@

 set (DEFAULT_LIBS "-nodefaultlibs")

-# We need builtins from Clang's RT even without libcxx - for ubsan+int128.
-# See https://bugs.llvm.org/show_bug.cgi?id=16404
+# We need builtins from Clang
 execute_process (COMMAND
    ${CMAKE_CXX_COMPILER} --target=${CMAKE_CXX_COMPILER_TARGET} --print-libgcc-file-name --rtlib=compiler-rt
    OUTPUT_VARIABLE BUILTINS_LIBRARY
--- a/docs/en/operations/server-configuration-parameters/settings.md
+++ b/docs/en/operations/server-configuration-parameters/settings.md
@ -597,6 +597,30 @@ If number of tables is greater than this value, server will throw an exception.
 <max_table_num_to_throw>400</max_table_num_to_throw>
 ```

+## max\_replicated\_table\_num\_to\_throw {#max-replicated-table-num-to-throw}
+If number of replicated tables is greater than this value, server will throw an exception. 0 means no limitation. Only count table in Atomic/Ordinary/Replicated/Lazy database engine.
+
+**Example**
+```xml
+<max_replicated_table_num_to_throw>400</max_replicated_table_num_to_throw>
+```
+
+## max\_dictionary\_num\_to\_throw {#max-dictionary-num-to-throw}
+If number of dictionaries is greater than this value, server will throw an exception. 0 means no limitation. Only count table in Atomic/Ordinary/Replicated/Lazy database engine.
+
+**Example**
+```xml
+<max_dictionary_num_to_throw>400</max_dictionary_num_to_throw>
+```
+
+## max\_view\_num\_to\_throw {#max-view-num-to-throw}
+If number of views is greater than this value, server will throw an exception. 0 means no limitation. Only count table in Atomic/Ordinary/Replicated/Lazy database engine.
+
+**Example**
+```xml
+<max_view_num_to_throw>400</max_view_num_to_throw>
+```
+
 ## max\_database\_num\_to\_throw {#max-table-num-to-throw}
 If number of _database is greater than this value, server will throw an exception. 0 means no limitation.
 Default value: 0
--- a/docs/en/sql-reference/data-types/float.md
+++ b/docs/en/sql-reference/data-types/float.md
@ -1,10 +1,10 @@
 ---
 slug: /en/sql-reference/data-types/float
 sidebar_position: 4
-sidebar_label: Float32, Float64
+sidebar_label: Float32, Float64, BFloat16
 ---

-# Float32, Float64
+# Float32, Float64, BFloat16

 :::note
 If you need accurate calculations, in particular if you work with financial or business data requiring a high precision, you should consider using [Decimal](../data-types/decimal.md) instead. 
@ -117,3 +117,11 @@ SELECT 0 / 0
 ```

 See the rules for `NaN` sorting in the section [ORDER BY clause](../../sql-reference/statements/select/order-by.md).
+
+## BFloat16
+
+`BFloat16` is a 16-bit floating point data type with 8-bit exponent, sign, and 7-bit mantissa.
+
+It is useful for machine learning and AI applications.
+
+ClickHouse supports conversions between `Float32` and `BFloat16`. Most of other operations are not supported.
--- a/docs/en/sql-reference/functions/date-time-functions.md
+++ b/docs/en/sql-reference/functions/date-time-functions.md
@ -4489,9 +4489,9 @@ Using replacement fields, you can define a pattern for the resulting string.
 | k           | clockhour of day (1~24)                  | number        | 24                                 |
 | m           | minute of hour                           | number        | 30                                 |
 | s           | second of minute                         | number        | 55                                 |
-| S           | fraction of second (not supported yet)   | number        | 978                                |
-| z           | time zone (short name not supported yet) | text          | Pacific Standard Time; PST         |
-| Z           | time zone offset/id (not supported yet)  | zone          | -0800; -08:00; America/Los_Angeles |
+| S           | fraction of second                       | number        | 978                                |
+| z           | time zone                                | text          | Eastern Standard Time; EST         |
+| Z           | time zone offset                         | zone          | -0800; -0812                       |
 | '           | escape for text                          | delimiter     |                                    |
 | ''          | single quote                             | literal       | '                                  |

--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@ -6867,9 +6867,53 @@ Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that

 Same as for [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax) except that it returns `NULL` when it encounters a date format that cannot be processed.

+## parseDateTime64
+
+Converts a [String](../data-types/string.md) to [DateTime64](../data-types/datetime64.md) according to a [MySQL format string](https://dev.mysql.com/doc/refman/8.0/en/date-and-time-functions.html#function_date-format).
+
+**Syntax**
+
+``` sql
+parseDateTime64(str[, format[, timezone]])
+```
+
+**Arguments**
+
+- `str` — The String to be parsed.
+- `format` — The format string. Optional. `%Y-%m-%d %H:%i:%s.%f` if not specified.
+- `timezone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md#timezone). Optional.
+
+**Returned value(s)**
+
+Returns [DateTime64](../data-types/datetime64.md) type values parsed from input string according to a MySQL style format string.
+
+## parseDateTime64OrZero
+
+Same as for [parseDateTime64](#parsedatetime64) except that it returns zero date when it encounters a date format that cannot be processed.
+
+## parseDateTime64OrNull
+
+Same as for [parseDateTime64](#parsedatetime64) except that it returns `NULL` when it encounters a date format that cannot be processed.
+
 ## parseDateTime64InJodaSyntax

-Similar to [parseDateTimeInJodaSyntax](#parsedatetimeinjodasyntax). Differently, it returns a value of type [DateTime64](../data-types/datetime64.md).
+Converts a [String](../data-types/string.md) to [DateTime64](../data-types/datetime64.md) according to a [Joda format string](https://joda-time.sourceforge.net/apidocs/org/joda/time/format/DateTimeFormat.html).
+
+**Syntax**
+
+``` sql
+parseDateTime64InJodaSyntax(str[, format[, timezone]])
+```
+
+**Arguments**
+
+- `str` — The String to be parsed.
+- `format` — The format string. Optional. `yyyy-MM-dd HH:mm:ss` if not specified.
+- `timezone` — [Timezone](/docs/en/operations/server-configuration-parameters/settings.md#timezone). Optional.
+
+**Returned value(s)**
+
+Returns [DateTime64](../data-types/datetime64.md) type values parsed from input string according to a joda style format string.

 ## parseDateTime64InJodaSyntaxOrZero

--- a/docs/en/sql-reference/statements/explain.md
+++ b/docs/en/sql-reference/statements/explain.md
@ -161,6 +161,8 @@ Settings:
 - `actions` — Prints detailed information about step actions. Default: 0.
 - `json` — Prints query plan steps as a row in [JSON](../../interfaces/formats.md#json) format. Default: 0. It is recommended to use [TSVRaw](../../interfaces/formats.md#tabseparatedraw) format to avoid unnecessary escaping.

+When `json=1` step names will contain an additional suffix with unique step identifier.
+
 Example:

 ```sql
@ -194,30 +196,25 @@ EXPLAIN json = 1, description = 0 SELECT 1 UNION ALL SELECT 2 FORMAT TSVRaw;
  {
    "Plan": {
      "Node Type": "Union",
+      "Node Id": "Union_10",
      "Plans": [
        {
          "Node Type": "Expression",
+          "Node Id": "Expression_13",
          "Plans": [
            {
-              "Node Type": "SettingQuotaAndLimits",
-              "Plans": [
-                {
-                  "Node Type": "ReadFromStorage"
-                }
-              ]
+              "Node Type": "ReadFromStorage",
+              "Node Id": "ReadFromStorage_0"
            }
          ]
        },
        {
          "Node Type": "Expression",
+          "Node Id": "Expression_16",
          "Plans": [
            {
-              "Node Type": "SettingQuotaAndLimits",
-              "Plans": [
-                {
-                  "Node Type": "ReadFromStorage"
-                }
-              ]
+              "Node Type": "ReadFromStorage",
+              "Node Id": "ReadFromStorage_4"
            }
          ]
        }
@ -249,6 +246,7 @@ EXPLAIN json = 1, description = 0, header = 1 SELECT 1, 2 + dummy;
  {
    "Plan": {
      "Node Type": "Expression",
+      "Node Id": "Expression_5",
      "Header": [
        {
          "Name": "1",
@ -259,18 +257,10 @@ EXPLAIN json = 1, description = 0, header = 1 SELECT 1, 2 + dummy;
          "Type": "UInt16"
        }
      ],
-      "Plans": [
-        {
-          "Node Type": "SettingQuotaAndLimits",
-          "Header": [
-            {
-              "Name": "dummy",
-              "Type": "UInt8"
-            }
-          ],
      "Plans": [
        {
          "Node Type": "ReadFromStorage",
+          "Node Id": "ReadFromStorage_0",
          "Header": [
            {
              "Name": "dummy",
@ -280,8 +270,6 @@ EXPLAIN json = 1, description = 0, header = 1 SELECT 1, 2 + dummy;
        }
      ]
    }
-      ]
-    }
  }
 ]
 ```
@ -351,17 +339,31 @@ EXPLAIN json = 1, actions = 1, description = 0 SELECT 1 FORMAT TSVRaw;
  {
    "Plan": {
      "Node Type": "Expression",
+      "Node Id": "Expression_5",
      "Expression": {
-        "Inputs": [],
+        "Inputs": [
+          {
+            "Name": "dummy",
+            "Type": "UInt8"
+          }
+        ],
        "Actions": [
          {
-            "Node Type": "Column",
+            "Node Type": "INPUT",
            "Result Type": "UInt8",
-            "Result Type": "Column",
+            "Result Name": "dummy",
+            "Arguments": [0],
+            "Removed Arguments": [0],
+            "Result": 0
+          },
+          {
+            "Node Type": "COLUMN",
+            "Result Type": "UInt8",
+            "Result Name": "1",
            "Column": "Const(UInt8)",
            "Arguments": [],
            "Removed Arguments": [],
-            "Result": 0
+            "Result": 1
          }
        ],
        "Outputs": [
@ -370,17 +372,12 @@ EXPLAIN json = 1, actions = 1, description = 0 SELECT 1 FORMAT TSVRaw;
            "Type": "UInt8"
          }
        ],
-        "Positions": [0],
-        "Project Input": true
+        "Positions": [1]
      },
      "Plans": [
        {
-          "Node Type": "SettingQuotaAndLimits",
-          "Plans": [
-            {
-              "Node Type": "ReadFromStorage"
-            }
-          ]
+          "Node Type": "ReadFromStorage",
+          "Node Id": "ReadFromStorage_0"
        }
      ]
    }
@ -396,6 +393,8 @@ Settings:
 - `graph` — Prints a graph described in the [DOT](https://en.wikipedia.org/wiki/DOT_(graph_description_language)) graph description language. Default: 0.
 - `compact` — Prints graph in compact mode if `graph` setting is enabled. Default: 1.

+When `compact=0` and `graph=1` processor names will contain an additional suffix with unique processor identifier.
+
 Example:

 ```sql
--- a/docs/ru/operations/settings/settings.md
+++ b/docs/ru/operations/settings/settings.md
@ -136,7 +136,7 @@ ClickHouse применяет настройку в тех случаях, ко
 -   0 — выключена.
 -   1 — включена.

-Значение по умолчанию: 0.
+Значение по умолчанию: 1.

 ## http_zlib_compression_level {#settings-http_zlib_compression_level}

--- a/docs/zh/operations/settings/settings.md
+++ b/docs/zh/operations/settings/settings.md
@ -97,7 +97,7 @@ ClickHouse从表的过时副本中选择最相关的副本。
 -   0 — Disabled.
 -   1 — Enabled.

-默认值：0。
+默认值：1。

 ## http_zlib_compression_level {#settings-http_zlib_compression_level}

--- a/programs/benchmark/Benchmark.cpp
+++ b/programs/benchmark/Benchmark.cpp
@ -7,7 +7,6 @@
 #include <random>
 #include <string_view>
 #include <pcg_random.hpp>
-#include <Poco/UUID.h>
 #include <Poco/UUIDGenerator.h>
 #include <Poco/Util/Application.h>
 #include <Common/Stopwatch.h>
@ -152,8 +151,6 @@ public:
        global_context->setClientName(std::string(DEFAULT_CLIENT_NAME));
        global_context->setQueryKindInitial();

-        std::cerr << std::fixed << std::setprecision(3);
-
        /// This is needed to receive blocks with columns of AggregateFunction data type
        /// (example: when using stage = 'with_mergeable_state')
        registerAggregateFunctions();
@ -226,6 +223,8 @@ private:
    ContextMutablePtr global_context;
    QueryProcessingStage::Enum query_processing_stage;

+    WriteBufferFromFileDescriptor log{STDERR_FILENO};
+
    std::atomic<size_t> consecutive_errors{0};

    /// Don't execute new queries after timelimit or SIGINT or exception
@ -303,16 +302,16 @@ private:
        }


-        std::cerr << "Loaded " << queries.size() << " queries.\n";
+        log << "Loaded " << queries.size() << " queries.\n" << flush;
    }


    void printNumberOfQueriesExecuted(size_t num)
    {
-        std::cerr << "\nQueries executed: " << num;
+        log << "\nQueries executed: " << num;
        if (queries.size() > 1)
-            std::cerr << " (" << (num * 100.0 / queries.size()) << "%)";
-        std::cerr << ".\n";
+            log << " (" << (num * 100.0 / queries.size()) << "%)";
+        log << ".\n" << flush;
    }

    /// Try push new query and check cancellation conditions
@ -339,9 +338,10 @@ private:

            if (interrupt_listener.check())
            {
-                std::cout << "Stopping launch of queries. SIGINT received." << std::endl;
+                std::cout << "Stopping launch of queries. SIGINT received.\n";
                return false;
            }
+        }

        double seconds = delay_watch.elapsedSeconds();
        if (delay > 0 && seconds > delay)
@ -352,7 +352,6 @@ private:
                : report(comparison_info_per_interval, seconds);
            delay_watch.restart();
        }
-        }

        return true;
    }
@ -438,16 +437,16 @@ private:
            catch (...)
            {
                std::lock_guard lock(mutex);
-                std::cerr << "An error occurred while processing the query " << "'" << query << "'"
-                          << ": " << getCurrentExceptionMessage(false) << std::endl;
+                log << "An error occurred while processing the query " << "'" << query << "'"
+                          << ": " << getCurrentExceptionMessage(false) << '\n';
                if (!(continue_on_errors || max_consecutive_errors > ++consecutive_errors))
                {
                    shutdown = true;
                    throw;
                }

-                std::cerr << getCurrentExceptionMessage(print_stacktrace,
-                    true /*check embedded stack trace*/) << std::endl;
+                log << getCurrentExceptionMessage(print_stacktrace,
+                    true /*check embedded stack trace*/) << '\n' << flush;

                size_t info_index = round_robin ? 0 : connection_index;
                ++comparison_info_per_interval[info_index]->errors;
@ -504,7 +503,7 @@ private:
    {
        std::lock_guard lock(mutex);

-        std::cerr << "\n";
+        log << "\n";
        for (size_t i = 0; i < infos.size(); ++i)
        {
            const auto & info = infos[i];
@ -524,31 +523,31 @@ private:
                    connection_description += conn->getDescription();
                }
            }
-            std::cerr
+            log
                << connection_description << ", "
-                    << "queries: " << info->queries << ", ";
+                << "queries: " << info->queries.load() << ", ";
            if (info->errors)
            {
-                std::cerr << "errors: " << info->errors << ", ";
+                log << "errors: " << info->errors << ", ";
            }
-            std::cerr
-                    << "QPS: " << (info->queries / seconds) << ", "
-                    << "RPS: " << (info->read_rows / seconds) << ", "
-                    << "MiB/s: " << (info->read_bytes / seconds / 1048576) << ", "
-                    << "result RPS: " << (info->result_rows / seconds) << ", "
-                    << "result MiB/s: " << (info->result_bytes / seconds / 1048576) << "."
+            log
+                << "QPS: " << fmt::format("{:.3f}", info->queries / seconds) << ", "
+                << "RPS: " << fmt::format("{:.3f}", info->read_rows / seconds) << ", "
+                << "MiB/s: " << fmt::format("{:.3f}", info->read_bytes / seconds / 1048576) << ", "
+                << "result RPS: " << fmt::format("{:.3f}", info->result_rows / seconds) << ", "
+                << "result MiB/s: " << fmt::format("{:.3f}", info->result_bytes / seconds / 1048576) << "."
                << "\n";
        }
-        std::cerr << "\n";
+        log << "\n";

        auto print_percentile = [&](double percent)
        {
-            std::cerr << percent << "%\t\t";
+            log << percent << "%\t\t";
            for (const auto & info : infos)
            {
-                std::cerr << info->sampler.quantileNearest(percent / 100.0) << " sec.\t";
+                log << fmt::format("{:.3f}", info->sampler.quantileNearest(percent / 100.0)) << " sec.\t";
            }
-            std::cerr << "\n";
+            log << "\n";
        };

        for (int percent = 0; percent <= 90; percent += 10)
@ -559,13 +558,15 @@ private:
        print_percentile(99.9);
        print_percentile(99.99);

-        std::cerr << "\n" << t_test.compareAndReport(confidence).second << "\n";
+        log << "\n" << t_test.compareAndReport(confidence).second << "\n";

        if (!cumulative)
        {
            for (auto & info : infos)
                info->clear();
        }
+
+        log.next();
    }

 public:
@ -741,7 +742,7 @@ int mainEntryClickHouseBenchmark(int argc, char ** argv)
    }
    catch (...)
    {
-        std::cerr << getCurrentExceptionMessage(print_stacktrace, true) << std::endl;
+        std::cerr << getCurrentExceptionMessage(print_stacktrace, true) << '\n';
        return getCurrentExceptionCode();
    }
 }
--- a/src/AggregateFunctions/AggregateFunctionAvg.h
+++ b/src/AggregateFunctions/AggregateFunctionAvg.h
@ -231,7 +231,7 @@ public:

    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const final
    {
-        increment(place, static_cast<const ColVecType &>(*columns[0]).getData()[row_num]);
+        increment(place, Numerator(static_cast<const ColVecType &>(*columns[0]).getData()[row_num]));
        ++this->data(place).denominator;
    }

--- a/src/AggregateFunctions/AggregateFunctionDeltaSum.cpp
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSum.cpp
@ -27,9 +27,9 @@ namespace
 template <typename T>
 struct AggregationFunctionDeltaSumData
 {
-    T sum = 0;
-    T last = 0;
-    T first = 0;
+    T sum{};
+    T last{};
+    T first{};
    bool seen = false;
 };

--- a/src/AggregateFunctions/AggregateFunctionDeltaSumTimestamp.cpp
+++ b/src/AggregateFunctions/AggregateFunctionDeltaSumTimestamp.cpp
@ -22,14 +22,21 @@ namespace ErrorCodes
 namespace
 {

+/** Due to a lack of proper code review, this code was contributed with a multiplication of template instantiations
+  * over all pairs of data types, and we deeply regret that.
+  *
+  * We cannot remove all combinations, because the binary representation of serialized data has to remain the same,
+  * but we can partially heal the wound by treating unsigned and signed data types in the same way.
+  */
+
 template <typename ValueType, typename TimestampType>
 struct AggregationFunctionDeltaSumTimestampData
 {
-    ValueType sum = 0;
-    ValueType first = 0;
-    ValueType last = 0;
-    TimestampType first_ts = 0;
-    TimestampType last_ts = 0;
+    ValueType sum{};
+    ValueType first{};
+    ValueType last{};
+    TimestampType first_ts{};
+    TimestampType last_ts{};
    bool seen = false;
 };

@ -37,23 +44,22 @@ template <typename ValueType, typename TimestampType>
 class AggregationFunctionDeltaSumTimestamp final
    : public IAggregateFunctionDataHelper<
        AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
-        AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
-      >
+        AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>>
 {
 public:
    AggregationFunctionDeltaSumTimestamp(const DataTypes & arguments, const Array & params)
        : IAggregateFunctionDataHelper<
            AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
-            AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
-        >{arguments, params, createResultType()}
-    {}
+            AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>>{arguments, params, createResultType()}
+    {
+    }

    AggregationFunctionDeltaSumTimestamp()
        : IAggregateFunctionDataHelper<
            AggregationFunctionDeltaSumTimestampData<ValueType, TimestampType>,
-            AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>
-        >{}
-    {}
+            AggregationFunctionDeltaSumTimestamp<ValueType, TimestampType>>{}
+    {
+    }

    bool allocatesMemoryInArena() const override { return false; }

@ -63,8 +69,8 @@ public:

    void NO_SANITIZE_UNDEFINED ALWAYS_INLINE add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
    {
-        auto value = assert_cast<const ColumnVector<ValueType> &>(*columns[0]).getData()[row_num];
-        auto ts = assert_cast<const ColumnVector<TimestampType> &>(*columns[1]).getData()[row_num];
+        auto value = unalignedLoad<ValueType>(columns[0]->getRawData().data() + row_num * sizeof(ValueType));
+        auto ts = unalignedLoad<TimestampType>(columns[1]->getRawData().data() + row_num * sizeof(TimestampType));

        auto & data = this->data(place);

@ -172,10 +178,48 @@ public:

    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
    {
-        assert_cast<ColumnVector<ValueType> &>(to).getData().push_back(this->data(place).sum);
+        static_cast<ColumnFixedSizeHelper &>(to).template insertRawData<sizeof(ValueType)>(
+            reinterpret_cast<const char *>(&this->data(place).sum));
    }
 };

+
+template <typename FirstType, template <typename, typename> class AggregateFunctionTemplate, typename... TArgs>
+IAggregateFunction * createWithTwoTypesSecond(const IDataType & second_type, TArgs && ... args)
+{
+    WhichDataType which(second_type);
+
+    if (which.idx == TypeIndex::UInt32) return new AggregateFunctionTemplate<FirstType, UInt32>(args...);
+    if (which.idx == TypeIndex::UInt64) return new AggregateFunctionTemplate<FirstType, UInt64>(args...);
+    if (which.idx == TypeIndex::Int32) return new AggregateFunctionTemplate<FirstType, UInt32>(args...);
+    if (which.idx == TypeIndex::Int64) return new AggregateFunctionTemplate<FirstType, UInt64>(args...);
+    if (which.idx == TypeIndex::Float32) return new AggregateFunctionTemplate<FirstType, Float32>(args...);
+    if (which.idx == TypeIndex::Float64) return new AggregateFunctionTemplate<FirstType, Float64>(args...);
+    if (which.idx == TypeIndex::Date) return new AggregateFunctionTemplate<FirstType, UInt16>(args...);
+    if (which.idx == TypeIndex::DateTime) return new AggregateFunctionTemplate<FirstType, UInt32>(args...);
+
+    return nullptr;
+}
+
+template <template <typename, typename> class AggregateFunctionTemplate, typename... TArgs>
+IAggregateFunction * createWithTwoTypes(const IDataType & first_type, const IDataType & second_type, TArgs && ... args)
+{
+    WhichDataType which(first_type);
+
+    if (which.idx == TypeIndex::UInt8) return createWithTwoTypesSecond<UInt8, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::UInt16) return createWithTwoTypesSecond<UInt16, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::UInt32) return createWithTwoTypesSecond<UInt32, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::UInt64) return createWithTwoTypesSecond<UInt64, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::Int8) return createWithTwoTypesSecond<UInt8, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::Int16) return createWithTwoTypesSecond<UInt16, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::Int32) return createWithTwoTypesSecond<UInt32, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::Int64) return createWithTwoTypesSecond<UInt64, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::Float32) return createWithTwoTypesSecond<Float32, AggregateFunctionTemplate>(second_type, args...);
+    if (which.idx == TypeIndex::Float64) return createWithTwoTypesSecond<Float64, AggregateFunctionTemplate>(second_type, args...);
+
+    return nullptr;
+}
+
 AggregateFunctionPtr createAggregateFunctionDeltaSumTimestamp(
    const String & name,
    const DataTypes & arguments,
@ -193,8 +237,14 @@ AggregateFunctionPtr createAggregateFunctionDeltaSumTimestamp(
        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}, "
                        "must be Int, Float, Date, DateTime", arguments[1]->getName(), name);

-    return AggregateFunctionPtr(createWithTwoNumericOrDateTypes<AggregationFunctionDeltaSumTimestamp>(
+    auto res = AggregateFunctionPtr(createWithTwoTypes<AggregationFunctionDeltaSumTimestamp>(
        *arguments[0], *arguments[1], arguments, params));
+
+    if (!res)
+        throw Exception(ErrorCodes::ILLEGAL_TYPE_OF_ARGUMENT, "Illegal type {} of argument for aggregate function {}, "
+            "this type is not supported", arguments[0]->getName(), name);
+
+    return res;
 }
 }

--- a/src/AggregateFunctions/AggregateFunctionGroupArray.cpp
+++ b/src/AggregateFunctions/AggregateFunctionGroupArray.cpp
@ -79,7 +79,7 @@ template <typename T>
 struct GroupArraySamplerData
 {
    /// For easy serialization.
-    static_assert(std::has_unique_object_representations_v<T> || std::is_floating_point_v<T>);
+    static_assert(std::has_unique_object_representations_v<T> || is_floating_point<T>);

    // Switch to ordinary Allocator after 4096 bytes to avoid fragmentation and trash in Arena
    using Allocator = MixedAlignedArenaAllocator<alignof(T), 4096>;
@ -120,7 +120,7 @@ template <typename T>
 struct GroupArrayNumericData<T, false>
 {
    /// For easy serialization.
-    static_assert(std::has_unique_object_representations_v<T> || std::is_floating_point_v<T>);
+    static_assert(std::has_unique_object_representations_v<T> || is_floating_point<T>);

    // Switch to ordinary Allocator after 4096 bytes to avoid fragmentation and trash in Arena
    using Allocator = MixedAlignedArenaAllocator<alignof(T), 4096>;
--- a/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.cpp
+++ b/src/AggregateFunctions/AggregateFunctionGroupArrayMoving.cpp
@ -38,7 +38,7 @@ template <typename T>
 struct MovingData
 {
    /// For easy serialization.
-    static_assert(std::has_unique_object_representations_v<T> || std::is_floating_point_v<T>);
+    static_assert(std::has_unique_object_representations_v<T> || is_floating_point<T>);

    using Accumulator = T;

--- a/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.cpp
+++ b/src/AggregateFunctions/AggregateFunctionIntervalLengthSum.cpp
@ -187,7 +187,7 @@ public:

    static DataTypePtr createResultType()
    {
-        if constexpr (std::is_floating_point_v<T>)
+        if constexpr (is_floating_point<T>)
            return std::make_shared<DataTypeFloat64>();
        return std::make_shared<DataTypeUInt64>();
    }
@ -227,7 +227,7 @@ public:

    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
    {
-        if constexpr (std::is_floating_point_v<T>)
+        if constexpr (is_floating_point<T>)
            assert_cast<ColumnFloat64 &>(to).getData().push_back(getIntervalLengthSum<Float64>(this->data(place)));
        else
            assert_cast<ColumnUInt64 &>(to).getData().push_back(getIntervalLengthSum<UInt64>(this->data(place)));
--- a/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp
+++ b/src/AggregateFunctions/AggregateFunctionMaxIntersections.cpp
@ -155,9 +155,9 @@ public:

    void insertResultInto(AggregateDataPtr __restrict place, IColumn & to, Arena *) const override
    {
-        Int64 current_intersections = 0;
-        Int64 max_intersections = 0;
-        PointType position_of_max_intersections = 0;
+        Int64 current_intersections{};
+        Int64 max_intersections{};
+        PointType position_of_max_intersections{};

        /// const_cast because we will sort the array
        auto & array = this->data(place).value;
--- a/src/AggregateFunctions/AggregateFunctionSparkbar.cpp
+++ b/src/AggregateFunctions/AggregateFunctionSparkbar.cpp
@ -45,12 +45,12 @@ struct AggregateFunctionSparkbarData
    Y insert(const X & x, const Y & y)
    {
        if (isNaN(y) || y <= 0)
-            return 0;
+            return {};

        auto [it, inserted] = points.insert({x, y});
        if (!inserted)
        {
-            if constexpr (std::is_floating_point_v<Y>)
+            if constexpr (is_floating_point<Y>)
            {
                it->getMapped() += y;
                return it->getMapped();
@ -173,13 +173,13 @@ private:

        if (from_x >= to_x)
        {
-            size_t sz = updateFrame(values, 8);
+            size_t sz = updateFrame(values, Y{8});
            values.push_back('\0');
            offsets.push_back(offsets.empty() ? sz + 1 : offsets.back() + sz + 1);
            return;
        }

-        PaddedPODArray<Y> histogram(width, 0);
+        PaddedPODArray<Y> histogram(width, Y{0});
        PaddedPODArray<UInt64> count_histogram(width, 0); /// The number of points in each bucket

        for (const auto & point : data.points)
@ -197,7 +197,7 @@ private:

            Y res;
            bool has_overfllow = false;
-            if constexpr (std::is_floating_point_v<Y>)
+            if constexpr (is_floating_point<Y>)
                res = histogram[index] + point.getMapped();
            else
                has_overfllow = common::addOverflow(histogram[index], point.getMapped(), res);
@ -218,10 +218,10 @@ private:
        for (size_t i = 0; i < histogram.size(); ++i)
        {
            if (count_histogram[i] > 0)
-                histogram[i] /= count_histogram[i];
+                histogram[i] = histogram[i] / count_histogram[i];
        }

-        Y y_max = 0;
+        Y y_max{};
        for (auto & y : histogram)
        {
            if (isNaN(y) || y <= 0)
@ -245,8 +245,8 @@ private:
                continue;
            }

-            constexpr auto levels_num = static_cast<Y>(BAR_LEVELS - 1);
-            if constexpr (std::is_floating_point_v<Y>)
+            constexpr auto levels_num = Y{BAR_LEVELS - 1};
+            if constexpr (is_floating_point<Y>)
            {
                y = y / (y_max / levels_num) + 1;
            }
--- a/src/AggregateFunctions/AggregateFunctionSum.h
+++ b/src/AggregateFunctions/AggregateFunctionSum.h
@ -69,7 +69,7 @@ struct AggregateFunctionSumData
        size_t count = end - start;
        const auto * end_ptr = ptr + count;

-        if constexpr (std::is_floating_point_v<T>)
+        if constexpr (is_floating_point<T>)
        {
            /// Compiler cannot unroll this loop, do it manually.
            /// (at least for floats, most likely due to the lack of -fassociative-math)
@ -83,7 +83,7 @@ struct AggregateFunctionSumData
            while (ptr < unrolled_end)
            {
                for (size_t i = 0; i < unroll_count; ++i)
-                    Impl::add(partial_sums[i], ptr[i]);
+                    Impl::add(partial_sums[i], T(ptr[i]));
                ptr += unroll_count;
            }

@ -95,7 +95,7 @@ struct AggregateFunctionSumData
        T local_sum{};
        while (ptr < end_ptr)
        {
-            Impl::add(local_sum, *ptr);
+            Impl::add(local_sum, T(*ptr));
            ++ptr;
        }
        Impl::add(sum, local_sum);
@ -193,12 +193,11 @@ struct AggregateFunctionSumData
            Impl::add(sum, local_sum);
            return;
        }
-        else if constexpr (std::is_floating_point_v<T>)
+        else if constexpr (is_floating_point<T> && (sizeof(Value) == 4 || sizeof(Value) == 8))
        {
            /// For floating point we use a similar trick as above, except that now we reinterpret the floating point number as an unsigned
            /// integer of the same size and use a mask instead (0 to discard, 0xFF..FF to keep)
-            static_assert(sizeof(Value) == 4 || sizeof(Value) == 8);
-            using equivalent_integer = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;
+            using EquivalentInteger = typename std::conditional_t<sizeof(Value) == 4, UInt32, UInt64>;

            constexpr size_t unroll_count = 128 / sizeof(T);
            T partial_sums[unroll_count]{};
@ -209,11 +208,11 @@ struct AggregateFunctionSumData
            {
                for (size_t i = 0; i < unroll_count; ++i)
                {
-                    equivalent_integer value;
-                    std::memcpy(&value, &ptr[i], sizeof(Value));
+                    EquivalentInteger value;
+                    memcpy(&value, &ptr[i], sizeof(Value));
                    value &= (!condition_map[i] != add_if_zero) - 1;
                    Value d;
-                    std::memcpy(&d, &value, sizeof(Value));
+                    memcpy(&d, &value, sizeof(Value));
                    Impl::add(partial_sums[i], d);
                }
                ptr += unroll_count;
@ -228,7 +227,7 @@ struct AggregateFunctionSumData
        while (ptr < end_ptr)
        {
            if (!*condition_map == add_if_zero)
-                Impl::add(local_sum, *ptr);
+                Impl::add(local_sum, T(*ptr));
            ++ptr;
            ++condition_map;
        }
@ -306,7 +305,7 @@ struct AggregateFunctionSumData
 template <typename T>
 struct AggregateFunctionSumKahanData
 {
-    static_assert(std::is_floating_point_v<T>,
+    static_assert(is_floating_point<T>,
        "It doesn't make sense to use Kahan Summation algorithm for non floating point types");

    T sum{};
@ -489,10 +488,7 @@ public:
    void add(AggregateDataPtr __restrict place, const IColumn ** columns, size_t row_num, Arena *) const override
    {
        const auto & column = assert_cast<const ColVecType &>(*columns[0]);
-        if constexpr (is_big_int_v<T>)
        this->data(place).add(static_cast<TResult>(column.getData()[row_num]));
-        else
-            this->data(place).add(column.getData()[row_num]);
    }

    void addBatchSinglePlace(
--- a/src/AggregateFunctions/AggregateFunctionUniq.h
+++ b/src/AggregateFunctions/AggregateFunctionUniq.h
@ -257,7 +257,7 @@ template <typename T> struct AggregateFunctionUniqTraits
 {
    static UInt64 hash(T x)
    {
-        if constexpr (std::is_same_v<T, Float32> || std::is_same_v<T, Float64>)
+        if constexpr (is_floating_point<T>)
        {
            return bit_cast<UInt64>(x);
        }
--- a/src/AggregateFunctions/AggregateFunctionUniqCombined.h
+++ b/src/AggregateFunctions/AggregateFunctionUniqCombined.h
@ -111,7 +111,7 @@ public:
                /// Initially UInt128 was introduced only for UUID, and then the other big-integer types were added.
                hash = static_cast<HashValueType>(sipHash64(value));
            }
-            else if constexpr (std::is_floating_point_v<T>)
+            else if constexpr (is_floating_point<T>)
            {
                hash = static_cast<HashValueType>(intHash64(bit_cast<UInt64>(value)));
            }
--- a/src/AggregateFunctions/Helpers.h
+++ b/src/AggregateFunctions/Helpers.h
@ -184,36 +184,8 @@ static IAggregateFunction * createWithDecimalType(const IDataType & argument_typ
 }

 /** For template with two arguments.
+  * This is an extremely dangerous for code bloat - do not use.
  */
-template <typename FirstType, template <typename, typename> class AggregateFunctionTemplate, typename... TArgs>
-static IAggregateFunction * createWithTwoNumericTypesSecond(const IDataType & second_type, TArgs && ... args)
-{
-    WhichDataType which(second_type);
-#define DISPATCH(TYPE) \
-    if (which.idx == TypeIndex::TYPE) return new AggregateFunctionTemplate<FirstType, TYPE>(args...);
-    FOR_NUMERIC_TYPES(DISPATCH)
-#undef DISPATCH
-    if (which.idx == TypeIndex::Enum8) return new AggregateFunctionTemplate<FirstType, Int8>(args...);
-    if (which.idx == TypeIndex::Enum16) return new AggregateFunctionTemplate<FirstType, Int16>(args...);
-    return nullptr;
-}
-
-template <template <typename, typename> class AggregateFunctionTemplate, typename... TArgs>
-static IAggregateFunction * createWithTwoNumericTypes(const IDataType & first_type, const IDataType & second_type, TArgs && ... args)
-{
-    WhichDataType which(first_type);
-#define DISPATCH(TYPE) \
-    if (which.idx == TypeIndex::TYPE) \
-        return createWithTwoNumericTypesSecond<TYPE, AggregateFunctionTemplate>(second_type, args...);
-    FOR_NUMERIC_TYPES(DISPATCH)
-#undef DISPATCH
-    if (which.idx == TypeIndex::Enum8)
-        return createWithTwoNumericTypesSecond<Int8, AggregateFunctionTemplate>(second_type, args...);
-    if (which.idx == TypeIndex::Enum16)
-        return createWithTwoNumericTypesSecond<Int16, AggregateFunctionTemplate>(second_type, args...);
-    return nullptr;
-}
-
 template <typename FirstType, template <typename, typename> class AggregateFunctionTemplate, typename... TArgs>
 static IAggregateFunction * createWithTwoBasicNumericTypesSecond(const IDataType & second_type, TArgs && ... args)
 {
@ -237,46 +209,6 @@ static IAggregateFunction * createWithTwoBasicNumericTypes(const IDataType & fir
    return nullptr;
 }

-template <typename FirstType, template <typename, typename> class AggregateFunctionTemplate, typename... TArgs>
-static IAggregateFunction * createWithTwoNumericOrDateTypesSecond(const IDataType & second_type, TArgs && ... args)
-{
-    WhichDataType which(second_type);
-#define DISPATCH(TYPE) \
-    if (which.idx == TypeIndex::TYPE) return new AggregateFunctionTemplate<FirstType, TYPE>(args...);
-    FOR_NUMERIC_TYPES(DISPATCH)
-#undef DISPATCH
-    if (which.idx == TypeIndex::Enum8) return new AggregateFunctionTemplate<FirstType, Int8>(args...);
-    if (which.idx == TypeIndex::Enum16) return new AggregateFunctionTemplate<FirstType, Int16>(args...);
-
-    /// expects that DataTypeDate based on UInt16, DataTypeDateTime based on UInt32
-    if (which.idx == TypeIndex::Date) return new AggregateFunctionTemplate<FirstType, UInt16>(args...);
-    if (which.idx == TypeIndex::DateTime) return new AggregateFunctionTemplate<FirstType, UInt32>(args...);
-
-    return nullptr;
-}
-
-template <template <typename, typename> class AggregateFunctionTemplate, typename... TArgs>
-static IAggregateFunction * createWithTwoNumericOrDateTypes(const IDataType & first_type, const IDataType & second_type, TArgs && ... args)
-{
-    WhichDataType which(first_type);
-#define DISPATCH(TYPE) \
-    if (which.idx == TypeIndex::TYPE) \
-        return createWithTwoNumericOrDateTypesSecond<TYPE, AggregateFunctionTemplate>(second_type, args...);
-    FOR_NUMERIC_TYPES(DISPATCH)
-#undef DISPATCH
-    if (which.idx == TypeIndex::Enum8)
-        return createWithTwoNumericOrDateTypesSecond<Int8, AggregateFunctionTemplate>(second_type, args...);
-    if (which.idx == TypeIndex::Enum16)
-        return createWithTwoNumericOrDateTypesSecond<Int16, AggregateFunctionTemplate>(second_type, args...);
-
-    /// expects that DataTypeDate based on UInt16, DataTypeDateTime based on UInt32
-    if (which.idx == TypeIndex::Date)
-        return createWithTwoNumericOrDateTypesSecond<UInt16, AggregateFunctionTemplate>(second_type, args...);
-    if (which.idx == TypeIndex::DateTime)
-        return createWithTwoNumericOrDateTypesSecond<UInt32, AggregateFunctionTemplate>(second_type, args...);
-    return nullptr;
-}
-
 template <template <typename> class AggregateFunctionTemplate, typename... TArgs>
 static IAggregateFunction * createWithStringType(const IDataType & argument_type, TArgs && ... args)
 {
--- a/src/AggregateFunctions/QuantileTDigest.h
+++ b/src/AggregateFunctions/QuantileTDigest.h
@ -391,7 +391,7 @@ public:
    ResultType getImpl(Float64 level)
    {
        if (centroids.empty())
-            return std::is_floating_point_v<ResultType> ? std::numeric_limits<ResultType>::quiet_NaN() : 0;
+            return is_floating_point<ResultType> ? std::numeric_limits<ResultType>::quiet_NaN() : 0;

        compress();

--- a/src/AggregateFunctions/ReservoirSampler.h
+++ b/src/AggregateFunctions/ReservoirSampler.h
@ -276,6 +276,6 @@ private:
    {
        if (OnEmpty == ReservoirSamplerOnEmpty::THROW)
            throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Quantile of empty ReservoirSampler");
-        return NanLikeValueConstructor<ResultType, std::is_floating_point_v<ResultType>>::getValue();
+        return NanLikeValueConstructor<ResultType, is_floating_point<ResultType>>::getValue();
    }
 };
--- a/src/AggregateFunctions/ReservoirSamplerDeterministic.h
+++ b/src/AggregateFunctions/ReservoirSamplerDeterministic.h
@ -271,7 +271,7 @@ private:
    {
        if (OnEmpty == ReservoirSamplerDeterministicOnEmpty::THROW)
            throw DB::Exception(DB::ErrorCodes::LOGICAL_ERROR, "Quantile of empty ReservoirSamplerDeterministic");
-        return NanLikeValueConstructor<ResultType, std::is_floating_point_v<ResultType>>::getValue();
+        return NanLikeValueConstructor<ResultType, is_floating_point<ResultType>>::getValue();
    }
 };

--- a/src/Backups/BackupConcurrencyCheck.cpp
+++ b/src/Backups/BackupConcurrencyCheck.cpp
@ -14,12 +14,12 @@ namespace ErrorCodes


 BackupConcurrencyCheck::BackupConcurrencyCheck(
-    const UUID & backup_or_restore_uuid_,
    bool is_restore_,
    bool on_cluster_,
+    const String & zookeeper_path_,
    bool allow_concurrency_,
    BackupConcurrencyCounters & counters_)
-    : is_restore(is_restore_), backup_or_restore_uuid(backup_or_restore_uuid_), on_cluster(on_cluster_), counters(counters_)
+    : is_restore(is_restore_), on_cluster(on_cluster_), zookeeper_path(zookeeper_path_), counters(counters_)
 {
    std::lock_guard lock{counters.mutex};

@ -32,7 +32,7 @@ BackupConcurrencyCheck::BackupConcurrencyCheck(
            size_t num_on_cluster_restores = counters.on_cluster_restores.size();
            if (on_cluster)
            {
-                if (!counters.on_cluster_restores.contains(backup_or_restore_uuid))
+                if (!counters.on_cluster_restores.contains(zookeeper_path))
                    ++num_on_cluster_restores;
            }
            else
@ -47,7 +47,7 @@ BackupConcurrencyCheck::BackupConcurrencyCheck(
            size_t num_on_cluster_backups = counters.on_cluster_backups.size();
            if (on_cluster)
            {
-                if (!counters.on_cluster_backups.contains(backup_or_restore_uuid))
+                if (!counters.on_cluster_backups.contains(zookeeper_path))
                    ++num_on_cluster_backups;
            }
            else
@ -64,9 +64,9 @@ BackupConcurrencyCheck::BackupConcurrencyCheck(
    if (on_cluster)
    {
        if (is_restore)
-            ++counters.on_cluster_restores[backup_or_restore_uuid];
+            ++counters.on_cluster_restores[zookeeper_path];
        else
-            ++counters.on_cluster_backups[backup_or_restore_uuid];
+            ++counters.on_cluster_backups[zookeeper_path];
    }
    else
    {
@ -86,7 +86,7 @@ BackupConcurrencyCheck::~BackupConcurrencyCheck()
    {
        if (is_restore)
        {
-            auto it = counters.on_cluster_restores.find(backup_or_restore_uuid);
+            auto it = counters.on_cluster_restores.find(zookeeper_path);
            if (it != counters.on_cluster_restores.end())
            {
                if (!--it->second)
@ -95,7 +95,7 @@ BackupConcurrencyCheck::~BackupConcurrencyCheck()
        }
        else
        {
-            auto it = counters.on_cluster_backups.find(backup_or_restore_uuid);
+            auto it = counters.on_cluster_backups.find(zookeeper_path);
            if (it != counters.on_cluster_backups.end())
            {
                if (!--it->second)
--- a/src/Backups/BackupConcurrencyCheck.h
+++ b/src/Backups/BackupConcurrencyCheck.h
@ -1,7 +1,8 @@
 #pragma once

-#include <Core/UUID.h>
+#include <base/defines.h>
 #include <base/scope_guard.h>
+#include <base/types.h>
 #include <mutex>
 #include <unordered_map>

@ -19,9 +20,9 @@ public:
    /// Checks concurrency of a BACKUP operation or a RESTORE operation.
    /// Keep a constructed instance of BackupConcurrencyCheck until the operation is done.
    BackupConcurrencyCheck(
-        const UUID & backup_or_restore_uuid_,
        bool is_restore_,
        bool on_cluster_,
+        const String & zookeeper_path_,
        bool allow_concurrency_,
        BackupConcurrencyCounters & counters_);

@ -31,8 +32,8 @@ public:

 private:
    const bool is_restore;
-    const UUID backup_or_restore_uuid;
    const bool on_cluster;
+    const String zookeeper_path;
    BackupConcurrencyCounters & counters;
 };

@ -47,8 +48,8 @@ private:
    friend class BackupConcurrencyCheck;
    size_t local_backups TSA_GUARDED_BY(mutex) = 0;
    size_t local_restores TSA_GUARDED_BY(mutex) = 0;
-    std::unordered_map<UUID /* backup_uuid */, size_t /* num_refs */> on_cluster_backups TSA_GUARDED_BY(mutex);
-    std::unordered_map<UUID /* restore_uuid */, size_t /* num_refs */> on_cluster_restores TSA_GUARDED_BY(mutex);
+    std::unordered_map<String /* zookeeper_path */, size_t /* num_refs */> on_cluster_backups TSA_GUARDED_BY(mutex);
+    std::unordered_map<String /* zookeeper_path */, size_t /* num_refs */> on_cluster_restores TSA_GUARDED_BY(mutex);
    std::mutex mutex;
 };

--- a/src/Backups/BackupCoordinationCleaner.cpp
+++ b/src/Backups/BackupCoordinationCleaner.cpp
@ -4,31 +4,29 @@
 namespace DB
 {

-BackupCoordinationCleaner::BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_)
-    : zookeeper_path(zookeeper_path_), with_retries(with_retries_), log(log_)
+BackupCoordinationCleaner::BackupCoordinationCleaner(bool is_restore_, const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_)
+    : is_restore(is_restore_), zookeeper_path(zookeeper_path_), with_retries(with_retries_), log(log_)
 {
 }

-void BackupCoordinationCleaner::cleanup()
+bool BackupCoordinationCleaner::cleanup(bool throw_if_error)
 {
-    tryRemoveAllNodes(/* throw_if_error = */ true, /* retries_kind = */ WithRetries::kNormal);
+    WithRetries::Kind retries_kind = throw_if_error ? WithRetries::kNormal : WithRetries::kErrorHandling;
+    return cleanupImpl(throw_if_error, retries_kind);
 }

-bool BackupCoordinationCleaner::tryCleanupAfterError() noexcept
-{
-    return tryRemoveAllNodes(/* throw_if_error = */ false, /* retries_kind = */ WithRetries::kNormal);
-}
-
-bool BackupCoordinationCleaner::tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind)
+bool BackupCoordinationCleaner::cleanupImpl(bool throw_if_error, WithRetries::Kind retries_kind)
 {
    {
        std::lock_guard lock{mutex};
-        if (cleanup_result.succeeded)
-            return true;
-        if (cleanup_result.exception)
+        if (succeeded)
        {
-            if (throw_if_error)
-                std::rethrow_exception(cleanup_result.exception);
+            LOG_TRACE(log, "Nodes from ZooKeeper are already removed");
+            return true;
+        }
+        if (tried)
+        {
+            LOG_INFO(log, "Skipped removing nodes from ZooKeeper because because earlier we failed to do that");
            return false;
        }
    }
@ -44,16 +42,18 @@ bool BackupCoordinationCleaner::tryRemoveAllNodes(bool throw_if_error, WithRetri
        });

        std::lock_guard lock{mutex};
-        cleanup_result.succeeded = true;
+        tried = true;
+        succeeded = true;
        return true;
    }
    catch (...)
    {
-        LOG_TRACE(log, "Caught exception while removing nodes from ZooKeeper for this restore: {}",
+        LOG_TRACE(log, "Caught exception while removing nodes from ZooKeeper for this {}: {}",
+                  is_restore ? "restore" : "backup",
                  getCurrentExceptionMessage(/* with_stacktrace= */ false, /* check_embedded_stacktrace= */ true));

        std::lock_guard lock{mutex};
-        cleanup_result.exception = std::current_exception();
+        tried = true;

        if (throw_if_error)
            throw;
--- a/src/Backups/BackupCoordinationCleaner.h
+++ b/src/Backups/BackupCoordinationCleaner.h
@ -12,14 +12,14 @@ namespace DB
 class BackupCoordinationCleaner
 {
 public:
-    BackupCoordinationCleaner(const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_);
+    BackupCoordinationCleaner(bool is_restore_, const String & zookeeper_path_, const WithRetries & with_retries_, LoggerPtr log_);

-    void cleanup();
-    bool tryCleanupAfterError() noexcept;
+    bool cleanup(bool throw_if_error);

 private:
-    bool tryRemoveAllNodes(bool throw_if_error, WithRetries::Kind retries_kind);
+    bool cleanupImpl(bool throw_if_error, WithRetries::Kind retries_kind);

+    const bool is_restore;
    const String zookeeper_path;

    /// A reference to a field of the parent object which is either BackupCoordinationOnCluster or RestoreCoordinationOnCluster.
@ -27,13 +27,8 @@ private:

    const LoggerPtr log;

-    struct CleanupResult
-    {
-        bool succeeded = false;
-        std::exception_ptr exception;
-    };
-    CleanupResult cleanup_result TSA_GUARDED_BY(mutex);
-
+    bool tried TSA_GUARDED_BY(mutex) = false;
+    bool succeeded TSA_GUARDED_BY(mutex) = false;
    std::mutex mutex;
 };

--- a/src/Backups/BackupCoordinationLocal.cpp
+++ b/src/Backups/BackupCoordinationLocal.cpp
@ -11,12 +11,11 @@ namespace DB
 {

 BackupCoordinationLocal::BackupCoordinationLocal(
-    const UUID & backup_uuid_,
    bool is_plain_backup_,
    bool allow_concurrent_backup_,
    BackupConcurrencyCounters & concurrency_counters_)
    : log(getLogger("BackupCoordinationLocal"))
-    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ false, allow_concurrent_backup_, concurrency_counters_)
+    , concurrency_check(/* is_restore = */ false, /* on_cluster = */ false, /* zookeeper_path = */ "", allow_concurrent_backup_, concurrency_counters_)
    , file_infos(is_plain_backup_)
 {
 }
--- a/src/Backups/BackupCoordinationLocal.h
+++ b/src/Backups/BackupCoordinationLocal.h
@ -23,20 +23,19 @@ class BackupCoordinationLocal : public IBackupCoordination
 {
 public:
    explicit BackupCoordinationLocal(
-        const UUID & backup_uuid_,
        bool is_plain_backup_,
        bool allow_concurrent_backup_,
        BackupConcurrencyCounters & concurrency_counters_);

    ~BackupCoordinationLocal() override;

+    void setBackupQueryIsSentToOtherHosts() override {}
+    bool isBackupQuerySentToOtherHosts() const override { return false; }
    Strings setStage(const String &, const String &, bool) override { return {}; }
-    void setBackupQueryWasSentToOtherHosts() override {}
-    bool trySetError(std::exception_ptr) override { return true; }
-    void finish() override {}
-    bool tryFinishAfterError() noexcept override { return true; }
-    void waitForOtherHostsToFinish() override {}
-    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }
+    bool setError(std::exception_ptr, bool) override { return true; }
+    bool waitOtherHostsFinish(bool) const override { return true; }
+    bool finish(bool) override { return true; }
+    bool cleanup(bool) override { return true; }

    void addReplicatedPartNames(const String & table_zk_path, const String & table_name_for_logs, const String & replica_name,
                                const std::vector<PartNameAndChecksum> & part_names_and_checksums) override;
--- a/src/Backups/BackupCoordinationOnCluster.cpp
+++ b/src/Backups/BackupCoordinationOnCluster.cpp
@ -184,17 +184,21 @@ BackupCoordinationOnCluster::BackupCoordinationOnCluster(
    , plain_backup(is_plain_backup_)
    , log(getLogger("BackupCoordinationOnCluster"))
    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
-    , concurrency_check(backup_uuid_, /* is_restore = */ false, /* on_cluster = */ true, allow_concurrent_backup_, concurrency_counters_)
-    , stage_sync(/* is_restore = */ false, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_backup_, with_retries, schedule_, process_list_element_, log)
-    , cleaner(zookeeper_path, with_retries, log)
+    , cleaner(/* is_restore = */ false, zookeeper_path, with_retries, log)
+    , stage_sync(/* is_restore = */ false, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_backup_, concurrency_counters_, with_retries, schedule_, process_list_element_, log)
+{
+    try
    {
        createRootNodes();
    }
-
-BackupCoordinationOnCluster::~BackupCoordinationOnCluster()
+    catch (...)
    {
-    tryFinishImpl();
+        stage_sync.setError(std::current_exception(), /* throw_if_error = */ false);
+        throw;
    }
+}
+
+BackupCoordinationOnCluster::~BackupCoordinationOnCluster() = default;

 void BackupCoordinationOnCluster::createRootNodes()
 {
@ -217,69 +221,52 @@ void BackupCoordinationOnCluster::createRootNodes()
    });
 }

+void BackupCoordinationOnCluster::setBackupQueryIsSentToOtherHosts()
+{
+    stage_sync.setQueryIsSentToOtherHosts();
+}
+
+bool BackupCoordinationOnCluster::isBackupQuerySentToOtherHosts() const
+{
+    return stage_sync.isQuerySentToOtherHosts();
+}
+
 Strings BackupCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
 {
    stage_sync.setStage(new_stage, message);
-
-    if (!sync)
+    if (sync)
+        return stage_sync.waitHostsReachStage(all_hosts_without_initiator, new_stage);
    return {};
-
-    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
 }

-void BackupCoordinationOnCluster::setBackupQueryWasSentToOtherHosts()
+bool BackupCoordinationOnCluster::setError(std::exception_ptr exception, bool throw_if_error)
 {
-    backup_query_was_sent_to_other_hosts = true;
+    return stage_sync.setError(exception, throw_if_error);
 }

-bool BackupCoordinationOnCluster::trySetError(std::exception_ptr exception)
+bool BackupCoordinationOnCluster::waitOtherHostsFinish(bool throw_if_error) const
 {
-    return stage_sync.trySetError(exception);
+    return stage_sync.waitOtherHostsFinish(throw_if_error);
 }

-void BackupCoordinationOnCluster::finish()
+bool BackupCoordinationOnCluster::finish(bool throw_if_error)
 {
-    bool other_hosts_also_finished = false;
-    stage_sync.finish(other_hosts_also_finished);
-
-    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
-        cleaner.cleanup();
+    return stage_sync.finish(throw_if_error);
 }

-bool BackupCoordinationOnCluster::tryFinishAfterError() noexcept
+bool BackupCoordinationOnCluster::cleanup(bool throw_if_error)
 {
-    return tryFinishImpl();
-}
-
-bool BackupCoordinationOnCluster::tryFinishImpl() noexcept
+    /// All the hosts must finish before we remove the coordination nodes.
+    bool expect_other_hosts_finished = stage_sync.isQuerySentToOtherHosts() || !stage_sync.isErrorSet();
+    bool all_hosts_finished = stage_sync.finished() && (stage_sync.otherHostsFinished() || !expect_other_hosts_finished);
+    if (!all_hosts_finished)
    {
-    bool other_hosts_also_finished = false;
-    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
-        return false;
-
-    if ((current_host == kInitiator) && (other_hosts_also_finished || !backup_query_was_sent_to_other_hosts))
-    {
-        if (!cleaner.tryCleanupAfterError())
+        auto unfinished_hosts = expect_other_hosts_finished ? stage_sync.getUnfinishedHosts() : Strings{current_host};
+        LOG_INFO(log, "Skipping removing nodes from ZooKeeper because hosts {} didn't finish",
+                 BackupCoordinationStageSync::getHostsDesc(unfinished_hosts));
        return false;
    }
-
-    return true;
-}
-
-void BackupCoordinationOnCluster::waitForOtherHostsToFinish()
-{
-    if ((current_host != kInitiator) || !backup_query_was_sent_to_other_hosts)
-        return;
-    stage_sync.waitForOtherHostsToFinish();
-}
-
-bool BackupCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
-{
-    if (current_host != kInitiator)
-        return false;
-    if (!backup_query_was_sent_to_other_hosts)
-        return true;
-    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
+    return cleaner.cleanup(throw_if_error);
 }

 ZooKeeperRetriesInfo BackupCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
--- a/src/Backups/BackupCoordinationOnCluster.h
+++ b/src/Backups/BackupCoordinationOnCluster.h
@ -1,7 +1,6 @@
 #pragma once

 #include <Backups/IBackupCoordination.h>
-#include <Backups/BackupConcurrencyCheck.h>
 #include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationFileInfos.h>
 #include <Backups/BackupCoordinationReplicatedAccess.h>
@ -20,7 +19,7 @@ class BackupCoordinationOnCluster : public IBackupCoordination
 {
 public:
    /// Empty string as the current host is used to mark the initiator of a BACKUP ON CLUSTER query.
-    static const constexpr std::string_view kInitiator;
+    static const constexpr std::string_view kInitiator = BackupCoordinationStageSync::kInitiator;

    BackupCoordinationOnCluster(
        const UUID & backup_uuid_,
@ -37,13 +36,13 @@ public:

    ~BackupCoordinationOnCluster() override;

+    void setBackupQueryIsSentToOtherHosts() override;
+    bool isBackupQuerySentToOtherHosts() const override;
    Strings setStage(const String & new_stage, const String & message, bool sync) override;
-    void setBackupQueryWasSentToOtherHosts() override;
-    bool trySetError(std::exception_ptr exception) override;
-    void finish() override;
-    bool tryFinishAfterError() noexcept override;
-    void waitForOtherHostsToFinish() override;
-    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;
+    bool setError(std::exception_ptr exception, bool throw_if_error) override;
+    bool waitOtherHostsFinish(bool throw_if_error) const override;
+    bool finish(bool throw_if_error) override;
+    bool cleanup(bool throw_if_error) override;

    void addReplicatedPartNames(
        const String & table_zk_path,
@ -110,11 +109,10 @@ private:
    const bool plain_backup;
    LoggerPtr const log;

+    /// The order is important: `stage_sync` must be initialized after `with_retries` and `cleaner`.
    const WithRetries with_retries;
-    BackupConcurrencyCheck concurrency_check;
-    BackupCoordinationStageSync stage_sync;
    BackupCoordinationCleaner cleaner;
-    std::atomic<bool> backup_query_was_sent_to_other_hosts = false;
+    BackupCoordinationStageSync stage_sync;

    mutable std::optional<BackupCoordinationReplicatedTables> replicated_tables TSA_GUARDED_BY(replicated_tables_mutex);
    mutable std::optional<BackupCoordinationReplicatedAccess> replicated_access TSA_GUARDED_BY(replicated_access_mutex);
--- a/src/Backups/BackupCoordinationStageSync.cpp
+++ b/src/Backups/BackupCoordinationStageSync.cpp
--- a/src/Backups/BackupCoordinationStageSync.h
+++ b/src/Backups/BackupCoordinationStageSync.h
@ -1,7 +1,9 @@
 #pragma once

+#include <Backups/BackupConcurrencyCheck.h>
 #include <Backups/WithRetries.h>

+
 namespace DB
 {

@ -9,12 +11,16 @@ namespace DB
 class BackupCoordinationStageSync
 {
 public:
+    /// Empty string as the current host is used to mark the initiator of a BACKUP ON CLUSTER or RESTORE ON CLUSTER query.
+    static const constexpr std::string_view kInitiator;
+
    BackupCoordinationStageSync(
        bool is_restore_,                    /// true if this is a RESTORE ON CLUSTER command, false if this is a BACKUP ON CLUSTER command
        const String & zookeeper_path_,      /// path to the "stage" folder in ZooKeeper
        const String & current_host_,        /// the current host, or an empty string if it's the initiator of the BACKUP/RESTORE ON CLUSTER command
        const Strings & all_hosts_,          /// all the hosts (including the initiator and the current host) performing the BACKUP/RESTORE ON CLUSTER command
        bool allow_concurrency_,             /// whether it's allowed to have concurrent backups or restores.
+        BackupConcurrencyCounters & concurrency_counters_,
        const WithRetries & with_retries_,
        ThreadPoolCallbackRunnerUnsafe<void> schedule_,
        QueryStatusPtr process_list_element_,
@ -22,30 +28,37 @@ public:

    ~BackupCoordinationStageSync();

+    /// Sets that the BACKUP or RESTORE query was sent to other hosts.
+    void setQueryIsSentToOtherHosts();
+    bool isQuerySentToOtherHosts() const;
+
    /// Sets the stage of the current host and signal other hosts if there were other hosts waiting for that.
    void setStage(const String & stage, const String & stage_result = {});

-    /// Waits until all the specified hosts come to the specified stage.
-    /// The function returns the results which specified hosts set when they came to the required stage.
-    /// If it doesn't happen before the timeout then the function will stop waiting and throw an exception.
-    Strings waitForHostsToReachStage(const String & stage_to_wait, const Strings & hosts, std::optional<std::chrono::milliseconds> timeout = {}) const;
-
-    /// Waits until all the other hosts finish their work.
-    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
-    void waitForOtherHostsToFinish() const;
-
-    /// Lets other host know that the current host has finished its work.
-    void finish(bool & other_hosts_also_finished);
+    /// Waits until specified hosts come to the specified stage.
+    /// The function returns the results which the specified hosts set when they came to the required stage.
+    Strings waitHostsReachStage(const Strings & hosts, const String & stage_to_wait) const;

    /// Lets other hosts know that the current host has encountered an error.
-    bool trySetError(std::exception_ptr exception) noexcept;
+    /// The function returns true if it successfully created the error node or if the error node was found already exist.
+    bool setError(std::exception_ptr exception, bool throw_if_error);
+    bool isErrorSet() const;

-    /// Waits until all the other hosts finish their work (as a part of error-handling process).
-    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
-    bool tryWaitForOtherHostsToFinishAfterError() const noexcept;
+    /// Waits until the hosts other than the current host finish their work. Must be called before finish().
+    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
+    bool waitOtherHostsFinish(bool throw_if_error) const;
+    bool otherHostsFinished() const;

-    /// Lets other host know that the current host has finished its work (as a part of error-handling process).
-    bool tryFinishAfterError(bool & other_hosts_also_finished) noexcept;
+    /// Lets other hosts know that the current host has finished its work.
+    bool finish(bool throw_if_error);
+    bool finished() const;
+
+    /// Returns true if all the hosts have finished.
+    bool allHostsFinished() const { return finished() && otherHostsFinished(); }
+
+    /// Returns a list of the hosts which haven't finished yet.
+    Strings getUnfinishedHosts() const;
+    Strings getUnfinishedOtherHosts() const;

    /// Returns a printable name of a specific host. For empty host the function returns "initiator".
    static String getHostDesc(const String & host);
@ -59,8 +72,8 @@ private:
    void createRootNodes();

    /// Atomically creates both 'start' and 'alive' nodes and also checks that there is no concurrent backup or restore if `allow_concurrency` is false.
-    void createStartAndAliveNodes();
-    void createStartAndAliveNodes(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+    void createStartAndAliveNodesAndCheckConcurrency(BackupConcurrencyCounters & concurrency_counters_);
+    void createStartAndAliveNodesAndCheckConcurrency(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);

    /// Deserialize the version of a node stored in the 'start' node.
    int parseStartNode(const String & start_node_contents, const String & host) const;
@ -78,14 +91,17 @@ private:

    /// Reads the current state from ZooKeeper without throwing exceptions.
    void readCurrentState(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+
+    /// Creates a stage node to let other hosts know we've reached the specified stage.
+    void createStageNode(const String & stage, const String & stage_result, Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
    String getStageNodePath(const String & stage) const;

    /// Lets other hosts know that the current host has encountered an error.
-    bool trySetError(const Exception & exception);
-    void setError(const Exception & exception);
+    bool setError(const Exception & exception, bool throw_if_error);
+    void createErrorNode(const Exception & exception, Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);

    /// Deserializes an error stored in the error node.
-    static std::pair<std::exception_ptr, String> parseErrorNode(const String & error_node_contents);
+    std::pair<std::exception_ptr, String> parseErrorNode(const String & error_node_contents) const;

    /// Reset the `connected` flag for each host.
    void resetConnectedFlag();
@ -102,19 +118,27 @@ private:
    void cancelQueryIfDisconnectedTooLong();

    /// Used by waitForHostsToReachStage() to check if everything is ready to return.
-    bool checkIfHostsReachStage(const Strings & hosts, const String & stage_to_wait, bool time_is_out, std::optional<std::chrono::milliseconds> timeout, Strings & results) const TSA_REQUIRES(mutex);
+    bool checkIfHostsReachStage(const Strings & hosts, const String & stage_to_wait, Strings & results) const TSA_REQUIRES(mutex);

    /// Creates the 'finish' node.
-    bool tryFinishImpl();
-    bool tryFinishImpl(bool & other_hosts_also_finished, bool throw_if_error, WithRetries::Kind retries_kind);
-    void createFinishNodeAndRemoveAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper);
+    bool finishImpl(bool throw_if_error, WithRetries::Kind retries_kind);
+    void createFinishNodeAndRemoveAliveNode(Coordination::ZooKeeperWithFaultInjection::Ptr zookeeper, bool throw_if_error);

    /// Returns the version used by the initiator.
    int getInitiatorVersion() const;

    /// Waits until all the other hosts finish their work.
-    bool tryWaitForOtherHostsToFinishImpl(const String & reason, bool throw_if_error, std::optional<std::chrono::seconds> timeout) const;
-    bool checkIfOtherHostsFinish(const String & reason, bool throw_if_error, bool time_is_out, std::optional<std::chrono::milliseconds> timeout) const TSA_REQUIRES(mutex);
+    bool waitOtherHostsFinishImpl(const String & reason, std::optional<std::chrono::seconds> timeout, bool throw_if_error) const;
+    bool checkIfOtherHostsFinish(const String & reason, std::optional<std::chrono::milliseconds> timeout, bool time_is_out, bool & result, bool throw_if_error) const TSA_REQUIRES(mutex);
+
+    /// Returns true if all the hosts have finished.
+    bool allHostsFinishedNoLock() const TSA_REQUIRES(mutex);
+    bool finishedNoLock() const TSA_REQUIRES(mutex);
+    bool otherHostsFinishedNoLock() const TSA_REQUIRES(mutex);
+
+    /// Returns a list of the hosts which haven't finished yet.
+    Strings getUnfinishedHostsNoLock() const TSA_REQUIRES(mutex);
+    Strings getUnfinishedOtherHostsNoLock() const TSA_REQUIRES(mutex);

    const bool is_restore;
    const String operation_name;
@ -138,15 +162,16 @@ private:
    /// Paths in ZooKeeper.
    const std::filesystem::path zookeeper_path;
    const String root_zookeeper_path;
-    const String operation_node_path;
+    const String operation_zookeeper_path;
    const String operation_node_name;
-    const String stage_node_path;
    const String start_node_path;
    const String finish_node_path;
    const String num_hosts_node_path;
+    const String error_node_path;
    const String alive_node_path;
    const String alive_tracker_node_path;
-    const String error_node_path;
+
+    std::optional<BackupConcurrencyCheck> local_concurrency_check;

    std::shared_ptr<Poco::Event> zk_nodes_changed;

@ -176,25 +201,21 @@ private:
    {
        std::map<String /* host */, HostInfo> hosts; /// std::map because we need to compare states
        std::optional<String> host_with_error;
-        bool cancelled = false;

        bool operator ==(const State & other) const;
        bool operator !=(const State & other) const;
+        void merge(const State & other);
    };

    State state TSA_GUARDED_BY(mutex);
    mutable std::condition_variable state_changed;

    std::future<void> watching_thread_future;
-    std::atomic<bool> should_stop_watching_thread = false;
+    bool should_stop_watching_thread TSA_GUARDED_BY(mutex) = false;

-    struct FinishResult
-    {
-        bool succeeded = false;
-        std::exception_ptr exception;
-        bool other_hosts_also_finished = false;
-    };
-    FinishResult finish_result TSA_GUARDED_BY(mutex);
+    bool query_is_sent_to_other_hosts TSA_GUARDED_BY(mutex) = false;
+    bool tried_to_finish TSA_GUARDED_BY(mutex) = false;
+    bool tried_to_set_error TSA_GUARDED_BY(mutex) = false;

    mutable std::mutex mutex;
 };
--- a/src/Backups/BackupsWorker.cpp
+++ b/src/Backups/BackupsWorker.cpp
@ -329,6 +329,7 @@ std::pair<OperationID, BackupStatus> BackupsWorker::start(const ASTPtr & backup_
 struct BackupsWorker::BackupStarter
 {
    BackupsWorker & backups_worker;
+    LoggerPtr log;
    std::shared_ptr<ASTBackupQuery> backup_query;
    ContextPtr query_context; /// We have to keep `query_context` until the end of the operation because a pointer to it is stored inside the ThreadGroup we're using.
    ContextMutablePtr backup_context;
@ -345,6 +346,7 @@ struct BackupsWorker::BackupStarter

    BackupStarter(BackupsWorker & backups_worker_, const ASTPtr & query_, const ContextPtr & context_)
        : backups_worker(backups_worker_)
+        , log(backups_worker.log)
        , backup_query(std::static_pointer_cast<ASTBackupQuery>(query_->clone()))
        , query_context(context_)
        , backup_context(Context::createCopy(query_context))
@ -399,9 +401,20 @@ struct BackupsWorker::BackupStarter
        chassert(!backup);
        backup = backups_worker.openBackupForWriting(backup_info, backup_settings, backup_coordination, backup_context);

-        backups_worker.doBackup(
-            backup, backup_query, backup_id, backup_name_for_logging, backup_settings, backup_coordination, backup_context,
+        backups_worker.doBackup(backup, backup_query, backup_id, backup_settings, backup_coordination, backup_context,
                                on_cluster, cluster);
+
+        backup_coordination->finish(/* throw_if_error = */ true);
+        backup.reset();
+
+        /// The backup coordination is not needed anymore.
+        if (!is_internal_backup)
+            backup_coordination->cleanup(/* throw_if_error = */ true);
+        backup_coordination.reset();
+
+        /// NOTE: setStatus is called after setNumFilesAndSize in order to have actual information in a backup log record
+        LOG_INFO(log, "{} {} was created successfully", (is_internal_backup ? "Internal backup" : "Backup"), backup_name_for_logging);
+        backups_worker.setStatus(backup_id, BackupStatus::BACKUP_CREATED);
    }

    void onException()
@ -416,15 +429,28 @@ struct BackupsWorker::BackupStarter
        if (backup && !backup->setIsCorrupted())
            should_remove_files_in_backup = false;

-        if (backup_coordination && backup_coordination->trySetError(std::current_exception()))
-        {
-            bool other_hosts_finished = backup_coordination->tryWaitForOtherHostsToFinishAfterError();
+        bool all_hosts_finished = false;

-            if (should_remove_files_in_backup && other_hosts_finished)
+        if (backup_coordination && backup_coordination->setError(std::current_exception(), /* throw_if_error = */ false))
+        {
+            bool other_hosts_finished = !is_internal_backup
+                && (!backup_coordination->isBackupQuerySentToOtherHosts() || backup_coordination->waitOtherHostsFinish(/* throw_if_error = */ false));
+
+            all_hosts_finished = backup_coordination->finish(/* throw_if_error = */ false) && other_hosts_finished;
+        }
+
+        if (!all_hosts_finished)
+            should_remove_files_in_backup = false;
+
+        if (backup && should_remove_files_in_backup)
            backup->tryRemoveAllFiles();

-            backup_coordination->tryFinishAfterError();
-        }
+        backup.reset();
+
+        if (backup_coordination && all_hosts_finished)
+            backup_coordination->cleanup(/* throw_if_error = */ false);
+
+        backup_coordination.reset();

        backups_worker.setStatusSafe(backup_id, getBackupStatusFromCurrentException());
    }
@ -497,7 +523,6 @@ void BackupsWorker::doBackup(
    BackupMutablePtr backup,
    const std::shared_ptr<ASTBackupQuery> & backup_query,
    const OperationID & backup_id,
-    const String & backup_name_for_logging,
    const BackupSettings & backup_settings,
    std::shared_ptr<IBackupCoordination> backup_coordination,
    ContextMutablePtr context,
@ -521,10 +546,10 @@ void BackupsWorker::doBackup(
        backup_settings.copySettingsToQuery(*backup_query);
        sendQueryToOtherHosts(*backup_query, cluster, backup_settings.shard_num, backup_settings.replica_num,
                              context, required_access, backup_coordination->getOnClusterInitializationKeeperRetriesInfo());
-        backup_coordination->setBackupQueryWasSentToOtherHosts();
+        backup_coordination->setBackupQueryIsSentToOtherHosts();

        /// Wait until all the hosts have written their backup entries.
-        backup_coordination->waitForOtherHostsToFinish();
+        backup_coordination->waitOtherHostsFinish(/* throw_if_error = */ true);
    }
    else
    {
@ -569,18 +594,8 @@ void BackupsWorker::doBackup(
        compressed_size = backup->getCompressedSize();
    }

-    /// Close the backup.
-    backup.reset();
-
-    /// The backup coordination is not needed anymore.
-    backup_coordination->finish();
-
    /// NOTE: we need to update metadata again after backup->finalizeWriting(), because backup metadata is written there.
    setNumFilesAndSize(backup_id, num_files, total_size, num_entries, uncompressed_size, compressed_size, 0, 0);
-
-    /// NOTE: setStatus is called after setNumFilesAndSize in order to have actual information in a backup log record
-    LOG_INFO(log, "{} {} was created successfully", (is_internal_backup ? "Internal backup" : "Backup"), backup_name_for_logging);
-    setStatus(backup_id, BackupStatus::BACKUP_CREATED);
 }


@ -687,6 +702,7 @@ void BackupsWorker::writeBackupEntries(
 struct BackupsWorker::RestoreStarter
 {
    BackupsWorker & backups_worker;
+    LoggerPtr log;
    std::shared_ptr<ASTBackupQuery> restore_query;
    ContextPtr query_context; /// We have to keep `query_context` until the end of the operation because a pointer to it is stored inside the ThreadGroup we're using.
    ContextMutablePtr restore_context;
@ -702,6 +718,7 @@ struct BackupsWorker::RestoreStarter

    RestoreStarter(BackupsWorker & backups_worker_, const ASTPtr & query_, const ContextPtr & context_)
        : backups_worker(backups_worker_)
+        , log(backups_worker.log)
        , restore_query(std::static_pointer_cast<ASTBackupQuery>(query_->clone()))
        , query_context(context_)
        , restore_context(Context::createCopy(query_context))
@ -753,16 +770,17 @@ struct BackupsWorker::RestoreStarter
        }
        restore_coordination = backups_worker.makeRestoreCoordination(on_cluster, restore_settings, restore_context);

-        backups_worker.doRestore(
-            restore_query,
-            restore_id,
-            backup_name_for_logging,
-            backup_info,
-            restore_settings,
-            restore_coordination,
-            restore_context,
-            on_cluster,
-            cluster);
+        backups_worker.doRestore(restore_query, restore_id, backup_info, restore_settings, restore_coordination, restore_context,
+                                 on_cluster, cluster);
+
+        /// The restore coordination is not needed anymore.
+        restore_coordination->finish(/* throw_if_error = */ true);
+        if (!is_internal_restore)
+            restore_coordination->cleanup(/* throw_if_error = */ true);
+        restore_coordination.reset();
+
+        LOG_INFO(log, "Restored from {} {} successfully", (is_internal_restore ? "internal backup" : "backup"), backup_name_for_logging);
+        backups_worker.setStatus(restore_id, BackupStatus::RESTORED);
    }

    void onException()
@ -770,12 +788,16 @@ struct BackupsWorker::RestoreStarter
        /// Something bad happened, some data were not restored.
        tryLogCurrentException(backups_worker.log, fmt::format("Failed to restore from {} {}", (is_internal_restore ? "internal backup" : "backup"), backup_name_for_logging));

-        if (restore_coordination && restore_coordination->trySetError(std::current_exception()))
+        if (restore_coordination && restore_coordination->setError(std::current_exception(), /* throw_if_error = */ false))
        {
-            restore_coordination->tryWaitForOtherHostsToFinishAfterError();
-            restore_coordination->tryFinishAfterError();
+            bool other_hosts_finished = !is_internal_restore
+                && (!restore_coordination->isRestoreQuerySentToOtherHosts() || restore_coordination->waitOtherHostsFinish(/* throw_if_error = */ false));
+            if (restore_coordination->finish(/* throw_if_error = */ false) && other_hosts_finished)
+                restore_coordination->cleanup(/* throw_if_error = */ false);
        }

+        restore_coordination.reset();
+
        backups_worker.setStatusSafe(restore_id, getRestoreStatusFromCurrentException());
    }
 };
@ -838,7 +860,6 @@ BackupPtr BackupsWorker::openBackupForReading(const BackupInfo & backup_info, co
 void BackupsWorker::doRestore(
    const std::shared_ptr<ASTBackupQuery> & restore_query,
    const OperationID & restore_id,
-    const String & backup_name_for_logging,
    const BackupInfo & backup_info,
    RestoreSettings restore_settings,
    std::shared_ptr<IRestoreCoordination> restore_coordination,
@ -882,10 +903,10 @@ void BackupsWorker::doRestore(
        restore_settings.copySettingsToQuery(*restore_query);
        sendQueryToOtherHosts(*restore_query, cluster, restore_settings.shard_num, restore_settings.replica_num,
                              context, {}, restore_coordination->getOnClusterInitializationKeeperRetriesInfo());
-        restore_coordination->setRestoreQueryWasSentToOtherHosts();
+        restore_coordination->setRestoreQueryIsSentToOtherHosts();

        /// Wait until all the hosts have done with their restoring work.
-        restore_coordination->waitForOtherHostsToFinish();
+        restore_coordination->waitOtherHostsFinish(/* throw_if_error = */ true);
    }
    else
    {
@ -905,12 +926,6 @@ void BackupsWorker::doRestore(
                                    backup, context, getThreadPool(ThreadPoolId::RESTORE), after_task_callback};
        restorer.run(RestorerFromBackup::RESTORE);
    }
-
-    /// The restore coordination is not needed anymore.
-    restore_coordination->finish();
-
-    LOG_INFO(log, "Restored from {} {} successfully", (is_internal_restore ? "internal backup" : "backup"), backup_name_for_logging);
-    setStatus(restore_id, BackupStatus::RESTORED);
 }


@ -943,7 +958,7 @@ BackupsWorker::makeBackupCoordination(bool on_cluster, const BackupSettings & ba
    if (!on_cluster)
    {
        return std::make_shared<BackupCoordinationLocal>(
-            *backup_settings.backup_uuid, !backup_settings.deduplicate_files, allow_concurrent_backups, *concurrency_counters);
+            !backup_settings.deduplicate_files, allow_concurrent_backups, *concurrency_counters);
    }

    bool is_internal_backup = backup_settings.internal;
@ -981,8 +996,7 @@ BackupsWorker::makeRestoreCoordination(bool on_cluster, const RestoreSettings &
 {
    if (!on_cluster)
    {
-        return std::make_shared<RestoreCoordinationLocal>(
-            *restore_settings.restore_uuid, allow_concurrent_restores, *concurrency_counters);
+        return std::make_shared<RestoreCoordinationLocal>(allow_concurrent_restores, *concurrency_counters);
    }

    bool is_internal_restore = restore_settings.internal;
--- a/src/Backups/BackupsWorker.h
+++ b/src/Backups/BackupsWorker.h
@ -81,7 +81,6 @@ private:
        BackupMutablePtr backup,
        const std::shared_ptr<ASTBackupQuery> & backup_query,
        const BackupOperationID & backup_id,
-        const String & backup_name_for_logging,
        const BackupSettings & backup_settings,
        std::shared_ptr<IBackupCoordination> backup_coordination,
        ContextMutablePtr context,
@ -102,7 +101,6 @@ private:
    void doRestore(
        const std::shared_ptr<ASTBackupQuery> & restore_query,
        const BackupOperationID & restore_id,
-        const String & backup_name_for_logging,
        const BackupInfo & backup_info,
        RestoreSettings restore_settings,
        std::shared_ptr<IRestoreCoordination> restore_coordination,
--- a/src/Backups/IBackupCoordination.h
+++ b/src/Backups/IBackupCoordination.h
@ -20,29 +20,27 @@ class IBackupCoordination
 public:
    virtual ~IBackupCoordination() = default;

+    /// Sets that the backup query was sent to other hosts.
+    /// Function waitOtherHostsFinish() will check that to find out if it should really wait or not.
+    virtual void setBackupQueryIsSentToOtherHosts() = 0;
+    virtual bool isBackupQuerySentToOtherHosts() const = 0;
+
    /// Sets the current stage and waits for other hosts to come to this stage too.
    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;

-    /// Sets that the backup query was sent to other hosts.
-    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
-    virtual void setBackupQueryWasSentToOtherHosts() = 0;
-
    /// Lets other hosts know that the current host has encountered an error.
-    virtual bool trySetError(std::exception_ptr exception) = 0;
-
-    /// Lets other hosts know that the current host has finished its work.
-    virtual void finish() = 0;
-
-    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
-    virtual bool tryFinishAfterError() noexcept = 0;
+    /// Returns true if the information is successfully passed so other hosts can read it.
+    virtual bool setError(std::exception_ptr exception, bool throw_if_error) = 0;

    /// Waits until all the other hosts finish their work.
    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
-    virtual void waitForOtherHostsToFinish() = 0;
+    virtual bool waitOtherHostsFinish(bool throw_if_error) const = 0;

-    /// Waits until all the other hosts finish their work (as a part of error-handling process).
-    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
-    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;
+    /// Lets other hosts know that the current host has finished its work.
+    virtual bool finish(bool throw_if_error) = 0;
+
+    /// Removes temporary nodes in ZooKeeper.
+    virtual bool cleanup(bool throw_if_error) = 0;

    struct PartNameAndChecksum
    {
--- a/src/Backups/IRestoreCoordination.h
+++ b/src/Backups/IRestoreCoordination.h
@ -18,29 +18,27 @@ class IRestoreCoordination
 public:
    virtual ~IRestoreCoordination() = default;

+    /// Sets that the restore query was sent to other hosts.
+    /// Function waitOtherHostsFinish() will check that to find out if it should really wait or not.
+    virtual void setRestoreQueryIsSentToOtherHosts() = 0;
+    virtual bool isRestoreQuerySentToOtherHosts() const = 0;
+
    /// Sets the current stage and waits for other hosts to come to this stage too.
    virtual Strings setStage(const String & new_stage, const String & message, bool sync) = 0;

-    /// Sets that the restore query was sent to other hosts.
-    /// Function waitForOtherHostsToFinish() will check that to find out if it should really wait or not.
-    virtual void setRestoreQueryWasSentToOtherHosts() = 0;
-
    /// Lets other hosts know that the current host has encountered an error.
-    virtual bool trySetError(std::exception_ptr exception) = 0;
-
-    /// Lets other hosts know that the current host has finished its work.
-    virtual void finish() = 0;
-
-    /// Lets other hosts know that the current host has finished its work (as a part of error-handling process).
-    virtual bool tryFinishAfterError() noexcept = 0;
+    /// Returns true if the information is successfully passed so other hosts can read it.
+    virtual bool setError(std::exception_ptr exception, bool throw_if_error) = 0;

    /// Waits until all the other hosts finish their work.
    /// Stops waiting and throws an exception if another host encounters an error or if some host gets cancelled.
-    virtual void waitForOtherHostsToFinish() = 0;
+    virtual bool waitOtherHostsFinish(bool throw_if_error) const = 0;

-    /// Waits until all the other hosts finish their work (as a part of error-handling process).
-    /// Doesn't stops waiting if some host encounters an error or gets cancelled.
-    virtual bool tryWaitForOtherHostsToFinishAfterError() noexcept = 0;
+    /// Lets other hosts know that the current host has finished its work.
+    virtual bool finish(bool throw_if_error) = 0;
+
+    /// Removes temporary nodes in ZooKeeper.
+    virtual bool cleanup(bool throw_if_error) = 0;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    virtual bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) = 0;
--- a/src/Backups/RestoreCoordinationLocal.cpp
+++ b/src/Backups/RestoreCoordinationLocal.cpp
@ -10,9 +10,9 @@ namespace DB
 {

 RestoreCoordinationLocal::RestoreCoordinationLocal(
-    const UUID & restore_uuid, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_)
+    bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_)
    : log(getLogger("RestoreCoordinationLocal"))
-    , concurrency_check(restore_uuid, /* is_restore = */ true, /* on_cluster = */ false, allow_concurrent_restore_, concurrency_counters_)
+    , concurrency_check(/* is_restore = */ true, /* on_cluster = */ false, /* zookeeper_path = */ "", allow_concurrent_restore_, concurrency_counters_)
 {
 }

--- a/src/Backups/RestoreCoordinationLocal.h
+++ b/src/Backups/RestoreCoordinationLocal.h
@ -17,16 +17,16 @@ class ASTCreateQuery;
 class RestoreCoordinationLocal : public IRestoreCoordination
 {
 public:
-    RestoreCoordinationLocal(const UUID & restore_uuid_, bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_);
+    RestoreCoordinationLocal(bool allow_concurrent_restore_, BackupConcurrencyCounters & concurrency_counters_);
    ~RestoreCoordinationLocal() override;

+    void setRestoreQueryIsSentToOtherHosts() override {}
+    bool isRestoreQuerySentToOtherHosts() const override { return false; }
    Strings setStage(const String &, const String &, bool) override { return {}; }
-    void setRestoreQueryWasSentToOtherHosts() override {}
-    bool trySetError(std::exception_ptr) override { return true; }
-    void finish() override {}
-    bool tryFinishAfterError() noexcept override { return true; }
-    void waitForOtherHostsToFinish() override {}
-    bool tryWaitForOtherHostsToFinishAfterError() noexcept override { return true; }
+    bool setError(std::exception_ptr, bool) override { return true; }
+    bool waitOtherHostsFinish(bool) const override { return true; }
+    bool finish(bool) override { return true; }
+    bool cleanup(bool) override { return true; }

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
--- a/src/Backups/RestoreCoordinationOnCluster.cpp
+++ b/src/Backups/RestoreCoordinationOnCluster.cpp
@ -35,17 +35,21 @@ RestoreCoordinationOnCluster::RestoreCoordinationOnCluster(
    , current_host_index(BackupCoordinationOnCluster::findCurrentHostIndex(current_host, all_hosts))
    , log(getLogger("RestoreCoordinationOnCluster"))
    , with_retries(log, get_zookeeper_, keeper_settings, process_list_element_, [root_zookeeper_path_](Coordination::ZooKeeperWithFaultInjection::Ptr zk) { zk->sync(root_zookeeper_path_); })
-    , concurrency_check(restore_uuid_, /* is_restore = */ true, /* on_cluster = */ true, allow_concurrent_restore_, concurrency_counters_)
-    , stage_sync(/* is_restore = */ true, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_restore_, with_retries, schedule_, process_list_element_, log)
-    , cleaner(zookeeper_path, with_retries, log)
+    , cleaner(/* is_restore = */ true, zookeeper_path, with_retries, log)
+    , stage_sync(/* is_restore = */ true, fs::path{zookeeper_path} / "stage", current_host, all_hosts, allow_concurrent_restore_, concurrency_counters_, with_retries, schedule_, process_list_element_, log)
+{
+    try
    {
        createRootNodes();
    }
-
-RestoreCoordinationOnCluster::~RestoreCoordinationOnCluster()
+    catch (...)
    {
-    tryFinishImpl();
+        stage_sync.setError(std::current_exception(), /* throw_if_error = */ false);
+        throw;
    }
+}
+
+RestoreCoordinationOnCluster::~RestoreCoordinationOnCluster() = default;

 void RestoreCoordinationOnCluster::createRootNodes()
 {
@ -66,69 +70,52 @@ void RestoreCoordinationOnCluster::createRootNodes()
        });
 }

+void RestoreCoordinationOnCluster::setRestoreQueryIsSentToOtherHosts()
+{
+    stage_sync.setQueryIsSentToOtherHosts();
+}
+
+bool RestoreCoordinationOnCluster::isRestoreQuerySentToOtherHosts() const
+{
+    return stage_sync.isQuerySentToOtherHosts();
+}
+
 Strings RestoreCoordinationOnCluster::setStage(const String & new_stage, const String & message, bool sync)
 {
    stage_sync.setStage(new_stage, message);
-
-    if (!sync)
+    if (sync)
+        return stage_sync.waitHostsReachStage(all_hosts_without_initiator, new_stage);
    return {};
-
-    return stage_sync.waitForHostsToReachStage(new_stage, all_hosts_without_initiator);
 }

-void RestoreCoordinationOnCluster::setRestoreQueryWasSentToOtherHosts()
+bool RestoreCoordinationOnCluster::setError(std::exception_ptr exception, bool throw_if_error)
 {
-    restore_query_was_sent_to_other_hosts = true;
+    return stage_sync.setError(exception, throw_if_error);
 }

-bool RestoreCoordinationOnCluster::trySetError(std::exception_ptr exception)
+bool RestoreCoordinationOnCluster::waitOtherHostsFinish(bool throw_if_error) const
 {
-    return stage_sync.trySetError(exception);
+    return stage_sync.waitOtherHostsFinish(throw_if_error);
 }

-void RestoreCoordinationOnCluster::finish()
+bool RestoreCoordinationOnCluster::finish(bool throw_if_error)
 {
-    bool other_hosts_also_finished = false;
-    stage_sync.finish(other_hosts_also_finished);
-
-    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
-        cleaner.cleanup();
+    return stage_sync.finish(throw_if_error);
 }

-bool RestoreCoordinationOnCluster::tryFinishAfterError() noexcept
+bool RestoreCoordinationOnCluster::cleanup(bool throw_if_error)
 {
-    return tryFinishImpl();
-}
-
-bool RestoreCoordinationOnCluster::tryFinishImpl() noexcept
+    /// All the hosts must finish before we remove the coordination nodes.
+    bool expect_other_hosts_finished = stage_sync.isQuerySentToOtherHosts() || !stage_sync.isErrorSet();
+    bool all_hosts_finished = stage_sync.finished() && (stage_sync.otherHostsFinished() || !expect_other_hosts_finished);
+    if (!all_hosts_finished)
    {
-    bool other_hosts_also_finished = false;
-    if (!stage_sync.tryFinishAfterError(other_hosts_also_finished))
-        return false;
-
-    if ((current_host == kInitiator) && (other_hosts_also_finished || !restore_query_was_sent_to_other_hosts))
-    {
-        if (!cleaner.tryCleanupAfterError())
+        auto unfinished_hosts = expect_other_hosts_finished ? stage_sync.getUnfinishedHosts() : Strings{current_host};
+        LOG_INFO(log, "Skipping removing nodes from ZooKeeper because hosts {} didn't finish",
+                 BackupCoordinationStageSync::getHostsDesc(unfinished_hosts));
        return false;
    }
-
-    return true;
-}
-
-void RestoreCoordinationOnCluster::waitForOtherHostsToFinish()
-{
-    if ((current_host != kInitiator) || !restore_query_was_sent_to_other_hosts)
-        return;
-    stage_sync.waitForOtherHostsToFinish();
-}
-
-bool RestoreCoordinationOnCluster::tryWaitForOtherHostsToFinishAfterError() noexcept
-{
-    if (current_host != kInitiator)
-        return false;
-    if (!restore_query_was_sent_to_other_hosts)
-        return true;
-    return stage_sync.tryWaitForOtherHostsToFinishAfterError();
+    return cleaner.cleanup(throw_if_error);
 }

 ZooKeeperRetriesInfo RestoreCoordinationOnCluster::getOnClusterInitializationKeeperRetriesInfo() const
--- a/src/Backups/RestoreCoordinationOnCluster.h
+++ b/src/Backups/RestoreCoordinationOnCluster.h
@ -1,7 +1,6 @@
 #pragma once

 #include <Backups/IRestoreCoordination.h>
-#include <Backups/BackupConcurrencyCheck.h>
 #include <Backups/BackupCoordinationCleaner.h>
 #include <Backups/BackupCoordinationStageSync.h>
 #include <Backups/WithRetries.h>
@ -15,7 +14,7 @@ class RestoreCoordinationOnCluster : public IRestoreCoordination
 {
 public:
    /// Empty string as the current host is used to mark the initiator of a RESTORE ON CLUSTER query.
-    static const constexpr std::string_view kInitiator;
+    static const constexpr std::string_view kInitiator = BackupCoordinationStageSync::kInitiator;

    RestoreCoordinationOnCluster(
        const UUID & restore_uuid_,
@ -31,13 +30,13 @@ public:

    ~RestoreCoordinationOnCluster() override;

+    void setRestoreQueryIsSentToOtherHosts() override;
+    bool isRestoreQuerySentToOtherHosts() const override;
    Strings setStage(const String & new_stage, const String & message, bool sync) override;
-    void setRestoreQueryWasSentToOtherHosts() override;
-    bool trySetError(std::exception_ptr exception) override;
-    void finish() override;
-    bool tryFinishAfterError() noexcept override;
-    void waitForOtherHostsToFinish() override;
-    bool tryWaitForOtherHostsToFinishAfterError() noexcept override;
+    bool setError(std::exception_ptr exception, bool throw_if_error) override;
+    bool waitOtherHostsFinish(bool throw_if_error) const override;
+    bool finish(bool throw_if_error) override;
+    bool cleanup(bool throw_if_error) override;

    /// Starts creating a table in a replicated database. Returns false if there is another host which is already creating this table.
    bool acquireCreatingTableInReplicatedDatabase(const String & database_zk_path, const String & table_name) override;
@ -78,11 +77,10 @@ private:
    const size_t current_host_index;
    LoggerPtr const log;

+    /// The order is important: `stage_sync` must be initialized after `with_retries` and `cleaner`.
    const WithRetries with_retries;
-    BackupConcurrencyCheck concurrency_check;
-    BackupCoordinationStageSync stage_sync;
    BackupCoordinationCleaner cleaner;
-    std::atomic<bool> restore_query_was_sent_to_other_hosts = false;
+    BackupCoordinationStageSync stage_sync;
 };

 }
--- a/src/Client/ClientBase.cpp
+++ b/src/Client/ClientBase.cpp
@ -68,15 +68,16 @@
 #include <Access/AccessControl.h>
 #include <Storages/ColumnsDescription.h>

-#include <boost/algorithm/string/case_conv.hpp>
-#include <boost/algorithm/string/replace.hpp>
-#include <iostream>
 #include <filesystem>
+#include <iostream>
 #include <limits>
 #include <map>
 #include <memory>
+#include <mutex>
 #include <string_view>
 #include <unordered_map>
+#include <boost/algorithm/string/case_conv.hpp>
+#include <boost/algorithm/string/replace.hpp>

 #include <Common/config_version.h>
 #include <base/find_symbols.h>
@ -441,9 +442,15 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query)

    /// If results are written INTO OUTFILE, we can avoid clearing progress to avoid flicker.
    if (need_render_progress && tty_buf && (!select_into_file || select_into_file_and_stdout))
-        progress_indication.clearProgressOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_indication.clearProgressOutput(*tty_buf, lock);
+    }
    if (need_render_progress_table && tty_buf && (!select_into_file || select_into_file_and_stdout))
-        progress_table.clearTableOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_table.clearTableOutput(*tty_buf, lock);
+    }

    try
    {
@ -464,13 +471,15 @@ void ClientBase::onData(Block & block, ASTPtr parsed_query)
    {
        if (select_into_file && !select_into_file_and_stdout)
            error_stream << "\r";
-        progress_indication.writeProgress(*tty_buf);
+        std::unique_lock lock(tty_mutex);
+        progress_indication.writeProgress(*tty_buf, lock);
    }
    if (need_render_progress_table && tty_buf && !cancelled)
    {
        if (!need_render_progress && select_into_file && !select_into_file_and_stdout)
            error_stream << "\r";
-        progress_table.writeTable(*tty_buf, progress_table_toggle_on.load(), progress_table_toggle_enabled);
+        std::unique_lock lock(tty_mutex);
+        progress_table.writeTable(*tty_buf, lock, progress_table_toggle_on.load(), progress_table_toggle_enabled);
    }
 }

@ -479,9 +488,15 @@ void ClientBase::onLogData(Block & block)
 {
    initLogsOutputStream();
    if (need_render_progress && tty_buf)
-        progress_indication.clearProgressOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_indication.clearProgressOutput(*tty_buf, lock);
+    }
    if (need_render_progress_table && tty_buf)
-        progress_table.clearTableOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_table.clearTableOutput(*tty_buf, lock);
+    }
    logs_out_stream->writeLogs(block);
    logs_out_stream->flush();
 }
@ -1151,34 +1166,8 @@ void ClientBase::receiveResult(ASTPtr parsed_query, Int32 signals_before_stop, b

    std::exception_ptr local_format_error;

-    if (keystroke_interceptor)
-    {
-        progress_table_toggle_on = false;
-        try
-        {
-            keystroke_interceptor->startIntercept();
-        }
-        catch (const DB::Exception &)
-        {
-            error_stream << getCurrentExceptionMessage(false);
-            keystroke_interceptor.reset();
-        }
-    }
-
-    SCOPE_EXIT({
-        if (keystroke_interceptor)
-        {
-            try
-            {
-                keystroke_interceptor->stopIntercept();
-            }
-            catch (...)
-            {
-                error_stream << getCurrentExceptionMessage(false);
-                keystroke_interceptor.reset();
-            }
-        }
-    });
+    startKeystrokeInterceptorIfExists();
+    SCOPE_EXIT({ stopKeystrokeInterceptorIfExists(); });

    while (true)
    {
@ -1318,7 +1307,10 @@ void ClientBase::onProgress(const Progress & value)
        output_format->onProgress(value);

    if (need_render_progress && tty_buf)
-        progress_indication.writeProgress(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_indication.writeProgress(*tty_buf, lock);
+    }
 }

 void ClientBase::onTimezoneUpdate(const String & tz)
@ -1330,9 +1322,15 @@ void ClientBase::onTimezoneUpdate(const String & tz)
 void ClientBase::onEndOfStream()
 {
    if (need_render_progress && tty_buf)
-        progress_indication.clearProgressOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_indication.clearProgressOutput(*tty_buf, lock);
+    }
    if (need_render_progress_table && tty_buf)
-        progress_table.clearTableOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_table.clearTableOutput(*tty_buf, lock);
+    }

    if (output_format)
    {
@ -1414,11 +1412,15 @@ void ClientBase::onProfileEvents(Block & block)
        progress_table.updateTable(block);

        if (need_render_progress && tty_buf)
-            progress_indication.writeProgress(*tty_buf);
+        {
+            std::unique_lock lock(tty_mutex);
+            progress_indication.writeProgress(*tty_buf, lock);
+        }
        if (need_render_progress_table && tty_buf && !cancelled)
        {
            bool toggle_enabled = getClientConfiguration().getBool("enable-progress-table-toggle", true);
-            progress_table.writeTable(*tty_buf, progress_table_toggle_on.load(), toggle_enabled);
+            std::unique_lock lock(tty_mutex);
+            progress_table.writeTable(*tty_buf, lock, progress_table_toggle_on.load(), toggle_enabled);
        }

        if (profile_events.print)
@ -1429,9 +1431,15 @@ void ClientBase::onProfileEvents(Block & block)
                profile_events.watch.restart();
                initLogsOutputStream();
                if (need_render_progress && tty_buf)
-                    progress_indication.clearProgressOutput(*tty_buf);
+                {
+                    std::unique_lock lock(tty_mutex);
+                    progress_indication.clearProgressOutput(*tty_buf, lock);
+                }
                if (need_render_progress_table && tty_buf)
-                    progress_table.clearTableOutput(*tty_buf);
+                {
+                    std::unique_lock lock(tty_mutex);
+                    progress_table.clearTableOutput(*tty_buf, lock);
+                }
                logs_out_stream->writeProfileEvents(block);
                logs_out_stream->flush();

@ -1450,7 +1458,10 @@ void ClientBase::onProfileEvents(Block & block)
 void ClientBase::resetOutput()
 {
    if (need_render_progress_table && tty_buf)
-        progress_table.clearTableOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_table.clearTableOutput(*tty_buf, lock);
+    }

    /// Order is important: format, compression, file

@ -1619,6 +1630,9 @@ void ClientBase::processInsertQuery(const String & query_to_execute, ASTPtr pars
    if (send_external_tables)
        sendExternalTables(parsed_query);

+    startKeystrokeInterceptorIfExists();
+    SCOPE_EXIT({ stopKeystrokeInterceptorIfExists(); });
+
    /// Receive description of table structure.
    Block sample;
    ColumnsDescription columns_description;
@ -1665,7 +1679,7 @@ void ClientBase::sendData(Block & sample, const ColumnsDescription & columns_des

        /// Set callback to be called on file progress.
        if (tty_buf)
-            progress_indication.setFileProgressCallback(client_context, *tty_buf);
+            progress_indication.setFileProgressCallback(client_context, *tty_buf, tty_mutex);
    }

    /// If data fetched from file (maybe compressed file)
@ -1947,9 +1961,15 @@ void ClientBase::cancelQuery()
        }

    if (need_render_progress && tty_buf)
-        progress_indication.clearProgressOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_indication.clearProgressOutput(*tty_buf, lock);
+    }
    if (need_render_progress_table && tty_buf)
-        progress_table.clearTableOutput(*tty_buf);
+    {
+        std::unique_lock lock(tty_mutex);
+        progress_table.clearTableOutput(*tty_buf, lock);
+    }

    if (is_interactive)
        output_stream << "Cancelling query." << std::endl;
@ -2112,9 +2132,15 @@ void ClientBase::processParsedSingleQuery(const String & full_query, const Strin
    {
        initLogsOutputStream();
        if (need_render_progress && tty_buf)
-            progress_indication.clearProgressOutput(*tty_buf);
+        {
+            std::unique_lock lock(tty_mutex);
+            progress_indication.clearProgressOutput(*tty_buf, lock);
+        }
        if (need_render_progress_table && tty_buf)
-            progress_table.clearTableOutput(*tty_buf);
+        {
+            std::unique_lock lock(tty_mutex);
+            progress_table.clearTableOutput(*tty_buf, lock);
+        }
        logs_out_stream->writeProfileEvents(profile_events.last_block);
        logs_out_stream->flush();

@ -2613,6 +2639,39 @@ bool ClientBase::addMergeTreeSettings(ASTCreateQuery & ast_create)
    return added_new_setting;
 }

+void ClientBase::startKeystrokeInterceptorIfExists()
+{
+    if (keystroke_interceptor)
+    {
+        progress_table_toggle_on = false;
+        try
+        {
+            keystroke_interceptor->startIntercept();
+        }
+        catch (const DB::Exception &)
+        {
+            error_stream << getCurrentExceptionMessage(false);
+            keystroke_interceptor.reset();
+        }
+    }
+}
+
+void ClientBase::stopKeystrokeInterceptorIfExists()
+{
+    if (keystroke_interceptor)
+    {
+        try
+        {
+            keystroke_interceptor->stopIntercept();
+        }
+        catch (...)
+        {
+            error_stream << getCurrentExceptionMessage(false);
+            keystroke_interceptor.reset();
+        }
+    }
+}
+
 void ClientBase::runInteractive()
 {
    if (getClientConfiguration().has("query_id"))
--- a/src/Client/ClientBase.h
+++ b/src/Client/ClientBase.h
@ -208,6 +208,9 @@ private:
    void initQueryIdFormats();
    bool addMergeTreeSettings(ASTCreateQuery & ast_create);

+    void startKeystrokeInterceptorIfExists();
+    void stopKeystrokeInterceptorIfExists();
+
 protected:

    class QueryInterruptHandler : private boost::noncopyable
@ -325,6 +328,7 @@ protected:
    /// /dev/tty if accessible or std::cerr - for progress bar.
    /// We prefer to output progress bar directly to tty to allow user to redirect stdout and stderr and still get the progress indication.
    std::unique_ptr<WriteBufferFromFileDescriptor> tty_buf;
+    std::mutex tty_mutex;

    String home_path;
    String history_file; /// Path to a file containing command history.
--- a/src/Client/ClientBaseHelpers.cpp
+++ b/src/Client/ClientBaseHelpers.cpp
@ -140,8 +140,6 @@ void highlight(const String & query, std::vector<replxx::Replxx::Color> & colors
    /// We don't do highlighting for foreign dialects, such as PRQL and Kusto.
    /// Only normal ClickHouse SQL queries are highlighted.

-    /// Currently we highlight only the first query in the multi-query mode.
-
    ParserQuery parser(end, false, context.getSettingsRef()[Setting::implicit_select]);
    ASTPtr ast;
    bool parse_res = false;
--- a/src/Client/ProgressTable.cpp
+++ b/src/Client/ProgressTable.cpp
@ -14,6 +14,7 @@
 #include <Common/formatReadable.h>

 #include <format>
+#include <mutex>
 #include <numeric>
 #include <unordered_map>

@ -192,7 +193,8 @@ void writeWithWidthStrict(Out & out, std::string_view s, size_t width)

 }

-void ProgressTable::writeTable(WriteBufferFromFileDescriptor & message, bool show_table, bool toggle_enabled)
+void ProgressTable::writeTable(
+    WriteBufferFromFileDescriptor & message, std::unique_lock<std::mutex> &, bool show_table, bool toggle_enabled)
 {
    std::lock_guard lock{mutex};
    if (!show_table && toggle_enabled)
@ -360,7 +362,7 @@ void ProgressTable::updateTable(const Block & block)
    written_first_block = true;
 }

-void ProgressTable::clearTableOutput(WriteBufferFromFileDescriptor & message)
+void ProgressTable::clearTableOutput(WriteBufferFromFileDescriptor & message, std::unique_lock<std::mutex> &)
 {
    message << "\r" << CLEAR_TO_END_OF_SCREEN << SHOW_CURSOR;
    message.next();
--- a/src/Client/ProgressTable.h
+++ b/src/Client/ProgressTable.h
@ -27,8 +27,9 @@ public:
    }

    /// Write progress table with metrics.
-    void writeTable(WriteBufferFromFileDescriptor & message, bool show_table, bool toggle_enabled);
-    void clearTableOutput(WriteBufferFromFileDescriptor & message);
+    void writeTable(WriteBufferFromFileDescriptor & message, std::unique_lock<std::mutex> & message_lock,
+            bool show_table, bool toggle_enabled);
+    void clearTableOutput(WriteBufferFromFileDescriptor & message, std::unique_lock<std::mutex> & message_lock);
    void writeFinalTable();

    /// Update the metric values. They can be updated from:
--- a/src/Columns/ColumnArray.cpp
+++ b/src/Columns/ColumnArray.cpp
@ -662,6 +662,8 @@ ColumnPtr ColumnArray::filter(const Filter & filt, ssize_t result_size_hint) con
        return filterNumber<Int128>(filt, result_size_hint);
    if (typeid_cast<const ColumnInt256 *>(data.get()))
        return filterNumber<Int256>(filt, result_size_hint);
+    if (typeid_cast<const ColumnBFloat16 *>(data.get()))
+        return filterNumber<BFloat16>(filt, result_size_hint);
    if (typeid_cast<const ColumnFloat32 *>(data.get()))
        return filterNumber<Float32>(filt, result_size_hint);
    if (typeid_cast<const ColumnFloat64 *>(data.get()))
@ -1065,6 +1067,8 @@ ColumnPtr ColumnArray::replicate(const Offsets & replicate_offsets) const
        return replicateNumber<Int128>(replicate_offsets);
    if (typeid_cast<const ColumnInt256 *>(data.get()))
        return replicateNumber<Int256>(replicate_offsets);
+    if (typeid_cast<const ColumnBFloat16 *>(data.get()))
+        return replicateNumber<BFloat16>(replicate_offsets);
    if (typeid_cast<const ColumnFloat32 *>(data.get()))
        return replicateNumber<Float32>(replicate_offsets);
    if (typeid_cast<const ColumnFloat64 *>(data.get()))
--- a/src/Columns/ColumnUnique.cpp
+++ b/src/Columns/ColumnUnique.cpp
@ -16,6 +16,7 @@ template class ColumnUnique<ColumnInt128>;
 template class ColumnUnique<ColumnUInt128>;
 template class ColumnUnique<ColumnInt256>;
 template class ColumnUnique<ColumnUInt256>;
+template class ColumnUnique<ColumnBFloat16>;
 template class ColumnUnique<ColumnFloat32>;
 template class ColumnUnique<ColumnFloat64>;
 template class ColumnUnique<ColumnString>;
--- a/src/Columns/ColumnUnique.h
+++ b/src/Columns/ColumnUnique.h
@ -760,6 +760,7 @@ extern template class ColumnUnique<ColumnInt128>;
 extern template class ColumnUnique<ColumnUInt128>;
 extern template class ColumnUnique<ColumnInt256>;
 extern template class ColumnUnique<ColumnUInt256>;
+extern template class ColumnUnique<ColumnBFloat16>;
 extern template class ColumnUnique<ColumnFloat32>;
 extern template class ColumnUnique<ColumnFloat64>;
 extern template class ColumnUnique<ColumnString>;
--- a/src/Columns/ColumnVector.cpp
+++ b/src/Columns/ColumnVector.cpp
@ -118,9 +118,9 @@ struct ColumnVector<T>::less_stable
        if (unlikely(parent.data[lhs] == parent.data[rhs]))
            return lhs < rhs;

-        if constexpr (std::is_floating_point_v<T>)
+        if constexpr (is_floating_point<T>)
        {
-            if (unlikely(std::isnan(parent.data[lhs]) && std::isnan(parent.data[rhs])))
+            if (unlikely(isNaN(parent.data[lhs]) && isNaN(parent.data[rhs])))
            {
                return lhs < rhs;
            }
@ -150,9 +150,9 @@ struct ColumnVector<T>::greater_stable
        if (unlikely(parent.data[lhs] == parent.data[rhs]))
            return lhs < rhs;

-        if constexpr (std::is_floating_point_v<T>)
+        if constexpr (is_floating_point<T>)
        {
-            if (unlikely(std::isnan(parent.data[lhs]) && std::isnan(parent.data[rhs])))
+            if (unlikely(isNaN(parent.data[lhs]) && isNaN(parent.data[rhs])))
            {
                return lhs < rhs;
            }
@ -224,9 +224,9 @@ void ColumnVector<T>::getPermutation(IColumn::PermutationSortDirection direction

    iota(res.data(), data_size, IColumn::Permutation::value_type(0));

-    if constexpr (has_find_extreme_implementation<T> && !std::is_floating_point_v<T>)
+    if constexpr (has_find_extreme_implementation<T> && !is_floating_point<T>)
    {
-        /// Disabled for:floating point
+        /// Disabled for floating point:
        /// * floating point: We don't deal with nan_direction_hint
        /// * stability::Stable: We might return any value, not the first
        if ((limit == 1) && (stability == IColumn::PermutationSortStability::Unstable))
@ -256,7 +256,7 @@ void ColumnVector<T>::getPermutation(IColumn::PermutationSortDirection direction
            bool sort_is_stable = stability == IColumn::PermutationSortStability::Stable;

            /// TODO: LSD RadixSort is currently not stable if direction is descending, or value is floating point
-            bool use_radix_sort = (sort_is_stable && ascending && !std::is_floating_point_v<T>) || !sort_is_stable;
+            bool use_radix_sort = (sort_is_stable && ascending && !is_floating_point<T>) || !sort_is_stable;

            /// Thresholds on size. Lower threshold is arbitrary. Upper threshold is chosen by the type for histogram counters.
            if (data_size >= 256 && data_size <= std::numeric_limits<UInt32>::max() && use_radix_sort)
@ -283,7 +283,7 @@ void ColumnVector<T>::getPermutation(IColumn::PermutationSortDirection direction

                /// Radix sort treats all NaNs to be greater than all numbers.
                /// If the user needs the opposite, we must move them accordingly.
-                if (std::is_floating_point_v<T> && nan_direction_hint < 0)
+                if (is_floating_point<T> && nan_direction_hint < 0)
                {
                    size_t nans_to_move = 0;

@ -330,7 +330,7 @@ void ColumnVector<T>::updatePermutation(IColumn::PermutationSortDirection direct
        if constexpr (is_arithmetic_v<T> && !is_big_int_v<T>)
        {
            /// TODO: LSD RadixSort is currently not stable if direction is descending, or value is floating point
-            bool use_radix_sort = (sort_is_stable && ascending && !std::is_floating_point_v<T>) || !sort_is_stable;
+            bool use_radix_sort = (sort_is_stable && ascending && !is_floating_point<T>) || !sort_is_stable;
            size_t size = end - begin;

            /// Thresholds on size. Lower threshold is arbitrary. Upper threshold is chosen by the type for histogram counters.
@ -353,7 +353,7 @@ void ColumnVector<T>::updatePermutation(IColumn::PermutationSortDirection direct

                /// Radix sort treats all NaNs to be greater than all numbers.
                /// If the user needs the opposite, we must move them accordingly.
-                if (std::is_floating_point_v<T> && nan_direction_hint < 0)
+                if (is_floating_point<T> && nan_direction_hint < 0)
                {
                    size_t nans_to_move = 0;

@ -1005,6 +1005,7 @@ template class ColumnVector<Int32>;
 template class ColumnVector<Int64>;
 template class ColumnVector<Int128>;
 template class ColumnVector<Int256>;
+template class ColumnVector<BFloat16>;
 template class ColumnVector<Float32>;
 template class ColumnVector<Float64>;
 template class ColumnVector<UUID>;
--- a/src/Columns/ColumnVector.h
+++ b/src/Columns/ColumnVector.h
@ -481,6 +481,7 @@ extern template class ColumnVector<Int32>;
 extern template class ColumnVector<Int64>;
 extern template class ColumnVector<Int128>;
 extern template class ColumnVector<Int256>;
+extern template class ColumnVector<BFloat16>;
 extern template class ColumnVector<Float32>;
 extern template class ColumnVector<Float64>;
 extern template class ColumnVector<UUID>;
--- a/src/Columns/ColumnsCommon.cpp
+++ b/src/Columns/ColumnsCommon.cpp
@ -328,6 +328,7 @@ INSTANTIATE(Int32)
 INSTANTIATE(Int64)
 INSTANTIATE(Int128)
 INSTANTIATE(Int256)
+INSTANTIATE(BFloat16)
 INSTANTIATE(Float32)
 INSTANTIATE(Float64)
 INSTANTIATE(Decimal32)
--- a/src/Columns/ColumnsNumber.h
+++ b/src/Columns/ColumnsNumber.h
@ -23,6 +23,7 @@ using ColumnInt64 = ColumnVector<Int64>;
 using ColumnInt128 = ColumnVector<Int128>;
 using ColumnInt256 = ColumnVector<Int256>;

+using ColumnBFloat16 = ColumnVector<BFloat16>;
 using ColumnFloat32 = ColumnVector<Float32>;
 using ColumnFloat64 = ColumnVector<Float64>;

--- a/src/Columns/IColumn.cpp
+++ b/src/Columns/IColumn.cpp
@ -443,6 +443,7 @@ template class IColumnHelper<ColumnVector<Int32>, ColumnFixedSizeHelper>;
 template class IColumnHelper<ColumnVector<Int64>, ColumnFixedSizeHelper>;
 template class IColumnHelper<ColumnVector<Int128>, ColumnFixedSizeHelper>;
 template class IColumnHelper<ColumnVector<Int256>, ColumnFixedSizeHelper>;
+template class IColumnHelper<ColumnVector<BFloat16>, ColumnFixedSizeHelper>;
 template class IColumnHelper<ColumnVector<Float32>, ColumnFixedSizeHelper>;
 template class IColumnHelper<ColumnVector<Float64>, ColumnFixedSizeHelper>;
 template class IColumnHelper<ColumnVector<UUID>, ColumnFixedSizeHelper>;
--- a/src/Columns/MaskOperations.cpp
+++ b/src/Columns/MaskOperations.cpp
@ -63,6 +63,7 @@ INSTANTIATE(Int32)
 INSTANTIATE(Int64)
 INSTANTIATE(Int128)
 INSTANTIATE(Int256)
+INSTANTIATE(BFloat16)
 INSTANTIATE(Float32)
 INSTANTIATE(Float64)
 INSTANTIATE(Decimal32)
@ -200,6 +201,7 @@ static MaskInfo extractMaskImpl(
          || extractMaskNumeric<inverted, Int16>(mask, column, null_value, null_bytemap, nulls, mask_info)
          || extractMaskNumeric<inverted, Int32>(mask, column, null_value, null_bytemap, nulls, mask_info)
          || extractMaskNumeric<inverted, Int64>(mask, column, null_value, null_bytemap, nulls, mask_info)
+          || extractMaskNumeric<inverted, BFloat16>(mask, column, null_value, null_bytemap, nulls, mask_info)
          || extractMaskNumeric<inverted, Float32>(mask, column, null_value, null_bytemap, nulls, mask_info)
          || extractMaskNumeric<inverted, Float64>(mask, column, null_value, null_bytemap, nulls, mask_info)))
        throw Exception(ErrorCodes::ILLEGAL_COLUMN, "Cannot convert column {} to mask.", column->getName());
--- a/src/Columns/tests/gtest_column_vector.cpp
+++ b/src/Columns/tests/gtest_column_vector.cpp
@ -93,6 +93,7 @@ TEST(ColumnVector, Filter)
    testFilter<Int64>();
    testFilter<UInt128>();
    testFilter<Int256>();
+    testFilter<BFloat16>();
    testFilter<Float32>();
    testFilter<Float64>();
    testFilter<UUID>();
--- a/src/Columns/tests/gtest_low_cardinality.cpp
+++ b/src/Columns/tests/gtest_low_cardinality.cpp
@ -45,6 +45,7 @@ TEST(ColumnLowCardinality, Insert)
    testLowCardinalityNumberInsert<Int128>(std::make_shared<DataTypeInt128>());
    testLowCardinalityNumberInsert<Int256>(std::make_shared<DataTypeInt256>());

+    testLowCardinalityNumberInsert<BFloat16>(std::make_shared<DataTypeBFloat16>());
    testLowCardinalityNumberInsert<Float32>(std::make_shared<DataTypeFloat32>());
    testLowCardinalityNumberInsert<Float64>(std::make_shared<DataTypeFloat64>());
 }
--- a/src/Common/CPUID.h
+++ b/src/Common/CPUID.h
@ -266,6 +266,11 @@ inline bool haveAVX512VBMI2() noexcept
    return haveAVX512F() && ((CPUInfo(0x7, 0).registers.ecx >> 6) & 1u);
 }

+inline bool haveAVX512BF16() noexcept
+{
+    return haveAVX512F() && ((CPUInfo(0x7, 1).registers.eax >> 5) & 1u);
+}
+
 inline bool haveRDRAND() noexcept
 {
    return CPUInfo(0x0).registers.eax >= 0x7 && ((CPUInfo(0x1).registers.ecx >> 30) & 1u);
@ -326,6 +331,7 @@ inline bool haveAMXINT8() noexcept
    OP(AVX512VL)             \
    OP(AVX512VBMI)           \
    OP(AVX512VBMI2)          \
+    OP(AVX512BF16)           \
    OP(PREFETCHWT1)          \
    OP(SHA)                  \
    OP(ADX)                  \
--- a/src/Common/CurrentMetrics.cpp
+++ b/src/Common/CurrentMetrics.cpp
@ -49,6 +49,7 @@
    M(TemporaryFilesForSort, "Number of temporary files created for external sorting") \
    M(TemporaryFilesForAggregation, "Number of temporary files created for external aggregation") \
    M(TemporaryFilesForJoin, "Number of temporary files created for JOIN") \
+    M(TemporaryFilesForMerge, "Number of temporary files for vertical merge") \
    M(TemporaryFilesUnknown, "Number of temporary files created without known purpose") \
    M(Read, "Number of read (read, pread, io_getevents, etc.) syscalls in fly") \
    M(RemoteRead, "Number of read with remote reader in fly") \
@ -255,6 +256,7 @@
    M(PartsActive, "Active data part, used by current and upcoming SELECTs.") \
    M(AttachedDatabase, "Active databases.") \
    M(AttachedTable, "Active tables.") \
+    M(AttachedReplicatedTable, "Active replicated tables.") \
    M(AttachedView, "Active views.") \
    M(AttachedDictionary, "Active dictionaries.") \
    M(PartsOutdated, "Not active data part, but could be used by only current SELECTs, could be deleted after SELECTs finishes.") \
--- a/src/Common/FailPoint.cpp
+++ b/src/Common/FailPoint.cpp
@ -86,6 +86,7 @@ APPLY_FOR_FAILPOINTS(M, M, M, M)

 std::unordered_map<String, std::shared_ptr<FailPointChannel>> FailPointInjection::fail_point_wait_channels;
 std::mutex FailPointInjection::mu;
+
 class FailPointChannel : private boost::noncopyable
 {
 public:
--- a/src/Common/FailPoint.h
+++ b/src/Common/FailPoint.h
@ -15,6 +15,7 @@

 #include <unordered_map>

+
 namespace DB
 {

@ -27,6 +28,7 @@ namespace DB
 /// 3. in test file, we can use system failpoint enable/disable 'failpoint_name'

 class FailPointChannel;
+
 class FailPointInjection
 {
 public:
--- a/src/Common/FieldVisitorConvertToNumber.cpp
+++ b/src/Common/FieldVisitorConvertToNumber.cpp
@ -1,5 +1,4 @@
 #include <Common/FieldVisitorConvertToNumber.h>
-#include "base/Decimal.h"

 namespace DB
 {
@ -17,6 +16,7 @@ template class FieldVisitorConvertToNumber<Int128>;
 template class FieldVisitorConvertToNumber<UInt128>;
 template class FieldVisitorConvertToNumber<Int256>;
 template class FieldVisitorConvertToNumber<UInt256>;
+//template class FieldVisitorConvertToNumber<BFloat16>;
 template class FieldVisitorConvertToNumber<Float32>;
 template class FieldVisitorConvertToNumber<Float64>;

--- a/src/Common/FieldVisitorConvertToNumber.h
+++ b/src/Common/FieldVisitorConvertToNumber.h
@ -58,7 +58,7 @@ public:

    T operator() (const Float64 & x) const
    {
-        if constexpr (!std::is_floating_point_v<T>)
+        if constexpr (!is_floating_point<T>)
        {
            if (!isFinite(x))
            {
@ -88,7 +88,7 @@ public:
    template <typename U>
    T operator() (const DecimalField<U> & x) const
    {
-        if constexpr (std::is_floating_point_v<T>)
+        if constexpr (is_floating_point<T>)
            return x.getValue().template convertTo<T>() / x.getScaleMultiplier().template convertTo<T>();
        else
            return (x.getValue() / x.getScaleMultiplier()).template convertTo<T>();
@ -129,6 +129,7 @@ extern template class FieldVisitorConvertToNumber<Int128>;
 extern template class FieldVisitorConvertToNumber<UInt128>;
 extern template class FieldVisitorConvertToNumber<Int256>;
 extern template class FieldVisitorConvertToNumber<UInt256>;
+//extern template class FieldVisitorConvertToNumber<BFloat16>;
 extern template class FieldVisitorConvertToNumber<Float32>;
 extern template class FieldVisitorConvertToNumber<Float64>;

--- a/src/Common/HashTable/Hash.h
+++ b/src/Common/HashTable/Hash.h
@ -322,6 +322,7 @@ DEFINE_HASH(Int32)
 DEFINE_HASH(Int64)
 DEFINE_HASH(Int128)
 DEFINE_HASH(Int256)
+DEFINE_HASH(BFloat16)
 DEFINE_HASH(Float32)
 DEFINE_HASH(Float64)
 DEFINE_HASH(DB::UUID)
--- a/src/Common/HashTable/HashTable.h
+++ b/src/Common/HashTable/HashTable.h
@ -76,7 +76,7 @@ struct HashTableNoState
 template <typename T>
 inline bool bitEquals(T a, T b)
 {
-    if constexpr (std::is_floating_point_v<T>)
+    if constexpr (is_floating_point<T>)
        /// Note that memcmp with constant size is a compiler builtin.
        return 0 == memcmp(&a, &b, sizeof(T)); /// NOLINT
    else
--- a/src/Common/HostResolvePool.cpp
+++ b/src/Common/HostResolvePool.cpp
@ -9,6 +9,7 @@

 #include <mutex>
 #include <algorithm>
+#include <Poco/Timespan.h>


 namespace ProfileEvents
@ -49,16 +50,18 @@ HostResolver::WeakPtr HostResolver::getWeakFromThis()
 }

 HostResolver::HostResolver(String host_, Poco::Timespan history_)
-    : host(std::move(host_))
-    , history(history_)
-    , resolve_function([](const String & host_to_resolve) { return DNSResolver::instance().resolveHostAllInOriginOrder(host_to_resolve); })
-{
-    update();
-}
+    : HostResolver(
+        [](const String & host_to_resolve) { return DNSResolver::instance().resolveHostAllInOriginOrder(host_to_resolve); },
+        host_,
+        history_)
+{}

 HostResolver::HostResolver(
    ResolveFunction && resolve_function_, String host_, Poco::Timespan history_)
-    : host(std::move(host_)), history(history_), resolve_function(std::move(resolve_function_))
+    : host(std::move(host_))
+    , history(history_)
+    , resolve_interval(history_.totalMicroseconds() / 3)
+    , resolve_function(std::move(resolve_function_))
 {
    update();
 }
@ -203,7 +206,7 @@ bool HostResolver::isUpdateNeeded()
    Poco::Timestamp now;

    std::lock_guard lock(mutex);
-    return last_resolve_time + history < now || records.empty();
+    return last_resolve_time + resolve_interval < now || records.empty();
 }

 void HostResolver::updateImpl(Poco::Timestamp now, std::vector<Poco::Net::IPAddress> & next_gen)
--- a/src/Common/HostResolvePool.h
+++ b/src/Common/HostResolvePool.h
@ -26,7 +26,7 @@
 //    a) it still occurs in resolve set after `history_` time or b) all other addresses are pessimized as well.
 // - resolve schedule
 //    Addresses are resolved through `DB::DNSResolver::instance()`.
-//    Usually it does not happen more often than once in `history_` time.
+//    Usually it does not happen more often than 3 times in `history_` period.
 //    But also new resolve performed each `setFail()` call.

 namespace DB
@ -212,6 +212,7 @@ protected:

    const String host;
    const Poco::Timespan history;
+    const Poco::Timespan resolve_interval;
    const HostResolverMetrics metrics = getMetrics();

    // for tests purpose
@ -245,4 +246,3 @@ private:
 };

 }
-
--- a/src/Common/NaNUtils.h
+++ b/src/Common/NaNUtils.h
@ -3,24 +3,24 @@
 #include <cmath>
 #include <limits>
 #include <type_traits>
+#include <base/DecomposedFloat.h>


 template <typename T>
 inline bool isNaN(T x)
 {
    /// To be sure, that this function is zero-cost for non-floating point types.
-    if constexpr (std::is_floating_point_v<T>)
-        return std::isnan(x);
+    if constexpr (is_floating_point<T>)
+        return DecomposedFloat(x).isNaN();
    else
        return false;
 }

-
 template <typename T>
 inline bool isFinite(T x)
 {
-    if constexpr (std::is_floating_point_v<T>)
-        return std::isfinite(x);
+    if constexpr (is_floating_point<T>)
+        return DecomposedFloat(x).isFinite();
    else
        return true;
 }
@ -28,7 +28,7 @@ inline bool isFinite(T x)
 template <typename T>
 bool canConvertTo(Float64 x)
 {
-    if constexpr (std::is_floating_point_v<T>)
+    if constexpr (is_floating_point<T>)
        return true;
    if (!isFinite(x))
        return false;
@ -46,3 +46,12 @@ T NaNOrZero()
    else
        return {};
 }
+
+template <typename T>
+bool signBit(T x)
+{
+    if constexpr (is_floating_point<T>)
+        return DecomposedFloat(x).isNegative();
+    else
+        return x < 0;
+}
--- a/src/Common/ProgressIndication.cpp
+++ b/src/Common/ProgressIndication.cpp
@ -2,6 +2,7 @@
 #include <algorithm>
 #include <cstddef>
 #include <iostream>
+#include <mutex>
 #include <numeric>
 #include <filesystem>
 #include <cmath>
@ -49,12 +50,13 @@ void ProgressIndication::resetProgress()
    }
 }

-void ProgressIndication::setFileProgressCallback(ContextMutablePtr context, WriteBufferFromFileDescriptor & message)
+void ProgressIndication::setFileProgressCallback(ContextMutablePtr context, WriteBufferFromFileDescriptor & message, std::mutex & message_mutex)
 {
    context->setFileProgressCallback([&](const FileProgress & file_progress)
    {
        progress.incrementPiecewiseAtomically(Progress(file_progress));
-        writeProgress(message);
+        std::unique_lock message_lock(message_mutex);
+        writeProgress(message, message_lock);
    });
 }

@ -113,7 +115,7 @@ void ProgressIndication::writeFinalProgress()
        output_stream << "\nPeak memory usage: " << formatReadableSizeWithBinarySuffix(peak_memory_usage) << ".";
 }

-void ProgressIndication::writeProgress(WriteBufferFromFileDescriptor & message)
+void ProgressIndication::writeProgress(WriteBufferFromFileDescriptor & message, std::unique_lock<std::mutex> &)
 {
    std::lock_guard lock(progress_mutex);

@ -274,7 +276,7 @@ void ProgressIndication::writeProgress(WriteBufferFromFileDescriptor & message)
    message.next();
 }

-void ProgressIndication::clearProgressOutput(WriteBufferFromFileDescriptor & message)
+void ProgressIndication::clearProgressOutput(WriteBufferFromFileDescriptor & message, std::unique_lock<std::mutex> &)
 {
    std::lock_guard lock(progress_mutex);

--- a/src/Common/ProgressIndication.h
+++ b/src/Common/ProgressIndication.h
@ -8,6 +8,7 @@

 #include <iostream>
 #include <mutex>
+#include <queue>
 #include <unordered_map>
 #include <unordered_set>

@ -47,8 +48,8 @@ public:
    }

    /// Write progress bar.
-    void writeProgress(WriteBufferFromFileDescriptor & message);
-    void clearProgressOutput(WriteBufferFromFileDescriptor & message);
+    void writeProgress(WriteBufferFromFileDescriptor & message, std::unique_lock<std::mutex> & message_lock);
+    void clearProgressOutput(WriteBufferFromFileDescriptor & message, std::unique_lock<std::mutex> & message_lock);

    /// Write summary.
    void writeFinalProgress();
@ -67,7 +68,7 @@ public:
    /// In some cases there is a need to update progress value, when there is no access to progress_inidcation object.
    /// In this case it is added via context.
    /// `write_progress_on_update` is needed to write progress for loading files data via pipe in non-interactive mode.
-    void setFileProgressCallback(ContextMutablePtr context, WriteBufferFromFileDescriptor & message);
+    void setFileProgressCallback(ContextMutablePtr context, WriteBufferFromFileDescriptor & message, std::mutex & message_mutex);

    /// How much seconds passed since query execution start.
    double elapsedSeconds() const { return getElapsedNanoseconds() / 1e9; }
--- a/src/Common/TargetSpecific.cpp
+++ b/src/Common/TargetSpecific.cpp
@ -23,6 +23,8 @@ UInt32 getSupportedArchs()
        result |= static_cast<UInt32>(TargetArch::AVX512VBMI);
    if (CPU::CPUFlagsCache::have_AVX512VBMI2)
        result |= static_cast<UInt32>(TargetArch::AVX512VBMI2);
+    if (CPU::CPUFlagsCache::have_AVX512BF16)
+        result |= static_cast<UInt32>(TargetArch::AVX512BF16);
    if (CPU::CPUFlagsCache::have_AMXBF16)
        result |= static_cast<UInt32>(TargetArch::AMXBF16);
    if (CPU::CPUFlagsCache::have_AMXTILE)
@ -50,6 +52,7 @@ String toString(TargetArch arch)
        case TargetArch::AVX512BW:    return "avx512bw";
        case TargetArch::AVX512VBMI:  return "avx512vbmi";
        case TargetArch::AVX512VBMI2: return "avx512vbmi2";
+        case TargetArch::AVX512BF16:  return "avx512bf16";
        case TargetArch::AMXBF16: return "amxbf16";
        case TargetArch::AMXTILE: return "amxtile";
        case TargetArch::AMXINT8: return "amxint8";
--- a/src/Common/TargetSpecific.h
+++ b/src/Common/TargetSpecific.h
@ -83,9 +83,10 @@ enum class TargetArch : UInt32
    AVX512BW    = (1 << 4),
    AVX512VBMI  = (1 << 5),
    AVX512VBMI2 = (1 << 6),
-    AMXBF16 = (1 << 7),
-    AMXTILE = (1 << 8),
-    AMXINT8 = (1 << 9),
+    AVX512BF16 = (1 << 7),
+    AMXBF16 = (1 << 8),
+    AMXTILE = (1 << 9),
+    AMXINT8 = (1 << 10),
 };

 /// Runtime detection.
@ -102,6 +103,7 @@ String toString(TargetArch arch);
 /// NOLINTNEXTLINE
 #define USE_MULTITARGET_CODE 1

+#define AVX512BF16_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,avx512bf16")))
 #define AVX512VBMI2_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2")))
 #define AVX512VBMI_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi")))
 #define AVX512BW_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw")))
@ -111,6 +113,8 @@ String toString(TargetArch arch);
 #define SSE42_FUNCTION_SPECIFIC_ATTRIBUTE __attribute__((target("sse,sse2,sse3,ssse3,sse4,popcnt")))
 #define DEFAULT_FUNCTION_SPECIFIC_ATTRIBUTE

+#   define BEGIN_AVX512BF16_SPECIFIC_CODE \
+        _Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2,avx512bf16\"))),apply_to=function)")
 #   define BEGIN_AVX512VBMI2_SPECIFIC_CODE \
        _Pragma("clang attribute push(__attribute__((target(\"sse,sse2,sse3,ssse3,sse4,popcnt,avx,avx2,avx512f,avx512bw,avx512vl,avx512vbmi,avx512vbmi2\"))),apply_to=function)")
 #   define BEGIN_AVX512VBMI_SPECIFIC_CODE \
@ -197,6 +201,14 @@ namespace TargetSpecific::AVX512VBMI2 { \
 } \
 END_TARGET_SPECIFIC_CODE

+#define DECLARE_AVX512BF16_SPECIFIC_CODE(...) \
+BEGIN_AVX512BF16_SPECIFIC_CODE \
+namespace TargetSpecific::AVX512BF16 { \
+    DUMMY_FUNCTION_DEFINITION \
+    using namespace DB::TargetSpecific::AVX512BF16; \
+    __VA_ARGS__ \
+} \
+END_TARGET_SPECIFIC_CODE

 #else

@ -211,6 +223,7 @@ END_TARGET_SPECIFIC_CODE
 #define DECLARE_AVX512BW_SPECIFIC_CODE(...)
 #define DECLARE_AVX512VBMI_SPECIFIC_CODE(...)
 #define DECLARE_AVX512VBMI2_SPECIFIC_CODE(...)
+#define DECLARE_AVX512BF16_SPECIFIC_CODE(...)

 #endif

@ -229,7 +242,8 @@ DECLARE_AVX2_SPECIFIC_CODE   (__VA_ARGS__) \
 DECLARE_AVX512F_SPECIFIC_CODE(__VA_ARGS__) \
 DECLARE_AVX512BW_SPECIFIC_CODE    (__VA_ARGS__) \
 DECLARE_AVX512VBMI_SPECIFIC_CODE  (__VA_ARGS__) \
-DECLARE_AVX512VBMI2_SPECIFIC_CODE (__VA_ARGS__)
+DECLARE_AVX512VBMI2_SPECIFIC_CODE (__VA_ARGS__) \
+DECLARE_AVX512BF16_SPECIFIC_CODE (__VA_ARGS__)

 DECLARE_DEFAULT_CODE(
    constexpr auto BuildArch = TargetArch::Default; /// NOLINT
@ -263,6 +277,10 @@ DECLARE_AVX512VBMI2_SPECIFIC_CODE(
    constexpr auto BuildArch = TargetArch::AVX512VBMI2; /// NOLINT
 ) // DECLARE_AVX512VBMI2_SPECIFIC_CODE

+DECLARE_AVX512BF16_SPECIFIC_CODE(
+    constexpr auto BuildArch = TargetArch::AVX512BF16; /// NOLINT
+) // DECLARE_AVX512BF16_SPECIFIC_CODE
+
 /** Runtime Dispatch helpers for class members.
  *
  * Example of usage:
--- a/src/Common/ThreadStatus.cpp
+++ b/src/Common/ThreadStatus.cpp
@ -204,6 +204,16 @@ bool ThreadStatus::isQueryCanceled() const
    return false;
 }

+size_t ThreadStatus::getNextPlanStepIndex() const
+{
+    return local_data.plan_step_index->fetch_add(1);
+}
+
+size_t ThreadStatus::getNextPipelineProcessorIndex() const
+{
+    return local_data.pipeline_processor_index->fetch_add(1);
+}
+
 ThreadStatus::~ThreadStatus()
 {
    flushUntrackedMemory();
--- a/src/Common/ThreadStatus.h
+++ b/src/Common/ThreadStatus.h
@ -11,6 +11,7 @@

 #include <boost/noncopyable.hpp>

+#include <atomic>
 #include <functional>
 #include <memory>
 #include <mutex>
@ -90,6 +91,11 @@ public:
        String query_for_logs;
        UInt64 normalized_query_hash = 0;

+        // Since processors might be added on the fly within expand() function we use atomic_size_t.
+        // These two fields are used for EXPLAIN PLAN / PIPELINE.
+        std::shared_ptr<std::atomic_size_t> plan_step_index = std::make_shared<std::atomic_size_t>(0);
+        std::shared_ptr<std::atomic_size_t> pipeline_processor_index = std::make_shared<std::atomic_size_t>(0);
+
        QueryIsCanceledPredicate query_is_canceled_predicate = {};
    };

@ -313,6 +319,9 @@ public:

    void initGlobalProfiler(UInt64 global_profiler_real_time_period, UInt64 global_profiler_cpu_time_period);

+    size_t getNextPlanStepIndex() const;
+    size_t getNextPipelineProcessorIndex() const;
+
 private:
    void applyGlobalSettings();
    void applyQuerySettings();
--- a/src/Common/findExtreme.cpp
+++ b/src/Common/findExtreme.cpp
@ -47,7 +47,7 @@ MULTITARGET_FUNCTION_AVX2_SSE42(

        /// Unroll the loop manually for floating point, since the compiler doesn't do it without fastmath
        /// as it might change the return value
-        if constexpr (std::is_floating_point_v<T>)
+        if constexpr (is_floating_point<T>)
        {
            constexpr size_t unroll_block = 512 / sizeof(T); /// Chosen via benchmarks with AVX2 so YMMV
            size_t unrolled_end = i + (((count - i) / unroll_block) * unroll_block);
--- a/src/Common/transformEndianness.h
+++ b/src/Common/transformEndianness.h
@ -38,7 +38,7 @@ inline void transformEndianness(T & x)
 }

 template <std::endian ToEndian, std::endian FromEndian = std::endian::native, typename T>
-requires std::is_floating_point_v<T>
+requires is_floating_point<T>
 inline void transformEndianness(T & value)
 {
    if constexpr (ToEndian != FromEndian)
--- a/src/Compression/CompressionCodecNone.h
+++ b/src/Compression/CompressionCodecNone.h
@ -3,7 +3,7 @@
 #include <IO/WriteBuffer.h>
 #include <Compression/ICompressionCodec.h>
 #include <IO/BufferWithOwnMemory.h>
-#include <Parsers/StringRange.h>
+

 namespace DB
 {
--- a/src/Compression/tests/gtest_compressionCodec.cpp
+++ b/src/Compression/tests/gtest_compressionCodec.cpp
@ -7,7 +7,6 @@
 #include <Parsers/ExpressionElementParsers.h>
 #include <Parsers/IParser.h>
 #include <Parsers/TokenIterator.h>
-#include <base/types.h>
 #include <Common/PODArray.h>
 #include <Common/Stopwatch.h>

--- a/src/Core/AccurateComparison.h
+++ b/src/Core/AccurateComparison.h
@ -25,7 +25,7 @@ bool lessOp(A a, B b)
        return a < b;

    /// float vs float
-    if constexpr (std::is_floating_point_v<A> && std::is_floating_point_v<B>)
+    if constexpr (is_floating_point<A> && is_floating_point<B>)
        return a < b;

    /// anything vs NaN
@ -49,7 +49,7 @@ bool lessOp(A a, B b)
    }

    /// int vs float
-    if constexpr (is_integer<A> && std::is_floating_point_v<B>)
+    if constexpr (is_integer<A> && is_floating_point<B>)
    {
        if constexpr (sizeof(A) <= 4)
            return static_cast<double>(a) < static_cast<double>(b);
@ -57,7 +57,7 @@ bool lessOp(A a, B b)
        return DecomposedFloat<B>(b).greater(a);
    }

-    if constexpr (std::is_floating_point_v<A> && is_integer<B>)
+    if constexpr (is_floating_point<A> && is_integer<B>)
    {
        if constexpr (sizeof(B) <= 4)
            return static_cast<double>(a) < static_cast<double>(b);
@ -65,8 +65,8 @@ bool lessOp(A a, B b)
        return DecomposedFloat<A>(a).less(b);
    }

-    static_assert(is_integer<A> || std::is_floating_point_v<A>);
-    static_assert(is_integer<B> || std::is_floating_point_v<B>);
+    static_assert(is_integer<A> || is_floating_point<A>);
+    static_assert(is_integer<B> || is_floating_point<B>);
    UNREACHABLE();
 }

@ -101,7 +101,7 @@ bool equalsOp(A a, B b)
        return a == b;

    /// float vs float
-    if constexpr (std::is_floating_point_v<A> && std::is_floating_point_v<B>)
+    if constexpr (is_floating_point<A> && is_floating_point<B>)
        return a == b;

    /// anything vs NaN
@ -125,7 +125,7 @@ bool equalsOp(A a, B b)
    }

    /// int vs float
-    if constexpr (is_integer<A> && std::is_floating_point_v<B>)
+    if constexpr (is_integer<A> && is_floating_point<B>)
    {
        if constexpr (sizeof(A) <= 4)
            return static_cast<double>(a) == static_cast<double>(b);
@ -133,7 +133,7 @@ bool equalsOp(A a, B b)
        return DecomposedFloat<B>(b).equals(a);
    }

-    if constexpr (std::is_floating_point_v<A> && is_integer<B>)
+    if constexpr (is_floating_point<A> && is_integer<B>)
    {
        if constexpr (sizeof(B) <= 4)
            return static_cast<double>(a) == static_cast<double>(b);
@ -163,7 +163,7 @@ inline bool NO_SANITIZE_UNDEFINED convertNumeric(From value, To & result)
        return true;
    }

-    if constexpr (std::is_floating_point_v<From> && std::is_floating_point_v<To>)
+    if constexpr (is_floating_point<From> && is_floating_point<To>)
    {
        /// Note that NaNs doesn't compare equal to anything, but they are still in range of any Float type.
        if (isNaN(value))
--- a/src/Core/DecimalFunctions.h
+++ b/src/Core/DecimalFunctions.h
@ -17,6 +17,7 @@ class DataTypeNumber;

 namespace ErrorCodes
 {
+    extern const int NOT_IMPLEMENTED;
    extern const int DECIMAL_OVERFLOW;
    extern const int ARGUMENT_OUT_OF_BOUND;
 }
@ -310,7 +311,14 @@ ReturnType convertToImpl(const DecimalType & decimal, UInt32 scale, To & result)
    using DecimalNativeType = typename DecimalType::NativeType;
    static constexpr bool throw_exception = std::is_void_v<ReturnType>;

-    if constexpr (std::is_floating_point_v<To>)
+    if constexpr (std::is_same_v<To, BFloat16>)
+    {
+        if constexpr (throw_exception)
+            throw Exception(ErrorCodes::NOT_IMPLEMENTED, "Conversion from Decimal to BFloat16 is not implemented");
+        else
+            return ReturnType(false);
+    }
+    else if constexpr (is_floating_point<To>)
    {
        result = static_cast<To>(decimal.value) / static_cast<To>(scaleMultiplier<DecimalNativeType>(scale));
    }
--- a/src/Core/Field.h
+++ b/src/Core/Field.h
@ -257,6 +257,7 @@ template <> struct NearestFieldTypeImpl<DecimalField<Decimal64>> { using Type =
 template <> struct NearestFieldTypeImpl<DecimalField<Decimal128>> { using Type = DecimalField<Decimal128>; };
 template <> struct NearestFieldTypeImpl<DecimalField<Decimal256>> { using Type = DecimalField<Decimal256>; };
 template <> struct NearestFieldTypeImpl<DecimalField<DateTime64>> { using Type = DecimalField<DateTime64>; };
+template <> struct NearestFieldTypeImpl<BFloat16> { using Type = Float64; };
 template <> struct NearestFieldTypeImpl<Float32> { using Type = Float64; };
 template <> struct NearestFieldTypeImpl<Float64> { using Type = Float64; };
 template <> struct NearestFieldTypeImpl<const char *> { using Type = String; };
--- a/src/Core/ServerSettings.cpp
+++ b/src/Core/ServerSettings.cpp
@ -131,6 +131,9 @@ namespace DB
    DECLARE(UInt64, max_database_num_to_warn, 1000lu, "If the number of databases is greater than this value, the server will create a warning that will displayed to user.", 0) \
    DECLARE(UInt64, max_part_num_to_warn, 100000lu, "If the number of parts is greater than this value, the server will create a warning that will displayed to user.", 0) \
    DECLARE(UInt64, max_table_num_to_throw, 0lu, "If number of tables is greater than this value, server will throw an exception. 0 means no limitation. View, remote tables, dictionary, system tables are not counted. Only count table in Atomic/Ordinary/Replicated/Lazy database engine.", 0) \
+    DECLARE(UInt64, max_replicated_table_num_to_throw, 0lu, "If number of replicated tables is greater than this value, server will throw an exception. 0 means no limitation. Only count table in Atomic/Ordinary/Replicated/Lazy database engine.", 0) \
+    DECLARE(UInt64, max_dictionary_num_to_throw, 0lu, "If number of dictionaries is greater than this value, server will throw an exception. 0 means no limitation. Only count table in Atomic/Ordinary/Replicated/Lazy database engine.", 0) \
+    DECLARE(UInt64, max_view_num_to_throw, 0lu, "If number of views is greater than this value, server will throw an exception. 0 means no limitation. Only count table in Atomic/Ordinary/Replicated/Lazy database engine.", 0) \
    DECLARE(UInt64, max_database_num_to_throw, 0lu, "If number of databases is greater than this value, server will throw an exception. 0 means no limitation.", 0) \
    DECLARE(UInt64, max_authentication_methods_per_user, 100, "The maximum number of authentication methods a user can be created with or altered. Changing this setting does not affect existing users. Zero means unlimited", 0) \
    DECLARE(UInt64, concurrent_threads_soft_limit_num, 0, "Sets how many concurrent thread can be allocated before applying CPU pressure. Zero means unlimited.", 0) \
--- a/src/Core/Settings.cpp
+++ b/src/Core/Settings.cpp
@ -1794,7 +1794,7 @@ Possible values:

 - 0 — Disabled.
 - 1 — Enabled.
-)", 0) \
+)", 1) \
    DECLARE(Int64, http_zlib_compression_level, 3, R"(
 Sets the level of data compression in the response to an HTTP request if [enable_http_compression = 1](#enable_http_compression).

@ -3669,6 +3669,11 @@ Given that, for example, dictionaries, can be out of sync across nodes, mutation

 </profiles>
 ```
+)", 0) \
+ DECLARE(Bool, validate_mutation_query, true, R"(
+Validate mutation queries before accepting them. Mutations are executed in the background, and running an invalid query will cause mutations to get stuck, requiring manual intervention.
+
+Only change this setting if you encounter a backward-incompatible bug.
 )", 0) \
    DECLARE(Seconds, lock_acquire_timeout, DBMS_DEFAULT_LOCK_ACQUIRE_TIMEOUT_SEC, R"(
 Defines how many seconds a locking request waits before failing.
@ -5737,7 +5742,10 @@ Enable experimental functions for natural language processing.
 Enable experimental hash functions
 )", EXPERIMENTAL) \
    DECLARE(Bool, allow_experimental_object_type, false, R"(
-Allow Object and JSON data types
+Allow the obsolete Object data type
+)", EXPERIMENTAL) \
+    DECLARE(Bool, allow_experimental_bfloat16_type, false, R"(
+Allow BFloat16 data type (under development).
 )", EXPERIMENTAL) \
    DECLARE(Bool, allow_experimental_time_series_table, false, R"(
 Allows creation of tables with the [TimeSeries](../../engines/table-engines/integrations/time-series.md) table engine.
--- a/src/Core/SettingsChangesHistory.cpp
+++ b/src/Core/SettingsChangesHistory.cpp
@ -64,6 +64,8 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
    },
    {"24.11",
        {
+            {"enable_http_compression", false, true, "Improvement for read-only clients since they can't change settings"},
+            {"validate_mutation_query", false, true, "New setting to validate mutation queries by default."},
            {"enable_job_stack_trace", false, true, "Enable by default collecting stack traces from job's scheduling."},
            {"allow_suspicious_types_in_group_by", true, false, "Don't allow Variant/Dynamic types in GROUP BY by default"},
            {"allow_suspicious_types_in_order_by", true, false, "Don't allow Variant/Dynamic types in ORDER BY by default"},
@ -78,6 +80,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
            {"backup_restore_finish_timeout_after_error_sec", 0, 180, "New setting."},
            {"query_plan_merge_filters", false, true, "Allow to merge filters in the query plan. This is required to properly support filter-push-down with a new analyzer."},
            {"parallel_replicas_local_plan", false, true, "Use local plan for local replica in a query with parallel replicas"},
+            {"allow_experimental_bfloat16_type", false, false, "Add new experimental BFloat16 type"},
            {"filesystem_cache_skip_download_if_exceeds_per_query_cache_write_limit", 1, 1, "Rename of setting skip_download_if_exceeds_query_cache_limit"},
            {"filesystem_cache_prefer_bigger_buffer_size", true, true, "New setting"},
            {"read_in_order_use_virtual_row", false, false, "Use virtual row while reading in order of primary key or its monotonic function fashion. It is useful when searching over multiple parts as only relevant ones are touched."},
@ -126,7 +129,7 @@ static std::initializer_list<std::pair<ClickHouseVersion, SettingsChangesHistory
            {"allow_experimental_refreshable_materialized_view", false, true, "Not experimental anymore"},
            {"max_parts_to_move", 0, 1000, "New setting"},
            {"hnsw_candidate_list_size_for_search", 64, 256, "New setting. Previously, the value was optionally specified in CREATE INDEX and 64 by default."},
-            {"allow_reorder_prewhere_conditions", false, true, "New setting"},
+            {"allow_reorder_prewhere_conditions", true, true, "New setting"},
            {"input_format_parquet_bloom_filter_push_down", false, true, "When reading Parquet files, skip whole row groups based on the WHERE/PREWHERE expressions and bloom filter in the Parquet metadata."},
            {"date_time_64_output_format_cut_trailing_zeros_align_to_groups_of_thousands", false, false, "Dynamically trim the trailing zeros of datetime64 values to adjust the output scale to (0, 3, 6), corresponding to 'seconds', 'milliseconds', and 'microseconds'."},
        }
--- a/src/Core/SortCursor.h
+++ b/src/Core/SortCursor.h
@ -726,6 +726,7 @@ private:
        SortingQueueImpl<SpecializedSingleColumnSortCursor<ColumnVector<Int128>>, strategy>,
        SortingQueueImpl<SpecializedSingleColumnSortCursor<ColumnVector<Int256>>, strategy>,

+        SortingQueueImpl<SpecializedSingleColumnSortCursor<ColumnVector<BFloat16>>, strategy>,
        SortingQueueImpl<SpecializedSingleColumnSortCursor<ColumnVector<Float32>>, strategy>,
        SortingQueueImpl<SpecializedSingleColumnSortCursor<ColumnVector<Float64>>, strategy>,

--- a/src/Core/TypeId.h
+++ b/src/Core/TypeId.h
@ -21,6 +21,7 @@ enum class TypeIndex : uint8_t
    Int64,
    Int128,
    Int256,
+    BFloat16,
    Float32,
    Float64,
    Date,
@ -94,6 +95,7 @@ TYPEID_MAP(Int32)
 TYPEID_MAP(Int64)
 TYPEID_MAP(Int128)
 TYPEID_MAP(Int256)
+TYPEID_MAP(BFloat16)
 TYPEID_MAP(Float32)
 TYPEID_MAP(Float64)
 TYPEID_MAP(UUID)
--- a/src/Core/Types_fwd.h
+++ b/src/Core/Types_fwd.h
@ -21,6 +21,7 @@ using Int128 = wide::integer<128, signed>;
 using UInt128 = wide::integer<128, unsigned>;
 using Int256 = wide::integer<256, signed>;
 using UInt256 = wide::integer<256, unsigned>;
+class BFloat16;

 namespace DB
 {
--- a/src/Core/callOnTypeIndex.h
+++ b/src/Core/callOnTypeIndex.h
@ -63,6 +63,7 @@ static bool callOnBasicType(TypeIndex number, F && f)
    {
        switch (number)
        {
+            case TypeIndex::BFloat16:     return f(TypePair<T, BFloat16>());
            case TypeIndex::Float32:      return f(TypePair<T, Float32>());
            case TypeIndex::Float64:      return f(TypePair<T, Float64>());
            default:
@ -133,6 +134,7 @@ static inline bool callOnBasicTypes(TypeIndex type_num1, TypeIndex type_num2, F
    {
        switch (type_num1)
        {
+            case TypeIndex::BFloat16: return callOnBasicType<BFloat16, _int, _float, _decimal, _datetime>(type_num2, std::forward<F>(f));
            case TypeIndex::Float32: return callOnBasicType<Float32, _int, _float, _decimal, _datetime>(type_num2, std::forward<F>(f));
            case TypeIndex::Float64: return callOnBasicType<Float64, _int, _float, _decimal, _datetime>(type_num2, std::forward<F>(f));
            default:
@ -190,6 +192,7 @@ static bool callOnIndexAndDataType(TypeIndex number, F && f, ExtraArgs && ... ar
        case TypeIndex::Int128:         return f(TypePair<DataTypeNumber<Int128>, T>(), std::forward<ExtraArgs>(args)...);
        case TypeIndex::Int256:         return f(TypePair<DataTypeNumber<Int256>, T>(), std::forward<ExtraArgs>(args)...);

+        case TypeIndex::BFloat16:        return f(TypePair<DataTypeNumber<BFloat16>, T>(), std::forward<ExtraArgs>(args)...);
        case TypeIndex::Float32:        return f(TypePair<DataTypeNumber<Float32>, T>(), std::forward<ExtraArgs>(args)...);
        case TypeIndex::Float64:        return f(TypePair<DataTypeNumber<Float64>, T>(), std::forward<ExtraArgs>(args)...);

--- a/src/DataTypes/DataTypeNumberBase.cpp
+++ b/src/DataTypes/DataTypeNumberBase.cpp
@ -42,6 +42,7 @@ template class DataTypeNumberBase<Int32>;
 template class DataTypeNumberBase<Int64>;
 template class DataTypeNumberBase<Int128>;
 template class DataTypeNumberBase<Int256>;
+template class DataTypeNumberBase<BFloat16>;
 template class DataTypeNumberBase<Float32>;
 template class DataTypeNumberBase<Float64>;

--- a/src/DataTypes/DataTypeNumberBase.h
+++ b/src/DataTypes/DataTypeNumberBase.h
@ -68,6 +68,7 @@ extern template class DataTypeNumberBase<Int32>;
 extern template class DataTypeNumberBase<Int64>;
 extern template class DataTypeNumberBase<Int128>;
 extern template class DataTypeNumberBase<Int256>;
+extern template class DataTypeNumberBase<BFloat16>;
 extern template class DataTypeNumberBase<Float32>;
 extern template class DataTypeNumberBase<Float64>;

--- a/Show More
+++ b/Show More