From 8eb81752583e50e27bfdf66b2ab258f17b388d67 Mon Sep 17 00:00:00 2001
From: Amos Bird <amosbird@gmail.com>
Date: Mon, 28 Sep 2020 17:07:57 +0800
Subject: [PATCH 01/72] Fix multiple column transformers.

---
 .../TranslateQualifiedNamesVisitor.cpp        | 22 ++++++++++++-------
 src/Parsers/ASTColumnsMatcher.cpp             |  3 +++
 .../01470_columns_transformers.reference      | 10 +++++++++
 .../01470_columns_transformers.sql            |  4 ++++
 4 files changed, 31 insertions(+), 8 deletions(-)
diff --git a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
index 74622c72865..4683a4e3cd7 100644
--- a/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
+++ b/src/Interpreters/TranslateQualifiedNamesVisitor.cpp
@@ -228,6 +228,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
 
     for (const auto & child : old_children)
     {
+        ASTs columns;
         if (const auto * asterisk = child->as<ASTAsterisk>())
         {
             bool first_table = true;
@@ -237,7 +238,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
                 {
                     if (first_table || !data.join_using_columns.count(column.name))
                     {
-                        addIdentifier(node.children, table.table, column.name, AsteriskSemantic::getAliases(*asterisk));
+                        addIdentifier(columns, table.table, column.name, AsteriskSemantic::getAliases(*asterisk));
                     }
                 }
 
@@ -245,7 +246,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
             }
             for (const auto & transformer : asterisk->children)
             {
-                IASTColumnsTransformer::transform(transformer, node.children);
+                IASTColumnsTransformer::transform(transformer, columns);
             }
         }
         else if (const auto * asterisk_pattern = child->as<ASTColumnsMatcher>())
@@ -253,7 +254,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
             if (asterisk_pattern->column_list)
             {
                 for (const auto & ident : asterisk_pattern->column_list->children)
-                    node.children.emplace_back(ident->clone());
+                    columns.emplace_back(ident->clone());
             }
             else
             {
@@ -264,7 +265,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
                     {
                         if (asterisk_pattern->isColumnMatching(column.name) && (first_table || !data.join_using_columns.count(column.name)))
                         {
-                            addIdentifier(node.children, table.table, column.name, AsteriskSemantic::getAliases(*asterisk_pattern));
+                            addIdentifier(columns, table.table, column.name, AsteriskSemantic::getAliases(*asterisk_pattern));
                         }
                     }
 
@@ -274,7 +275,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
             // ColumnsMatcher's transformers start to appear at child 1
             for (auto it = asterisk_pattern->children.begin() + 1; it != asterisk_pattern->children.end(); ++it)
             {
-                IASTColumnsTransformer::transform(*it, node.children);
+                IASTColumnsTransformer::transform(*it, columns);
             }
         }
         else if (const auto * qualified_asterisk = child->as<ASTQualifiedAsterisk>())
@@ -287,7 +288,7 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
                 {
                     for (const auto & column : table.columns)
                     {
-                        addIdentifier(node.children, table.table, column.name, AsteriskSemantic::getAliases(*qualified_asterisk));
+                        addIdentifier(columns, table.table, column.name, AsteriskSemantic::getAliases(*qualified_asterisk));
                     }
                     break;
                 }
@@ -295,11 +296,16 @@ void TranslateQualifiedNamesMatcher::visit(ASTExpressionList & node, const ASTPt
             // QualifiedAsterisk's transformers start to appear at child 1
             for (auto it = qualified_asterisk->children.begin() + 1; it != qualified_asterisk->children.end(); ++it)
             {
-                IASTColumnsTransformer::transform(*it, node.children);
+                IASTColumnsTransformer::transform(*it, columns);
             }
         }
         else
-            node.children.emplace_back(child);
+            columns.emplace_back(child);
+
+        node.children.insert(
+            node.children.end(),
+            std::make_move_iterator(columns.begin()),
+            std::make_move_iterator(columns.end()));
     }
 }
 
diff --git a/src/Parsers/ASTColumnsMatcher.cpp b/src/Parsers/ASTColumnsMatcher.cpp
index e9b2c4cc562..aab8e841981 100644
--- a/src/Parsers/ASTColumnsMatcher.cpp
+++ b/src/Parsers/ASTColumnsMatcher.cpp
@@ -32,7 +32,10 @@ void ASTColumnsMatcher::formatImpl(const FormatSettings & settings, FormatState
 {
     settings.ostr << (settings.hilite ? hilite_keyword : "") << "COLUMNS" << (settings.hilite ? hilite_none : "") << "(";
     if (column_list)
+    {
+        frame.expression_list_prepend_whitespace = false;
         column_list->formatImpl(settings, state, frame);
+    }
     else
         settings.ostr << quoteString(original_pattern);
     settings.ostr << ")";
diff --git a/tests/queries/0_stateless/01470_columns_transformers.reference b/tests/queries/0_stateless/01470_columns_transformers.reference
index ba23352c420..2d8a1802289 100644
--- a/tests/queries/0_stateless/01470_columns_transformers.reference
+++ b/tests/queries/0_stateless/01470_columns_transformers.reference
@@ -67,3 +67,13 @@ SELECT
     sum(j),
     sum(k)
 FROM columns_transformers
+100	10	100	10	324	10
+120	8	120	8	23	8
+SELECT
+    i,
+    j,
+    toFloat64(i),
+    toFloat64(j),
+    toFloat64(k),
+    j
+FROM columns_transformers
diff --git a/tests/queries/0_stateless/01470_columns_transformers.sql b/tests/queries/0_stateless/01470_columns_transformers.sql
index 55335110c97..f95cee51fb0 100644
--- a/tests/queries/0_stateless/01470_columns_transformers.sql
+++ b/tests/queries/0_stateless/01470_columns_transformers.sql
@@ -37,4 +37,8 @@ EXPLAIN SYNTAX SELECT * REPLACE(i + 1 AS i) REPLACE(i + 1 AS i) from columns_tra
 SELECT COLUMNS(i, j, k) APPLY(sum) from columns_transformers;
 EXPLAIN SYNTAX SELECT COLUMNS(i, j, k) APPLY(sum) from columns_transformers;
 
+-- Multiple column matchers and transformers
+SELECT i, j, COLUMNS(i, j, k) APPLY(toFloat64), COLUMNS(i, j) EXCEPT (i) from columns_transformers;
+EXPLAIN SYNTAX SELECT i, j, COLUMNS(i, j, k) APPLY(toFloat64), COLUMNS(i, j) EXCEPT (i) from columns_transformers;
+
 DROP TABLE columns_transformers;

From ea9045da8ddf9c4f4d2cad30688991745e3c4e7c Mon Sep 17 00:00:00 2001
From: Sergey Ryzhkov <53r93y@yandex-team.ru>
Date: Fri, 2 Oct 2020 08:09:44 +0300
Subject: [PATCH 02/72] DOCSUP-2866: Document the
 allow_experimental_bigint_types setting

---
 docs/en/operations/settings/settings.md             | 13 ++++++++++++-
 docs/en/sql-reference/data-types/decimal.md         |  7 +++++--
 docs/en/sql-reference/data-types/int-uint.md        |  7 +++++--
 .../functions/type-conversion-functions.md          |  8 +++++---
 4 files changed, 27 insertions(+), 8 deletions(-)

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index e97a418f1ed..8493bd7bf57 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -1565,7 +1565,7 @@ See also:
 
 ## allow\_introspection\_functions {#settings-allow_introspection_functions}
 
-Enables of disables [introspections functions](../../sql-reference/functions/introspection.md) for query profiling.
+Enables or disables [introspections functions](../../sql-reference/functions/introspection.md) for query profiling.
 
 Possible values:
 
@@ -2027,3 +2027,14 @@ Result:
 ```
 
 [Original article](https://clickhouse.tech/docs/en/operations/settings/settings/) <!-- hide -->
+
+## allow_experimental_bigint_types {#allow_experimental_bigint_types}
+
+Enables or disables integer values exceeding the range that is supported by the int data type.
+
+Possible values:
+
+-   1 — The bigint data type is enabled.
+-   0 — The bigint data type is disabled.
+
+Default value: `0`.
\ No newline at end of file
diff --git a/docs/en/sql-reference/data-types/decimal.md b/docs/en/sql-reference/data-types/decimal.md
index 7902394e3ee..bedd0f13744 100644
--- a/docs/en/sql-reference/data-types/decimal.md
+++ b/docs/en/sql-reference/data-types/decimal.md
@@ -3,25 +3,27 @@ toc_priority: 42
 toc_title: Decimal
 ---
 
-# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S) {#decimalp-s-decimal32s-decimal64s-decimal128s}
+# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S), Decimal256(S) {#decimalp-s-decimal32s-decimal64s-decimal128s-decimal256s}
 
 Signed fixed-point numbers that keep precision during add, subtract and multiply operations. For division least significant digits are discarded (not rounded).
 
 ## Parameters {#parameters}
 
--   P - precision. Valid range: \[ 1 : 38 \]. Determines how many decimal digits number can have (including fraction).
+-   P - precision. Valid range: \[ 1 : 76 \]. Determines how many decimal digits number can have (including fraction).
 -   S - scale. Valid range: \[ 0 : P \]. Determines how many decimal digits fraction can have.
 
 Depending on P parameter value Decimal(P, S) is a synonym for:
 - P from \[ 1 : 9 \] - for Decimal32(S)
 - P from \[ 10 : 18 \] - for Decimal64(S)
 - P from \[ 19 : 38 \] - for Decimal128(S)
+- P from \[ 38 : 76 \] - for Decimal256(S)
 
 ## Decimal Value Ranges {#decimal-value-ranges}
 
 -   Decimal32(S) - ( -1 \* 10^(9 - S), 1 \* 10^(9 - S) )
 -   Decimal64(S) - ( -1 \* 10^(18 - S), 1 \* 10^(18 - S) )
 -   Decimal128(S) - ( -1 \* 10^(38 - S), 1 \* 10^(38 - S) )
+-   Decimal256(S) - ( -1 \* 10^(76 - S), 1 \* 10^(76 - S) )
 
 For example, Decimal32(4) can contain numbers from -99999.9999 to 99999.9999 with 0.0001 step.
 
@@ -38,6 +40,7 @@ Binary operations on Decimal result in wider result type (with any order of argu
 -   `Decimal64(S1) <op> Decimal32(S2) -> Decimal64(S)`
 -   `Decimal128(S1) <op> Decimal32(S2) -> Decimal128(S)`
 -   `Decimal128(S1) <op> Decimal64(S2) -> Decimal128(S)`
+-   `Decimal256(S1) <op> Decimal128(S2) -> Decimal256(S)`
 
 Rules for scale:
 
diff --git a/docs/en/sql-reference/data-types/int-uint.md b/docs/en/sql-reference/data-types/int-uint.md
index 84ef120af7d..83a4bb6b677 100644
--- a/docs/en/sql-reference/data-types/int-uint.md
+++ b/docs/en/sql-reference/data-types/int-uint.md
@@ -1,9 +1,9 @@
 ---
 toc_priority: 40
-toc_title: UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64
+toc_title: UInt8, UInt16, UInt32, UInt64, UInt256, Int8, Int16, Int32, Int64, Int128, Int256
 ---
 
-# UInt8, UInt16, UInt32, UInt64, Int8, Int16, Int32, Int64 {#uint8-uint16-uint32-uint64-int8-int16-int32-int64}
+# UInt8, UInt16, UInt32, UInt64, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 {#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256}
 
 Fixed-length integers, with or without a sign.
 
@@ -13,6 +13,8 @@ Fixed-length integers, with or without a sign.
 -   Int16 - \[-32768 : 32767\]
 -   Int32 - \[-2147483648 : 2147483647\]
 -   Int64 - \[-9223372036854775808 : 9223372036854775807\]
+-   Int128 - \[-170141183460469231731687303715884105728 : 170141183460469231731687303715884105727\]
+-   Int256 - \[-57896044618658097711785492504343953926634992332820282019728792003956564819968 : 57896044618658097711785492504343953926634992332820282019728792003956564819967\]
 
 ## Uint Ranges {#uint-ranges}
 
@@ -20,5 +22,6 @@ Fixed-length integers, with or without a sign.
 -   UInt16 - \[0 : 65535\]
 -   UInt32 - \[0 : 4294967295\]
 -   UInt64 - \[0 : 18446744073709551615\]
+-   UInt256 - \[0 : 115792089237316195423570985008687907853269984665640564039457584007913129639935\]
 
 [Original article](https://clickhouse.tech/docs/en/data_types/int_uint/) <!--hide-->
diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index e5c321041c2..3791b6581c7 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -11,7 +11,7 @@ When you convert a value from one to another data type, you should remember that
 
 ClickHouse has the [same behavior as C++ programs](https://en.cppreference.com/w/cpp/language/implicit_conversion).
 
-## toInt(8\|16\|32\|64) {#toint8163264}
+## toInt(8\|16\|32\|64\|128\|256) {#toint8163264128256}
 
 Converts an input value to the [Int](../../sql-reference/data-types/int-uint.md) data type. This function family includes:
 
@@ -19,6 +19,8 @@ Converts an input value to the [Int](../../sql-reference/data-types/int-uint.md)
 -   `toInt16(expr)` — Results in the `Int16` data type.
 -   `toInt32(expr)` — Results in the `Int32` data type.
 -   `toInt64(expr)` — Results in the `Int64` data type.
+-   `toInt128(expr)` — Results in the `Int128` data type.
+-   `toInt256(expr)` — Results in the `Int256` data type.
 
 **Parameters**
 
@@ -26,7 +28,7 @@ Converts an input value to the [Int](../../sql-reference/data-types/int-uint.md)
 
 **Returned value**
 
-Integer value in the `Int8`, `Int16`, `Int32`, or `Int64` data type.
+Integer value in the `Int8`, `Int16`, `Int32`, `Int64`, `Int128` or `Int256` data type.
 
 Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers.
 
@@ -46,7 +48,7 @@ SELECT toInt64(nan), toInt32(32), toInt16('16'), toInt8(8.8)
 
 ## toInt(8\|16\|32\|64)OrZero {#toint8163264orzero}
 
-It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 32 \| 64). If failed, returns 0.
+It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 32 \| 64 \| 128 \| 256). If failed, returns 0.
 
 **Example**
 

From 0d5923d87c4029b43d5a77b4bd6c72874ae64157 Mon Sep 17 00:00:00 2001
From: Sergey Ryzhkov <53r93y@yandex-team.ru>
Date: Fri, 2 Oct 2020 10:11:14 +0300
Subject: [PATCH 03/72] Update type-conversion-functions.md

---
 .../functions/type-conversion-functions.md    | 22 +++++++++++--------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/docs/en/sql-reference/functions/type-conversion-functions.md b/docs/en/sql-reference/functions/type-conversion-functions.md
index 3791b6581c7..88f92d571f4 100644
--- a/docs/en/sql-reference/functions/type-conversion-functions.md
+++ b/docs/en/sql-reference/functions/type-conversion-functions.md
@@ -62,9 +62,9 @@ select toInt64OrZero('123123'), toInt8OrZero('123qwe123')
 └─────────────────────────┴───────────────────────────┘
 ```
 
-## toInt(8\|16\|32\|64)OrNull {#toint8163264ornull}
+## toInt(8\|16\|32\|64\|128\|256)OrNull {#toint8163264128256ornull}
 
-It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 32 \| 64). If failed, returns NULL.
+It takes an argument of type String and tries to parse it into Int (8 \| 16 \| 32 \| 64 \| 128 \| 256). If failed, returns NULL.
 
 **Example**
 
@@ -78,7 +78,7 @@ select toInt64OrNull('123123'), toInt8OrNull('123qwe123')
 └─────────────────────────┴───────────────────────────┘
 ```
 
-## toUInt(8\|16\|32\|64) {#touint8163264}
+## toUInt(8\|16\|32\|64\|256) {#touint8163264256}
 
 Converts an input value to the [UInt](../../sql-reference/data-types/int-uint.md) data type. This function family includes:
 
@@ -86,6 +86,7 @@ Converts an input value to the [UInt](../../sql-reference/data-types/int-uint.md
 -   `toUInt16(expr)` — Results in the `UInt16` data type.
 -   `toUInt32(expr)` — Results in the `UInt32` data type.
 -   `toUInt64(expr)` — Results in the `UInt64` data type.
+-   `toUInt256(expr)` — Results in the `UInt256` data type.
 
 **Parameters**
 
@@ -93,7 +94,7 @@ Converts an input value to the [UInt](../../sql-reference/data-types/int-uint.md
 
 **Returned value**
 
-Integer value in the `UInt8`, `UInt16`, `UInt32`, or `UInt64` data type.
+Integer value in the `UInt8`, `UInt16`, `UInt32`, `UInt64` or `UInt256` data type.
 
 Functions use [rounding towards zero](https://en.wikipedia.org/wiki/Rounding#Rounding_towards_zero), meaning they truncate fractional digits of numbers.
 
@@ -111,9 +112,9 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8)
 └─────────────────────┴───────────────┴────────────────┴──────────────┘
 ```
 
-## toUInt(8\|16\|32\|64)OrZero {#touint8163264orzero}
+## toUInt(8\|16\|32\|64\|256)OrZero {#touint8163264256orzero}
 
-## toUInt(8\|16\|32\|64)OrNull {#touint8163264ornull}
+## toUInt(8\|16\|32\|64\|256)OrNull {#touint8163264256ornull}
 
 ## toFloat(32\|64) {#tofloat3264}
 
@@ -133,21 +134,23 @@ SELECT toUInt64(nan), toUInt32(-32), toUInt16('16'), toUInt8(8.8)
 
 ## toDateTimeOrNull {#todatetimeornull}
 
-## toDecimal(32\|64\|128) {#todecimal3264128}
+## toDecimal(32\|64\|128\|256) {#todecimal3264128256}
 
 Converts `value` to the [Decimal](../../sql-reference/data-types/decimal.md) data type with precision of `S`. The `value` can be a number or a string. The `S` (scale) parameter specifies the number of decimal places.
 
 -   `toDecimal32(value, S)`
 -   `toDecimal64(value, S)`
 -   `toDecimal128(value, S)`
+-   `toDecimal256(value, S)`
 
-## toDecimal(32\|64\|128)OrNull {#todecimal3264128ornull}
+## toDecimal(32\|64\|128\|256)OrNull {#todecimal3264128256ornull}
 
 Converts an input string to a [Nullable(Decimal(P,S))](../../sql-reference/data-types/decimal.md) data type value. This family of functions include:
 
 -   `toDecimal32OrNull(expr, S)` — Results in `Nullable(Decimal32(S))` data type.
 -   `toDecimal64OrNull(expr, S)` — Results in `Nullable(Decimal64(S))` data type.
 -   `toDecimal128OrNull(expr, S)` — Results in `Nullable(Decimal128(S))` data type.
+-   `toDecimal256OrNull(expr, S)` — Results in `Nullable(Decimal256(S))` data type.
 
 These functions should be used instead of `toDecimal*()` functions, if you prefer to get a `NULL` value instead of an exception in the event of an input value parsing error.
 
@@ -185,13 +188,14 @@ SELECT toDecimal32OrNull(toString(-1.111), 2) AS val, toTypeName(val)
 └──────┴────────────────────────────────────────────────────┘
 ```
 
-## toDecimal(32\|64\|128)OrZero {#todecimal3264128orzero}
+## toDecimal(32\|64\|128\'|256)OrZero {#todecimal3264128256orzero}
 
 Converts an input value to the [Decimal(P,S)](../../sql-reference/data-types/decimal.md) data type. This family of functions include:
 
 -   `toDecimal32OrZero( expr, S)` — Results in `Decimal32(S)` data type.
 -   `toDecimal64OrZero( expr, S)` — Results in `Decimal64(S)` data type.
 -   `toDecimal128OrZero( expr, S)` — Results in `Decimal128(S)` data type.
+-   `toDecimal256OrZero( expr, S)` — Results in `Decimal256(S)` data type.
 
 These functions should be used instead of `toDecimal*()` functions, if you prefer to get a `0` value instead of an exception in the event of an input value parsing error.
 

From aa68f2d51adc13d99fa436c92a270c890aef0e29 Mon Sep 17 00:00:00 2001
From: Sergey Ryzhkov <53r93y@yandex-team.ru>
Date: Fri, 2 Oct 2020 11:40:31 +0300
Subject: [PATCH 04/72] I hereby agree to the terms of the CLA available at:
 https://yandex.ru/legal/cla/?lang=en

Changelog category:

Documentation for #13097

From c3e9317d32c714fb97fb1f9859e9f8a236e3d27e Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Fri, 2 Oct 2020 17:50:37 +0300
Subject: [PATCH 05/72] add test draft

---
 ...on_stuck_after_replace_partition.reference | 29 ++++++++++++++
 ...mutation_stuck_after_replace_partition.sql | 40 +++++++++++++++++++
 2 files changed, 69 insertions(+)
 create mode 100644 tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.reference
 create mode 100644 tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql

diff --git a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.reference b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.reference
new file mode 100644
index 00000000000..a0bbc35566d
--- /dev/null
+++ b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.reference
@@ -0,0 +1,29 @@
+1
+2
+44
+33
+3
+4
+default	mt	0	0_1_1_0	2
+default	mt	3	3_2_2_0	1
+default	mt	4	4_3_3_0	1
+default	rmt	0	0_0_0_0	2
+3
+4
+default	mt	0	0_1_1_0	2
+default	mt	3	3_2_2_0	1
+default	mt	4	4_3_3_0	1
+default	rmt	0	0_2_2_0	2
+3
+4
+1	1
+2	2
+3
+4
+default	mt	0	0_1_1_0	2
+default	mt	3	3_2_2_0	1
+default	mt	4	4_3_3_0	1
+default	rmt	0	0_6_6_0	2
+default	rmt	0000000000	DROP COLUMN s	2020-10-02 17:46:31	['0']	[3]	['0_0_5_4294967295']	1	1		1970-01-01 03:00:00
+3
+4
diff --git a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
new file mode 100644
index 00000000000..6cb5953e8f4
--- /dev/null
+++ b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
@@ -0,0 +1,40 @@
+drop table if exists mt;
+drop table if exists rmt;
+
+
+create table mt (`n` UInt64) engine = MergeTree partition by intDiv(n, 10) order by n;
+
+insert into mt values (3), (4), (33), (44);
+
+create table rmt (`n` UInt64) engine = ReplicatedMergeTree('/clickhouse/test_01149/rmt1', 'r1') partition by intDiv(n, 10) order by n;
+
+insert into rmt values (1), (2);
+select * from rmt;
+select * from mt;
+
+select database, table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
+
+alter table rmt replace partition '0' from mt;
+select * from rmt;
+select database, table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
+
+alter table rmt add column s String;
+alter table rmt drop column s;
+select * from rmt;
+alter table rmt add column s String;
+insert into rmt values (1, '1'), (2, '2');
+alter table mt add column s String;
+select * from rmt;
+alter table rmt replace partition '0' from mt;      -- (probably) removes parts incorrectly
+
+select database, table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
+
+alter table rmt drop column s;  -- hangs waiting for mutation of removed parts
+
+-- see parts_to_do
+select * from system.mutations where database=currentDatabase() and table='rmt';
+
+select * from rmt;
+
+drop table mt;
+drop table rmt;

From bd790f399fab0a63c74b85130f780cb1e0c1efc9 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Fri, 2 Oct 2020 22:34:37 +0300
Subject: [PATCH 06/72] simplify test

---
 ...on_stuck_after_replace_partition.reference | 35 ++++++-------------
 ...mutation_stuck_after_replace_partition.sql | 31 ++++++----------
 2 files changed, 20 insertions(+), 46 deletions(-)

diff --git a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.reference b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.reference
index a0bbc35566d..edf8a7e391c 100644
--- a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.reference
+++ b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.reference
@@ -1,29 +1,14 @@
-1
-2
-44
-33
-3
-4
-default	mt	0	0_1_1_0	2
-default	mt	3	3_2_2_0	1
-default	mt	4	4_3_3_0	1
-default	rmt	0	0_0_0_0	2
-3
-4
-default	mt	0	0_1_1_0	2
-default	mt	3	3_2_2_0	1
-default	mt	4	4_3_3_0	1
-default	rmt	0	0_2_2_0	2
-3
-4
 1	1
 2	2
-3
-4
-default	mt	0	0_1_1_0	2
-default	mt	3	3_2_2_0	1
-default	mt	4	4_3_3_0	1
-default	rmt	0	0_6_6_0	2
-default	rmt	0000000000	DROP COLUMN s	2020-10-02 17:46:31	['0']	[3]	['0_0_5_4294967295']	1	1		1970-01-01 03:00:00
+3	3
+4	4
+mt	0	0_1_1_0	2
+rmt	0	0_0_0_0	2
+1	1
+2	2
+mt	0	0_1_1_0	2
+rmt	0	0_3_3_0	2
+0000000000	UPDATE s = concat(\'s\', toString(n)) WHERE 1	[]	0	1
+0000000001	DROP COLUMN s	[]	0	1
 3
 4
diff --git a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
index 6cb5953e8f4..c9b355b1e4e 100644
--- a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
+++ b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
@@ -1,39 +1,28 @@
 drop table if exists mt;
 drop table if exists rmt;
 
+create table mt (n UInt64, s String) engine = MergeTree partition by intDiv(n, 10) order by n;
+insert into mt values (3, '3'), (4, '4');
 
-create table mt (`n` UInt64) engine = MergeTree partition by intDiv(n, 10) order by n;
+create table rmt (n UInt64, s String) engine = ReplicatedMergeTree('/clickhouse/test_01149/rmt', 'r1') partition by intDiv(n, 10) order by n;
+insert into rmt values (1,'1'), (2, '2');
 
-insert into mt values (3), (4), (33), (44);
-
-create table rmt (`n` UInt64) engine = ReplicatedMergeTree('/clickhouse/test_01149/rmt1', 'r1') partition by intDiv(n, 10) order by n;
-
-insert into rmt values (1), (2);
 select * from rmt;
 select * from mt;
+select table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
 
-select database, table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
-
-alter table rmt replace partition '0' from mt;
-select * from rmt;
-select database, table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
-
-alter table rmt add column s String;
-alter table rmt drop column s;
-select * from rmt;
-alter table rmt add column s String;
-insert into rmt values (1, '1'), (2, '2');
-alter table mt add column s String;
+alter table rmt update s = 's'||toString(n) where 1;    -- increment mutation version
 select * from rmt;
 alter table rmt replace partition '0' from mt;      -- (probably) removes parts incorrectly
 
-select database, table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
+--system restart replica rmt;  -- workaround
+
+select table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
 
 alter table rmt drop column s;  -- hangs waiting for mutation of removed parts
 
 -- see parts_to_do
-select * from system.mutations where database=currentDatabase() and table='rmt';
-
+select mutation_id, command, parts_to_do_names, parts_to_do, is_done from system.mutations where database=currentDatabase() and table='rmt';
 select * from rmt;
 
 drop table mt;

From 1e638d2a61ae25fae244e68f7c43e7c6539cd4cd Mon Sep 17 00:00:00 2001
From: r1j1k <r1j1k@users.noreply.github.com>
Date: Mon, 5 Oct 2020 08:29:45 +0300
Subject: [PATCH 07/72] Update int-uint.md

---
 docs/en/sql-reference/data-types/int-uint.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/data-types/int-uint.md b/docs/en/sql-reference/data-types/int-uint.md
index 4c2377a634b..2af855a340d 100644
--- a/docs/en/sql-reference/data-types/int-uint.md
+++ b/docs/en/sql-reference/data-types/int-uint.md
@@ -3,7 +3,7 @@ toc_priority: 40
 toc_title: UInt8, UInt16, UInt32, UInt64, UInt256, Int8, Int16, Int32, Int64, Int128, Int256
 ---
 
-# UInt8, UInt16, UInt32, UInt64, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 {#uint8-uint16-uint32-uint64-int8-int16-int32-int64}
+# UInt8, UInt16, UInt32, UInt64, UInt256, Int8, Int16, Int32, Int64, Int128, Int256 {#uint8-uint16-uint32-uint64-uint256-int8-int16-int32-int64-int128-int256}
 
 Fixed-length integers, with or without a sign.
 

From f8fb9ccb7b93a37a8a7bd4f37243ebdc7a50ec03 Mon Sep 17 00:00:00 2001
From: r1j1k <r1j1k@users.noreply.github.com>
Date: Mon, 5 Oct 2020 08:42:21 +0300
Subject: [PATCH 08/72] Update decimal.md

-decimal256s}
---
 docs/en/sql-reference/data-types/decimal.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docs/en/sql-reference/data-types/decimal.md b/docs/en/sql-reference/data-types/decimal.md
index fe80e735667..dce50380888 100644
--- a/docs/en/sql-reference/data-types/decimal.md
+++ b/docs/en/sql-reference/data-types/decimal.md
@@ -3,7 +3,7 @@ toc_priority: 42
 toc_title: Decimal
 ---
 
-# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S), Decimal256(S) {#decimalp-s-decimal32s-decimal64s-decimal128s}
+# Decimal(P, S), Decimal32(S), Decimal64(S), Decimal128(S), Decimal256(S) {#decimalp-s-decimal32s-decimal64s-decimal128s-decimal256s}
 
 Signed fixed-point numbers that keep precision during add, subtract and multiply operations. For division least significant digits are discarded (not rounded).
 

From a3d27962a78d0da7e524755ea63fa1d427f3fb30 Mon Sep 17 00:00:00 2001
From: Danila Kutenin <kutdanila@yandex.ru>
Date: Mon, 5 Oct 2020 09:01:51 +0300
Subject: [PATCH 09/72] Add experimental pass manager by default in clang

---
 CMakeLists.txt | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/CMakeLists.txt b/CMakeLists.txt
index aea5e0617a4..7e86f0f280a 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -300,6 +300,11 @@ if (COMPILER_CLANG)
         option(ENABLE_THINLTO "Clang-specific link time optimization" ON)
     endif()
 
+    # Set new experimental pass manager, it's a performance, build time and binary size win.
+    # Can be removed after https://reviews.llvm.org/D66490 merged and released to at least two versions of clang.
+    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -fexperimental-new-pass-manager")
+    set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -fexperimental-new-pass-manager")
+
     # We cannot afford to use LTO when compiling unit tests, and it's not enough
     # to only supply -fno-lto at the final linking stage. So we disable it
     # completely.

From 4c91bc71665e35769783ca1cbbcbe8202645535f Mon Sep 17 00:00:00 2001
From: Yatsishin Ilya <2159081+qoega@users.noreply.github.com>
Date: Mon, 5 Oct 2020 15:27:19 +0300
Subject: [PATCH 10/72] Check binary, darwin, aarch64, freebsd builds with
 clang-11

---
 tests/ci/ci_config.json | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/tests/ci/ci_config.json b/tests/ci/ci_config.json
index 6810b54bd32..6cc7a4a398a 100644
--- a/tests/ci/ci_config.json
+++ b/tests/ci/ci_config.json
@@ -102,7 +102,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-10",
+            "compiler": "clang-11",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",
@@ -134,7 +134,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-10-darwin",
+            "compiler": "clang-11-darwin",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",
@@ -144,7 +144,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-10-aarch64",
+            "compiler": "clang-11-aarch64",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",
@@ -154,7 +154,7 @@
             "with_coverage": false
         },
         {
-            "compiler": "clang-10-freebsd",
+            "compiler": "clang-11-freebsd",
             "build-type": "",
             "sanitizer": "",
             "package-type": "binary",

From 30a18f7567d3209e79b39be76f40d751f0260f34 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Mon, 5 Oct 2020 16:52:03 +0300
Subject: [PATCH 11/72] make it better

---
 src/Storages/StorageReplicatedMergeTree.cpp   | 30 ++++++++++---------
 src/Storages/StorageReplicatedMergeTree.h     |  2 +-
 ...mutation_stuck_after_replace_partition.sql | 17 +++++------
 3 files changed, 24 insertions(+), 25 deletions(-)

diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 9613bd5111d..c461fac43e6 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -4053,6 +4053,7 @@ Pipe StorageReplicatedMergeTree::alterPartition(
 
 
 /// If new version returns ordinary name, else returns part name containing the first and last month of the month
+/// NOTE: use it in pair with getFakePartCoveringAllPartsInPartition(...)
 static String getPartNamePossiblyFake(MergeTreeDataFormatVersion format_version, const MergeTreePartInfo & part_info)
 {
     if (format_version < MERGE_TREE_DATA_MIN_FORMAT_VERSION_WITH_CUSTOM_PARTITIONING)
@@ -4068,7 +4069,7 @@ static String getPartNamePossiblyFake(MergeTreeDataFormatVersion format_version,
     return part_info.getPartName();
 }
 
-bool StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(const String & partition_id, MergeTreePartInfo & part_info)
+bool StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(const String & partition_id, MergeTreePartInfo & part_info, bool for_replace_partition)
 {
     /// Even if there is no data in the partition, you still need to mark the range for deletion.
     /// - Because before executing DETACH, tasks for downloading parts to this partition can be executed.
@@ -4091,11 +4092,15 @@ bool StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(const St
         mutation_version = queue.getCurrentMutationVersion(partition_id, right);
     }
 
-    /// Empty partition.
-    if (right == 0)
-        return false;
+    /// REPLACE PARTITION does not decrement max_block of DROP_RANGE for unknown (probably historical) reason.
+    if (!for_replace_partition)
+    {
+        /// Empty partition.
+        if (right == 0)
+            return false;
 
-    --right;
+        --right;
+    }
 
     /// Artificial high level is chosen, to make this part "covering" all parts inside.
     part_info = MergeTreePartInfo(partition_id, left, right, MergeTreePartInfo::MAX_LEVEL, mutation_version);
@@ -5305,11 +5310,11 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
     /// Firstly, generate last block number and compute drop_range
     /// NOTE: Even if we make ATTACH PARTITION instead of REPLACE PARTITION drop_range will not be empty, it will contain a block.
     /// So, such case has special meaning, if drop_range contains only one block it means that nothing to drop.
+    /// TODO why not to add normal DROP_RANGE entry to replication queue if `replace` is true?
     MergeTreePartInfo drop_range;
-    drop_range.partition_id = partition_id;
-    drop_range.max_block = allocateBlockNumber(partition_id, zookeeper)->getNumber();
-    drop_range.min_block = replace ? 0 : drop_range.max_block;
-    drop_range.level = std::numeric_limits<decltype(drop_range.level)>::max();
+    getFakePartCoveringAllPartsInPartition(partition_id, drop_range, true);
+    if (!replace)
+        drop_range.min_block = drop_range.max_block;
 
     String drop_range_fake_part_name = getPartNamePossiblyFake(format_version, drop_range);
 
@@ -5502,11 +5507,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
     /// A range for log entry to remove parts from the source table (myself).
 
     MergeTreePartInfo drop_range;
-    drop_range.partition_id = partition_id;
-    drop_range.max_block = allocateBlockNumber(partition_id, zookeeper)->getNumber();
-    drop_range.min_block = 0;
-    drop_range.level = std::numeric_limits<decltype(drop_range.level)>::max();
-
+    getFakePartCoveringAllPartsInPartition(partition_id, drop_range, true);
     String drop_range_fake_part_name = getPartNamePossiblyFake(format_version, drop_range);
 
     if (drop_range.getBlocksCount() > 1)
@@ -5561,6 +5562,7 @@ void StorageReplicatedMergeTree::movePartitionToTable(const StoragePtr & dest_ta
         drop_range_dest.max_block = drop_range.max_block;
         drop_range_dest.min_block = drop_range.max_block;
         drop_range_dest.level = drop_range.level;
+        drop_range_dest.mutation = drop_range.mutation;
 
         entry.type = ReplicatedMergeTreeLogEntryData::REPLACE_RANGE;
         entry.source_replica = dest_table_storage->replica_name;
diff --git a/src/Storages/StorageReplicatedMergeTree.h b/src/Storages/StorageReplicatedMergeTree.h
index d851082d5c2..2126a4d2fec 100644
--- a/src/Storages/StorageReplicatedMergeTree.h
+++ b/src/Storages/StorageReplicatedMergeTree.h
@@ -526,7 +526,7 @@ private:
 
     /// Produce an imaginary part info covering all parts in the specified partition (at the call moment).
     /// Returns false if the partition doesn't exist yet.
-    bool getFakePartCoveringAllPartsInPartition(const String & partition_id, MergeTreePartInfo & part_info);
+    bool getFakePartCoveringAllPartsInPartition(const String & partition_id, MergeTreePartInfo & part_info, bool for_replace_partition = false);
 
     /// Check for a node in ZK. If it is, remember this information, and then immediately answer true.
     std::unordered_set<std::string> existing_nodes_cache;
diff --git a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
index c9b355b1e4e..f478be89eea 100644
--- a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
+++ b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
@@ -1,5 +1,5 @@
 drop table if exists mt;
-drop table if exists rmt;
+drop table if exists rmt sync;
 
 create table mt (n UInt64, s String) engine = MergeTree partition by intDiv(n, 10) order by n;
 insert into mt values (3, '3'), (4, '4');
@@ -9,21 +9,18 @@ insert into rmt values (1,'1'), (2, '2');
 
 select * from rmt;
 select * from mt;
-select table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
+select table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') and active=1 order by table, name;
 
-alter table rmt update s = 's'||toString(n) where 1;    -- increment mutation version
+alter table rmt update s = 's'||toString(n) where 1;
 select * from rmt;
-alter table rmt replace partition '0' from mt;      -- (probably) removes parts incorrectly
+alter table rmt replace partition '0' from mt;
 
---system restart replica rmt;  -- workaround
+select table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') and active=1 order by table, name;
 
-select table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') order by table, name;
+alter table rmt drop column s;
 
-alter table rmt drop column s;  -- hangs waiting for mutation of removed parts
-
--- see parts_to_do
 select mutation_id, command, parts_to_do_names, parts_to_do, is_done from system.mutations where database=currentDatabase() and table='rmt';
 select * from rmt;
 
 drop table mt;
-drop table rmt;
+drop table rmt sync;

From 0e4fd5f24b0089ba0e7c709b8af97649458095a0 Mon Sep 17 00:00:00 2001
From: nvartolomei <nv@cloudflare.com>
Date: Mon, 5 Oct 2020 16:27:03 +0100
Subject: [PATCH 12/72] Use https protocol for llvm repository in fasttest

---
 docker/test/fasttest/Dockerfile | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/docker/test/fasttest/Dockerfile b/docker/test/fasttest/Dockerfile
index 65ec078d3ca..3cfa57bd747 100644
--- a/docker/test/fasttest/Dockerfile
+++ b/docker/test/fasttest/Dockerfile
@@ -11,7 +11,7 @@ RUN apt-get update \
     && echo "${LLVM_PUBKEY_HASH} /tmp/llvm-snapshot.gpg.key" | sha384sum -c \
     && apt-key add /tmp/llvm-snapshot.gpg.key \
     && export CODENAME="$(lsb_release --codename --short | tr 'A-Z' 'a-z')" \
-    && echo "deb [trusted=yes] http://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
+    && echo "deb [trusted=yes] https://apt.llvm.org/${CODENAME}/ llvm-toolchain-${CODENAME}-${LLVM_VERSION} main" >> \
         /etc/apt/sources.list
 
 # initial packages

From 8ec58c17f3f37099e376d3101bedf838cd641071 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Mon, 5 Oct 2020 19:41:46 +0300
Subject: [PATCH 13/72] More consistent metadata usage

---
 src/Storages/MergeTree/DataPartsExchange.cpp  |  2 +-
 src/Storages/MergeTree/MergeTreeData.cpp      | 27 ++++++++++---------
 src/Storages/MergeTree/MergeTreeData.h        |  2 ++
 .../MergeTree/MergeTreeDataPartTTLInfo.cpp    |  7 +++--
 .../MergeTree/MergeTreeDataWriter.cpp         |  2 +-
 src/Storages/StorageMergeTree.cpp             | 13 ++++++---
 src/Storages/StorageReplicatedMergeTree.cpp   |  6 ++---
 7 files changed, 36 insertions(+), 23 deletions(-)

diff --git a/src/Storages/MergeTree/DataPartsExchange.cpp b/src/Storages/MergeTree/DataPartsExchange.cpp
index f9fb157942a..3da0e203f14 100644
--- a/src/Storages/MergeTree/DataPartsExchange.cpp
+++ b/src/Storages/MergeTree/DataPartsExchange.cpp
@@ -276,7 +276,7 @@ MergeTreeData::MutableDataPartPtr Fetcher::fetchPart(
             ReadBufferFromString ttl_infos_buffer(ttl_infos_string);
             assertString("ttl format version: 1\n", ttl_infos_buffer);
             ttl_infos.read(ttl_infos_buffer);
-            reservation = data.reserveSpacePreferringTTLRules(sum_files_size, ttl_infos, std::time(nullptr), 0, true);
+            reservation = data.reserveSpacePreferringTTLRules(metadata_snapshot, sum_files_size, ttl_infos, std::time(nullptr), 0, true);
         }
         else
             reservation = data.reserveSpace(sum_files_size);
diff --git a/src/Storages/MergeTree/MergeTreeData.cpp b/src/Storages/MergeTree/MergeTreeData.cpp
index f2b26a928c1..ef96c446287 100644
--- a/src/Storages/MergeTree/MergeTreeData.cpp
+++ b/src/Storages/MergeTree/MergeTreeData.cpp
@@ -3037,28 +3037,31 @@ ReservationPtr MergeTreeData::tryReserveSpace(UInt64 expected_size, SpacePtr spa
     return space->reserve(expected_size);
 }
 
-ReservationPtr MergeTreeData::reserveSpacePreferringTTLRules(UInt64 expected_size,
-        const IMergeTreeDataPart::TTLInfos & ttl_infos,
-        time_t time_of_move,
-        size_t min_volume_index,
-        bool is_insert) const
+ReservationPtr MergeTreeData::reserveSpacePreferringTTLRules(
+    const StorageMetadataPtr & metadata_snapshot,
+    UInt64 expected_size,
+    const IMergeTreeDataPart::TTLInfos & ttl_infos,
+    time_t time_of_move,
+    size_t min_volume_index,
+    bool is_insert) const
 {
     expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size);
 
-    ReservationPtr reservation = tryReserveSpacePreferringTTLRules(expected_size, ttl_infos, time_of_move, min_volume_index, is_insert);
+    ReservationPtr reservation = tryReserveSpacePreferringTTLRules(metadata_snapshot, expected_size, ttl_infos, time_of_move, min_volume_index, is_insert);
 
     return checkAndReturnReservation(expected_size, std::move(reservation));
 }
 
-ReservationPtr MergeTreeData::tryReserveSpacePreferringTTLRules(UInt64 expected_size,
-        const IMergeTreeDataPart::TTLInfos & ttl_infos,
-        time_t time_of_move,
-        size_t min_volume_index,
-        bool is_insert) const
+ReservationPtr MergeTreeData::tryReserveSpacePreferringTTLRules(
+    const StorageMetadataPtr & metadata_snapshot,
+    UInt64 expected_size,
+    const IMergeTreeDataPart::TTLInfos & ttl_infos,
+    time_t time_of_move,
+    size_t min_volume_index,
+    bool is_insert) const
 {
     expected_size = std::max(RESERVATION_MIN_ESTIMATION_SIZE, expected_size);
 
-    auto metadata_snapshot = getInMemoryMetadataPtr();
     ReservationPtr reservation;
 
     auto move_ttl_entry = selectTTLDescriptionForTTLInfos(metadata_snapshot->getMoveTTLs(), ttl_infos.moves_ttl, time_of_move, true);
diff --git a/src/Storages/MergeTree/MergeTreeData.h b/src/Storages/MergeTree/MergeTreeData.h
index 1125eb32b66..5c18661dad1 100644
--- a/src/Storages/MergeTree/MergeTreeData.h
+++ b/src/Storages/MergeTree/MergeTreeData.h
@@ -632,6 +632,7 @@ public:
 
     /// Reserves space at least 1MB preferring best destination according to `ttl_infos`.
     ReservationPtr reserveSpacePreferringTTLRules(
+        const StorageMetadataPtr & metadata_snapshot,
         UInt64 expected_size,
         const IMergeTreeDataPart::TTLInfos & ttl_infos,
         time_t time_of_move,
@@ -639,6 +640,7 @@ public:
         bool is_insert = false) const;
 
     ReservationPtr tryReserveSpacePreferringTTLRules(
+        const StorageMetadataPtr & metadata_snapshot,
         UInt64 expected_size,
         const IMergeTreeDataPart::TTLInfos & ttl_infos,
         time_t time_of_move,
diff --git a/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.cpp b/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.cpp
index 7d3c00aa19c..92c8a66e828 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartTTLInfo.cpp
@@ -182,6 +182,10 @@ std::optional<TTLDescription> selectTTLDescriptionForTTLInfos(const TTLDescripti
     for (auto ttl_entry_it = descriptions.begin(); ttl_entry_it != descriptions.end(); ++ttl_entry_it)
     {
         auto ttl_info_it = ttl_info_map.find(ttl_entry_it->result_column);
+
+        if (ttl_info_it == ttl_info_map.end())
+            continue;
+
         time_t ttl_time;
 
         if (use_max)
@@ -190,8 +194,7 @@ std::optional<TTLDescription> selectTTLDescriptionForTTLInfos(const TTLDescripti
             ttl_time = ttl_info_it->second.min;
 
         /// Prefer TTL rule which went into action last.
-        if (ttl_info_it != ttl_info_map.end()
-                && ttl_time <= current_time
+        if (ttl_time <= current_time
                 && best_ttl_time <= ttl_time)
         {
             best_entry_it = ttl_entry_it;
diff --git a/src/Storages/MergeTree/MergeTreeDataWriter.cpp b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
index 739aff31a06..e42bb786f46 100644
--- a/src/Storages/MergeTree/MergeTreeDataWriter.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataWriter.cpp
@@ -237,7 +237,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataWriter::writeTempPart(BlockWithPa
         updateTTL(ttl_entry, move_ttl_infos, move_ttl_infos.moves_ttl[ttl_entry.result_column], block, false);
 
     NamesAndTypesList columns = metadata_snapshot->getColumns().getAllPhysical().filter(block.getNames());
-    ReservationPtr reservation = data.reserveSpacePreferringTTLRules(expected_size, move_ttl_infos, time(nullptr), 0, true);
+    ReservationPtr reservation = data.reserveSpacePreferringTTLRules(metadata_snapshot, expected_size, move_ttl_infos, time(nullptr), 0, true);
     VolumePtr volume = data.getStoragePolicy()->getVolume(0);
 
     auto new_data_part = data.createPart(
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 55fb42b550e..6edfd5551ed 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -286,7 +286,12 @@ struct CurrentlyMergingPartsTagger
     StorageMergeTree & storage;
 
 public:
-    CurrentlyMergingPartsTagger(FutureMergedMutatedPart & future_part_, size_t total_size, StorageMergeTree & storage_, bool is_mutation)
+    CurrentlyMergingPartsTagger(
+        FutureMergedMutatedPart & future_part_,
+        size_t total_size,
+        StorageMergeTree & storage_,
+        const StorageMetadataPtr & metadata_snapshot,
+        bool is_mutation)
         : future_part(future_part_), storage(storage_)
     {
         /// Assume mutex is already locked, because this method is called from mergeTask.
@@ -304,7 +309,7 @@ public:
                 max_volume_index = std::max(max_volume_index, storage.getStoragePolicy()->getVolumeIndexByDisk(part_ptr->volume->getDisk()));
             }
 
-            reserved_space = storage.tryReserveSpacePreferringTTLRules(total_size, ttl_infos, time(nullptr), max_volume_index);
+            reserved_space = storage.tryReserveSpacePreferringTTLRules(metadata_snapshot, total_size, ttl_infos, time(nullptr), max_volume_index);
         }
         if (!reserved_space)
         {
@@ -715,7 +720,7 @@ bool StorageMergeTree::merge(
             return false;
         }
 
-        merging_tagger.emplace(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace(future_part.parts), *this, false);
+        merging_tagger.emplace(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace(future_part.parts), *this, metadata_snapshot, false);
         auto table_id = getStorageID();
         merge_entry = global_context.getMergeList().insert(table_id.database_name, table_id.table_name, future_part);
     }
@@ -856,7 +861,7 @@ bool StorageMergeTree::tryMutatePart()
             future_part.name = part->getNewName(new_part_info);
             future_part.type = part->getType();
 
-            tagger.emplace(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, true);
+            tagger.emplace(future_part, MergeTreeDataMergerMutator::estimateNeededDiskSpace({part}), *this, metadata_snapshot, true);
             break;
         }
     }
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 9613bd5111d..0c0d7ea6407 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -1416,11 +1416,11 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
         ttl_infos.update(part_ptr->ttl_infos);
         max_volume_index = std::max(max_volume_index, getStoragePolicy()->getVolumeIndexByDisk(part_ptr->volume->getDisk()));
     }
-    ReservationPtr reserved_space = reserveSpacePreferringTTLRules(estimated_space_for_merge,
-            ttl_infos, time(nullptr), max_volume_index);
-
     auto table_lock = lockForShare(RWLockImpl::NO_QUERY, storage_settings_ptr->lock_acquire_timeout_for_background_operations);
+
     StorageMetadataPtr metadata_snapshot = getInMemoryMetadataPtr();
+    ReservationPtr reserved_space = reserveSpacePreferringTTLRules(
+        metadata_snapshot, estimated_space_for_merge, ttl_infos, time(nullptr), max_volume_index);
 
     FutureMergedMutatedPart future_merged_part(parts, entry.new_part_type);
     if (future_merged_part.name != entry.new_part_name)

From 418766e92b4ac839b21f3b1a06b4159471832d2b Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Mon, 5 Oct 2020 22:16:28 +0300
Subject: [PATCH 14/72] fix another bug in mutations

---
 src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp  | 10 ++++++++++
 src/Storages/StorageReplicatedMergeTree.cpp          |  8 ++++++--
 ...keeper_mutation_stuck_after_replace_partition.sql | 12 ++++++++++++
 3 files changed, 28 insertions(+), 2 deletions(-)

diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
index ef7ebead966..45e16e81208 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeQueue.cpp
@@ -715,6 +715,16 @@ void ReplicatedMergeTreeQueue::updateMutations(zkutil::ZooKeeperPtr zookeeper, C
                     for (const String & produced_part_name : queue_entry->getVirtualPartNames())
                     {
                         auto part_info = MergeTreePartInfo::fromPartName(produced_part_name, format_version);
+
+                        /// Oddly enough, getVirtualPartNames() may return _virtual_ part name.
+                        /// Such parts do not exist and will never appear, so we should not add virtual parts to parts_to_do list.
+                        /// Fortunately, it's easy to distinguish virtual parts from normal parts by part level.
+                        /// See StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(...)
+                        auto max_level = MergeTreePartInfo::MAX_LEVEL;      /// DROP/DETACH PARTITION
+                        auto another_max_level = std::numeric_limits<decltype(part_info.level)>::max();    /// REPLACE/MOVE PARTITION
+                        if (part_info.level == max_level || part_info.level == another_max_level)
+                            continue;
+
                         auto it = entry->block_numbers.find(part_info.partition_id);
                         if (it != entry->block_numbers.end() && it->second > part_info.getDataVersion())
                             mutation.parts_to_do.add(produced_part_name);
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index c461fac43e6..193fe645909 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -4092,9 +4092,12 @@ bool StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(const St
         mutation_version = queue.getCurrentMutationVersion(partition_id, right);
     }
 
-    /// REPLACE PARTITION does not decrement max_block of DROP_RANGE for unknown (probably historical) reason.
+    /// REPLACE PARTITION uses different max level and does not decrement max_block of DROP_RANGE for unknown (probably historical) reason.
+    auto max_level = std::numeric_limits<decltype(part_info.level)>::max();
     if (!for_replace_partition)
     {
+        max_level = MergeTreePartInfo::MAX_LEVEL;
+
         /// Empty partition.
         if (right == 0)
             return false;
@@ -4103,7 +4106,7 @@ bool StorageReplicatedMergeTree::getFakePartCoveringAllPartsInPartition(const St
     }
 
     /// Artificial high level is chosen, to make this part "covering" all parts inside.
-    part_info = MergeTreePartInfo(partition_id, left, right, MergeTreePartInfo::MAX_LEVEL, mutation_version);
+    part_info = MergeTreePartInfo(partition_id, left, right, max_level, mutation_version);
     return true;
 }
 
@@ -5393,6 +5396,7 @@ void StorageReplicatedMergeTree::replacePartitionFrom(
     }
 
     /// We are almost ready to commit changes, remove fetches and merges from drop range
+    /// FIXME it's unsafe to remove queue entries before we actually commit REPLACE_RANGE to replication log
     queue.removePartProducingOpsInRange(zookeeper, drop_range, entry);
 
     /// Remove deduplication block_ids of replacing parts
diff --git a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
index f478be89eea..667831a6797 100644
--- a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
+++ b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
@@ -22,5 +22,17 @@ alter table rmt drop column s;
 select mutation_id, command, parts_to_do_names, parts_to_do, is_done from system.mutations where database=currentDatabase() and table='rmt';
 select * from rmt;
 
+drop table rmt sync;
+
+set replication_alter_partitions_sync=0;
+create table rmt (n UInt64, s String) engine = ReplicatedMergeTree('/clickhouse/test_01149/rmt', 'r1') partition by intDiv(n, 10) order by n;
+insert into rmt values (1,'1'), (2, '2');
+
+alter table rmt update s = 's'||toString(n) where 1;
+alter table rmt drop partition '0';
+
+set replication_alter_partitions_sync=1;
+alter table rmt drop column s;
+
 drop table mt;
 drop table rmt sync;

From 28e12c559c9f2f3f990d50cec4889fd0f132fb08 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Tue, 6 Oct 2020 09:35:35 +0300
Subject: [PATCH 15/72] Ensure resource desrtuction order in
 ReadFromStorageStep.

---
 src/Processors/Pipe.h                         |  2 +-
 src/Processors/QueryPipeline.h                |  2 +-
 .../QueryPlan/ReadFromStorageStep.cpp         | 41 ++++++++-----------
 .../QueryPlan/ReadFromStorageStep.h           | 14 -------
 4 files changed, 19 insertions(+), 40 deletions(-)

diff --git a/src/Processors/Pipe.h b/src/Processors/Pipe.h
index f674663154d..f5feca7a2db 100644
--- a/src/Processors/Pipe.h
+++ b/src/Processors/Pipe.h
@@ -101,7 +101,7 @@ public:
     void setQuota(const std::shared_ptr<const EnabledQuota> & quota);
 
     /// Do not allow to change the table while the processors of pipe are alive.
-    void addTableLock(const TableLockHolder & lock) { holder.table_locks.push_back(lock); }
+    void addTableLock(TableLockHolder lock) { holder.table_locks.emplace_back(std::move(lock)); }
     /// This methods are from QueryPipeline. Needed to make conversion from pipeline to pipe possible.
     void addInterpreterContext(std::shared_ptr<Context> context) { holder.interpreter_context.emplace_back(std::move(context)); }
     void addStorageHolder(StoragePtr storage) { holder.storage_holders.emplace_back(std::move(storage)); }
diff --git a/src/Processors/QueryPipeline.h b/src/Processors/QueryPipeline.h
index 80ae1d591a4..6321928b357 100644
--- a/src/Processors/QueryPipeline.h
+++ b/src/Processors/QueryPipeline.h
@@ -101,7 +101,7 @@ public:
 
     const Block & getHeader() const { return pipe.getHeader(); }
 
-    void addTableLock(const TableLockHolder & lock) { pipe.addTableLock(lock); }
+    void addTableLock(TableLockHolder lock) { pipe.addTableLock(std::move(lock)); }
     void addInterpreterContext(std::shared_ptr<Context> context) { pipe.addInterpreterContext(std::move(context)); }
     void addStorageHolder(StoragePtr storage) { pipe.addStorageHolder(std::move(storage)); }
     void addQueryPlan(std::unique_ptr<QueryPlan> plan) { pipe.addQueryPlan(std::move(plan)); }
diff --git a/src/Processors/QueryPlan/ReadFromStorageStep.cpp b/src/Processors/QueryPlan/ReadFromStorageStep.cpp
index b085c177ad4..474410148fa 100644
--- a/src/Processors/QueryPlan/ReadFromStorageStep.cpp
+++ b/src/Processors/QueryPlan/ReadFromStorageStep.cpp
@@ -13,33 +13,26 @@ namespace DB
 
 ReadFromStorageStep::ReadFromStorageStep(
     TableLockHolder table_lock_,
-    StorageMetadataPtr metadata_snapshot_,
-    StreamLocalLimits & limits_,
-    SizeLimits & leaf_limits_,
-    std::shared_ptr<const EnabledQuota> quota_,
+    StorageMetadataPtr metadata_snapshot,
+    StreamLocalLimits & limits,
+    SizeLimits & leaf_limits,
+    std::shared_ptr<const EnabledQuota> quota,
     StoragePtr storage_,
-    const Names & required_columns_,
-    const SelectQueryInfo & query_info_,
+    const Names & required_columns,
+    const SelectQueryInfo & query_info,
     std::shared_ptr<Context> context_,
-    QueryProcessingStage::Enum processing_stage_,
-    size_t max_block_size_,
-    size_t max_streams_)
-    : table_lock(std::move(table_lock_))
-    , metadata_snapshot(std::move(metadata_snapshot_))
-    , limits(limits_)
-    , leaf_limits(leaf_limits_)
-    , quota(std::move(quota_))
-    , storage(std::move(storage_))
-    , required_columns(required_columns_)
-    , query_info(query_info_)
-    , context(std::move(context_))
-    , processing_stage(processing_stage_)
-    , max_block_size(max_block_size_)
-    , max_streams(max_streams_)
+    QueryProcessingStage::Enum processing_stage,
+    size_t max_block_size,
+    size_t max_streams)
 {
     /// Note: we read from storage in constructor of step because we don't know real header before reading.
     /// It will be fixed when storage return QueryPlanStep itself.
 
+    /// Move arguments into stack in order to ensure order of destruction in case of exception.
+    auto context = std::move(context_);
+    auto storage = std::move(storage_);
+    auto table_lock = std::move(table_lock_);
+
     Pipe pipe = storage->read(required_columns, metadata_snapshot, query_info, *context, processing_stage, max_block_size, max_streams);
 
     if (pipe.empty())
@@ -83,9 +76,6 @@ ReadFromStorageStep::ReadFromStorageStep(
     pipeline = std::make_unique<QueryPipeline>();
     QueryPipelineProcessorsCollector collector(*pipeline, this);
 
-    /// Table lock is stored inside pipeline here.
-    pipeline->addTableLock(table_lock);
-
     pipe.setLimits(limits);
 
     /**
@@ -103,8 +93,11 @@ ReadFromStorageStep::ReadFromStorageStep(
 
     pipeline->init(std::move(pipe));
 
+    /// Add resources to pipeline. The order is important.
+    /// Add in reverse order of destruction. Pipeline will be destroyed at the end in case of exception.
     pipeline->addInterpreterContext(std::move(context));
     pipeline->addStorageHolder(std::move(storage));
+    pipeline->addTableLock(std::move(table_lock));
 
     processors = collector.detachProcessors();
 
diff --git a/src/Processors/QueryPlan/ReadFromStorageStep.h b/src/Processors/QueryPlan/ReadFromStorageStep.h
index 98cde63a863..93a948116e8 100644
--- a/src/Processors/QueryPlan/ReadFromStorageStep.h
+++ b/src/Processors/QueryPlan/ReadFromStorageStep.h
@@ -45,20 +45,6 @@ public:
     void describePipeline(FormatSettings & settings) const override;
 
 private:
-    TableLockHolder table_lock;
-    StorageMetadataPtr metadata_snapshot;
-    StreamLocalLimits limits;
-    SizeLimits leaf_limits;
-    std::shared_ptr<const EnabledQuota> quota;
-
-    StoragePtr storage;
-    const Names & required_columns;
-    const SelectQueryInfo & query_info;
-    std::shared_ptr<Context> context;
-    QueryProcessingStage::Enum processing_stage;
-    size_t max_block_size;
-    size_t max_streams;
-
     QueryPipelinePtr pipeline;
     Processors processors;
 };

From bad8171f24f79356dacd256bd060839fa833fec0 Mon Sep 17 00:00:00 2001
From: Artem Zuikov <chertus@gmail.com>
Date: Tue, 6 Oct 2020 11:18:25 +0300
Subject: [PATCH 16/72] Fix distributed IN without database on initiator
 (#15538)

---
 .../InJoinSubqueriesPreprocessor.cpp          |  4 ++-
 ...87_distributed_in_not_default_db.reference |  2 ++
 .../01487_distributed_in_not_default_db.sql   | 36 +++++++++++++++++++
 3 files changed, 41 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/01487_distributed_in_not_default_db.reference
 create mode 100644 tests/queries/0_stateless/01487_distributed_in_not_default_db.sql

diff --git a/src/Interpreters/InJoinSubqueriesPreprocessor.cpp b/src/Interpreters/InJoinSubqueriesPreprocessor.cpp
index 244693396ca..2e922393131 100644
--- a/src/Interpreters/InJoinSubqueriesPreprocessor.cpp
+++ b/src/Interpreters/InJoinSubqueriesPreprocessor.cpp
@@ -27,7 +27,9 @@ namespace
 
 StoragePtr tryGetTable(const ASTPtr & database_and_table, const Context & context)
 {
-    auto table_id = context.resolveStorageID(database_and_table);
+    auto table_id = context.tryResolveStorageID(database_and_table);
+    if (!table_id)
+        return {};
     return DatabaseCatalog::instance().tryGetTable(table_id, context);
 }
 
diff --git a/tests/queries/0_stateless/01487_distributed_in_not_default_db.reference b/tests/queries/0_stateless/01487_distributed_in_not_default_db.reference
new file mode 100644
index 00000000000..0d66ea1aee9
--- /dev/null
+++ b/tests/queries/0_stateless/01487_distributed_in_not_default_db.reference
@@ -0,0 +1,2 @@
+0
+1
diff --git a/tests/queries/0_stateless/01487_distributed_in_not_default_db.sql b/tests/queries/0_stateless/01487_distributed_in_not_default_db.sql
new file mode 100644
index 00000000000..f6f7471711a
--- /dev/null
+++ b/tests/queries/0_stateless/01487_distributed_in_not_default_db.sql
@@ -0,0 +1,36 @@
+CREATE DATABASE IF NOT EXISTS shard_0;
+CREATE DATABASE IF NOT EXISTS shard_1;
+CREATE DATABASE IF NOT EXISTS main_01487;
+CREATE DATABASE IF NOT EXISTS test_01487;
+
+USE main_01487;
+
+DROP TABLE IF EXISTS shard_0.l;
+DROP TABLE IF EXISTS shard_1.l;
+DROP TABLE IF EXISTS d;
+DROP TABLE IF EXISTS t;
+
+CREATE TABLE shard_0.l (value UInt8) ENGINE = MergeTree ORDER BY value;
+CREATE TABLE shard_1.l (value UInt8) ENGINE = MergeTree ORDER BY value;
+CREATE TABLE t (value UInt8) ENGINE = Memory;
+
+INSERT INTO shard_0.l VALUES (0);
+INSERT INTO shard_1.l VALUES (1);
+INSERT INTO t VALUES (0), (1), (2);
+
+CREATE TABLE d AS t ENGINE = Distributed(test_cluster_two_shards_different_databases, currentDatabase(), t);
+
+USE test_01487;
+DROP DATABASE test_01487;
+
+SELECT * FROM main_01487.d WHERE value IN (SELECT l.value FROM l) ORDER BY value;
+
+USE main_01487;
+
+DROP TABLE IF EXISTS shard_0.l;
+DROP TABLE IF EXISTS shard_1.l;
+DROP TABLE IF EXISTS d;
+DROP TABLE IF EXISTS t;
+
+DROP DATABASE shard_0;
+DROP DATABASE shard_1;

From 69b4bc6f337c01ebe5cc8b48a9224ffee6fff830 Mon Sep 17 00:00:00 2001
From: Pavel Kovalenko <jokserfn@yandex-team.ru>
Date: Tue, 6 Oct 2020 12:38:00 +0300
Subject: [PATCH 17/72] Proper error handling during insert into MergeTree with
 S3.

---
 src/Disks/DiskCacheWrapper.cpp                |   1 +
 src/Disks/IDisk.cpp                           |   1 +
 src/Disks/S3/registerDiskS3.cpp               |   4 +
 src/Storages/MergeTree/IMergeTreeDataPart.cpp |   1 +
 .../MergeTreeDataPartWriterCompact.cpp        |   2 +
 .../MergeTreeDataPartWriterOnDisk.cpp         |   7 +-
 src/Storages/MergeTree/MergeTreePartition.cpp |   1 +
 .../MergeTree/MergedBlockOutputStream.cpp     |   5 +
 .../test_merge_tree_s3_failover/__init__.py   |   0
 .../configs/config.d/log_conf.xml             |  12 ++
 .../configs/config.d/storage_conf.xml         |  26 ++++
 .../configs/config.d/users.xml                |   5 +
 .../configs/config.xml                        |  20 +++
 .../s3_endpoint/endpoint.py                   |  49 ++++++++
 .../test_merge_tree_s3_failover/test.py       | 117 ++++++++++++++++++
 15 files changed, 250 insertions(+), 1 deletion(-)
 create mode 100644 tests/integration/test_merge_tree_s3_failover/__init__.py
 create mode 100644 tests/integration/test_merge_tree_s3_failover/configs/config.d/log_conf.xml
 create mode 100644 tests/integration/test_merge_tree_s3_failover/configs/config.d/storage_conf.xml
 create mode 100644 tests/integration/test_merge_tree_s3_failover/configs/config.d/users.xml
 create mode 100644 tests/integration/test_merge_tree_s3_failover/configs/config.xml
 create mode 100644 tests/integration/test_merge_tree_s3_failover/s3_endpoint/endpoint.py
 create mode 100644 tests/integration/test_merge_tree_s3_failover/test.py

diff --git a/src/Disks/DiskCacheWrapper.cpp b/src/Disks/DiskCacheWrapper.cpp
index c60f69920f4..7ce963380d4 100644
--- a/src/Disks/DiskCacheWrapper.cpp
+++ b/src/Disks/DiskCacheWrapper.cpp
@@ -194,6 +194,7 @@ DiskCacheWrapper::writeFile(const String & path, size_t buf_size, WriteMode mode
             auto src_buffer = cache_disk->readFile(path, buf_size, estimated_size, aio_threshold, 0);
             auto dst_buffer = DiskDecorator::writeFile(path, buf_size, mode, estimated_size, aio_threshold);
             copyData(*src_buffer, *dst_buffer);
+            dst_buffer->finalize();
         },
         buf_size);
 }
diff --git a/src/Disks/IDisk.cpp b/src/Disks/IDisk.cpp
index 5ae96ca6c23..8f11d6549e9 100644
--- a/src/Disks/IDisk.cpp
+++ b/src/Disks/IDisk.cpp
@@ -27,6 +27,7 @@ void copyFile(IDisk & from_disk, const String & from_path, IDisk & to_disk, cons
     auto in = from_disk.readFile(from_path);
     auto out = to_disk.writeFile(to_path);
     copyData(*in, *out);
+    out->finalize();
 }
 
 
diff --git a/src/Disks/S3/registerDiskS3.cpp b/src/Disks/S3/registerDiskS3.cpp
index 1c7a5e24282..862fd388476 100644
--- a/src/Disks/S3/registerDiskS3.cpp
+++ b/src/Disks/S3/registerDiskS3.cpp
@@ -1,3 +1,4 @@
+#include <aws/core/client/DefaultRetryStrategy.h>
 #include <IO/ReadHelpers.h>
 #include <IO/S3Common.h>
 #include <IO/WriteHelpers.h>
@@ -123,6 +124,9 @@ void registerDiskS3(DiskFactory & factory)
         if (proxy_config)
             cfg.perRequestConfiguration = [proxy_config](const auto & request) { return proxy_config->getConfiguration(request); };
 
+        cfg.retryStrategy = std::make_shared<Aws::Client::DefaultRetryStrategy>(
+            config.getUInt(config_prefix + ".retry_attempts", 10));
+
         auto client = S3::ClientFactory::instance().create(
             cfg,
             uri.is_virtual_hosted_style,
diff --git a/src/Storages/MergeTree/IMergeTreeDataPart.cpp b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
index 486e444763d..40f12428561 100644
--- a/src/Storages/MergeTree/IMergeTreeDataPart.cpp
+++ b/src/Storages/MergeTree/IMergeTreeDataPart.cpp
@@ -88,6 +88,7 @@ void IMergeTreeDataPart::MinMaxIndex::store(
         out_hashing.next();
         out_checksums.files[file_name].file_size = out_hashing.count();
         out_checksums.files[file_name].file_hash = out_hashing.getHash();
+        out->finalize();
     }
 }
 
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
index c81894ee36d..0b22f1271e3 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterCompact.cpp
@@ -229,6 +229,8 @@ void MergeTreeDataPartWriterCompact::finishDataSerialization(IMergeTreeDataPart:
     marks.next();
     addToChecksums(checksums);
 
+    plain_file->finalize();
+    marks_file->finalize();
     if (sync)
     {
         plain_file->sync();
diff --git a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
index 8295b881d87..c6b689da33a 100644
--- a/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataPartWriterOnDisk.cpp
@@ -17,8 +17,12 @@ namespace
 void MergeTreeDataPartWriterOnDisk::Stream::finalize()
 {
     compressed.next();
-    plain_file->next();
+    /// 'compressed_buf' doesn't call next() on underlying buffer ('plain_hashing'). We should do it manually.
+    plain_hashing.next();
     marks.next();
+
+    plain_file->finalize();
+    marks_file->finalize();
 }
 
 void MergeTreeDataPartWriterOnDisk::Stream::sync() const
@@ -331,6 +335,7 @@ void MergeTreeDataPartWriterOnDisk::finishPrimaryIndexSerialization(
         index_stream->next();
         checksums.files["primary.idx"].file_size = index_stream->count();
         checksums.files["primary.idx"].file_hash = index_stream->getHash();
+        index_file_stream->finalize();
         if (sync)
             index_file_stream->sync();
         index_stream = nullptr;
diff --git a/src/Storages/MergeTree/MergeTreePartition.cpp b/src/Storages/MergeTree/MergeTreePartition.cpp
index 4a846f63b7c..9b02b9f1fd8 100644
--- a/src/Storages/MergeTree/MergeTreePartition.cpp
+++ b/src/Storages/MergeTree/MergeTreePartition.cpp
@@ -156,6 +156,7 @@ void MergeTreePartition::store(const Block & partition_key_sample, const DiskPtr
     out_hashing.next();
     checksums.files["partition.dat"].file_size = out_hashing.count();
     checksums.files["partition.dat"].file_hash = out_hashing.getHash();
+    out->finalize();
 }
 
 void MergeTreePartition::create(const StorageMetadataPtr & metadata_snapshot, Block block, size_t row)
diff --git a/src/Storages/MergeTree/MergedBlockOutputStream.cpp b/src/Storages/MergeTree/MergedBlockOutputStream.cpp
index c91ed545ac5..1b40f9ab292 100644
--- a/src/Storages/MergeTree/MergedBlockOutputStream.cpp
+++ b/src/Storages/MergeTree/MergedBlockOutputStream.cpp
@@ -148,6 +148,7 @@ void MergedBlockOutputStream::finalizePartOnDisk(
         count_out_hashing.next();
         checksums.files["count.txt"].file_size = count_out_hashing.count();
         checksums.files["count.txt"].file_hash = count_out_hashing.getHash();
+        count_out->finalize();
         if (sync)
             count_out->sync();
     }
@@ -160,6 +161,7 @@ void MergedBlockOutputStream::finalizePartOnDisk(
         new_part->ttl_infos.write(out_hashing);
         checksums.files["ttl.txt"].file_size = out_hashing.count();
         checksums.files["ttl.txt"].file_hash = out_hashing.getHash();
+        out->finalize();
         if (sync)
             out->sync();
     }
@@ -170,6 +172,7 @@ void MergedBlockOutputStream::finalizePartOnDisk(
         /// Write a file with a description of columns.
         auto out = volume->getDisk()->writeFile(part_path + "columns.txt", 4096);
         part_columns.writeText(*out);
+        out->finalize();
         if (sync)
             out->sync();
     }
@@ -178,6 +181,7 @@ void MergedBlockOutputStream::finalizePartOnDisk(
     {
         auto out = volume->getDisk()->writeFile(part_path + IMergeTreeDataPart::DEFAULT_COMPRESSION_CODEC_FILE_NAME, 4096);
         DB::writeText(queryToString(default_codec->getFullCodecDesc()), *out);
+        out->finalize();
     }
     else
     {
@@ -189,6 +193,7 @@ void MergedBlockOutputStream::finalizePartOnDisk(
         /// Write file with checksums.
         auto out = volume->getDisk()->writeFile(part_path + "checksums.txt", 4096);
         checksums.write(*out);
+        out->finalize();
         if (sync)
             out->sync();
     }
diff --git a/tests/integration/test_merge_tree_s3_failover/__init__.py b/tests/integration/test_merge_tree_s3_failover/__init__.py
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/integration/test_merge_tree_s3_failover/configs/config.d/log_conf.xml b/tests/integration/test_merge_tree_s3_failover/configs/config.d/log_conf.xml
new file mode 100644
index 00000000000..318a6bca95d
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_failover/configs/config.d/log_conf.xml
@@ -0,0 +1,12 @@
+<yandex>
+    <shutdown_wait_unfinished>3</shutdown_wait_unfinished>
+    <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_failover/configs/config.d/storage_conf.xml b/tests/integration/test_merge_tree_s3_failover/configs/config.d/storage_conf.xml
new file mode 100644
index 00000000000..d4d53ab5efe
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_failover/configs/config.d/storage_conf.xml
@@ -0,0 +1,26 @@
+<yandex>
+    <storage_configuration>
+        <disks>
+            <s3>
+                <type>s3</type>
+                <!-- Use custom S3 endpoint -->
+                <endpoint>http://resolver:8080/root/data/</endpoint>
+                <access_key_id>minio</access_key_id>
+                <secret_access_key>minio123</secret_access_key>
+                <!-- ClickHouse starts earlier than custom S3 endpoint. Skip access check to avoid fail on start-up -->
+                <skip_access_check>true</skip_access_check>
+                <!-- Avoid extra retries to speed up tests -->
+                <retry_attempts>0</retry_attempts>
+            </s3>
+        </disks>
+        <policies>
+            <s3>
+                <volumes>
+                    <main>
+                        <disk>s3</disk>
+                    </main>
+                </volumes>
+            </s3>
+        </policies>
+    </storage_configuration>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_failover/configs/config.d/users.xml b/tests/integration/test_merge_tree_s3_failover/configs/config.d/users.xml
new file mode 100644
index 00000000000..797113053f4
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_failover/configs/config.d/users.xml
@@ -0,0 +1,5 @@
+<yandex>
+    <profiles>
+        <default/>
+    </profiles>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_failover/configs/config.xml b/tests/integration/test_merge_tree_s3_failover/configs/config.xml
new file mode 100644
index 00000000000..24b7344df3a
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_failover/configs/config.xml
@@ -0,0 +1,20 @@
+<?xml version="1.0"?>
+<yandex>
+    <tcp_port>9000</tcp_port>
+    <listen_host>127.0.0.1</listen_host>
+
+    <openSSL>
+        <client>
+            <cacheSessions>true</cacheSessions>
+            <verificationMode>none</verificationMode>
+            <invalidCertificateHandler>
+                <name>AcceptCertificateHandler</name>
+            </invalidCertificateHandler>
+        </client>
+    </openSSL>
+
+    <max_concurrent_queries>500</max_concurrent_queries>
+    <mark_cache_size>5368709120</mark_cache_size>
+    <path>./clickhouse/</path>
+    <users_config>users.xml</users_config>
+</yandex>
diff --git a/tests/integration/test_merge_tree_s3_failover/s3_endpoint/endpoint.py b/tests/integration/test_merge_tree_s3_failover/s3_endpoint/endpoint.py
new file mode 100644
index 00000000000..e7614dd9536
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_failover/s3_endpoint/endpoint.py
@@ -0,0 +1,49 @@
+from bottle import request, route, run, response
+
+
+# Endpoint can be configured to throw 500 error on N-th request attempt.
+# In usual situation just redirects to original Minio server.
+
+# Dict to the number of request should be failed.
+cache = {}
+
+
+@route('/fail_request/<_request_number>')
+def fail_request(_request_number):
+    request_number = int(_request_number)
+    if request_number > 0:
+        cache['request_number'] = request_number
+    else:
+        cache.pop('request_number', None)
+    return 'OK'
+
+
+# Handle for MultipleObjectsDelete.
+@route('/<_bucket>', ['POST'])
+def delete(_bucket):
+    response.set_header("Location", "http://minio1:9001/" + _bucket + "?" + request.query_string)
+    response.status = 307
+    return 'Redirected'
+
+
+@route('/<_bucket>/<_path:path>', ['GET', 'POST', 'PUT', 'DELETE'])
+def server(_bucket, _path):
+    if cache.get('request_number', None):
+        request_number = cache.pop('request_number') - 1
+        if request_number > 0:
+            cache['request_number'] = request_number
+        else:
+            response.status = 500
+            return 'Expected Error'
+
+    response.set_header("Location", "http://minio1:9001/" + _bucket + '/' + _path)
+    response.status = 307
+    return 'Redirected'
+
+
+@route('/')
+def ping():
+    return 'OK'
+
+
+run(host='0.0.0.0', port=8080)
diff --git a/tests/integration/test_merge_tree_s3_failover/test.py b/tests/integration/test_merge_tree_s3_failover/test.py
new file mode 100644
index 00000000000..59006e2e99a
--- /dev/null
+++ b/tests/integration/test_merge_tree_s3_failover/test.py
@@ -0,0 +1,117 @@
+import logging
+import os
+import time
+
+import pytest
+
+from helpers.client import QueryRuntimeException
+from helpers.cluster import ClickHouseCluster
+
+logging.getLogger().setLevel(logging.INFO)
+logging.getLogger().addHandler(logging.StreamHandler())
+
+
+# Runs custom python-based S3 endpoint.
+def run_endpoint(cluster):
+    logging.info("Starting custom S3 endpoint")
+    container_id = cluster.get_container_id('resolver')
+    current_dir = os.path.dirname(__file__)
+    cluster.copy_file_to_container(container_id, os.path.join(current_dir, "s3_endpoint", "endpoint.py"), "endpoint.py")
+    cluster.exec_in_container(container_id, ["python", "endpoint.py"], detach=True)
+
+    # Wait for S3 endpoint start
+    for attempt in range(10):
+        ping_response = cluster.exec_in_container(cluster.get_container_id('resolver'),
+                                  ["curl", "-s", "http://resolver:8080/"], nothrow=True)
+        if ping_response != 'OK':
+            if attempt == 9:
+                assert ping_response == 'OK', 'Expected "OK", but got "{}"'.format(ping_response)
+            else:
+                time.sleep(1)
+        else:
+            break
+
+    logging.info("S3 endpoint started")
+
+
+def fail_request(cluster, request):
+    response = cluster.exec_in_container(cluster.get_container_id('resolver'),
+                                         ["curl", "-s", "http://resolver:8080/fail_request/{}".format(request)])
+    assert response == 'OK', 'Expected "OK", but got "{}"'.format(response)
+
+
+@pytest.fixture(scope="module")
+def cluster():
+    try:
+        cluster = ClickHouseCluster(__file__)
+        cluster.add_instance("node",
+                             main_configs=["configs/config.d/log_conf.xml", "configs/config.d/storage_conf.xml"],
+                             with_minio=True)
+        logging.info("Starting cluster...")
+        cluster.start()
+        logging.info("Cluster started")
+
+        run_endpoint(cluster)
+
+        yield cluster
+    finally:
+        cluster.shutdown()
+
+
+@pytest.fixture(autouse=True)
+def drop_table(cluster):
+    yield
+    node = cluster.instances["node"]
+    node.query("DROP TABLE IF EXISTS s3_failover_test NO DELAY")
+
+
+# S3 request will be failed for an appropriate part file write.
+FILES_PER_PART_BASE = 5  # partition.dat, default_compression_codec.txt, count.txt, columns.txt, checksums.txt
+FILES_PER_PART_WIDE = FILES_PER_PART_BASE + 1 + 1 + 3 * 2  # Primary index, MinMax, Mark and data file for column(s)
+FILES_PER_PART_COMPACT = FILES_PER_PART_BASE + 1 + 1 + 2
+
+
+@pytest.mark.parametrize(
+    "min_bytes_for_wide_part,request_count",
+    [
+        (0, FILES_PER_PART_WIDE),
+        (1024 * 1024, FILES_PER_PART_COMPACT)
+    ]
+)
+def test_write_failover(cluster, min_bytes_for_wide_part, request_count):
+    node = cluster.instances["node"]
+
+    node.query(
+        """
+        CREATE TABLE s3_failover_test (
+            dt Date,
+            id Int64,
+            data String
+        ) ENGINE=MergeTree()
+        ORDER BY id
+        PARTITION BY dt
+        SETTINGS storage_policy='s3', min_bytes_for_wide_part={}
+        """
+        .format(min_bytes_for_wide_part)
+    )
+
+    for request in range(request_count + 1):
+        # Fail N-th request to S3.
+        fail_request(cluster, request + 1)
+
+        data = "('2020-03-01',0,'data'),('2020-03-01',1,'data')"
+        positive = request == request_count
+        try:
+            node.query("INSERT INTO s3_failover_test VALUES {}".format(data))
+
+            assert positive, "Insert query should be failed, request {}".format(request)
+        except QueryRuntimeException as e:
+            assert not positive, "Insert query shouldn't be failed, request {}".format(request)
+            assert str(e).find("Expected Error") != -1, "Unexpected error {}".format(str(e))
+
+        if positive:
+            # Disable request failing.
+            fail_request(cluster, 0)
+
+            assert node.query("CHECK TABLE s3_failover_test") == '1\n'
+            assert node.query("SELECT * FROM s3_failover_test FORMAT Values") == data

From 3564ba1c632f182ddfacd996dbbfef2d0db2ed45 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Tue, 6 Oct 2020 13:02:07 +0300
Subject: [PATCH 18/72] Remove moves.

---
 src/Processors/QueryPlan/ReadFromStorageStep.cpp | 11 +++--------
 1 file changed, 3 insertions(+), 8 deletions(-)

diff --git a/src/Processors/QueryPlan/ReadFromStorageStep.cpp b/src/Processors/QueryPlan/ReadFromStorageStep.cpp
index 474410148fa..fd38dd9218b 100644
--- a/src/Processors/QueryPlan/ReadFromStorageStep.cpp
+++ b/src/Processors/QueryPlan/ReadFromStorageStep.cpp
@@ -12,15 +12,15 @@ namespace DB
 {
 
 ReadFromStorageStep::ReadFromStorageStep(
-    TableLockHolder table_lock_,
+    TableLockHolder table_lock,
     StorageMetadataPtr metadata_snapshot,
     StreamLocalLimits & limits,
     SizeLimits & leaf_limits,
     std::shared_ptr<const EnabledQuota> quota,
-    StoragePtr storage_,
+    StoragePtr storage,
     const Names & required_columns,
     const SelectQueryInfo & query_info,
-    std::shared_ptr<Context> context_,
+    std::shared_ptr<Context> context,
     QueryProcessingStage::Enum processing_stage,
     size_t max_block_size,
     size_t max_streams)
@@ -28,11 +28,6 @@ ReadFromStorageStep::ReadFromStorageStep(
     /// Note: we read from storage in constructor of step because we don't know real header before reading.
     /// It will be fixed when storage return QueryPlanStep itself.
 
-    /// Move arguments into stack in order to ensure order of destruction in case of exception.
-    auto context = std::move(context_);
-    auto storage = std::move(storage_);
-    auto table_lock = std::move(table_lock_);
-
     Pipe pipe = storage->read(required_columns, metadata_snapshot, query_info, *context, processing_stage, max_block_size, max_streams);
 
     if (pipe.empty())

From c008555bc526e83b44f935a180da51d0ff5311cb Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 6 Oct 2020 14:00:52 +0300
Subject: [PATCH 19/72] Trying to fix race in AMQP-CPP

---
 .gitmodules      | 2 +-
 contrib/AMQP-CPP | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.gitmodules b/.gitmodules
index 13ed2614f4d..ace36122e6e 100644
--- a/.gitmodules
+++ b/.gitmodules
@@ -158,7 +158,7 @@
 	url = https://github.com/openldap/openldap.git
 [submodule "contrib/AMQP-CPP"]
 	path = contrib/AMQP-CPP
-	url = https://github.com/CopernicaMarketingSoftware/AMQP-CPP.git
+	url = https://github.com/ClickHouse-Extras/AMQP-CPP.git
 [submodule "contrib/cassandra"]
 	path = contrib/cassandra
 	url = https://github.com/ClickHouse-Extras/cpp-driver.git
diff --git a/contrib/AMQP-CPP b/contrib/AMQP-CPP
index 1c08399ab0a..d63e1f01658 160000
--- a/contrib/AMQP-CPP
+++ b/contrib/AMQP-CPP
@@ -1 +1 @@
-Subproject commit 1c08399ab0ab9e4042ef8e2bbe9e208e5dcbc13b
+Subproject commit d63e1f016582e9faaaf279aa24513087a07bc6e7

From dd939ce62328aabfac6398f36d274b157fa8ef75 Mon Sep 17 00:00:00 2001
From: olgarev <56617294+olgarev@users.noreply.github.com>
Date: Tue, 6 Oct 2020 14:17:19 +0300
Subject: [PATCH 20/72] DOCSUP-1674: Docs for the extractAllGroupsHorizontal
 and extractAllGroupsVertical functions (English) (#14317)

* Docs for the extractAllGroupsHorizontal and extractAllGroupsVertical functions (English).

* Minor fixes (en).

* Misspelling fixed.

* English docs corrected and translated into Russian.

* English misspelling corrected.

Co-authored-by: Olga Revyakina <revolg@yandex.ru>
Co-authored-by: Olga Revyakina <revolg@yandex-team.ru>
---
 .../functions/string-search-functions.md      | 83 +++++++++++++++++++
 .../functions/string-search-functions.md      | 83 +++++++++++++++++++
 2 files changed, 166 insertions(+)

diff --git a/docs/en/sql-reference/functions/string-search-functions.md b/docs/en/sql-reference/functions/string-search-functions.md
index a625af14505..5f08417f349 100644
--- a/docs/en/sql-reference/functions/string-search-functions.md
+++ b/docs/en/sql-reference/functions/string-search-functions.md
@@ -360,6 +360,89 @@ Extracts a fragment of a string using a regular expression. If ‘haystack’ do
 
 Extracts all the fragments of a string using a regular expression. If ‘haystack’ doesn’t match the ‘pattern’ regex, an empty string is returned. Returns an array of strings consisting of all matches to the regex. In general, the behavior is the same as the ‘extract’ function (it takes the first subpattern, or the entire expression if there isn’t a subpattern).
 
+## extractAllGroupsHorizontal {#extractallgroups-horizontal}
+
+Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where the first array includes all fragments matching the first group, the second array - matching the second group, etc.  
+
+!!! note "Note"
+    `extractAllGroupsHorizontal` function is slower than [extractAllGroupsVertical](#extractallgroups-vertical).
+
+**Syntax** 
+
+``` sql
+extractAllGroupsHorizontal(haystack, pattern)
+```
+
+**Parameters** 
+
+-   `haystack` — Input string. Type: [String](../../sql-reference/data-types/string.md).
+-   `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. Type: [String](../../sql-reference/data-types/string.md). 
+
+**Returned value**
+
+-   Type: [Array](../../sql-reference/data-types/array.md).
+
+If `haystack` doesn’t match the `pattern` regex, an array of empty arrays is returned. 
+
+**Example**
+
+Query:
+
+``` sql
+SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
+```
+
+Result:
+
+``` text
+┌─extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')─┐
+│ [['abc','def','ghi'],['111','222','333']]                                                │
+└──────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+**See also**
+-   [extractAllGroupsVertical](#extractallgroups-vertical)
+
+## extractAllGroupsVertical {#extractallgroups-vertical}
+
+Matches all groups of the `haystack` string using the `pattern` regular expression. Returns an array of arrays, where each array includes matching fragments from every group. Fragments are grouped in order of appearance in the `haystack`.
+
+**Syntax** 
+
+``` sql
+extractAllGroupsVertical(haystack, pattern)
+```
+
+**Parameters** 
+
+-   `haystack` — Input string. Type: [String](../../sql-reference/data-types/string.md).
+-   `pattern` — Regular expression with [re2 syntax](https://github.com/google/re2/wiki/Syntax). Must contain groups, each group enclosed in parentheses. If `pattern` contains no groups, an exception is thrown. Type: [String](../../sql-reference/data-types/string.md).
+
+**Returned value**
+
+-   Type: [Array](../../sql-reference/data-types/array.md).
+
+If `haystack` doesn’t match the `pattern` regex, an empty array is returned. 
+
+**Example**
+
+Query:
+
+``` sql
+SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
+```
+
+Result:
+
+``` text
+┌─extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')─┐
+│ [['abc','111'],['def','222'],['ghi','333']]                                            │
+└────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+**See also**
+-   [extractAllGroupsHorizontal](#extractallgroups-horizontal)
+
 ## like(haystack, pattern), haystack LIKE pattern operator {#function-like}
 
 Checks whether a string matches a simple regular expression.
diff --git a/docs/ru/sql-reference/functions/string-search-functions.md b/docs/ru/sql-reference/functions/string-search-functions.md
index de713031046..29dd67fd0eb 100644
--- a/docs/ru/sql-reference/functions/string-search-functions.md
+++ b/docs/ru/sql-reference/functions/string-search-functions.md
@@ -341,6 +341,89 @@ Result:
 
 Извлечение всех фрагментов строки по регулярному выражению. Если haystack не соответствует регулярному выражению pattern, то возвращается пустая строка. Возвращается массив строк, состоящий из всех соответствий регулярному выражению. В остальном, поведение аналогично функции extract (по прежнему, вынимается первый subpattern, или всё выражение, если subpattern-а нет).
 
+## extractAllGroupsHorizontal {#extractallgroups-horizontal}
+
+Разбирает строку `haystack` на фрагменты, соответствующие группам регулярного выражения `pattern`. Возвращает массив массивов, где первый массив содержит все фрагменты, соответствующие первой группе регулярного выражения, второй массив - соответствующие второй группе, и т.д.  
+
+!!! note "Замечание"
+    Функция `extractAllGroupsHorizontal` работает медленнее, чем функция [extractAllGroupsVertical](#extractallgroups-vertical).
+
+**Синтаксис** 
+
+``` sql
+extractAllGroupsHorizontal(haystack, pattern)
+```
+
+**Параметры** 
+
+-   `haystack` — строка для разбора. Тип: [String](../../sql-reference/data-types/string.md).
+-   `pattern` — регулярное выражение, построенное по синтаксическим правилам [re2](https://github.com/google/re2/wiki/Syntax). Выражение должно содержать группы, заключенные в круглые скобки. Если выражение не содержит групп, генерируется исключение. Тип: [String](../../sql-reference/data-types/string.md). 
+
+**Возвращаемое значение**
+
+-   Тип: [Array](../../sql-reference/data-types/array.md).
+
+Если в строке `haystack` нет групп, соответствующих регулярному выражению `pattern`, возвращается массив пустых массивов. 
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
+```
+
+Результат:
+
+``` text
+┌─extractAllGroupsHorizontal('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')─┐
+│ [['abc','def','ghi'],['111','222','333']]                                                │
+└──────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+**См. также**
+-   функция [extractAllGroupsVertical](#extractallgroups-vertical)
+
+## extractAllGroupsVertical {#extractallgroups-vertical}
+
+Разбирает строку `haystack` на фрагменты, соответствующие группам регулярного выражения `pattern`. Возвращает массив массивов, где каждый массив содержит по одному фрагменту, соответствующему каждой группе регулярного выражения. Фрагменты группируются в массивы в соответствии с порядком появления в исходной строке.
+
+**Синтаксис** 
+
+``` sql
+extractAllGroupsVertical(haystack, pattern)
+```
+
+**Параметры** 
+
+-   `haystack` — строка для разбора. Тип: [String](../../sql-reference/data-types/string.md).
+-   `pattern` — регулярное выражение, построенное по синтаксическим правилам [re2](https://github.com/google/re2/wiki/Syntax). Выражение должно содержать группы, заключенные в круглые скобки. Если выражение не содержит групп, генерируется исключение. Тип: [String](../../sql-reference/data-types/string.md).
+
+**Возвращаемое значение**
+
+-   Тип: [Array](../../sql-reference/data-types/array.md).
+
+Если в строке `haystack` нет групп, соответствующих регулярному выражению `pattern`, возвращается пустой массив. 
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')
+```
+
+Результат:
+
+``` text
+┌─extractAllGroupsVertical('abc=111, def=222, ghi=333', '("[^"]+"|\\w+)=("[^"]+"|\\w+)')─┐
+│ [['abc','111'],['def','222'],['ghi','333']]                                            │
+└────────────────────────────────────────────────────────────────────────────────────────┘
+```
+
+**См. также**
+-   функция [extractAllGroupsHorizontal](#extractallgroups-horizontal)
+
 ## like(haystack, pattern), оператор haystack LIKE pattern {#function-like}
 
 Проверка строки на соответствие простому регулярному выражению.

From 9594e463b4de8745c084de65e3c01e4623c98492 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 6 Oct 2020 14:24:55 +0300
Subject: [PATCH 21/72] Cancel other processors on query exception (#15578)

* Add a test for canceling distributed queries on error

* Cancel other processors on exception in PipelineExecutor

Fixes: 01514_distributed_cancel_query_on_error
---
 src/Processors/Executors/PipelineExecutor.cpp |  2 +-
 ...istributed_cancel_query_on_error.reference |  0
 ...01514_distributed_cancel_query_on_error.sh | 21 +++++++++++++++++++
 3 files changed, 22 insertions(+), 1 deletion(-)
 create mode 100644 tests/queries/0_stateless/01514_distributed_cancel_query_on_error.reference
 create mode 100755 tests/queries/0_stateless/01514_distributed_cancel_query_on_error.sh

diff --git a/src/Processors/Executors/PipelineExecutor.cpp b/src/Processors/Executors/PipelineExecutor.cpp
index 858f53cb047..271903add86 100644
--- a/src/Processors/Executors/PipelineExecutor.cpp
+++ b/src/Processors/Executors/PipelineExecutor.cpp
@@ -566,7 +566,7 @@ void PipelineExecutor::executeStepImpl(size_t thread_num, size_t num_threads, st
             }
 
             if (node->exception)
-                finish();
+                cancel();
 
             if (finished)
                 break;
diff --git a/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.reference b/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.sh b/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.sh
new file mode 100755
index 00000000000..a24bb00fdf3
--- /dev/null
+++ b/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.sh
@@ -0,0 +1,21 @@
+#!/usr/bin/env bash
+
+CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
+. "$CURDIR"/../shell_config.sh
+
+# _shard_num:
+#   1 on 127.2
+#   2 on 127.3
+# max_block_size to fail faster
+# max_memory_usage/_shard_num/repeat() will allow failure on the first shard earlier.
+opts=(
+    --max_memory_usage=3G
+    --max_block_size=50
+    --max_threads=1
+    --max_distributed_connections=2
+)
+${CLICKHOUSE_CLIENT} "${opts[@]}" -q "SELECT groupArray(repeat('a', 1000*_shard_num)), number%100000 k from remote('127.{2,3}', system.numbers) GROUP BY k LIMIT 10e6" |& {
+    # the query should fail earlier on 127.3 and 127.2 should not even go to the memory limit exceeded error.
+    fgrep -q 'DB::Exception: Received from 127.3:9000. DB::Exception: Memory limit (for query) exceeded:'
+    # while if this will not correctly then it will got the exception from the 127.2:9000 and fail
+}

From 585ca870014b3596749ab1c906df5a85709b86b3 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 6 Oct 2020 14:26:27 +0300
Subject: [PATCH 22/72] Fix cmake

---
 contrib/amqpcpp-cmake/CMakeLists.txt | 1 +
 1 file changed, 1 insertion(+)

diff --git a/contrib/amqpcpp-cmake/CMakeLists.txt b/contrib/amqpcpp-cmake/CMakeLists.txt
index ca79b84c523..4853983680e 100644
--- a/contrib/amqpcpp-cmake/CMakeLists.txt
+++ b/contrib/amqpcpp-cmake/CMakeLists.txt
@@ -16,6 +16,7 @@ set (SRCS
     ${LIBRARY_DIR}/src/flags.cpp
     ${LIBRARY_DIR}/src/linux_tcp/openssl.cpp
     ${LIBRARY_DIR}/src/linux_tcp/tcpconnection.cpp
+    ${LIBRARY_DIR}/src/inbuffer.cpp
     ${LIBRARY_DIR}/src/receivedframe.cpp
     ${LIBRARY_DIR}/src/table.cpp
     ${LIBRARY_DIR}/src/watchable.cpp

From 00f29e400fa85b87296992211e91cb4349f0e194 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 6 Oct 2020 16:29:08 +0300
Subject: [PATCH 23/72] Fix race condition in hdfs

---
 contrib/libhdfs3              |  2 +-
 src/IO/ReadBufferFromHDFS.cpp | 13 +++++++++++--
 2 files changed, 12 insertions(+), 3 deletions(-)

diff --git a/contrib/libhdfs3 b/contrib/libhdfs3
index 1b666578c85..7d370205486 160000
--- a/contrib/libhdfs3
+++ b/contrib/libhdfs3
@@ -1 +1 @@
-Subproject commit 1b666578c85094306b061352078022f6350bfab8
+Subproject commit 7d370205486ff8c2216cb477c3021d49781f1122
diff --git a/src/IO/ReadBufferFromHDFS.cpp b/src/IO/ReadBufferFromHDFS.cpp
index d6dfd12bd82..0b9f75d8c46 100644
--- a/src/IO/ReadBufferFromHDFS.cpp
+++ b/src/IO/ReadBufferFromHDFS.cpp
@@ -3,6 +3,7 @@
 #if USE_HDFS
 #include <IO/HDFSCommon.h>
 #include <hdfs/hdfs.h>
+#include <mutex>
 
 
 namespace DB
@@ -17,6 +18,9 @@ ReadBufferFromHDFS::~ReadBufferFromHDFS() = default;
 
 struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl
 {
+    /// HDFS create/open functions are not thread safe
+    static std::mutex hdfs_init_mutex;
+
     std::string hdfs_uri;
     hdfsFile fin;
     HDFSBuilderPtr builder;
@@ -24,9 +28,11 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl
 
     explicit ReadBufferFromHDFSImpl(const std::string & hdfs_name_)
         : hdfs_uri(hdfs_name_)
-        , builder(createHDFSBuilder(hdfs_uri))
-        , fs(createHDFSFS(builder.get()))
     {
+        std::lock_guard lock(hdfs_init_mutex);
+
+        builder = createHDFSBuilder(hdfs_uri);
+        fs = createHDFSFS(builder.get());
         const size_t begin_of_path = hdfs_uri.find('/', hdfs_uri.find("//") + 2);
         const std::string path = hdfs_uri.substr(begin_of_path);
         fin = hdfsOpenFile(fs.get(), path.c_str(), O_RDONLY, 0, 0, 0);
@@ -47,10 +53,13 @@ struct ReadBufferFromHDFS::ReadBufferFromHDFSImpl
 
     ~ReadBufferFromHDFSImpl()
     {
+        std::lock_guard lock(hdfs_init_mutex);
         hdfsCloseFile(fs.get(), fin);
     }
 };
 
+std::mutex ReadBufferFromHDFS::ReadBufferFromHDFSImpl::hdfs_init_mutex;
+
 ReadBufferFromHDFS::ReadBufferFromHDFS(const std::string & hdfs_name_, size_t buf_size)
     : BufferWithOwnMemory<ReadBuffer>(buf_size)
     , impl(std::make_unique<ReadBufferFromHDFSImpl>(hdfs_name_))

From db03a894e584ec39b97d6cadcfbcdd5b79e4b4a9 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 6 Oct 2020 16:29:50 +0300
Subject: [PATCH 24/72] Change test to run flaky check

---
 tests/integration/test_storage_hdfs/test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tests/integration/test_storage_hdfs/test.py b/tests/integration/test_storage_hdfs/test.py
index 996df28ac81..59944a3725e 100644
--- a/tests/integration/test_storage_hdfs/test.py
+++ b/tests/integration/test_storage_hdfs/test.py
@@ -6,7 +6,7 @@ from helpers.hdfs_api import HDFSApi
 
 SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__))
 cluster = ClickHouseCluster(__file__)
-node1 = cluster.add_instance('node1', with_hdfs=True, user_configs=[], main_configs=['configs/log_conf.xml'])
+node1 = cluster.add_instance('node1', with_hdfs=True, main_configs=['configs/log_conf.xml'])
 
 
 @pytest.fixture(scope="module")

From e54ff6e60ae520d4e58b97277eb70d4d0dd9510e Mon Sep 17 00:00:00 2001
From: Alexander Kazakov <Akazz@users.noreply.github.com>
Date: Tue, 6 Oct 2020 16:45:17 +0300
Subject: [PATCH 25/72] Lock-less totalRows/totalBytes + more clear postponed
 init

---
 src/Storages/StorageMemory.cpp | 135 +++++++++++++++++----------------
 src/Storages/StorageMemory.h   |   5 ++
 2 files changed, 73 insertions(+), 67 deletions(-)

diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index 25e232dc4ad..1b722f5fa73 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -21,81 +21,64 @@ namespace ErrorCodes
 
 class MemorySource : public SourceWithProgress
 {
+    using InitializerFunc = std::function<void(BlocksList::iterator &, size_t &)>;
 public:
-    /// We use range [first, last] which includes right border.
     /// Blocks are stored in std::list which may be appended in another thread.
-    /// We don't use synchronisation here, because elements in range [first, last] won't be modified.
+    /// We use pointer to the beginning of the list and its current size.
+    /// We don't need synchronisation in this reader, because while we hold SharedLock on storage,
+    /// only new elements can be added to the back of the list, so our iterators remain valid
+
     MemorySource(
-        Names column_names_,
-        BlocksList::iterator first_,
+        const Names & column_names_,
+        BlocksList::const_iterator first_,
         size_t num_blocks_,
         const StorageMemory & storage,
-        const StorageMetadataPtr & metadata_snapshot)
+        const StorageMetadataPtr & metadata_snapshot,
+        InitializerFunc initializer_func_ = [](size_t &, size_t &) {})
         : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID()))
-        , column_names(std::move(column_names_))
+        , column_names(column_names_)
         , current_it(first_)
         , num_blocks(num_blocks_)
+        , initializer_func(std::move(initializer_func_))
     {
     }
 
-    /// If called, will initialize the number of blocks at first read.
-    /// It allows to read data which was inserted into memory table AFTER Storage::read was called.
-    /// This hack is needed for global subqueries.
-    void delayInitialization(BlocksList * data_, std::mutex * mutex_)
-    {
-        data = data_;
-        mutex = mutex_;
-    }
-
     String getName() const override { return "Memory"; }
 
 protected:
     Chunk generate() override
     {
-        if (data)
+        if (!postponed_init_done)
         {
-            std::lock_guard guard(*mutex);
-            current_it = data->begin();
-            num_blocks = data->size();
-            is_finished = num_blocks == 0;
-
-            data = nullptr;
-            mutex = nullptr;
+            initializer_func(current_it, num_blocks);
+            postponed_init_done = true;
         }
 
-        if (is_finished)
-        {
+        if (current_block_idx == num_blocks)
             return {};
-        }
-        else
-        {
-            const Block & src = *current_it;
-            Columns columns;
-            columns.reserve(column_names.size());
 
-            /// Add only required columns to `res`.
-            for (const auto & name : column_names)
-                columns.emplace_back(src.getByName(name).column);
+        const Block & src = *current_it;
+        Columns columns;
+        columns.reserve(column_names.size());
 
-            ++current_block_idx;
+        /// Add only required columns to `res`.
+        for (const auto & name : column_names)
+            columns.push_back(src.getByName(name).column);
 
-            if (current_block_idx == num_blocks)
-                is_finished = true;
-            else
-                ++current_it;
+        if (++current_block_idx < num_blocks)
+            ++current_it;
 
-            return Chunk(std::move(columns), src.rows());
-        }
+        return Chunk(std::move(columns), src.rows());
     }
-private:
-    Names column_names;
-    BlocksList::iterator current_it;
-    size_t current_block_idx = 0;
-    size_t num_blocks;
-    bool is_finished = false;
 
-    BlocksList * data = nullptr;
-    std::mutex * mutex = nullptr;
+private:
+    const Names column_names;
+    BlocksList::const_iterator current_it;
+    size_t num_blocks;
+    size_t current_block_idx = 0;
+
+    bool postponed_init_done = false;
+    InitializerFunc initializer_func;
 };
 
 
@@ -113,9 +96,18 @@ public:
 
     void write(const Block & block) override
     {
+        const auto size_bytes_diff = block.allocatedBytes();
+        const auto size_rows_diff = block.rows();
+
         metadata_snapshot->check(block, true);
-        std::lock_guard lock(storage.mutex);
-        storage.data.push_back(block);
+        {
+            std::lock_guard lock(storage.mutex);
+            storage.data.push_back(block);
+
+            storage.total_size_bytes.fetch_add(size_bytes_diff, std::memory_order_relaxed);
+            storage.total_size_rows.fetch_add(size_rows_diff, std::memory_order_relaxed);
+        }
+
     }
 private:
     StorageMemory & storage;
@@ -144,8 +136,6 @@ Pipe StorageMemory::read(
 {
     metadata_snapshot->check(column_names, getVirtuals(), getStorageID());
 
-    std::lock_guard lock(mutex);
-
     if (delay_read_for_global_subqueries)
     {
         /// Note: for global subquery we use single source.
@@ -156,11 +146,21 @@ Pipe StorageMemory::read(
         /// set for IN or hash table for JOIN, which can't be done concurrently.
         /// Since no other manipulation with data is done, multiple sources shouldn't give any profit.
 
-        auto source = std::make_shared<MemorySource>(column_names, data.begin(), data.size(), *this, metadata_snapshot);
-        source->delayInitialization(&data, &mutex);
-        return Pipe(std::move(source));
+        return {std::make_shared<MemorySource>(
+                        column_names, data.end(), 0, *this, metadata_snapshot,
+                        /// This hack is needed for global subqueries.
+                        /// It allows to set up this Source for read AFTER Storage::read() has been called and just before actual reading
+                        [this](size_t & current_it, size_t & num_blocks)
+                        {
+                            std::lock_guard guard(mutex);
+                            current_it = data.begin();
+                            num_blocks = data.size();
+                        }
+                )};
     }
 
+    std::lock_guard lock(mutex);
+
     size_t size = data.size();
 
     if (num_streams > size)
@@ -178,7 +178,7 @@ Pipe StorageMemory::read(
 
         assert(num_blocks > 0);
 
-        pipes.emplace_back(std::make_shared<MemorySource>(column_names, it, num_blocks, *this, metadata_snapshot));
+        pipes.push_back(std::make_shared<MemorySource>(column_names, it, num_blocks, *this, metadata_snapshot));
 
         while (offset < next_offset)
         {
@@ -201,31 +201,32 @@ void StorageMemory::drop()
 {
     std::lock_guard lock(mutex);
     data.clear();
+    total_size_bytes.store(0, std::memory_order_relaxed);
+    total_size_rows.store(0, std::memory_order_relaxed);
 }
 
+
 void StorageMemory::truncate(
     const ASTPtr &, const StorageMetadataPtr &, const Context &, TableExclusiveLockHolder &)
 {
     std::lock_guard lock(mutex);
     data.clear();
+    total_size_bytes.store(0, std::memory_order_relaxed);
+    total_size_rows.store(0, std::memory_order_relaxed);
 }
 
+
 std::optional<UInt64> StorageMemory::totalRows() const
 {
-    UInt64 rows = 0;
-    std::lock_guard lock(mutex);
-    for (const auto & buffer : data)
-        rows += buffer.rows();
-    return rows;
+    /// All modifications of these counters are done under mutex which automatically guarantees synchronization/consistency
+    /// When run concurrently we are fine with any value: "before" or "after"
+    return total_size_rows.load(std::memory_order_relaxed);
 }
 
+
 std::optional<UInt64> StorageMemory::totalBytes() const
 {
-    UInt64 bytes = 0;
-    std::lock_guard lock(mutex);
-    for (const auto & buffer : data)
-        bytes += buffer.allocatedBytes();
-    return bytes;
+    return total_size_bytes.load(std::memory_order_relaxed);
 }
 
 
diff --git a/src/Storages/StorageMemory.h b/src/Storages/StorageMemory.h
index e67e3015028..6b525cd6dbb 100644
--- a/src/Storages/StorageMemory.h
+++ b/src/Storages/StorageMemory.h
@@ -1,5 +1,7 @@
 #pragma once
 
+#include <atomic>
+#include <optional>
 #include <mutex>
 
 #include <ext/shared_ptr_helper.h>
@@ -93,6 +95,9 @@ private:
 
     bool delay_read_for_global_subqueries = false;
 
+    std::atomic<size_t> total_size_bytes = 0;
+    std::atomic<size_t> total_size_rows = 0;
+
 protected:
     StorageMemory(const StorageID & table_id_, ColumnsDescription columns_description_, ConstraintsDescription constraints_);
 };

From ae2c106f94ad9a666879652f16dd746b7e5f703d Mon Sep 17 00:00:00 2001
From: Alexander Kazakov <Akazz@users.noreply.github.com>
Date: Tue, 6 Oct 2020 17:04:08 +0300
Subject: [PATCH 26/72] Minor fixes

---
 src/Storages/StorageMemory.cpp | 15 ++++++++-------
 1 file changed, 8 insertions(+), 7 deletions(-)

diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index 1b722f5fa73..111e121162b 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -21,7 +21,7 @@ namespace ErrorCodes
 
 class MemorySource : public SourceWithProgress
 {
-    using InitializerFunc = std::function<void(BlocksList::iterator &, size_t &)>;
+    using InitializerFunc = std::function<void(BlocksList::const_iterator &, size_t &)>;
 public:
     /// Blocks are stored in std::list which may be appended in another thread.
     /// We use pointer to the beginning of the list and its current size.
@@ -34,7 +34,7 @@ public:
         size_t num_blocks_,
         const StorageMemory & storage,
         const StorageMetadataPtr & metadata_snapshot,
-        InitializerFunc initializer_func_ = [](size_t &, size_t &) {})
+        InitializerFunc initializer_func_ = [](BlocksList::const_iterator &, size_t &) {})
         : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID()))
         , column_names(column_names_)
         , current_it(first_)
@@ -146,17 +146,18 @@ Pipe StorageMemory::read(
         /// set for IN or hash table for JOIN, which can't be done concurrently.
         /// Since no other manipulation with data is done, multiple sources shouldn't give any profit.
 
-        return {std::make_shared<MemorySource>(
+        return Pipe(
+                std::make_shared<MemorySource>(
                         column_names, data.end(), 0, *this, metadata_snapshot,
                         /// This hack is needed for global subqueries.
                         /// It allows to set up this Source for read AFTER Storage::read() has been called and just before actual reading
-                        [this](size_t & current_it, size_t & num_blocks)
+                        [this](BlocksList::const_iterator & current_it, size_t & num_blocks)
                         {
                             std::lock_guard guard(mutex);
                             current_it = data.begin();
                             num_blocks = data.size();
                         }
-                )};
+                ));
     }
 
     std::lock_guard lock(mutex);
@@ -168,7 +169,7 @@ Pipe StorageMemory::read(
 
     Pipes pipes;
 
-    BlocksList::iterator it = data.begin();
+    BlocksList::const_iterator it = data.begin();
 
     size_t offset = 0;
     for (size_t stream = 0; stream < num_streams; ++stream)
@@ -178,7 +179,7 @@ Pipe StorageMemory::read(
 
         assert(num_blocks > 0);
 
-        pipes.push_back(std::make_shared<MemorySource>(column_names, it, num_blocks, *this, metadata_snapshot));
+        pipes.emplace_back(std::make_shared<MemorySource>(column_names, it, num_blocks, *this, metadata_snapshot));
 
         while (offset < next_offset)
         {

From a453e8dbe16afbbba2c48643408520e451d9f0fb Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 6 Oct 2020 17:07:00 +0300
Subject: [PATCH 27/72] Add stderr log to test_grant_and_revoke

---
 .../test_grant_and_revoke/configs/log_conf.xml        | 11 +++++++++++
 tests/integration/test_grant_and_revoke/test.py       |  2 +-
 2 files changed, 12 insertions(+), 1 deletion(-)
 create mode 100644 tests/integration/test_grant_and_revoke/configs/log_conf.xml

diff --git a/tests/integration/test_grant_and_revoke/configs/log_conf.xml b/tests/integration/test_grant_and_revoke/configs/log_conf.xml
new file mode 100644
index 00000000000..0de2745ca4c
--- /dev/null
+++ b/tests/integration/test_grant_and_revoke/configs/log_conf.xml
@@ -0,0 +1,11 @@
+<yandex>
+  <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
diff --git a/tests/integration/test_grant_and_revoke/test.py b/tests/integration/test_grant_and_revoke/test.py
index 0404120907d..e29d63c9e0b 100644
--- a/tests/integration/test_grant_and_revoke/test.py
+++ b/tests/integration/test_grant_and_revoke/test.py
@@ -3,7 +3,7 @@ from helpers.cluster import ClickHouseCluster
 from helpers.test_tools import TSV
 
 cluster = ClickHouseCluster(__file__)
-instance = cluster.add_instance('instance')
+instance = cluster.add_instance('instance', main_configs=['configs/log_conf.xml'])
 
 
 @pytest.fixture(scope="module", autouse=True)

From 767a525031ccbc8cc77861c0e9ed3b5135af925f Mon Sep 17 00:00:00 2001
From: tavplubix <avtokmakov@yandex-team.ru>
Date: Tue, 6 Oct 2020 17:24:34 +0300
Subject: [PATCH 28/72] trigger CI

---
 .../01149_zookeeper_mutation_stuck_after_replace_partition.sql   | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
index 667831a6797..178f9b81ead 100644
--- a/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
+++ b/tests/queries/0_stateless/01149_zookeeper_mutation_stuck_after_replace_partition.sql
@@ -12,6 +12,7 @@ select * from mt;
 select table, partition_id, name, rows from system.parts where database=currentDatabase() and table in ('mt', 'rmt') and active=1 order by table, name;
 
 alter table rmt update s = 's'||toString(n) where 1;
+
 select * from rmt;
 alter table rmt replace partition '0' from mt;
 

From cabab90ab201adc68d282f920266d00ef860c62c Mon Sep 17 00:00:00 2001
From: Alexander Kazakov <Akazz@users.noreply.github.com>
Date: Tue, 6 Oct 2020 17:54:22 +0300
Subject: [PATCH 29/72] Get back to moving args

---
 src/Storages/StorageMemory.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Storages/StorageMemory.cpp b/src/Storages/StorageMemory.cpp
index 111e121162b..f16df6983ed 100644
--- a/src/Storages/StorageMemory.cpp
+++ b/src/Storages/StorageMemory.cpp
@@ -29,14 +29,14 @@ public:
     /// only new elements can be added to the back of the list, so our iterators remain valid
 
     MemorySource(
-        const Names & column_names_,
+        Names column_names_,
         BlocksList::const_iterator first_,
         size_t num_blocks_,
         const StorageMemory & storage,
         const StorageMetadataPtr & metadata_snapshot,
         InitializerFunc initializer_func_ = [](BlocksList::const_iterator &, size_t &) {})
         : SourceWithProgress(metadata_snapshot->getSampleBlockForColumns(column_names_, storage.getVirtuals(), storage.getStorageID()))
-        , column_names(column_names_)
+        , column_names(std::move(column_names_))
         , current_it(first_)
         , num_blocks(num_blocks_)
         , initializer_func(std::move(initializer_func_))

From f950198fa5998d285aaa3b65ddc801ea3e812412 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 6 Oct 2020 19:00:42 +0300
Subject: [PATCH 30/72] Allow to use c11 with clang pedantic

---
 cmake/warnings.cmake | 1 +
 1 file changed, 1 insertion(+)

diff --git a/cmake/warnings.cmake b/cmake/warnings.cmake
index 3b2215f9bb6..f70caebf95a 100644
--- a/cmake/warnings.cmake
+++ b/cmake/warnings.cmake
@@ -31,6 +31,7 @@ if (COMPILER_CLANG)
     add_warning(pedantic)
     no_warning(vla-extension)
     no_warning(zero-length-array)
+    no_warning(c11-extensions)
 
     add_warning(comma)
     add_warning(conditional-uninitialized)

From ade916570bc52c6c3f8fd063b27f61fa5e0e7bba Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 6 Oct 2020 19:01:13 +0300
Subject: [PATCH 31/72] Add log config

---
 .../configs/log_conf.xml                             | 12 ++++++++++++
 .../test_mysql.py                                    |  1 +
 2 files changed, 13 insertions(+)
 create mode 100644 tests/integration/test_dictionaries_all_layouts_separate_sources/configs/log_conf.xml

diff --git a/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/log_conf.xml b/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/log_conf.xml
new file mode 100644
index 00000000000..b52d833cde8
--- /dev/null
+++ b/tests/integration/test_dictionaries_all_layouts_separate_sources/configs/log_conf.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<yandex>
+  <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
diff --git a/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mysql.py b/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mysql.py
index 69e2543f226..77d16e901a9 100644
--- a/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mysql.py
+++ b/tests/integration/test_dictionaries_all_layouts_separate_sources/test_mysql.py
@@ -42,6 +42,7 @@ def setup_module(module):
     dictionaries = []
     main_configs = []
     main_configs.append(os.path.join('configs', 'disable_ssl_verification.xml'))
+    main_configs.append(os.path.join('configs', 'log_conf.xml'))
     
     for fname in os.listdir(DICT_CONFIG_PATH):
         dictionaries.append(os.path.join(DICT_CONFIG_PATH, fname))

From d08909cc24c0abf718b30a68bd811cbb8d1debd0 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Tue, 6 Oct 2020 19:28:32 +0300
Subject: [PATCH 32/72] Update mariadb

---
 contrib/mariadb-connector-c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c
index 3f512fedf0b..780963dda5c 160000
--- a/contrib/mariadb-connector-c
+++ b/contrib/mariadb-connector-c
@@ -1 +1 @@
-Subproject commit 3f512fedf0ba0f769a1b4852b4bac542d92c5b20
+Subproject commit 780963dda5cd55bcc7149a90ab2124ea11b65c19

From 675c70acbcf5c0eee82090444695b29211443c71 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 6 Oct 2020 00:32:46 +0300
Subject: [PATCH 33/72] Remove obsolete settings

Came to this when looking at allow_experimental_data_skipping_indices,
for implementing force_data_skipping_indices
---
 src/Core/Settings.h | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 3ecb79c3fce..54edf32eb8d 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -389,14 +389,6 @@ class IColumn;
     \
     /** Obsolete settings that do nothing but left for compatibility reasons. Remove each one after half a year of obsolescence. */ \
     \
-    M(Bool, allow_experimental_low_cardinality_type, true, "Obsolete setting, does nothing. Will be removed after 2019-08-13", 0) \
-    M(Bool, compile, false, "Whether query compilation is enabled. Will be removed after 2020-03-13", 0) \
-    M(UInt64, min_count_to_compile, 0, "Obsolete setting, does nothing. Will be removed after 2020-03-13", 0) \
-    M(Bool, allow_experimental_multiple_joins_emulation, true, "Obsolete setting, does nothing. Will be removed after 2020-05-31", 0) \
-    M(Bool, allow_experimental_cross_to_join_conversion, true, "Obsolete setting, does nothing. Will be removed after 2020-05-31", 0) \
-    M(Bool, allow_experimental_data_skipping_indices, true, "Obsolete setting, does nothing. Will be removed after 2020-05-31", 0) \
-    M(Bool, merge_tree_uniform_read_distribution, true, "Obsolete setting, does nothing. Will be removed after 2020-05-20", 0) \
-    M(UInt64, mark_cache_min_lifetime, 0, "Obsolete setting, does nothing. Will be removed after 2020-05-31", 0) \
     M(Bool, partial_merge_join, false, "Obsolete. Use join_algorithm='prefer_partial_merge' instead.", 0) \
     M(UInt64, max_memory_usage_for_all_queries, 0, "Obsolete. Will be removed after 2020-10-20", 0) \
     M(UInt64, multiple_joins_rewriter_version, 0, "Obsolete setting, does nothing. Will be removed after 2021-03-31", 0) \

From 7298cf3eccd2433930e8e9814c22a9010e10c14b Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 6 Oct 2020 01:29:06 +0300
Subject: [PATCH 34/72] Remove obsolete settings from tests

---
 .../00443_merge_tree_uniform_read_distribution_0.sh         | 2 +-
 .../queries/0_stateless/00443_preferred_block_size_bytes.sh | 4 ++--
 ..._tree_uniform_read_distribution_and_max_rows_to_read.sql | 6 ------
 tests/queries/0_stateless/01045_bloom_filter_null_array.sql | 3 ---
 .../0_stateless/01056_negative_with_bloom_filter.sql        | 2 --
 ...ohibition_secondary_index_with_old_format_merge_tree.sql | 1 -
 tests/queries/0_stateless/01072_nullable_jit.sql            | 4 ++--
 7 files changed, 5 insertions(+), 17 deletions(-)

diff --git a/tests/queries/0_stateless/00443_merge_tree_uniform_read_distribution_0.sh b/tests/queries/0_stateless/00443_merge_tree_uniform_read_distribution_0.sh
index 29ea645affa..81b9c015fb9 100755
--- a/tests/queries/0_stateless/00443_merge_tree_uniform_read_distribution_0.sh
+++ b/tests/queries/0_stateless/00443_merge_tree_uniform_read_distribution_0.sh
@@ -4,4 +4,4 @@ set -e
 
 CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 
-env CLICKHOUSE_CLIENT_OPT="--merge_tree_uniform_read_distribution=0" bash "$CURDIR"/00443_optimize_final_vertical_merge.sh
+bash "$CURDIR"/00443_optimize_final_vertical_merge.sh
diff --git a/tests/queries/0_stateless/00443_preferred_block_size_bytes.sh b/tests/queries/0_stateless/00443_preferred_block_size_bytes.sh
index 3308343c58a..be986a72457 100755
--- a/tests/queries/0_stateless/00443_preferred_block_size_bytes.sh
+++ b/tests/queries/0_stateless/00443_preferred_block_size_bytes.sh
@@ -41,10 +41,10 @@ popd > /dev/null
 #SCRIPTDIR=`dirname "$SCRIPTPATH"`
 SCRIPTDIR=$SCRIPTPATH
 
-cat "$SCRIPTDIR"/00282_merging.sql | $CLICKHOUSE_CLIENT --preferred_block_size_bytes=10 --merge_tree_uniform_read_distribution=1 -n > "${CLICKHOUSE_TMP}"/preferred_block_size_bytes.stdout 2>&1
+cat "$SCRIPTDIR"/00282_merging.sql | $CLICKHOUSE_CLIENT --preferred_block_size_bytes=10 -n > "${CLICKHOUSE_TMP}"/preferred_block_size_bytes.stdout 2>&1
 cmp "$SCRIPTDIR"/00282_merging.reference "${CLICKHOUSE_TMP}"/preferred_block_size_bytes.stdout && echo PASSED || echo FAILED
 
-cat "$SCRIPTDIR"/00282_merging.sql | $CLICKHOUSE_CLIENT --preferred_block_size_bytes=20 --merge_tree_uniform_read_distribution=0 -n > "${CLICKHOUSE_TMP}"/preferred_block_size_bytes.stdout 2>&1
+cat "$SCRIPTDIR"/00282_merging.sql | $CLICKHOUSE_CLIENT --preferred_block_size_bytes=20 -n > "${CLICKHOUSE_TMP}"/preferred_block_size_bytes.stdout 2>&1
 cmp "$SCRIPTDIR"/00282_merging.reference "${CLICKHOUSE_TMP}"/preferred_block_size_bytes.stdout && echo PASSED || echo FAILED
 
 rm "${CLICKHOUSE_TMP}"/preferred_block_size_bytes.stdout
diff --git a/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql b/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql
index b3d9612b39f..d31989d65bd 100644
--- a/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql
+++ b/tests/queries/0_stateless/00971_merge_tree_uniform_read_distribution_and_max_rows_to_read.sql
@@ -5,18 +5,12 @@ INSERT INTO merge_tree SELECT 0 FROM numbers(1000000);
 SET max_threads = 4;
 SET max_rows_to_read = 1100000;
 
-SET merge_tree_uniform_read_distribution = 1;
 SELECT count() FROM merge_tree;
-
-SET merge_tree_uniform_read_distribution = 0;
 SELECT count() FROM merge_tree;
 
 SET max_rows_to_read = 900000;
 
-SET merge_tree_uniform_read_distribution = 1;
 SELECT count() FROM merge_tree WHERE not ignore(); -- { serverError 158 }
-
-SET merge_tree_uniform_read_distribution = 0;
 SELECT count() FROM merge_tree WHERE not ignore(); -- { serverError 158 }
 
 DROP TABLE merge_tree;
diff --git a/tests/queries/0_stateless/01045_bloom_filter_null_array.sql b/tests/queries/0_stateless/01045_bloom_filter_null_array.sql
index a021a2b9e28..3dfc04ae8ff 100644
--- a/tests/queries/0_stateless/01045_bloom_filter_null_array.sql
+++ b/tests/queries/0_stateless/01045_bloom_filter_null_array.sql
@@ -1,6 +1,3 @@
-SET allow_experimental_data_skipping_indices = 1;
-
-
 DROP TABLE IF EXISTS bloom_filter_null_array;
 
 CREATE TABLE bloom_filter_null_array (v Array(LowCardinality(Nullable(String))), INDEX idx v TYPE bloom_filter(0.1) GRANULARITY 1) ENGINE = MergeTree() ORDER BY v;
diff --git a/tests/queries/0_stateless/01056_negative_with_bloom_filter.sql b/tests/queries/0_stateless/01056_negative_with_bloom_filter.sql
index 271754b848f..4816d865c1c 100644
--- a/tests/queries/0_stateless/01056_negative_with_bloom_filter.sql
+++ b/tests/queries/0_stateless/01056_negative_with_bloom_filter.sql
@@ -1,5 +1,3 @@
-SET allow_experimental_data_skipping_indices = 1;
-
 DROP TABLE IF EXISTS test;
 
 CREATE TABLE test (`int8` Int8, `int16` Int16, `int32` Int32, `int64` Int64, INDEX idx (`int8`, `int16`, `int32`, `int64`) TYPE bloom_filter(0.01) GRANULARITY 8192 ) ENGINE = MergeTree() ORDER BY `int8`;
diff --git a/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.sql b/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.sql
index 50259c28420..4c005f6ba56 100644
--- a/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.sql
+++ b/tests/queries/0_stateless/01071_prohibition_secondary_index_with_old_format_merge_tree.sql
@@ -1,5 +1,4 @@
 CREATE TABLE old_syntax_01071_test (date Date, id UInt8) ENGINE = MergeTree(date, id, 8192);
-SET allow_experimental_data_skipping_indices=1;
 ALTER TABLE old_syntax_01071_test ADD INDEX  id_minmax id TYPE minmax GRANULARITY 1; -- { serverError 36 }
 CREATE TABLE new_syntax_01071_test (date Date, id UInt8) ENGINE = MergeTree() ORDER BY id;
 ALTER TABLE new_syntax_01071_test ADD INDEX  id_minmax id TYPE minmax GRANULARITY 1;
diff --git a/tests/queries/0_stateless/01072_nullable_jit.sql b/tests/queries/0_stateless/01072_nullable_jit.sql
index 8c00103bfa8..45e9f110bcb 100644
--- a/tests/queries/0_stateless/01072_nullable_jit.sql
+++ b/tests/queries/0_stateless/01072_nullable_jit.sql
@@ -12,7 +12,7 @@ CREATE TABLE foo (
 
 INSERT INTO foo VALUES (1, 0.5, 0.2, 0.3, 0.8);
 
-SELECT divide(sum(a) + sum(b), nullIf(sum(c) + sum(d), 0)) FROM foo SETTINGS compile_expressions = 1, min_count_to_compile = 0;
-SELECT divide(sum(a) + sum(b), nullIf(sum(c) + sum(d), 0)) FROM foo SETTINGS compile_expressions = 1, min_count_to_compile = 0;
+SELECT divide(sum(a) + sum(b), nullIf(sum(c) + sum(d), 0)) FROM foo SETTINGS compile_expressions = 1;
+SELECT divide(sum(a) + sum(b), nullIf(sum(c) + sum(d), 0)) FROM foo SETTINGS compile_expressions = 1;
 
 DROP TABLE foo;

From ef6d12967f8c2d21552d3b3b36e361abd5280a01 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Mon, 5 Oct 2020 23:50:03 +0300
Subject: [PATCH 35/72] Implement force_data_skipping_indices setting

---
 docs/en/operations/settings/settings.md       |  6 ++++
 src/Core/Settings.h                           |  1 +
 .../MergeTree/MergeTreeDataSelectExecutor.cpp | 27 ++++++++++++++++++
 ...1515_force_data_skipping_indices.reference |  0
 .../01515_force_data_skipping_indices.sql     | 28 +++++++++++++++++++
 .../queries/0_stateless/arcadia_skip_list.txt |  1 +
 6 files changed, 63 insertions(+)
 create mode 100644 tests/queries/0_stateless/01515_force_data_skipping_indices.reference
 create mode 100644 tests/queries/0_stateless/01515_force_data_skipping_indices.sql

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index decaf6b9029..e901e478605 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -70,6 +70,12 @@ Works with tables in the MergeTree family.
 
 If `force_primary_key=1`, ClickHouse checks to see if the query has a primary key condition that can be used for restricting data ranges. If there is no suitable condition, it throws an exception. However, it does not check whether the condition reduces the amount of data to read. For more information about data ranges in MergeTree tables, see [MergeTree](../../engines/table-engines/mergetree-family/mergetree.md).
 
+## force\_data\_skipping\_indices {#settings-force_data_skipping_indices}
+
+Disables query execution if passed data skipping indices wasn't used.
+
+Works with tables in the MergeTree family.
+
 ## format\_schema {#format-schema}
 
 This parameter is useful when you are using formats that require a schema definition, such as [Cap’n Proto](https://capnproto.org/) or [Protobuf](https://developers.google.com/protocol-buffers/). The value depends on the format.
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 3ecb79c3fce..901c4fd0b03 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -135,6 +135,7 @@ class IColumn;
     \
     M(Bool, force_index_by_date, 0, "Throw an exception if there is a partition key in a table, and it is not used.", 0) \
     M(Bool, force_primary_key, 0, "Throw an exception if there is primary key in a table, and it is not used.", 0) \
+    M(String, force_data_skipping_indices, "", "Comma separated list of data skipping indices that should be used, otherwise an exception will be thrown.", 0) \
     \
     M(Float, max_streams_to_max_threads_ratio, 1, "Allows you to use more sources than the number of threads - to more evenly distribute work across threads. It is assumed that this is a temporary solution, since it will be possible in the future to make the number of sources equal to the number of threads, but for each source to dynamically select available work for itself.", 0) \
     M(Float, max_streams_multiplier_for_merge_tables, 5, "Ask more streams when reading from Merge table. Streams will be spread across tables that Merge table will use. This allows more even distribution of work across threads and especially helpful when merged tables differ in size.", 0) \
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 9128694c793..09ab3201085 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -1,6 +1,8 @@
 #include <boost/rational.hpp>   /// For calculations related to sampling coefficients.
+#include <boost/algorithm/string/split.hpp>
 #include <ext/scope_guard.h>
 #include <optional>
+#include <unordered_set>
 
 #include <Poco/File.h>
 
@@ -552,6 +554,31 @@ Pipe MergeTreeDataSelectExecutor::readFromParts(
             useful_indices.emplace_back(index_helper, condition);
     }
 
+    if (settings.force_data_skipping_indices.changed)
+    {
+        std::unordered_set<std::string> useful_indices_names;
+        for (const auto & useful_index : useful_indices)
+            useful_indices_names.insert(useful_index.first->index.name);
+
+        std::vector<std::string> forced_indices;
+        boost::split(forced_indices,
+            settings.force_data_skipping_indices.toString(),
+            [](char c){ return c == ','; });
+
+        for (const auto & index_name : forced_indices)
+        {
+            if (index_name.empty())
+                continue;
+
+            if (!useful_indices_names.count(index_name))
+            {
+                throw Exception(ErrorCodes::INDEX_NOT_USED,
+                    "Index {} is not used and setting 'force_data_skipping_indices' contains it",
+                    backQuote(index_name));
+            }
+        }
+    }
+
     RangesInDataParts parts_with_ranges(parts.size());
     size_t sum_marks = 0;
     std::atomic<size_t> sum_marks_pk = 0;
diff --git a/tests/queries/0_stateless/01515_force_data_skipping_indices.reference b/tests/queries/0_stateless/01515_force_data_skipping_indices.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01515_force_data_skipping_indices.sql b/tests/queries/0_stateless/01515_force_data_skipping_indices.sql
new file mode 100644
index 00000000000..7a78cfe5ab5
--- /dev/null
+++ b/tests/queries/0_stateless/01515_force_data_skipping_indices.sql
@@ -0,0 +1,28 @@
+DROP TABLE IF EXISTS data_01515;
+CREATE TABLE data_01515
+(
+    key Int,
+    d1 Int,
+    d1_null Nullable(Int),
+    INDEX d1_idx d1 TYPE minmax GRANULARITY 1,
+    INDEX d1_null_idx assumeNotNull(d1_null) TYPE minmax GRANULARITY 1
+)
+Engine=MergeTree()
+ORDER BY key;
+
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='';
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
+
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',d1_idx,';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_idx,,';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx,d1_null_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx,d1_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx,d1_idx,,'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_null_idx,d1_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_null_idx,,'; -- { serverError 277 }
+
+SELECT * FROM data_01515 WHERE d1_null = 0 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='d1_null_idx';
diff --git a/tests/queries/0_stateless/arcadia_skip_list.txt b/tests/queries/0_stateless/arcadia_skip_list.txt
index e59a2634d0c..78dfbb2e9e1 100644
--- a/tests/queries/0_stateless/arcadia_skip_list.txt
+++ b/tests/queries/0_stateless/arcadia_skip_list.txt
@@ -149,3 +149,4 @@
 00609_mv_index_in_in
 00510_materizlized_view_and_deduplication_zookeeper
 00738_lock_for_inner_table
+01515_force_data_skipping_indices

From 75e612fc16544e1902a517d75c2d971f7a260783 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Tue, 6 Oct 2020 09:46:12 +0300
Subject: [PATCH 36/72] Use full featured parser for
 force_data_skipping_indices

---
 docs/en/operations/settings/settings.md       | 23 ++++++++++++++++
 src/Core/Settings.h                           |  2 +-
 .../MergeTree/MergeTreeDataSelectExecutor.cpp | 26 ++++++++++++-------
 .../01515_force_data_skipping_indices.sql     | 13 ++++++----
 4 files changed, 49 insertions(+), 15 deletions(-)

diff --git a/docs/en/operations/settings/settings.md b/docs/en/operations/settings/settings.md
index e901e478605..e7ac9c4a17d 100644
--- a/docs/en/operations/settings/settings.md
+++ b/docs/en/operations/settings/settings.md
@@ -74,6 +74,29 @@ If `force_primary_key=1`, ClickHouse checks to see if the query has a primary ke
 
 Disables query execution if passed data skipping indices wasn't used.
 
+Consider the following example:
+
+```sql
+CREATE TABLE data
+(
+    key Int,
+    d1 Int,
+    d1_null Nullable(Int),
+    INDEX d1_idx d1 TYPE minmax GRANULARITY 1,
+    INDEX d1_null_idx assumeNotNull(d1_null) TYPE minmax GRANULARITY 1
+)
+Engine=MergeTree()
+ORDER BY key;
+
+SELECT * FROM data_01515;
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices=''; -- query will produce CANNOT_PARSE_TEXT error.
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_idx'; -- query will produce INDEX_NOT_USED error.
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx'; -- Ok.
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`'; -- Ok (example of full featured parser).
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- query will produce INDEX_NOT_USED error, since d1_null_idx is not used.
+SELECT * FROM data_01515 WHERE d1 = 0 AND assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='`d1_idx`, d1_null_idx'; -- Ok.
+```
+
 Works with tables in the MergeTree family.
 
 ## format\_schema {#format-schema}
diff --git a/src/Core/Settings.h b/src/Core/Settings.h
index 901c4fd0b03..6d3437defb2 100644
--- a/src/Core/Settings.h
+++ b/src/Core/Settings.h
@@ -135,7 +135,7 @@ class IColumn;
     \
     M(Bool, force_index_by_date, 0, "Throw an exception if there is a partition key in a table, and it is not used.", 0) \
     M(Bool, force_primary_key, 0, "Throw an exception if there is primary key in a table, and it is not used.", 0) \
-    M(String, force_data_skipping_indices, "", "Comma separated list of data skipping indices that should be used, otherwise an exception will be thrown.", 0) \
+    M(String, force_data_skipping_indices, "", "Comma separated list of strings or literals with the name of the data skipping indices that should be used during query execution, otherwise an exception will be thrown.", 0) \
     \
     M(Float, max_streams_to_max_threads_ratio, 1, "Allows you to use more sources than the number of threads - to more evenly distribute work across threads. It is assumed that this is a temporary solution, since it will be possible in the future to make the number of sources equal to the number of threads, but for each source to dynamically select available work for itself.", 0) \
     M(Float, max_streams_multiplier_for_merge_tables, 5, "Ask more streams when reading from Merge table. Streams will be spread across tables that Merge table will use. This allows more even distribution of work across threads and especially helpful when merged tables differ in size.", 0) \
diff --git a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
index 09ab3201085..f30e37d3e31 100644
--- a/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataSelectExecutor.cpp
@@ -1,5 +1,4 @@
 #include <boost/rational.hpp>   /// For calculations related to sampling coefficients.
-#include <boost/algorithm/string/split.hpp>
 #include <ext/scope_guard.h>
 #include <optional>
 #include <unordered_set>
@@ -20,6 +19,7 @@
 #include <Parsers/ASTLiteral.h>
 #include <Parsers/ASTFunction.h>
 #include <Parsers/ASTSampleRatio.h>
+#include <Parsers/parseIdentifierOrStringLiteral.h>
 #include <Interpreters/ExpressionAnalyzer.h>
 #include <Interpreters/Context.h>
 
@@ -61,6 +61,7 @@ namespace ErrorCodes
     extern const int ILLEGAL_COLUMN;
     extern const int ARGUMENT_OUT_OF_BOUND;
     extern const int TOO_MANY_ROWS;
+    extern const int CANNOT_PARSE_TEXT;
 }
 
 
@@ -556,20 +557,27 @@ Pipe MergeTreeDataSelectExecutor::readFromParts(
 
     if (settings.force_data_skipping_indices.changed)
     {
+        const auto & indices = settings.force_data_skipping_indices.toString();
+
+        Strings forced_indices;
+        {
+            Tokens tokens(&indices[0], &indices[indices.size()], settings.max_query_size);
+            IParser::Pos pos(tokens, settings.max_parser_depth);
+            Expected expected;
+            if (!parseIdentifiersOrStringLiterals(pos, expected, forced_indices))
+                throw Exception(ErrorCodes::CANNOT_PARSE_TEXT,
+                    "Cannot parse force_data_skipping_indices ('{}')", indices);
+        }
+
+        if (forced_indices.empty())
+            throw Exception(ErrorCodes::CANNOT_PARSE_TEXT, "No indices parsed from force_data_skipping_indices ('{}')", indices);
+
         std::unordered_set<std::string> useful_indices_names;
         for (const auto & useful_index : useful_indices)
             useful_indices_names.insert(useful_index.first->index.name);
 
-        std::vector<std::string> forced_indices;
-        boost::split(forced_indices,
-            settings.force_data_skipping_indices.toString(),
-            [](char c){ return c == ','; });
-
         for (const auto & index_name : forced_indices)
         {
-            if (index_name.empty())
-                continue;
-
             if (!useful_indices_names.count(index_name))
             {
                 throw Exception(ErrorCodes::INDEX_NOT_USED,
diff --git a/tests/queries/0_stateless/01515_force_data_skipping_indices.sql b/tests/queries/0_stateless/01515_force_data_skipping_indices.sql
index 7a78cfe5ab5..5e45018707d 100644
--- a/tests/queries/0_stateless/01515_force_data_skipping_indices.sql
+++ b/tests/queries/0_stateless/01515_force_data_skipping_indices.sql
@@ -10,19 +10,22 @@ CREATE TABLE data_01515
 Engine=MergeTree()
 ORDER BY key;
 
-SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='';
+SELECT * FROM data_01515;
+SELECT * FROM data_01515 SETTINGS force_data_skipping_indices=''; -- { serverError 6 }
 SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_idx'; -- { serverError 277 }
 SELECT * FROM data_01515 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
 
 SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx';
-SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',d1_idx,';
-SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_idx,,';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='`d1_idx`';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=' d1_idx ';
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='  d1_idx  ';
 SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_idx,d1_null_idx'; -- { serverError 277 }
 SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx,d1_idx'; -- { serverError 277 }
 SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx,d1_idx,,'; -- { serverError 277 }
-SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_null_idx,d1_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='  d1_null_idx,d1_idx'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='  `d1_null_idx`,d1_idx'; -- { serverError 277 }
 SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
-SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices=',,d1_null_idx,,'; -- { serverError 277 }
+SELECT * FROM data_01515 WHERE d1 = 0 SETTINGS force_data_skipping_indices='  d1_null_idx  '; -- { serverError 277 }
 
 SELECT * FROM data_01515 WHERE d1_null = 0 SETTINGS force_data_skipping_indices='d1_null_idx'; -- { serverError 277 }
 SELECT * FROM data_01515 WHERE assumeNotNull(d1_null) = 0 SETTINGS force_data_skipping_indices='d1_null_idx';

From ea163700244784b9c0823e921b9965061c9587e9 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Tue, 6 Oct 2020 23:05:28 +0300
Subject: [PATCH 37/72] improvements in lost replica recovery

---
 src/Storages/MergeTree/MergeTreeSettings.h    |   3 +-
 .../ReplicatedMergeTreeCleanupThread.cpp      |   1 +
 src/Storages/StorageReplicatedMergeTree.cpp   | 102 ++++++++++++++----
 .../configs/remote_servers.xml                |  19 ----
 .../integration/test_recovery_replica/test.py |  37 +++++--
 5 files changed, 117 insertions(+), 45 deletions(-)
 delete mode 100644 tests/integration/test_recovery_replica/configs/remote_servers.xml

diff --git a/src/Storages/MergeTree/MergeTreeSettings.h b/src/Storages/MergeTree/MergeTreeSettings.h
index d26ff5c79bd..97bc73caf5b 100644
--- a/src/Storages/MergeTree/MergeTreeSettings.h
+++ b/src/Storages/MergeTree/MergeTreeSettings.h
@@ -58,7 +58,7 @@ struct Settings;
     /** Replication settings. */ \
     M(UInt64, replicated_deduplication_window, 100, "How many last blocks of hashes should be kept in ZooKeeper (old blocks will be deleted).", 0) \
     M(UInt64, replicated_deduplication_window_seconds, 7 * 24 * 60 * 60 /* one week */, "Similar to \"replicated_deduplication_window\", but determines old blocks by their lifetime. Hash of an inserted block will be deleted (and the block will not be deduplicated after) if it outside of one \"window\". You can set very big replicated_deduplication_window to avoid duplicating INSERTs during that period of time.", 0) \
-    M(UInt64, max_replicated_logs_to_keep, 100, "How many records may be in log, if there is inactive replica.", 0) \
+    M(UInt64, max_replicated_logs_to_keep, 1000, "How many records may be in log, if there is inactive replica. Inactive replica becomes lost when when this number exceed.", 0) \
     M(UInt64, min_replicated_logs_to_keep, 10, "Keep about this number of last records in ZooKeeper log, even if they are obsolete. It doesn't affect work of tables: used only to diagnose ZooKeeper log before cleaning.", 0) \
     M(Seconds, prefer_fetch_merged_part_time_threshold, 3600, "If time passed after replication log entry creation exceeds this threshold and sum size of parts is greater than \"prefer_fetch_merged_part_size_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \
     M(UInt64, prefer_fetch_merged_part_size_threshold, 10ULL * 1024 * 1024 * 1024, "If sum size of parts exceeds this threshold and time passed after replication log entry creation is greater than \"prefer_fetch_merged_part_time_threshold\", prefer fetching merged part from replica instead of doing merge locally. To speed up very long merges.", 0) \
@@ -75,6 +75,7 @@ struct Settings;
     M(UInt64, replicated_max_parallel_sends_for_table, 0, "Limit parallel sends for one table.", 0) \
     M(Bool, replicated_can_become_leader, true, "If true, Replicated tables replicas on this node will try to acquire leadership.", 0) \
     M(Seconds, zookeeper_session_expiration_check_period, 60, "ZooKeeper session expiration check period, in seconds.", 0) \
+    M(Bool, detach_old_local_parts_when_cloning_replica, 1, "Do not remove old local parts when repairing lost replica.", 0) \
     \
     /** Check delay of replicas settings. */ \
     M(UInt64, min_relative_delay_to_measure, 120, "Calculate relative replica delay only if absolute delay is not less that this value.", 0) \
diff --git a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp
index 11f23a5c110..5de0a79b2c3 100644
--- a/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp
+++ b/src/Storages/MergeTree/ReplicatedMergeTreeCleanupThread.cpp
@@ -274,6 +274,7 @@ void ReplicatedMergeTreeCleanupThread::markLostReplicas(const std::unordered_map
     for (const auto & pair : log_pointers_candidate_lost_replicas)
     {
         String replica = pair.first;
+        LOG_WARNING(log, "Will mark replica {} as lost, because it has stale log pointer: {}", replica, pair.second);
         Coordination::Requests ops;
         /// If host changed version we can not mark replicas, because replica started to be active.
         ops.emplace_back(zkutil::makeCheckRequest(
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 9613bd5111d..1e38abf90ef 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -2182,8 +2182,6 @@ bool StorageReplicatedMergeTree::executeReplaceRange(const LogEntry & entry)
 
 void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coordination::Stat source_is_lost_stat, zkutil::ZooKeeperPtr & zookeeper)
 {
-    LOG_INFO(log, "Will mimic {}", source_replica);
-
     String source_path = zookeeper_path + "/replicas/" + source_replica;
 
     /** TODO: it will be deleted! (It is only to support old version of CH server).
@@ -2309,6 +2307,17 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
             LOG_WARNING(log, "Source replica does not have part {}. Removing it from working set.", part->name);
         }
     }
+
+    if (getSettings()->detach_old_local_parts_when_cloning_replica)
+    {
+        auto metadata_snapshot = getInMemoryMetadataPtr();
+        for (const auto & part : parts_to_remove_from_working_set)
+        {
+            LOG_INFO(log, "Detaching {}", part->relative_path);
+            part->makeCloneInDetached("clone", metadata_snapshot);
+        }
+    }
+
     removePartsFromWorkingSet(parts_to_remove_from_working_set, true);
 
     for (const String & name : active_parts)
@@ -2336,47 +2345,102 @@ void StorageReplicatedMergeTree::cloneReplica(const String & source_replica, Coo
 
 void StorageReplicatedMergeTree::cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zookeeper)
 {
+    Coordination::Stat is_lost_stat;
+    bool is_new_replica = true;
     String res;
-    if (zookeeper->tryGet(replica_path + "/is_lost", res))
+    if (zookeeper->tryGet(replica_path + "/is_lost", res, &is_lost_stat))
     {
         if (res == "0")
             return;
+        if (is_lost_stat.version)
+            is_new_replica = false;
     }
     else
     {
         /// Replica was created by old version of CH, so me must create "/is_lost".
         /// Note that in old version of CH there was no "lost" replicas possible.
+        /// TODO is_lost node should always exist since v18.12, maybe we can replace `tryGet` with `get` and remove old code?
         zookeeper->create(replica_path + "/is_lost", "0", zkutil::CreateMode::Persistent);
         return;
     }
 
     /// is_lost is "1": it means that we are in repair mode.
-
-    String source_replica;
-    Coordination::Stat source_is_lost_stat;
-    source_is_lost_stat.version = -1;
-
-    for (const String & source_replica_name : zookeeper->getChildren(zookeeper_path + "/replicas"))
+    /// Try choose source replica to clone.
+    /// Source replica must not be lost and should have minimal queue size and maximal log pointer.
+    Strings replicas = zookeeper->getChildren(zookeeper_path + "/replicas");
+    std::vector<zkutil::ZooKeeper::FutureGet> futures;
+    for (const String & source_replica_name : replicas)
     {
+        /// Do not clone from myself.
+        if (source_replica_name == replica_name)
+            continue;
+
         String source_replica_path = zookeeper_path + "/replicas/" + source_replica_name;
 
-        /// Do not clone from myself.
-        if (source_replica_path != replica_path)
+        /// Obviously the following get operations are not atomic, but it's ok to choose good enough replica, not the best one.
+        /// NOTE: We may count some entries twice if log_pointer is moved.
+        futures.emplace_back(zookeeper->asyncTryGet(source_replica_path + "/is_lost"));
+        futures.emplace_back(zookeeper->asyncTryGet(source_replica_path + "/log_pointer"));
+        futures.emplace_back(zookeeper->asyncTryGet(source_replica_path + "/queue"));
+    }
+
+    Strings log_entries = zookeeper->getChildren(zookeeper_path + "/log");
+    size_t max_log_entry = 0;
+    if (!log_entries.empty())
+    {
+        String last_entry_num = std::max_element(log_entries.begin(), log_entries.end())->substr(4);
+        max_log_entry = std::stol(last_entry_num);
+    }
+    ++max_log_entry;
+
+    size_t min_replication_lag = std::numeric_limits<size_t>::max();
+    String source_replica;
+    Coordination::Stat source_is_lost_stat;
+    size_t future_num = 0;
+    for (const String & source_replica_name : replicas)
+    {
+        if (source_replica_name == replica_name)
+            continue;
+
+        auto get_is_lost     = futures[future_num++].get();
+        auto get_log_pointer = futures[future_num++].get();
+        auto get_queue       = futures[future_num++].get();
+
+        if (get_is_lost.error == Coordination::Error::ZNONODE)
         {
-            /// Do not clone from lost replicas.
-            String source_replica_is_lost_value;
-            if (!zookeeper->tryGet(source_replica_path + "/is_lost", source_replica_is_lost_value, &source_is_lost_stat)
-                || source_replica_is_lost_value == "0")
-            {
-                source_replica = source_replica_name;
-                break;
-            }
+            /// For compatibility with older ClickHouse versions
+            get_is_lost.stat.version = -1;
+        }
+        else if (get_is_lost.data != "0")
+            continue;
+        if (get_log_pointer.error != Coordination::Error::ZOK)
+            continue;
+        if (get_queue.error != Coordination::Error::ZOK)
+            continue;
+
+        /// Replica is not lost and we can clone it. Let's calculate approx replication lag.
+        size_t source_log_pointer = get_log_pointer.data.empty() ? 0 : parse<UInt64>(get_log_pointer.data);
+        assert(source_log_pointer <= max_log_entry);
+        size_t replica_queue_lag = max_log_entry - source_log_pointer;
+        size_t replica_queue_size = get_queue.stat.numChildren;
+        size_t replication_lag = replica_queue_lag + replica_queue_size;
+        LOG_INFO(log, "Replica {} has approximate {} queue lag and {} queue size", source_replica_name, replica_queue_lag, replica_queue_size);
+        if (replication_lag < min_replication_lag)
+        {
+            source_replica = source_replica_name;
+            source_is_lost_stat = get_is_lost.stat;
+            min_replication_lag = replication_lag;
         }
     }
 
     if (source_replica.empty())
         throw Exception("All replicas are lost", ErrorCodes::ALL_REPLICAS_LOST);
 
+    if (is_new_replica)
+        LOG_INFO(log, "Will mimic {}", source_replica);
+    else
+        LOG_WARNING(log, "Will mimic {}", source_replica);
+
     /// Clear obsolete queue that we no longer need.
     zookeeper->removeChildren(replica_path + "/queue");
 
diff --git a/tests/integration/test_recovery_replica/configs/remote_servers.xml b/tests/integration/test_recovery_replica/configs/remote_servers.xml
deleted file mode 100644
index a6e80ce2b08..00000000000
--- a/tests/integration/test_recovery_replica/configs/remote_servers.xml
+++ /dev/null
@@ -1,19 +0,0 @@
-<yandex>
-    <remote_servers>
-        <test_cluster>
-            <shard>
-                <internal_replication>true</internal_replication>
-                <replica>
-                    <default_database>shard_0</default_database>
-                    <host>node1</host>
-                    <port>9000</port>
-                </replica>
-                <replica>
-                    <default_database>shard_0</default_database>
-                    <host>node2</host>
-                    <port>9000</port>
-                </replica>
-            </shard>
-        </test_cluster>
-    </remote_servers>
-</yandex>
diff --git a/tests/integration/test_recovery_replica/test.py b/tests/integration/test_recovery_replica/test.py
index 9320a176f1e..22dba7aafec 100644
--- a/tests/integration/test_recovery_replica/test.py
+++ b/tests/integration/test_recovery_replica/test.py
@@ -9,16 +9,15 @@ def fill_nodes(nodes, shard):
     for node in nodes:
         node.query(
             '''
-                CREATE DATABASE test;
-    
                 CREATE TABLE test_table(date Date, id UInt32)
-                ENGINE = ReplicatedMergeTree('/clickhouse/tables/test{shard}/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0;
+                ENGINE = ReplicatedMergeTree('/clickhouse/tables/test/replicated', '{replica}') ORDER BY id PARTITION BY toYYYYMM(date) SETTINGS min_replicated_logs_to_keep=3, max_replicated_logs_to_keep=5, cleanup_delay_period=0, cleanup_delay_period_random_add=0;
             '''.format(shard=shard, replica=node.name))
 
 
 cluster = ClickHouseCluster(__file__)
-node1 = cluster.add_instance('node1', main_configs=['configs/remote_servers.xml'], with_zookeeper=True)
-node2 = cluster.add_instance('node2', main_configs=['configs/remote_servers.xml'], with_zookeeper=True)
+node1 = cluster.add_instance('node1', with_zookeeper=True)
+node2 = cluster.add_instance('node2', with_zookeeper=True)
+node3 = cluster.add_instance('node3', with_zookeeper=True)
 
 
 @pytest.fixture(scope="module")
@@ -26,7 +25,7 @@ def start_cluster():
     try:
         cluster.start()
 
-        fill_nodes([node1, node2], 1)
+        fill_nodes([node1, node2, node3], 1)
 
         yield cluster
 
@@ -49,3 +48,29 @@ def test_recovery(start_cluster):
                            check_callback=lambda x: len(node2.query("select * from test_table")) > 0)
 
     assert_eq_with_retry(node2, "SELECT count(*) FROM test_table", node1.query("SELECT count(*) FROM test_table"))
+    lost_marker = "Will mark replica node2 as lost"
+    assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
+
+def test_choose_source_replica(start_cluster):
+    node3.query("INSERT INTO test_table VALUES (2, 1)")
+    time.sleep(1)
+    node2.query("DETACH TABLE test_table")
+    node1.query("SYSTEM STOP FETCHES test_table")   # node1 will have many entries in queue, so node2 will clone node3
+
+    for i in range(100):
+        node3.query("INSERT INTO test_table VALUES (2, {})".format(i))
+
+    node2.query_with_retry("ATTACH TABLE test_table",
+                           check_callback=lambda x: len(node2.query("select * from test_table")) > 0)
+
+    node1.query("SYSTEM START FETCHES test_table")
+    node1.query("SYSTEM SYNC REPLICA test_table")
+    node2.query("SYSTEM SYNC REPLICA test_table")
+
+    assert node1.query("SELECT count(*) FROM test_table") == node3.query("SELECT count(*) FROM test_table")
+    assert node2.query("SELECT count(*) FROM test_table") == node3.query("SELECT count(*) FROM test_table")
+
+    lost_marker = "Will mark replica node2 as lost"
+    assert node1.contains_in_log(lost_marker) or node3.contains_in_log(lost_marker)
+    assert node2.contains_in_log("Will mimic node3")
+

From 47e9fe9ff16aa48c3f9b77ea1e9764b9cc17af20 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Wed, 7 Oct 2020 03:05:48 +0300
Subject: [PATCH 38/72] fix

---
 src/Storages/StorageReplicatedMergeTree.cpp | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 1e38abf90ef..35e0c9150ac 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -2384,6 +2384,10 @@ void StorageReplicatedMergeTree::cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zooke
         futures.emplace_back(zookeeper->asyncTryGet(source_replica_path + "/queue"));
     }
 
+    /// Wait for results before getting log entries
+    for (auto & future : futures)
+        future.wait();
+
     Strings log_entries = zookeeper->getChildren(zookeeper_path + "/log");
     size_t max_log_entry = 0;
     if (!log_entries.empty())
@@ -2391,6 +2395,7 @@ void StorageReplicatedMergeTree::cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zooke
         String last_entry_num = std::max_element(log_entries.begin(), log_entries.end())->substr(4);
         max_log_entry = std::stol(last_entry_num);
     }
+    /// log_pointer can point to future entry, which was not created yet
     ++max_log_entry;
 
     size_t min_replication_lag = std::numeric_limits<size_t>::max();

From 8dbae0966bc3dfdf46ab09f8c7ce6a5ec4925165 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 7 Oct 2020 10:51:57 +0300
Subject: [PATCH 39/72] Apply strange fix for g++

---
 contrib/mariadb-connector-c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/mariadb-connector-c b/contrib/mariadb-connector-c
index 780963dda5c..f5638e954a7 160000
--- a/contrib/mariadb-connector-c
+++ b/contrib/mariadb-connector-c
@@ -1 +1 @@
-Subproject commit 780963dda5cd55bcc7149a90ab2124ea11b65c19
+Subproject commit f5638e954a79f50bac7c7a5deaa5a241e0ce8b5f

From 7d97c8b05399dea274f6ee491ec79bba506545e8 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 7 Oct 2020 11:21:32 +0300
Subject: [PATCH 40/72] Remove useless code

---
 programs/server/Server.cpp | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/programs/server/Server.cpp b/programs/server/Server.cpp
index 0019ba7e76c..da5760acc09 100644
--- a/programs/server/Server.cpp
+++ b/programs/server/Server.cpp
@@ -267,12 +267,6 @@ int Server::main(const std::vector<std::string> & /*args*/)
     registerDictionaries();
     registerDisks();
 
-#if !defined(ARCADIA_BUILD)
-#if USE_OPENCL
-    BitonicSort::getInstance().configure();
-#endif
-#endif
-
     CurrentMetrics::set(CurrentMetrics::Revision, ClickHouseRevision::getVersionRevision());
     CurrentMetrics::set(CurrentMetrics::VersionInteger, ClickHouseRevision::getVersionInteger());
 

From c620d9de86b29137deb5f182c283e4ea47a1c5b4 Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 7 Oct 2020 11:25:36 +0300
Subject: [PATCH 41/72] Remove useless file

---
 utils/test-data-generator/MarkovModel.h | 188 ------------------------
 1 file changed, 188 deletions(-)
 delete mode 100644 utils/test-data-generator/MarkovModel.h

diff --git a/utils/test-data-generator/MarkovModel.h b/utils/test-data-generator/MarkovModel.h
deleted file mode 100644
index 338aee2e61f..00000000000
--- a/utils/test-data-generator/MarkovModel.h
+++ /dev/null
@@ -1,188 +0,0 @@
-#pragma once
-
-#include <Core/Types.h>
-#include <Common/HashTable/HashMap.h>
-#include <Common/Arena.h>
-#include <ext/bit_cast.h>
-#include <common/StringRef.h>
-
-
-namespace DB
-{
-
-
-class MarkovModel
-{
-private:
-    using NGramHash = UInt32;
-
-    struct HistogramElement
-    {
-        UInt8 byte;
-        UInt32 count;
-    };
-
-    struct Histogram
-    {
-        UInt32 total = 0;
-        std::vector<HistogramElement> data;
-
-        void add(UInt8 byte)
-        {
-            ++total;
-
-            for (auto & elem : data)
-            {
-                if (elem.byte == byte)
-                {
-                    ++elem.count;
-                    return;
-                }
-            }
-
-            data.emplace_back(HistogramElement{.byte = byte, .count = 1});
-        }
-
-        UInt8 sample(UInt32 random) const
-        {
-            random %= total;
-
-            UInt32 sum = 0;
-            for (const auto & elem : data)
-            {
-                sum += elem.count;
-                if (sum > random)
-                    return elem.byte;
-            }
-
-            __builtin_unreachable();
-        }
-    };
-
-    using Table = HashMap<NGramHash, Histogram, TrivialHash>;
-    Table table;
-
-    size_t n;
-
-
-    NGramHash hashContext(const char * pos, const char * data, size_t size) const
-    {
-        if (pos >= data + n)
-            return CRC32Hash()(StringRef(pos - n, n));
-        else
-            return CRC32Hash()(StringRef(data, pos - data));
-    }
-
-public:
-    explicit MarkovModel(size_t n_) : n(n_) {}
-    MarkovModel() {}
-
-    void consume(const char * data, size_t size)
-    {
-        const char * pos = data;
-        const char * end = data + size;
-
-        while (pos < end)
-        {
-            table[hashContext(pos, data, size)].add(*pos);
-            ++pos;
-        }
-
-        /// Mark end of string as zero byte.
-        table[hashContext(pos, data, size)].add(0);
-    }
-
-
-    template <typename Random>
-    size_t generate(char * data, size_t size, Random && random) const
-    {
-        char * pos = data;
-        char * end = data + size;
-
-        while (pos < end)
-        {
-            auto it = table.find(hashContext(pos, data, size));
-            if (table.end() == it)
-                return pos - data;
-
-            *pos = it->getMapped().sample(random());
-
-            /// Zero byte marks end of string.
-            if (0 == *pos)
-                return pos - data;
-
-            ++pos;
-        }
-
-        return size;
-    }
-
-
-    /// Allows to add random noise to frequencies.
-    template <typename Transform>
-    void modifyCounts(Transform && transform)
-    {
-        for (auto & elem : table)
-        {
-            UInt32 new_total = 0;
-            for (auto & frequency : elem.getMapped().data)
-            {
-                frequency.count = transform(frequency.count);
-                new_total += frequency.count;
-            }
-            elem.getMapped().total = new_total;
-        }
-    }
-
-
-    void write(WriteBuffer & out) const
-    {
-        writeBinary(UInt8(n), out);
-        writeVarUInt(table.size(), out);
-
-        for (const auto & elem : table)
-        {
-            writeBinary(elem.getKey(), out);
-            writeBinary(UInt8(elem.getMapped().data.size()), out);
-
-            for (const auto & frequency : elem.getMapped().data)
-            {
-                writeBinary(frequency.byte, out);
-                writeVarUInt(frequency.count, out);
-            }
-        }
-    }
-
-
-    void read(ReadBuffer & in)
-    {
-        table.clear();
-
-        UInt8 read_n = 0;
-        readBinary(read_n, in);
-        n = read_n;
-
-        size_t read_size = 0;
-        readVarUInt(read_size, in);
-
-        for (size_t i = 0; i < read_size; ++i)
-        {
-            NGramHash key = 0;
-            UInt8 historgam_size = 0;
-            readBinary(key, in);
-            readBinary(historgam_size, in);
-
-            Histogram & histogram = table[key];
-            histogram.data.resize(historgam_size);
-
-            for (size_t j = 0; j < historgam_size; ++j)
-            {
-                readBinary(histogram.data[j].byte, in);
-                readVarUInt(histogram.data[j].count, in);
-                histogram.total += histogram.data[j].count;
-            }
-        }
-    }
-};
-
-}

From e588463ce720bb4a8f799d1fb873ad6ec31804db Mon Sep 17 00:00:00 2001
From: Alexey Milovidov <milovidov@yandex-team.ru>
Date: Wed, 7 Oct 2020 11:25:46 +0300
Subject: [PATCH 42/72] Fix build on my machine

---
 utils/test-data-generator/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/utils/test-data-generator/CMakeLists.txt b/utils/test-data-generator/CMakeLists.txt
index 20c37854c0a..80d7d4301e7 100644
--- a/utils/test-data-generator/CMakeLists.txt
+++ b/utils/test-data-generator/CMakeLists.txt
@@ -9,4 +9,10 @@ if (USE_PROTOBUF)
     target_link_libraries (ProtobufDelimitedMessagesSerializer PRIVATE ${Protobuf_LIBRARY} boost::program_options)
     get_filename_component(ProtobufDelimitedMessagesSerializer_OutputDir "${CMAKE_CURRENT_LIST_DIR}/../../tests/queries/0_stateless" REALPATH)
     target_compile_definitions(ProtobufDelimitedMessagesSerializer PRIVATE OUTPUT_DIR="${ProtobufDelimitedMessagesSerializer_OutputDir}")
+
+    # Protoc generates substandard code.
+    check_cxx_compiler_flag("-Wsuggest-destructor-override" HAS_SUGGEST_DESTRUCTOR_OVERRIDE)
+    if (HAS_SUGGEST_OVERRIDE)
+        target_compile_options(ProtobufDelimitedMessagesSerializer PRIVATE -Wno-suggest-destructor-override)
+    endif()
 endif ()

From e00583cf2baa414f5d7a5edc878493f3fe3aa701 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 7 Oct 2020 11:27:29 +0300
Subject: [PATCH 43/72] Build with gcc-9

---
 contrib/libhdfs3 | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/contrib/libhdfs3 b/contrib/libhdfs3
index 7d370205486..24b058c3567 160000
--- a/contrib/libhdfs3
+++ b/contrib/libhdfs3
@@ -1 +1 @@
-Subproject commit 7d370205486ff8c2216cb477c3021d49781f1122
+Subproject commit 24b058c356794ef6cc2d31323dc9adf0386652ff

From fba013334221a52fbb35f31c7c0f5c34a44cc98e Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Wed, 7 Oct 2020 12:38:03 +0300
Subject: [PATCH 44/72] Bump ci


From df02573c926dfb2ad384289c137e9e8a95957322 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Wed, 7 Oct 2020 13:11:58 +0300
Subject: [PATCH 45/72] Fix ARRAY JOIN optimisation when reading from MV.

---
 src/Interpreters/ExpressionActions.cpp | 4 ----
 src/Storages/StorageValues.cpp         | 9 +++++++--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/src/Interpreters/ExpressionActions.cpp b/src/Interpreters/ExpressionActions.cpp
index 36c08d945eb..1325ff87d63 100644
--- a/src/Interpreters/ExpressionActions.cpp
+++ b/src/Interpreters/ExpressionActions.cpp
@@ -1114,10 +1114,6 @@ ExpressionActionsPtr ExpressionActions::splitActionsBeforeArrayJoin(const NameSe
     input_columns = split_actions->getSampleBlock().getNamesAndTypesList();
     input_columns.insert(input_columns.end(), inputs_from_array_join.begin(), inputs_from_array_join.end());
 
-    /// Remove not needed columns.
-    if (!actions.empty())
-        prependProjectInput();
-
     return split_actions;
 }
 
diff --git a/src/Storages/StorageValues.cpp b/src/Storages/StorageValues.cpp
index 387d2065f92..a07d49fd3f8 100644
--- a/src/Storages/StorageValues.cpp
+++ b/src/Storages/StorageValues.cpp
@@ -32,8 +32,13 @@ Pipe StorageValues::read(
 {
     metadata_snapshot->check(column_names, getVirtuals(), getStorageID());
 
-    Chunk chunk(res_block.getColumns(), res_block.rows());
-    return Pipe(std::make_shared<SourceFromSingleChunk>(res_block.cloneEmpty(), std::move(chunk)));
+    /// Get only required columns.
+    Block block;
+    for (const auto & name : column_names)
+        block.insert(res_block.getByName(name));
+
+    Chunk chunk(block.getColumns(), block.rows());
+    return Pipe(std::make_shared<SourceFromSingleChunk>(block.cloneEmpty(), std::move(chunk)));
 }
 
 }

From cbf43a870de1733b15edf43178af9e38532f49f1 Mon Sep 17 00:00:00 2001
From: Nikolai Kochetov <nik-kochetov@yandex-team.ru>
Date: Wed, 7 Oct 2020 13:43:06 +0300
Subject: [PATCH 46/72] Added test.

---
 ..._and_array_join_optimisation_bag.reference |  0
 ...515_mv_and_array_join_optimisation_bag.sql | 48 +++++++++++++++++++
 2 files changed, 48 insertions(+)
 create mode 100644 tests/queries/0_stateless/01515_mv_and_array_join_optimisation_bag.reference
 create mode 100644 tests/queries/0_stateless/01515_mv_and_array_join_optimisation_bag.sql

diff --git a/tests/queries/0_stateless/01515_mv_and_array_join_optimisation_bag.reference b/tests/queries/0_stateless/01515_mv_and_array_join_optimisation_bag.reference
new file mode 100644
index 00000000000..e69de29bb2d
diff --git a/tests/queries/0_stateless/01515_mv_and_array_join_optimisation_bag.sql b/tests/queries/0_stateless/01515_mv_and_array_join_optimisation_bag.sql
new file mode 100644
index 00000000000..9cca5e85da7
--- /dev/null
+++ b/tests/queries/0_stateless/01515_mv_and_array_join_optimisation_bag.sql
@@ -0,0 +1,48 @@
+CREATE TABLE visits
+(
+    `CounterID` UInt32,
+    `StartDate` Date,
+    `StartTime` DateTime,
+    `GoalsID` Array(UInt32),
+    `Sign` Int8
+)
+ENGINE = Null;
+
+
+CREATE MATERIALIZED VIEW goal_view TO goal
+(
+    `CounterID` UInt32,
+    `StartDate` Date,
+    `GoalID` UInt32,
+    `Visits` AggregateFunction(sumIf, Int8, UInt8),
+    `GoalReaches` AggregateFunction(sum, Int8)
+) AS
+SELECT
+    CounterID,
+    StartDate,
+    GoalID,
+    sumIfState(Sign, _uniq = 1) AS Visits,
+    sumState(Sign) AS GoalReaches
+FROM visits
+ARRAY JOIN 
+    GoalsID AS GoalID,
+    arrayEnumerateUniq(GoalsID) AS _uniq
+GROUP BY 
+    CounterID,
+    StartDate,
+    GoalID
+ORDER BY 
+    CounterID ASC,
+    StartDate ASC,
+    GoalID ASC;
+
+CREATE TABLE goal
+(
+     `CounterID` UInt32,
+     `StartDate` Date,
+     `GoalID` UInt32,
+     `Visits` AggregateFunction(sumIf, Int8, UInt8),
+     `GoalReaches` AggregateFunction(sum, Int8)
+) ENGINE = AggregatingMergeTree PARTITION BY toStartOfMonth(StartDate) ORDER BY (CounterID, StartDate, GoalID) SETTINGS index_granularity = 256;
+
+INSERT INTO visits (`CounterID`,`StartDate`,`StartTime`,`Sign`,`GoalsID`) VALUES (1, toDate('2000-01-01'), toDateTime(toDate('2000-01-01')), 1, [1]);

From 403a5320f5d01e4f44a997b29a81fe8f83994e0a Mon Sep 17 00:00:00 2001
From: Artem Zuikov <chertus@gmail.com>
Date: Wed, 7 Oct 2020 14:32:57 +0300
Subject: [PATCH 47/72] enable bigint float division (#15691)

---
 src/Functions/divide.cpp                      |  6 +----
 .../01440_big_int_arithm.reference            | 11 +++++++++
 .../0_stateless/01440_big_int_arithm.sql      | 24 +++++++++----------
 3 files changed, 24 insertions(+), 17 deletions(-)

diff --git a/src/Functions/divide.cpp b/src/Functions/divide.cpp
index 34ba33effb4..aa36e23d344 100644
--- a/src/Functions/divide.cpp
+++ b/src/Functions/divide.cpp
@@ -5,7 +5,6 @@ namespace DB
 {
 namespace ErrorCodes
 {
-    extern const int NOT_IMPLEMENTED;
     extern const int LOGICAL_ERROR;
 }
 
@@ -18,10 +17,7 @@ struct DivideFloatingImpl
     template <typename Result = ResultType>
     static inline NO_SANITIZE_UNDEFINED Result apply(A a [[maybe_unused]], B b [[maybe_unused]])
     {
-        if constexpr (is_big_int_v<A> || is_big_int_v<B>)
-            throw Exception("DivideFloatingImpl are not implemented for big integers", ErrorCodes::NOT_IMPLEMENTED);
-        else
-            return static_cast<Result>(a) / b;
+        return static_cast<Result>(a) / b;
     }
 
 #if USE_EMBEDDED_COMPILER
diff --git a/tests/queries/0_stateless/01440_big_int_arithm.reference b/tests/queries/0_stateless/01440_big_int_arithm.reference
index befa4e8368f..f500a9f67e2 100644
--- a/tests/queries/0_stateless/01440_big_int_arithm.reference
+++ b/tests/queries/0_stateless/01440_big_int_arithm.reference
@@ -42,3 +42,14 @@
 1	1	Int128	Int256
 1	1	Int128	Int256
 -1	-1	Int128	Int256
+1	1	Float64	Float64
+1	1	Float64	Float64
+1	1	Float64	Float64
+1	1	Float64	Float64
+-1	-1	Float64	Float64
+-1	-1	Float64	Float64
+-1	-1	Float64	Float64
+-1	-1	Float64	Float64
+1	1	Float64	Float64
+1	1	Float64	Float64
+-1	-1	Float64	Float64
diff --git a/tests/queries/0_stateless/01440_big_int_arithm.sql b/tests/queries/0_stateless/01440_big_int_arithm.sql
index 0ac6a04ad76..3eec2b3cc62 100644
--- a/tests/queries/0_stateless/01440_big_int_arithm.sql
+++ b/tests/queries/0_stateless/01440_big_int_arithm.sql
@@ -58,16 +58,16 @@ select intDiv(toInt128(-1), toInt256(-1)) x, intDiv(toInt256(-1), toInt256(-1))
 select intDiv(toInt128(-1), toUInt256(1)) x, intDiv(toInt256(-1), toUInt256(1)) y, toTypeName(x), toTypeName(y);
 
 
--- select (toInt128(-1) / toInt8(1)) x, (toInt256(-1) / toInt8(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) / toInt16(1)) x, (toInt256(-1) / toInt16(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) / toInt32(1)) x, (toInt256(-1) / toInt32(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) / toInt64(1)) x, (toInt256(-1) / toInt64(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) / toUInt8(1)) x, (toInt256(-1) / toUInt8(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) / toUInt16(1)) x, (toInt256(-1) / toUInt16(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) / toUInt32(1)) x, (toInt256(-1) / toUInt32(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) / toUInt64(1)) x, (toInt256(-1) / toUInt64(1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toInt8(-1)) x, (toInt256(-1) / toInt8(-1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toInt16(-1)) x, (toInt256(-1) / toInt16(-1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toInt32(-1)) x, (toInt256(-1) / toInt32(-1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toInt64(-1)) x, (toInt256(-1) / toInt64(-1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toUInt8(1)) x, (toInt256(-1) / toUInt8(1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toUInt16(1)) x, (toInt256(-1) / toUInt16(1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toUInt32(1)) x, (toInt256(-1) / toUInt32(1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toUInt64(1)) x, (toInt256(-1) / toUInt64(1)) y, toTypeName(x), toTypeName(y);
 
--- select (toInt128(-1) * toInt128(1)) x, (toInt256(-1) * toInt128(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) * toInt256(1)) x, (toInt256(-1) * toInt256(1)) y, toTypeName(x), toTypeName(y);
--- --select (toInt128(-1) * toUInt128(1)) x, (toInt256(-1) * toUInt128(1)) y, toTypeName(x), toTypeName(y);
--- select (toInt128(-1) * toUInt256(1)) x, (toInt256(-1) * toUInt256(1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toInt128(-1)) x, (toInt256(-1) / toInt128(-1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toInt256(-1)) x, (toInt256(-1) / toInt256(-1)) y, toTypeName(x), toTypeName(y);
+--select (toInt128(-1) / toUInt128(1)) x, (toInt256(-1) / toUInt128(1)) y, toTypeName(x), toTypeName(y);
+select (toInt128(-1) / toUInt256(1)) x, (toInt256(-1) / toUInt256(1)) y, toTypeName(x), toTypeName(y);

From cc451e41f5ebf3c482b2a3db62053a6e401032ec Mon Sep 17 00:00:00 2001
From: Anton Popov <pad11rus@gmail.com>
Date: Wed, 7 Oct 2020 18:36:34 +0300
Subject: [PATCH 48/72] fix assertion with transform_null_in

---
 src/Interpreters/ActionsVisitor.cpp           | 24 ++++++++++++-------
 .../01507_transform_null_in.refernce          |  8 +++++++
 .../0_stateless/01507_transform_null_in.sql   | 10 ++++++++
 3 files changed, 34 insertions(+), 8 deletions(-)
 create mode 100644 tests/queries/0_stateless/01507_transform_null_in.refernce
 create mode 100644 tests/queries/0_stateless/01507_transform_null_in.sql

diff --git a/src/Interpreters/ActionsVisitor.cpp b/src/Interpreters/ActionsVisitor.cpp
index 8905eef4528..34410736605 100644
--- a/src/Interpreters/ActionsVisitor.cpp
+++ b/src/Interpreters/ActionsVisitor.cpp
@@ -74,7 +74,7 @@ static size_t getTypeDepth(const DataTypePtr & type)
 }
 
 template<typename Collection>
-static Block createBlockFromCollection(const Collection & collection, const DataTypes & types, const Context & context)
+static Block createBlockFromCollection(const Collection & collection, const DataTypes & types, bool transform_null_in)
 {
     size_t columns_num = types.size();
     MutableColumns columns(columns_num);
@@ -87,7 +87,8 @@ static Block createBlockFromCollection(const Collection & collection, const Data
         if (columns_num == 1)
         {
             auto field = convertFieldToType(value, *types[0]);
-            if (!field.isNull() || context.getSettingsRef().transform_null_in)
+            bool need_insert_null = transform_null_in && types[0]->isNullable();
+            if (!field.isNull() || need_insert_null)
                 columns[0]->insert(std::move(field));
         }
         else
@@ -110,7 +111,8 @@ static Block createBlockFromCollection(const Collection & collection, const Data
             for (; i < tuple_size; ++i)
             {
                 tuple_values[i] = convertFieldToType(tuple[i], *types[i]);
-                if (tuple_values[i].isNull() && !context.getSettingsRef().transform_null_in)
+                bool need_insert_null = transform_null_in && types[i]->isNullable();
+                if (tuple_values[i].isNull() && !need_insert_null)
                     break;
             }
 
@@ -155,6 +157,7 @@ static Block createBlockFromAST(const ASTPtr & node, const DataTypes & types, co
     DataTypePtr tuple_type;
     Row tuple_values;
     const auto & list = node->as<ASTExpressionList &>();
+    bool transform_null_in = context.getSettingsRef().transform_null_in;
     for (const auto & elem : list.children)
     {
         if (num_columns == 1)
@@ -162,8 +165,9 @@ static Block createBlockFromAST(const ASTPtr & node, const DataTypes & types, co
             /// One column at the left of IN.
 
             Field value = extractValueFromNode(elem, *types[0], context);
+            bool need_insert_null = transform_null_in && types[0]->isNullable();
 
-            if (!value.isNull() || context.getSettingsRef().transform_null_in)
+            if (!value.isNull() || need_insert_null)
                 columns[0]->insert(value);
         }
         else if (elem->as<ASTFunction>() || elem->as<ASTLiteral>())
@@ -217,9 +221,11 @@ static Block createBlockFromAST(const ASTPtr & node, const DataTypes & types, co
                 Field value = tuple ? convertFieldToType((*tuple)[i], *types[i])
                                     : extractValueFromNode(func->arguments->children[i], *types[i], context);
 
+                bool need_insert_null = transform_null_in && types[i]->isNullable();
+
                 /// If at least one of the elements of the tuple has an impossible (outside the range of the type) value,
                 ///  then the entire tuple too.
-                if (value.isNull() && !context.getSettings().transform_null_in)
+                if (value.isNull() && !need_insert_null)
                     break;
 
                 tuple_values[i] = value;
@@ -254,20 +260,22 @@ Block createBlockForSet(
     };
 
     Block block;
+    bool tranform_null_in = context.getSettingsRef().transform_null_in;
+
     /// 1 in 1; (1, 2) in (1, 2); identity(tuple(tuple(tuple(1)))) in tuple(tuple(tuple(1))); etc.
     if (left_type_depth == right_type_depth)
     {
         Array array{right_arg_value};
-        block = createBlockFromCollection(array, set_element_types, context);
+        block = createBlockFromCollection(array, set_element_types, tranform_null_in);
     }
     /// 1 in (1, 2); (1, 2) in ((1, 2), (3, 4)); etc.
     else if (left_type_depth + 1 == right_type_depth)
     {
         auto type_index = right_arg_type->getTypeId();
         if (type_index == TypeIndex::Tuple)
-            block = createBlockFromCollection(DB::get<const Tuple &>(right_arg_value), set_element_types, context);
+            block = createBlockFromCollection(DB::get<const Tuple &>(right_arg_value), set_element_types, tranform_null_in);
         else if (type_index == TypeIndex::Array)
-            block = createBlockFromCollection(DB::get<const Array &>(right_arg_value), set_element_types, context);
+            block = createBlockFromCollection(DB::get<const Array &>(right_arg_value), set_element_types, tranform_null_in);
         else
             throw_unsupported_type(right_arg_type);
     }
diff --git a/tests/queries/0_stateless/01507_transform_null_in.refernce b/tests/queries/0_stateless/01507_transform_null_in.refernce
new file mode 100644
index 00000000000..d7cc69cea12
--- /dev/null
+++ b/tests/queries/0_stateless/01507_transform_null_in.refernce
@@ -0,0 +1,8 @@
+1
+0
+1
+1
+0
+0
+1
+1
diff --git a/tests/queries/0_stateless/01507_transform_null_in.sql b/tests/queries/0_stateless/01507_transform_null_in.sql
new file mode 100644
index 00000000000..2377ccf43eb
--- /dev/null
+++ b/tests/queries/0_stateless/01507_transform_null_in.sql
@@ -0,0 +1,10 @@
+SET transform_null_in = 1;
+
+SELECT NULL IN NULL;
+SELECT 1 IN NULL;
+SELECT 1 IN (1, NULL);
+SELECT 1 IN tuple(1, NULL);
+SELECT (1, 2) IN (1, NULL);
+SELECT (1, 2) IN tuple(1, NULL);
+SELECT (1, 2) IN ((1, NULL), (1, 2));
+SELECT (1, NULL) IN (1, NULL);

From eb7f139cb15ed8d6464af31e13dfccb31e635acd Mon Sep 17 00:00:00 2001
From: Anton Popov <pad11rus@gmail.com>
Date: Wed, 7 Oct 2020 19:27:04 +0300
Subject: [PATCH 49/72] Rename 01507_transform_null_in.refernce to
 01507_transform_null_in.reference

---
 ...ansform_null_in.refernce => 01507_transform_null_in.reference} | 0
 1 file changed, 0 insertions(+), 0 deletions(-)
 rename tests/queries/0_stateless/{01507_transform_null_in.refernce => 01507_transform_null_in.reference} (100%)

diff --git a/tests/queries/0_stateless/01507_transform_null_in.refernce b/tests/queries/0_stateless/01507_transform_null_in.reference
similarity index 100%
rename from tests/queries/0_stateless/01507_transform_null_in.refernce
rename to tests/queries/0_stateless/01507_transform_null_in.reference

From eb1cb506c9eca1f3937ee2fa14e0044f46915cf1 Mon Sep 17 00:00:00 2001
From: Artem Zuikov <chertus@gmail.com>
Date: Wed, 7 Oct 2020 20:20:07 +0300
Subject: [PATCH 50/72] minor code improvement (#15668)

---
 src/Core/DecimalFunctions.h | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/src/Core/DecimalFunctions.h b/src/Core/DecimalFunctions.h
index cd5a2b5a670..52e8ebae322 100644
--- a/src/Core/DecimalFunctions.h
+++ b/src/Core/DecimalFunctions.h
@@ -152,12 +152,14 @@ inline typename DecimalType::NativeType getFractionalPartWithScaleMultiplier(
 {
     using T = typename DecimalType::NativeType;
 
-    T result = decimal.value;
+    /// There's UB with min integer value here. But it does not matter for Decimals cause they use not full integer ranges.
+    /// Anycase we make modulo before compare to make scale_multiplier > 1 unaffected.
+    T result = decimal.value % scale_multiplier;
     if constexpr (!keep_sign)
         if (result < T(0))
             result = -result;
 
-    return result % scale_multiplier;
+    return result;
 }
 
 /** Get fractional part from decimal

From cc9dafcb309f750cb427a639a17d324f662ab17b Mon Sep 17 00:00:00 2001
From: Anna <42538400+adevyatova@users.noreply.github.com>
Date: Wed, 7 Oct 2020 21:13:01 +0300
Subject: [PATCH 51/72] DOCSUP-2614: Added description for isDecimalOverflow
 and countDigits function (#15109)

* Added description for isDecimalOverflow and countDigits function

* Update docs/en/sql-reference/data-types/decimal.md

Co-authored-by: BayoNet <da-daos@yandex.ru>

* Update docs/en/sql-reference/data-types/decimal.md

Co-authored-by: BayoNet <da-daos@yandex.ru>

* Update docs/en/sql-reference/data-types/decimal.md

Co-authored-by: BayoNet <da-daos@yandex.ru>

* Translated on russian

* Move to functions/other-functions.md

* Fixed errors

* Update docs/ru/sql-reference/functions/other-functions.md

Co-authored-by: BayoNet <da-daos@yandex.ru>

* Update docs/ru/sql-reference/functions/other-functions.md

Co-authored-by: BayoNet <da-daos@yandex.ru>

* Update docs/ru/sql-reference/functions/other-functions.md

Co-authored-by: BayoNet <da-daos@yandex.ru>

* Update docs/ru/sql-reference/functions/other-functions.md

Co-authored-by: BayoNet <da-daos@yandex.ru>

* Update docs/ru/sql-reference/functions/other-functions.md

Co-authored-by: BayoNet <da-daos@yandex.ru>

Co-authored-by: BayoNet <da-daos@yandex.ru>
---
 docs/en/sql-reference/data-types/decimal.md   |  4 +
 .../functions/other-functions.md              | 75 +++++++++++++++++++
 docs/ru/sql-reference/data-types/decimal.md   |  5 ++
 .../functions/other-functions.md              | 75 +++++++++++++++++++
 4 files changed, 159 insertions(+)

diff --git a/docs/en/sql-reference/data-types/decimal.md b/docs/en/sql-reference/data-types/decimal.md
index fe80e735667..f4c39832661 100644
--- a/docs/en/sql-reference/data-types/decimal.md
+++ b/docs/en/sql-reference/data-types/decimal.md
@@ -107,4 +107,8 @@ SELECT toDecimal32(1, 8) < 100
 DB::Exception: Can't compare.
 ```
 
+**See also**
+-   [isDecimalOverflow](../../sql-reference/functions/other-functions.md#is-decimal-overflow)
+-   [countDigits](../../sql-reference/functions/other-functions.md#count-digits)
+
 [Original article](https://clickhouse.tech/docs/en/data_types/decimal/) <!--hide-->
diff --git a/docs/en/sql-reference/functions/other-functions.md b/docs/en/sql-reference/functions/other-functions.md
index 3d06bbfff9e..00e00fa9bad 100644
--- a/docs/en/sql-reference/functions/other-functions.md
+++ b/docs/en/sql-reference/functions/other-functions.md
@@ -1526,5 +1526,80 @@ SELECT getSetting('custom_a');
 
 -   [Custom Settings](../../operations/settings/index.md#custom_settings)
 
+## isDecimalOverflow {#is-decimal-overflow}
+
+Checks whether the [Decimal](../../sql-reference/data-types/decimal.md#decimalp-s-decimal32s-decimal64s-decimal128s) value is out of its (or specified) precision.
+
+**Syntax**
+
+``` sql
+isDecimalOverflow(d, [p])
+```
+
+**Parameters** 
+
+-   `d` — value. [Decimal](../../sql-reference/data-types/decimal.md#decimalp-s-decimal32s-decimal64s-decimal128s).
+-   `p` — precision. Optional. If omitted, the initial presicion of the first argument is used. Using of this paratemer could be helpful for data extraction to another DBMS or file. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). 
+
+**Returned values**
+
+-   `1` — Decimal value has more digits then it's precision allow,
+-   `0` — Decimal value satisfies the specified precision.
+
+**Example**
+
+Query:
+
+``` sql
+SELECT isDecimalOverflow(toDecimal32(1000000000, 0), 9),
+       isDecimalOverflow(toDecimal32(1000000000, 0)),
+       isDecimalOverflow(toDecimal32(-1000000000, 0), 9),
+       isDecimalOverflow(toDecimal32(-1000000000, 0));
+```
+
+Result:
+
+``` text
+1	1	1	1
+```
+
+## countDigits {#count-digits}
+
+Returns number of decimal digits you need to represent the value.
+
+**Syntax**
+
+``` sql
+countDigits(x)
+```
+
+**Parameters** 
+
+-   `x` — [Int](../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64) or [Decimal](../../sql-reference/data-types/decimal.md#decimalp-s-decimal32s-decimal64s-decimal128s) value.
+
+**Returned value**
+
+Number of digits.
+
+Type: [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges).
+
+ !!! note "Note"
+    For `Decimal` values takes into account their scales: calculates result over underlying integer type which is `(value * scale)`. For example: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. I.e. you may check decimal overflow for `Decimal64` with `countDecimal(x) > 18`. It's a slow variant of [isDecimalOverflow](#is-decimal-overflow).
+
+**Example**
+
+Query:
+
+``` sql
+SELECT countDigits(toDecimal32(1, 9)), countDigits(toDecimal32(-1, 9)),
+       countDigits(toDecimal64(1, 18)), countDigits(toDecimal64(-1, 18)),
+       countDigits(toDecimal128(1, 38)), countDigits(toDecimal128(-1, 38));
+```
+
+Result:
+
+``` text
+10	10	19	19	39	39
+```
 
 [Original article](https://clickhouse.tech/docs/en/query_language/functions/other_functions/) <!--hide-->
diff --git a/docs/ru/sql-reference/data-types/decimal.md b/docs/ru/sql-reference/data-types/decimal.md
index 5e22fcccadc..aa2886d7ec0 100644
--- a/docs/ru/sql-reference/data-types/decimal.md
+++ b/docs/ru/sql-reference/data-types/decimal.md
@@ -102,4 +102,9 @@ SELECT toDecimal32(1, 8) < 100
 DB::Exception: Can't compare.
 ```
 
+**Смотрите также**
+-   [isDecimalOverflow](../../sql-reference/functions/other-functions.md#is-decimal-overflow)
+-   [countDigits](../../sql-reference/functions/other-functions.md#count-digits)
+
+
 [Оригинальная статья](https://clickhouse.tech/docs/ru/data_types/decimal/) <!--hide-->
diff --git a/docs/ru/sql-reference/functions/other-functions.md b/docs/ru/sql-reference/functions/other-functions.md
index 7b9dacf21cd..f162df4a703 100644
--- a/docs/ru/sql-reference/functions/other-functions.md
+++ b/docs/ru/sql-reference/functions/other-functions.md
@@ -1431,5 +1431,80 @@ SELECT randomStringUTF8(13)
 
 ```
 
+## isDecimalOverflow {#is-decimal-overflow}
+
+Проверяет, находится ли число [Decimal](../../sql-reference/data-types/decimal.md#decimalp-s-decimal32s-decimal64s-decimal128s) вне собственной (или заданной) области значений.
+
+**Синтаксис**
+
+``` sql
+isDecimalOverflow(d, [p])
+```
+
+**Параметры** 
+
+-   `d` — число. [Decimal](../../sql-reference/data-types/decimal.md#decimalp-s-decimal32s-decimal64s-decimal128s).
+-   `p` — точность. Необязательный параметр. Если опущен, используется исходная точность первого аргумента. Использование этого параметра может быть полезно для извлечения данных в другую СУБД или файл. [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges). 
+
+**Возвращаемое значение**
+
+-   `1` — число имеет больше цифр, чем позволяет точность.
+-   `0` — число удовлетворяет заданной точности.
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT isDecimalOverflow(toDecimal32(1000000000, 0), 9),
+       isDecimalOverflow(toDecimal32(1000000000, 0)),
+       isDecimalOverflow(toDecimal32(-1000000000, 0), 9),
+       isDecimalOverflow(toDecimal32(-1000000000, 0));
+```
+
+Результат:
+
+``` text
+1	1	1	1
+```
+
+## countDigits {#count-digits}
+
+Возвращает количество десятичных цифр, необходимых для представления значения.
+
+**Синтаксис**
+
+``` sql
+countDigits(x)
+```
+
+**Параметры** 
+
+-   `x` — [целое](../../sql-reference/data-types/int-uint.md#uint8-uint16-uint32-uint64-int8-int16-int32-int64) или [дробное](../../sql-reference/data-types/decimal.md#decimalp-s-decimal32s-decimal64s-decimal128s) число.
+
+**Возвращаемое значение**
+
+Количество цифр.
+
+Тип: [UInt8](../../sql-reference/data-types/int-uint.md#uint-ranges).
+
+ !!! note "Примечание"
+    Для `Decimal` значений учитывается их масштаб: вычисляется результат по базовому целочисленному типу, полученному как `(value * scale)`. Например: `countDigits(42) = 2`, `countDigits(42.000) = 5`, `countDigits(0.04200) = 4`. То есть вы можете проверить десятичное переполнение для `Decimal64` с помощью `countDecimal(x) > 18`. Это медленный вариант [isDecimalOverflow](#is-decimal-overflow).
+
+**Пример**
+
+Запрос:
+
+``` sql
+SELECT countDigits(toDecimal32(1, 9)), countDigits(toDecimal32(-1, 9)),
+       countDigits(toDecimal64(1, 18)), countDigits(toDecimal64(-1, 18)),
+       countDigits(toDecimal128(1, 38)), countDigits(toDecimal128(-1, 38));
+```
+
+Результат:
+
+``` text
+10	10	19	19	39	39
+```
 
 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/functions/other_functions/) <!--hide-->

From 553f6aa4cdc5273174e83113b371546022237b04 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 7 Oct 2020 21:30:53 +0300
Subject: [PATCH 52/72] Remove unused functions

---
 src/Common/ThreadProfileEvents.h | 14 --------------
 1 file changed, 14 deletions(-)

diff --git a/src/Common/ThreadProfileEvents.h b/src/Common/ThreadProfileEvents.h
index 69db595b426..9ebb5fbc2c6 100644
--- a/src/Common/ThreadProfileEvents.h
+++ b/src/Common/ThreadProfileEvents.h
@@ -82,13 +82,6 @@ inline UInt64 getCurrentTimeNanoseconds(clockid_t clock_type = CLOCK_MONOTONIC)
     return ts.tv_sec * 1000000000ULL + ts.tv_nsec;
 }
 
-inline UInt64 getCurrentTimeMicroseconds()
-{
-    struct timeval tv;
-    gettimeofday(&tv, nullptr);
-    return (tv.tv_sec) * 1000000U + (tv.tv_usec);
-}
-
 struct RUsageCounters
 {
     /// In nanoseconds
@@ -115,13 +108,6 @@ struct RUsageCounters
         hard_page_faults = static_cast<UInt64>(rusage.ru_majflt);
     }
 
-    static RUsageCounters zeros(UInt64 real_time_ = getCurrentTimeNanoseconds())
-    {
-        RUsageCounters res;
-        res.real_time = real_time_;
-        return res;
-    }
-
     static RUsageCounters current(UInt64 real_time_ = getCurrentTimeNanoseconds())
     {
         ::rusage rusage {};

From c0e15ba348dae4030bd592ca1635fb3b09043afc Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 7 Oct 2020 21:32:36 +0300
Subject: [PATCH 53/72] Fix RealTimeMicroseconds ProfileEvents

RUsageCounters::real_time (RealTimeMicroseconds) was constructed from
CLOCK_REALTIME (or similar) in the ThreadStatus::initPerformanceCounters()
(ThreadStatusExt.cpp), while the intention is to store CLOCK_MONOTONIC
(time from boot, i.e. /proc/uptime) values there (in ThreadProfileEvents.h)
---
 src/Interpreters/ThreadStatusExt.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/Interpreters/ThreadStatusExt.cpp b/src/Interpreters/ThreadStatusExt.cpp
index 61245782ba9..d1d206009c5 100644
--- a/src/Interpreters/ThreadStatusExt.cpp
+++ b/src/Interpreters/ThreadStatusExt.cpp
@@ -171,7 +171,8 @@ void ThreadStatus::initPerformanceCounters()
     query_start_time_microseconds = time_in_microseconds(now);
     ++queries_started;
 
-    *last_rusage = RUsageCounters::current(query_start_time_nanoseconds);
+    // query_start_time_nanoseconds cannot be used here since RUsageCounters expect CLOCK_MONOTONIC
+    *last_rusage = RUsageCounters::current();
 
     if (query_context)
     {

From 3cd71f3357e0a1ac951c08a5988b585abbdf9d60 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 7 Oct 2020 21:40:20 +0300
Subject: [PATCH 54/72] Cleanup interceptors for librdkafka in StorageKafka

Wrap them into StorageKafkaInterceptors to allow access to private
fields and add logging inside interceptors if something fails.

This is also preparation for ThreadStatus interceptor.
---
 src/Storages/Kafka/StorageKafka.cpp | 111 +++++++++++++++++-----------
 src/Storages/Kafka/StorageKafka.h   |   5 +-
 2 files changed, 71 insertions(+), 45 deletions(-)

diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp
index 9ba5ad7a65b..68f3cf6b8f1 100644
--- a/src/Storages/Kafka/StorageKafka.cpp
+++ b/src/Storages/Kafka/StorageKafka.cpp
@@ -50,6 +50,67 @@ namespace ErrorCodes
     extern const int NUMBER_OF_ARGUMENTS_DOESNT_MATCH;
 }
 
+struct StorageKafkaInterceptors
+{
+    static rd_kafka_resp_err_t rdKafkaOnThreadStart(rd_kafka_t *, rd_kafka_thread_type_t thread_type, const char *, void * ctx)
+    {
+        StorageKafka * self = reinterpret_cast<StorageKafka *>(ctx);
+
+        const auto & storage_id = self->getStorageID();
+        const auto & table = storage_id.getTableName();
+
+        switch (thread_type)
+        {
+            case RD_KAFKA_THREAD_MAIN:
+                setThreadName(("rdk:m/" + table.substr(0, 9)).c_str());
+                break;
+            case RD_KAFKA_THREAD_BACKGROUND:
+                setThreadName(("rdk:bg/" + table.substr(0, 8)).c_str());
+                break;
+            case RD_KAFKA_THREAD_BROKER:
+                setThreadName(("rdk:b/" + table.substr(0, 9)).c_str());
+                break;
+        }
+
+        return RD_KAFKA_RESP_ERR_NO_ERROR;
+    }
+
+    static rd_kafka_resp_err_t rdKafkaOnNew(rd_kafka_t * rk, const rd_kafka_conf_t *, void * ctx, char * /*errstr*/, size_t /*errstr_size*/)
+    {
+        StorageKafka * self = reinterpret_cast<StorageKafka *>(ctx);
+        rd_kafka_resp_err_t status;
+
+        status = rd_kafka_interceptor_add_on_thread_start(rk, "init-thread", rdKafkaOnThreadStart, ctx);
+        if (status != RD_KAFKA_RESP_ERR_NO_ERROR)
+        {
+            LOG_ERROR(self->log, "Cannot set on thread start interceptor due to {} error", status);
+            return status;
+        }
+
+        return status;
+    }
+
+    static rd_kafka_resp_err_t rdKafkaOnConfDup(rd_kafka_conf_t * new_conf, const rd_kafka_conf_t * /*old_conf*/, size_t /*filter_cnt*/, const char ** /*filter*/, void * ctx)
+    {
+        StorageKafka * self = reinterpret_cast<StorageKafka *>(ctx);
+        rd_kafka_resp_err_t status;
+
+        // cppkafka copies configuration multiple times
+        status = rd_kafka_conf_interceptor_add_on_conf_dup(new_conf, "init", rdKafkaOnConfDup, ctx);
+        if (status != RD_KAFKA_RESP_ERR_NO_ERROR)
+        {
+            LOG_ERROR(self->log, "Cannot set on conf dup interceptor due to {} error", status);
+            return status;
+        }
+
+        status = rd_kafka_conf_interceptor_add_on_new(new_conf, "init", rdKafkaOnNew, ctx);
+        if (status != RD_KAFKA_RESP_ERR_NO_ERROR)
+            LOG_ERROR(self->log, "Cannot set on conf new interceptor due to {} error", status);
+
+        return status;
+    }
+};
+
 namespace
 {
     const auto RESCHEDULE_MS = 500;
@@ -76,46 +137,6 @@ namespace
             conf.set(key_name, config.getString(key_path));
         }
     }
-
-    rd_kafka_resp_err_t rdKafkaOnThreadStart(rd_kafka_t *, rd_kafka_thread_type_t thread_type, const char *, void * ctx)
-    {
-        StorageKafka * self = reinterpret_cast<StorageKafka *>(ctx);
-
-        const auto & storage_id = self->getStorageID();
-        const auto & table = storage_id.getTableName();
-
-        switch (thread_type)
-        {
-            case RD_KAFKA_THREAD_MAIN:
-                setThreadName(("rdk:m/" + table.substr(0, 9)).c_str());
-                break;
-            case RD_KAFKA_THREAD_BACKGROUND:
-                setThreadName(("rdk:bg/" + table.substr(0, 8)).c_str());
-                break;
-            case RD_KAFKA_THREAD_BROKER:
-                setThreadName(("rdk:b/" + table.substr(0, 9)).c_str());
-                break;
-        }
-        return RD_KAFKA_RESP_ERR_NO_ERROR;
-    }
-
-    rd_kafka_resp_err_t rdKafkaOnNew(rd_kafka_t * rk, const rd_kafka_conf_t *, void * ctx, char * /*errstr*/, size_t /*errstr_size*/)
-    {
-        return rd_kafka_interceptor_add_on_thread_start(rk, "setThreadName", rdKafkaOnThreadStart, ctx);
-    }
-
-    rd_kafka_resp_err_t rdKafkaOnConfDup(rd_kafka_conf_t * new_conf, const rd_kafka_conf_t * /*old_conf*/, size_t /*filter_cnt*/, const char ** /*filter*/, void * ctx)
-    {
-        rd_kafka_resp_err_t status;
-
-        // cppkafka copies configuration multiple times
-        status = rd_kafka_conf_interceptor_add_on_conf_dup(new_conf, "setThreadName", rdKafkaOnConfDup, ctx);
-        if (status != RD_KAFKA_RESP_ERR_NO_ERROR)
-            return status;
-
-        status = rd_kafka_conf_interceptor_add_on_new(new_conf, "setThreadName", rdKafkaOnNew, ctx);
-        return status;
-    }
 }
 
 StorageKafka::StorageKafka(
@@ -443,14 +464,16 @@ void StorageKafka::updateConfiguration(cppkafka::Configuration & conf)
 
         int status;
 
-        status = rd_kafka_conf_interceptor_add_on_new(conf.get_handle(), "setThreadName", rdKafkaOnNew, self);
+        status = rd_kafka_conf_interceptor_add_on_new(conf.get_handle(),
+            "init", StorageKafkaInterceptors::rdKafkaOnNew, self);
         if (status != RD_KAFKA_RESP_ERR_NO_ERROR)
-            LOG_ERROR(log, "Cannot set new interceptor");
+            LOG_ERROR(log, "Cannot set new interceptor due to {} error", status);
 
         // cppkafka always copy the configuration
-        status = rd_kafka_conf_interceptor_add_on_conf_dup(conf.get_handle(), "setThreadName", rdKafkaOnConfDup, self);
+        status = rd_kafka_conf_interceptor_add_on_conf_dup(conf.get_handle(),
+            "init", StorageKafkaInterceptors::rdKafkaOnConfDup, self);
         if (status != RD_KAFKA_RESP_ERR_NO_ERROR)
-            LOG_ERROR(log, "Cannot set dup conf interceptor");
+            LOG_ERROR(log, "Cannot set dup conf interceptor due to {} error", status);
     }
 }
 
diff --git a/src/Storages/Kafka/StorageKafka.h b/src/Storages/Kafka/StorageKafka.h
index 272e419bebe..7e972844883 100644
--- a/src/Storages/Kafka/StorageKafka.h
+++ b/src/Storages/Kafka/StorageKafka.h
@@ -23,12 +23,16 @@ class Configuration;
 namespace DB
 {
 
+struct StorageKafkaInterceptors;
+
 /** Implements a Kafka queue table engine that can be used as a persistent queue / buffer,
   * or as a basic building block for creating pipelines with a continuous insertion / ETL.
   */
 class StorageKafka final : public ext::shared_ptr_helper<StorageKafka>, public IStorage
 {
     friend struct ext::shared_ptr_helper<StorageKafka>;
+    friend struct StorageKafkaInterceptors;
+
 public:
     std::string getName() const override { return "Kafka"; }
 
@@ -109,7 +113,6 @@ private:
     ConsumerBufferPtr createReadBuffer(const size_t consumer_number);
 
     // Update Kafka configuration with values from CH user configuration.
-
     void updateConfiguration(cppkafka::Configuration & conf);
     void threadFunc(size_t idx);
 

From e0bc66942398adaa9d16951684daa6066ced0812 Mon Sep 17 00:00:00 2001
From: Anton Popov <pad11rus@gmail.com>
Date: Wed, 7 Oct 2020 21:53:34 +0300
Subject: [PATCH 55/72] run check with wide parts

---
 tests/clickhouse-test                       | 9 +++++++++
 tests/config/config.d/polymorphic_parts.xml | 2 +-
 tests/queries/skip_list.json                | 2 ++
 3 files changed, 12 insertions(+), 1 deletion(-)

diff --git a/tests/clickhouse-test b/tests/clickhouse-test
index 348df2f93be..be8f9551db2 100755
--- a/tests/clickhouse-test
+++ b/tests/clickhouse-test
@@ -512,6 +512,15 @@ def collect_build_flags(client):
     else:
         raise Exception("Cannot get inforamtion about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
 
+    clickhouse_proc = Popen(shlex.split(client), stdin=PIPE, stdout=PIPE, stderr=PIPE)
+    (stdout, stderr) = clickhouse_proc.communicate(b"SELECT value FROM system.merge_tree_settings WHERE name = 'min_bytes_for_wide_part'")
+
+    if clickhouse_proc.returncode == 0:
+        if stdout == b'0\n':
+            result.append(BuildFlags.POLYMORPHIC_PARTS)
+    else:
+        raise Exception("Cannot get inforamtion about build from server errorcode {}, stderr {}".format(clickhouse_proc.returncode, stderr))
+
     return result
 
 
diff --git a/tests/config/config.d/polymorphic_parts.xml b/tests/config/config.d/polymorphic_parts.xml
index 2924aa5c69d..5f06bf532db 100644
--- a/tests/config/config.d/polymorphic_parts.xml
+++ b/tests/config/config.d/polymorphic_parts.xml
@@ -1,5 +1,5 @@
 <yandex>
     <merge_tree>
-        <min_bytes_for_wide_part>10485760</min_bytes_for_wide_part>
+        <min_bytes_for_wide_part>0</min_bytes_for_wide_part>
     </merge_tree>
 </yandex>
diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json
index ac8bd77fcac..541a5e2111f 100644
--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -98,5 +98,7 @@
         "00609_mv_index_in_in",
         "00510_materizlized_view_and_deduplication_zookeeper",
         "00738_lock_for_inner_table"
+    ],
+    "polymorphic-parts": [
     ]
 }

From ffb19fa4bc484f27549b74585b83680f03f7b336 Mon Sep 17 00:00:00 2001
From: Alexander Tokmakov <avtokmakov@yandex-team.ru>
Date: Wed, 7 Oct 2020 22:47:31 +0300
Subject: [PATCH 56/72] fix

---
 src/Storages/StorageReplicatedMergeTree.cpp | 29 ++++++++++++++++-----
 1 file changed, 22 insertions(+), 7 deletions(-)

diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 35e0c9150ac..a9669151958 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -721,7 +721,10 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con
         throw Exception("Table was not dropped because ZooKeeper session has expired.", ErrorCodes::TABLE_WAS_NOT_DROPPED);
 
     auto remote_replica_path = zookeeper_path + "/replicas/" + replica;
-    LOG_INFO(logger, "Removing replica {}", remote_replica_path);
+    LOG_INFO(logger, "Removing replica {}, marking it as lost", remote_replica_path);
+    /// Mark itself lost before removing, because the following recursive removal may fail
+    /// and partially dropped replica may be considered as alive one (until someone will mark it lost)
+    zookeeper->set(zookeeper_path + "/replicas/" + replica + "/is_lost", "1");
     /// It may left some garbage if replica_path subtree are concurrently modified
     zookeeper->tryRemoveRecursive(remote_replica_path);
     if (zookeeper->exists(remote_replica_path))
@@ -2392,8 +2395,8 @@ void StorageReplicatedMergeTree::cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zooke
     size_t max_log_entry = 0;
     if (!log_entries.empty())
     {
-        String last_entry_num = std::max_element(log_entries.begin(), log_entries.end())->substr(4);
-        max_log_entry = std::stol(last_entry_num);
+        String last_entry = *std::max_element(log_entries.begin(), log_entries.end());
+        max_log_entry = parse<UInt64>(last_entry.substr(strlen("log-")));
     }
     /// log_pointer can point to future entry, which was not created yet
     ++max_log_entry;
@@ -2402,6 +2405,7 @@ void StorageReplicatedMergeTree::cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zooke
     String source_replica;
     Coordination::Stat source_is_lost_stat;
     size_t future_num = 0;
+
     for (const String & source_replica_name : replicas)
     {
         if (source_replica_name == replica_name)
@@ -2411,17 +2415,27 @@ void StorageReplicatedMergeTree::cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zooke
         auto get_log_pointer = futures[future_num++].get();
         auto get_queue       = futures[future_num++].get();
 
-        if (get_is_lost.error == Coordination::Error::ZNONODE)
+        if (get_is_lost.error != Coordination::Error::ZOK)
         {
-            /// For compatibility with older ClickHouse versions
-            get_is_lost.stat.version = -1;
+            LOG_INFO(log, "Not cloning {}, cannot get '/is_lost': {}", Coordination::errorMessage(get_is_lost.error));
+            continue;
         }
         else if (get_is_lost.data != "0")
+        {
+            LOG_INFO(log, "Not cloning {}, it's lost");
             continue;
+        }
+
         if (get_log_pointer.error != Coordination::Error::ZOK)
+        {
+            LOG_INFO(log, "Not cloning {}, cannot get '/log_pointer': {}", Coordination::errorMessage(get_log_pointer.error));
             continue;
+        }
         if (get_queue.error != Coordination::Error::ZOK)
+        {
+            LOG_INFO(log, "Not cloning {}, cannot get '/queue': {}", Coordination::errorMessage(get_queue.error));
             continue;
+        }
 
         /// Replica is not lost and we can clone it. Let's calculate approx replication lag.
         size_t source_log_pointer = get_log_pointer.data.empty() ? 0 : parse<UInt64>(get_log_pointer.data);
@@ -2429,7 +2443,8 @@ void StorageReplicatedMergeTree::cloneReplicaIfNeeded(zkutil::ZooKeeperPtr zooke
         size_t replica_queue_lag = max_log_entry - source_log_pointer;
         size_t replica_queue_size = get_queue.stat.numChildren;
         size_t replication_lag = replica_queue_lag + replica_queue_size;
-        LOG_INFO(log, "Replica {} has approximate {} queue lag and {} queue size", source_replica_name, replica_queue_lag, replica_queue_size);
+        LOG_INFO(log, "Replica {} has log pointer '{}', approximate {} queue lag and {} queue size",
+                 source_replica_name, get_log_pointer.data, replica_queue_lag, replica_queue_size);
         if (replication_lag < min_replication_lag)
         {
             source_replica = source_replica_name;

From 1f51de362f7d6511c6af67d82b0fb6a953be2962 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Wed, 7 Oct 2020 21:40:20 +0300
Subject: [PATCH 57/72] Take memory allocations from librdkafka threads into
 account

---
 src/Storages/Kafka/StorageKafka.cpp | 29 +++++++++++++++++++++++++++++
 src/Storages/Kafka/StorageKafka.h   |  5 +++++
 2 files changed, 34 insertions(+)

diff --git a/src/Storages/Kafka/StorageKafka.cpp b/src/Storages/Kafka/StorageKafka.cpp
index 68f3cf6b8f1..02b34c6b3f2 100644
--- a/src/Storages/Kafka/StorageKafka.cpp
+++ b/src/Storages/Kafka/StorageKafka.cpp
@@ -72,6 +72,31 @@ struct StorageKafkaInterceptors
                 break;
         }
 
+        /// Create ThreadStatus to track memory allocations from librdkafka threads.
+        //
+        /// And store them in a separate list (thread_statuses) to make sure that they will be destroyed,
+        /// regardless how librdkafka calls the hooks.
+        /// But this can trigger use-after-free if librdkafka will not destroy threads after rd_kafka_wait_destroyed()
+        auto thread_status = std::make_shared<ThreadStatus>();
+        std::lock_guard lock(self->thread_statuses_mutex);
+        self->thread_statuses.emplace_back(std::move(thread_status));
+
+        return RD_KAFKA_RESP_ERR_NO_ERROR;
+    }
+    static rd_kafka_resp_err_t rdKafkaOnThreadExit(rd_kafka_t *, rd_kafka_thread_type_t, const char *, void * ctx)
+    {
+        StorageKafka * self = reinterpret_cast<StorageKafka *>(ctx);
+
+        std::lock_guard lock(self->thread_statuses_mutex);
+        const auto it = std::find_if(self->thread_statuses.begin(), self->thread_statuses.end(), [](const auto & thread_status_ptr)
+        {
+            return thread_status_ptr.get() == current_thread;
+        });
+        if (it == self->thread_statuses.end())
+            throw Exception("No thread status for this librdkafka thread.", ErrorCodes::LOGICAL_ERROR);
+
+        self->thread_statuses.erase(it);
+
         return RD_KAFKA_RESP_ERR_NO_ERROR;
     }
 
@@ -87,6 +112,10 @@ struct StorageKafkaInterceptors
             return status;
         }
 
+        status = rd_kafka_interceptor_add_on_thread_exit(rk, "exit-thread", rdKafkaOnThreadExit, ctx);
+        if (status != RD_KAFKA_RESP_ERR_NO_ERROR)
+            LOG_ERROR(self->log, "Cannot set on thread exit interceptor due to {} error", status);
+
         return status;
     }
 
diff --git a/src/Storages/Kafka/StorageKafka.h b/src/Storages/Kafka/StorageKafka.h
index 7e972844883..52d8651b064 100644
--- a/src/Storages/Kafka/StorageKafka.h
+++ b/src/Storages/Kafka/StorageKafka.h
@@ -11,6 +11,7 @@
 #include <ext/shared_ptr_helper.h>
 
 #include <mutex>
+#include <list>
 #include <atomic>
 
 namespace cppkafka
@@ -109,6 +110,10 @@ private:
     std::vector<std::shared_ptr<TaskContext>> tasks;
     bool thread_per_consumer = false;
 
+    /// For memory accounting in the librdkafka threads.
+    std::mutex thread_statuses_mutex;
+    std::list<std::shared_ptr<ThreadStatus>> thread_statuses;
+
     SettingsChanges createSettingsAdjustments();
     ConsumerBufferPtr createReadBuffer(const size_t consumer_number);
 

From 983303243b06819d0ea1405ecf16710815325a32 Mon Sep 17 00:00:00 2001
From: Azat Khuzhin <a3at.mail@gmail.com>
Date: Thu, 8 Oct 2020 01:58:31 +0300
Subject: [PATCH 58/72] Fix 01514_distributed_cancel_query_on_error flackiness

---
 .../0_stateless/01514_distributed_cancel_query_on_error.sh    | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.sh b/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.sh
index a24bb00fdf3..f251ff6138b 100755
--- a/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.sh
+++ b/tests/queries/0_stateless/01514_distributed_cancel_query_on_error.sh
@@ -9,12 +9,12 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 # max_block_size to fail faster
 # max_memory_usage/_shard_num/repeat() will allow failure on the first shard earlier.
 opts=(
-    --max_memory_usage=3G
+    --max_memory_usage=1G
     --max_block_size=50
     --max_threads=1
     --max_distributed_connections=2
 )
-${CLICKHOUSE_CLIENT} "${opts[@]}" -q "SELECT groupArray(repeat('a', 1000*_shard_num)), number%100000 k from remote('127.{2,3}', system.numbers) GROUP BY k LIMIT 10e6" |& {
+${CLICKHOUSE_CLIENT} "${opts[@]}" -q "SELECT groupArray(repeat('a', if(_shard_num == 2, 100000, 1))), number%100000 k from remote('127.{2,3}', system.numbers) GROUP BY k LIMIT 10e6" |& {
     # the query should fail earlier on 127.3 and 127.2 should not even go to the memory limit exceeded error.
     fgrep -q 'DB::Exception: Received from 127.3:9000. DB::Exception: Memory limit (for query) exceeded:'
     # while if this will not correctly then it will got the exception from the 127.2:9000 and fail

From b3238c2765a7a120463a2af6b23386ab0c625d9e Mon Sep 17 00:00:00 2001
From: Artemeey <artemeey.ru@mail.ru>
Date: Thu, 8 Oct 2020 08:10:17 +0300
Subject: [PATCH 59/72] Update index.md

fix links
---
 docs/ru/sql-reference/table-functions/index.md | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/docs/ru/sql-reference/table-functions/index.md b/docs/ru/sql-reference/table-functions/index.md
index 3a6b3660060..b8c3e51619d 100644
--- a/docs/ru/sql-reference/table-functions/index.md
+++ b/docs/ru/sql-reference/table-functions/index.md
@@ -24,14 +24,14 @@ toc_title: "\u0412\u0432\u0435\u0434\u0435\u043D\u0438\u0435"
 
 | Функция               | Описание                                                                                                                              |
 |-----------------------|---------------------------------------------------------------------------------------------------------------------------------------|
-| [file](file.md)       | Создаёт таблицу с движком [File](../../sql-reference/table-functions/index.md).                                                       |
-| [merge](merge.md)     | Создаёт таблицу с движком [Merge](../../sql-reference/table-functions/index.md).                                                      |
+| [file](file.md)       | Создаёт таблицу с движком [File](../../engines/table-engines/special/file.md).                                                       |
+| [merge](merge.md)     | Создаёт таблицу с движком [Merge](../../engines/table-engines/special/merge.md).                                                      |
 | [numbers](numbers.md) | Создаёт таблицу с единственным столбцом, заполненным целыми числами.                                                                  |
-| [remote](remote.md)   | Предоставляет доступ к удалённым серверам, не создавая таблицу с движком [Distributed](../../sql-reference/table-functions/index.md). |
-| [url](url.md)         | Создаёт таблицу с движком [Url](../../sql-reference/table-functions/index.md).                                                        |
-| [mysql](mysql.md)     | Создаёт таблицу с движком [MySQL](../../sql-reference/table-functions/index.md).                                                      |
-| [jdbc](jdbc.md)       | Создаёт таблицу с дижком [JDBC](../../sql-reference/table-functions/index.md).                                                        |
-| [odbc](odbc.md)       | Создаёт таблицу с движком [ODBC](../../sql-reference/table-functions/index.md).                                                       |
-| [hdfs](hdfs.md)       | Создаёт таблицу с движком [HDFS](../../sql-reference/table-functions/index.md).                                                       |
+| [remote](remote.md)   | Предоставляет доступ к удалённым серверам, не создавая таблицу с движком [Distributed](../../engines/table-engines/special/distributed.md). |
+| [url](url.md)         | Создаёт таблицу с движком [Url](../../engines/table-engines/special/url.md).                                                        |
+| [mysql](mysql.md)     | Создаёт таблицу с движком [MySQL](../../engines/table-engines/integrations/mysql.md).                                                      |
+| [jdbc](jdbc.md)       | Создаёт таблицу с дижком [JDBC](../../engines/table-engines/integrations/jdbc.md).                                                        |
+| [odbc](odbc.md)       | Создаёт таблицу с движком [ODBC](../../engines/table-engines/integrations/odbc.md).                                                       |
+| [hdfs](hdfs.md)       | Создаёт таблицу с движком [HDFS](../../engines/table-engines/integrations/hdfs.md).                                                       |
 
 [Оригинальная статья](https://clickhouse.tech/docs/ru/query_language/table_functions/) <!--hide-->

From dd7d46052e6416a5b80458d40854fad15807554a Mon Sep 17 00:00:00 2001
From: vic <1018595261@qq.com>
Date: Thu, 8 Oct 2020 15:30:14 +0800
Subject: [PATCH 60/72] add tcp client for php

add tcp client for php
---
 docs/en/interfaces/third-party/client-libraries.md | 1 +
 1 file changed, 1 insertion(+)

diff --git a/docs/en/interfaces/third-party/client-libraries.md b/docs/en/interfaces/third-party/client-libraries.md
index 9dbbe0a0022..128c3009f5e 100644
--- a/docs/en/interfaces/third-party/client-libraries.md
+++ b/docs/en/interfaces/third-party/client-libraries.md
@@ -20,6 +20,7 @@ toc_title: Client Libraries
     -   [simpod/clickhouse-client](https://packagist.org/packages/simpod/clickhouse-client)
     -   [seva-code/php-click-house-client](https://packagist.org/packages/seva-code/php-click-house-client)
     -   [SeasClick C++ client](https://github.com/SeasX/SeasClick)
+    -   [one-ck](https://github.com/lizhichao/one-ck)
 -   Go
     -   [clickhouse](https://github.com/kshvakov/clickhouse/)
     -   [go-clickhouse](https://github.com/roistat/go-clickhouse)

From f953c1be878a3e26b90202718d23d283f293f6d8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 8 Oct 2020 13:48:53 +0300
Subject: [PATCH 61/72] Disable cassandra tests with thread sanitizer

---
 .../test_cassandra.py                             | 15 +++++++++++----
 1 file changed, 11 insertions(+), 4 deletions(-)

diff --git a/tests/integration/test_dictionaries_all_layouts_separate_sources/test_cassandra.py b/tests/integration/test_dictionaries_all_layouts_separate_sources/test_cassandra.py
index c6b2ed370f4..2d54d846169 100644
--- a/tests/integration/test_dictionaries_all_layouts_separate_sources/test_cassandra.py
+++ b/tests/integration/test_dictionaries_all_layouts_separate_sources/test_cassandra.py
@@ -42,7 +42,8 @@ def setup_module(module):
     dictionaries = []
     main_configs = []
     main_configs.append(os.path.join('configs', 'disable_ssl_verification.xml'))
-    
+    main_configs.append(os.path.join('configs', 'log_conf.xml'))
+   
     for fname in os.listdir(DICT_CONFIG_PATH):
         dictionaries.append(os.path.join(DICT_CONFIG_PATH, fname))
 
@@ -69,14 +70,20 @@ def started_cluster():
     finally:
         cluster.shutdown()
 
+# We have a lot of race conditions in cassandra library
+# https://github.com/ClickHouse/ClickHouse/issues/15754.
+# TODO fix them and enable tests as soon as possible.
 @pytest.mark.parametrize("layout_name", LAYOUTS_SIMPLE)
 def test_simple(started_cluster, layout_name):
-    simple_tester.execute(layout_name, node)
+    if not node.is_built_with_thread_sanitizer():
+        simple_tester.execute(layout_name, node)
 
 @pytest.mark.parametrize("layout_name", LAYOUTS_COMPLEX)
 def test_complex(started_cluster, layout_name):
-    complex_tester.execute(layout_name, node)
+    if not node.is_built_with_thread_sanitizer():
+        complex_tester.execute(layout_name, node)
     
 @pytest.mark.parametrize("layout_name", LAYOUTS_RANGED)
 def test_ranged(started_cluster, layout_name):
-    ranged_tester.execute(layout_name, node)
+    if not node.is_built_with_thread_sanitizer():
+        ranged_tester.execute(layout_name, node)

From 3a2068f19d02ef42f890314721cb9dcbef14be5a Mon Sep 17 00:00:00 2001
From: Pervakov Grigorii <pervakovg@yandex-team.ru>
Date: Mon, 5 Oct 2020 20:43:38 +0300
Subject: [PATCH 62/72] Use tmp disk for vertical merge files

---
 .../MergeTree/MergeTreeDataMergerMutator.cpp        | 13 +++++++------
 src/Storages/MergeTree/MergeTreeDataMergerMutator.h |  1 +
 src/Storages/StorageMergeTree.cpp                   |  2 +-
 src/Storages/StorageReplicatedMergeTree.cpp         |  2 +-
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
index 73b3fe698cc..fb0a488700c 100644
--- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
+++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.cpp
@@ -624,6 +624,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor
     MergeList::Entry & merge_entry,
     TableLockHolder &,
     time_t time_of_merge,
+    const Context & context,
     const ReservationPtr & space_reservation,
     bool deduplicate)
 {
@@ -706,7 +707,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor
     MergeAlgorithm merge_alg = chooseMergeAlgorithm(parts, sum_input_rows_upper_bound, gathering_columns, deduplicate, need_remove_expired_values);
     merge_entry->merge_algorithm = merge_alg;
 
-    LOG_DEBUG(log, "Selected MergeAlgorithm: {}", ((merge_alg == MergeAlgorithm::Vertical) ? "Vertical" : "Horizontal"));
+    LOG_DEBUG(log, "Selected MergeAlgorithm: {}", toString(merge_alg));
 
     /// Note: this is done before creating input streams, because otherwise data.data_parts_mutex
     /// (which is locked in data.getTotalActiveSizeInBytes())
@@ -715,7 +716,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor
     /// deadlock is impossible.
     auto compression_codec = data.getCompressionCodecForPart(merge_entry->total_size_bytes_compressed, new_data_part->ttl_infos, time_of_merge);
 
-    /// TODO: Should it go through IDisk interface?
+    auto tmp_disk = context.getTemporaryVolume()->getDisk();
     String rows_sources_file_path;
     std::unique_ptr<WriteBufferFromFileBase> rows_sources_uncompressed_write_buf;
     std::unique_ptr<WriteBuffer> rows_sources_write_buf;
@@ -723,9 +724,9 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor
 
     if (merge_alg == MergeAlgorithm::Vertical)
     {
-        disk->createDirectories(new_part_tmp_path);
+        tmp_disk->createDirectories(new_part_tmp_path);
         rows_sources_file_path = new_part_tmp_path + "rows_sources";
-        rows_sources_uncompressed_write_buf = disk->writeFile(rows_sources_file_path);
+        rows_sources_uncompressed_write_buf = tmp_disk->writeFile(rows_sources_file_path);
         rows_sources_write_buf = std::make_unique<CompressedWriteBuffer>(*rows_sources_uncompressed_write_buf);
 
         for (const MergeTreeData::DataPartPtr & part : parts)
@@ -955,7 +956,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor
                 + ") differs from number of bytes written to rows_sources file (" + toString(rows_sources_count)
                 + "). It is a bug.", ErrorCodes::LOGICAL_ERROR);
 
-        CompressedReadBufferFromFile rows_sources_read_buf(disk->readFile(rows_sources_file_path));
+        CompressedReadBufferFromFile rows_sources_read_buf(tmp_disk->readFile(rows_sources_file_path));
         IMergedBlockOutputStream::WrittenOffsetColumns written_offset_columns;
 
         for (size_t column_num = 0, gathering_column_names_size = gathering_column_names.size();
@@ -1027,7 +1028,7 @@ MergeTreeData::MutableDataPartPtr MergeTreeDataMergerMutator::mergePartsToTempor
             merge_entry->progress.store(progress_before + column_sizes->columnWeight(column_name), std::memory_order_relaxed);
         }
 
-        disk->remove(rows_sources_file_path);
+        tmp_disk->remove(rows_sources_file_path);
     }
 
     for (const auto & part : parts)
diff --git a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h
index a865d5f0cb6..0ad525d1901 100644
--- a/src/Storages/MergeTree/MergeTreeDataMergerMutator.h
+++ b/src/Storages/MergeTree/MergeTreeDataMergerMutator.h
@@ -114,6 +114,7 @@ public:
         MergeListEntry & merge_entry,
         TableLockHolder & table_lock_holder,
         time_t time_of_merge,
+        const Context & context,
         const ReservationPtr & space_reservation,
         bool deduplicate);
 
diff --git a/src/Storages/StorageMergeTree.cpp b/src/Storages/StorageMergeTree.cpp
index 6edfd5551ed..0f1afe1bd62 100644
--- a/src/Storages/StorageMergeTree.cpp
+++ b/src/Storages/StorageMergeTree.cpp
@@ -744,7 +744,7 @@ bool StorageMergeTree::merge(
     try
     {
         new_part = merger_mutator.mergePartsToTemporaryPart(
-            future_part, metadata_snapshot, *merge_entry, table_lock_holder, time(nullptr),
+            future_part, metadata_snapshot, *merge_entry, table_lock_holder, time(nullptr), global_context,
             merging_tagger->reserved_space, deduplicate);
 
         merger_mutator.renameMergedTemporaryPart(new_part, future_part.parts, nullptr);
diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index 900b9d9c8ee..66f06f305a1 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -1450,7 +1450,7 @@ bool StorageReplicatedMergeTree::tryExecuteMerge(const LogEntry & entry)
     {
         part = merger_mutator.mergePartsToTemporaryPart(
             future_merged_part, metadata_snapshot, *merge_entry,
-            table_lock, entry.create_time, reserved_space, entry.deduplicate);
+            table_lock, entry.create_time, global_context, reserved_space, entry.deduplicate);
 
         merger_mutator.renameMergedTemporaryPart(part, parts, &transaction);
 

From 8b4566ba755d722ada17d1ac1fb38b7ff52f04e0 Mon Sep 17 00:00:00 2001
From: Anton Popov <pad11rus@gmail.com>
Date: Thu, 8 Oct 2020 14:37:39 +0300
Subject: [PATCH 63/72] Update skip_list.json

---
 tests/queries/skip_list.json | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/tests/queries/skip_list.json b/tests/queries/skip_list.json
index 541a5e2111f..714371034a9 100644
--- a/tests/queries/skip_list.json
+++ b/tests/queries/skip_list.json
@@ -100,5 +100,7 @@
         "00738_lock_for_inner_table"
     ],
     "polymorphic-parts": [
+        "01508_partition_pruning", /// bug, shoud be fixed
+        "01482_move_to_prewhere_and_cast" /// bug, shoud be fixed
     ]
 }

From 6065a5b65c9aff98032fd0bef89b7dcd09e08f2f Mon Sep 17 00:00:00 2001
From: tavplubix <avtokmakov@yandex-team.ru>
Date: Thu, 8 Oct 2020 16:37:32 +0300
Subject: [PATCH 64/72] Update StorageReplicatedMergeTree.cpp

---
 src/Storages/StorageReplicatedMergeTree.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/Storages/StorageReplicatedMergeTree.cpp b/src/Storages/StorageReplicatedMergeTree.cpp
index a9669151958..b3de8d2d54f 100644
--- a/src/Storages/StorageReplicatedMergeTree.cpp
+++ b/src/Storages/StorageReplicatedMergeTree.cpp
@@ -724,7 +724,7 @@ void StorageReplicatedMergeTree::dropReplica(zkutil::ZooKeeperPtr zookeeper, con
     LOG_INFO(logger, "Removing replica {}, marking it as lost", remote_replica_path);
     /// Mark itself lost before removing, because the following recursive removal may fail
     /// and partially dropped replica may be considered as alive one (until someone will mark it lost)
-    zookeeper->set(zookeeper_path + "/replicas/" + replica + "/is_lost", "1");
+    zookeeper->trySet(zookeeper_path + "/replicas/" + replica + "/is_lost", "1");
     /// It may left some garbage if replica_path subtree are concurrently modified
     zookeeper->tryRemoveRecursive(remote_replica_path);
     if (zookeeper->exists(remote_replica_path))

From bf36ffde33f52855a315cc777bfc861e3b67e1bc Mon Sep 17 00:00:00 2001
From: tavplubix <avtokmakov@yandex-team.ru>
Date: Thu, 8 Oct 2020 16:48:23 +0300
Subject: [PATCH 65/72] Update 01194_http_query_id.sh

---
 tests/queries/0_stateless/01194_http_query_id.sh | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/tests/queries/0_stateless/01194_http_query_id.sh b/tests/queries/0_stateless/01194_http_query_id.sh
index ae81f267b07..5eb766bbe2a 100755
--- a/tests/queries/0_stateless/01194_http_query_id.sh
+++ b/tests/queries/0_stateless/01194_http_query_id.sh
@@ -6,11 +6,11 @@ CURDIR=$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)
 url="http://${CLICKHOUSE_HOST}:${CLICKHOUSE_PORT_HTTP}/?session_id=test_01194"
 rnd=$RANDOM
 
-${CLICKHOUSE_CURL} -sS "$url&query=SELECT+$rnd,1" > /dev/null
-${CLICKHOUSE_CURL} -sS "$url&query=SELECT+$rnd,2" > /dev/null
-${CLICKHOUSE_CURL} -sS "$url" --data "SELECT $rnd,3" > /dev/null
-${CLICKHOUSE_CURL} -sS "$url" --data "SELECT $rnd,4" > /dev/null
+${CLICKHOUSE_CURL} -sS "$url&query=SELECT+'test_01194',$rnd,1" > /dev/null
+${CLICKHOUSE_CURL} -sS "$url&query=SELECT+'test_01194',$rnd,2" > /dev/null
+${CLICKHOUSE_CURL} -sS "$url" --data "SELECT 'test_01194',$rnd,3" > /dev/null
+${CLICKHOUSE_CURL} -sS "$url" --data "SELECT 'test_01194',$rnd,4" > /dev/null
 
 ${CLICKHOUSE_CURL} -sS "$url" --data "SYSTEM FLUSH LOGS"
 
-${CLICKHOUSE_CURL} -sS "$url&query=SELECT+count(DISTINCT+query_id)+FROM+system.query_log+WHERE+query+LIKE+'SELECT+$rnd%25'"
+${CLICKHOUSE_CURL} -sS "$url&query=SELECT+count(DISTINCT+query_id)+FROM+system.query_log+WHERE+query+LIKE+'SELECT+''test_01194'',$rnd%25'"

From de722cd57cb3ebd5dd516ba5e993969c29059cda Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 8 Oct 2020 18:50:09 +0300
Subject: [PATCH 66/72] Fix race condition in MySQL pool

---
 base/mysqlxx/Pool.h                                  |  5 ++++-
 .../test_dictionaries_mysql/configs/log_conf.xml     | 12 ++++++++++++
 tests/integration/test_dictionaries_mysql/test.py    |  2 +-
 3 files changed, 17 insertions(+), 2 deletions(-)
 create mode 100644 tests/integration/test_dictionaries_mysql/configs/log_conf.xml

diff --git a/base/mysqlxx/Pool.h b/base/mysqlxx/Pool.h
index bf9365a064a..59d15e8c9a0 100644
--- a/base/mysqlxx/Pool.h
+++ b/base/mysqlxx/Pool.h
@@ -3,6 +3,7 @@
 #include <list>
 #include <memory>
 #include <mutex>
+#include <atomic>
 
 #include <Poco/Exception.h>
 #include <mysqlxx/Connection.h>
@@ -35,7 +36,9 @@ protected:
     struct Connection
     {
         mysqlxx::Connection conn;
-        int ref_count = 0;
+        /// Ref count modified in constructor/descructor of Entry
+        /// but also read in pool code.
+        std::atomic<int> ref_count = 0;
     };
 
 public:
diff --git a/tests/integration/test_dictionaries_mysql/configs/log_conf.xml b/tests/integration/test_dictionaries_mysql/configs/log_conf.xml
new file mode 100644
index 00000000000..b52d833cde8
--- /dev/null
+++ b/tests/integration/test_dictionaries_mysql/configs/log_conf.xml
@@ -0,0 +1,12 @@
+<?xml version="1.0" encoding="utf-8"?>
+<yandex>
+  <logger>
+        <level>trace</level>
+        <log>/var/log/clickhouse-server/log.log</log>
+        <errorlog>/var/log/clickhouse-server/log.err.log</errorlog>
+        <size>1000M</size>
+        <count>10</count>
+        <stderr>/var/log/clickhouse-server/stderr.log</stderr>
+        <stdout>/var/log/clickhouse-server/stdout.log</stdout>
+    </logger>
+</yandex>
diff --git a/tests/integration/test_dictionaries_mysql/test.py b/tests/integration/test_dictionaries_mysql/test.py
index a8e91c94d00..16e432c6425 100644
--- a/tests/integration/test_dictionaries_mysql/test.py
+++ b/tests/integration/test_dictionaries_mysql/test.py
@@ -5,7 +5,7 @@ from helpers.cluster import ClickHouseCluster
 
 CONFIG_FILES = ['configs/dictionaries/mysql_dict1.xml', 'configs/dictionaries/mysql_dict2.xml',
                 'configs/remote_servers.xml']
-CONFIG_FILES += ['configs/enable_dictionaries.xml']
+CONFIG_FILES += ['configs/enable_dictionaries.xml', 'configs/log_conf.xml']
 cluster = ClickHouseCluster(__file__)
 instance = cluster.add_instance('instance', main_configs=CONFIG_FILES, with_mysql=True)
 

From 20b3de3d76ce542161eff4b1af7457a38dcc6167 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Thu, 8 Oct 2020 20:38:31 +0300
Subject: [PATCH 67/72] Update version_date.tsv after release 20.9.3.45

---
 utils/list-versions/version_date.tsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 75605968a37..25f680c7779 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,3 +1,4 @@
+v20.9.3.45-stable	2020-10-08
 v20.9.2.20-stable	2020-09-22
 v20.8.3.18-stable	2020-09-18
 v20.8.2.3-stable	2020-09-08

From 70f2a3328d7537f1358a946d0bff37bc184b2ba8 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 8 Oct 2020 20:45:18 +0300
Subject: [PATCH 68/72] Better atomic operations on refcount

---
 base/mysqlxx/Pool.cpp | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/base/mysqlxx/Pool.cpp b/base/mysqlxx/Pool.cpp
index 99815363a56..78d72e57b81 100644
--- a/base/mysqlxx/Pool.cpp
+++ b/base/mysqlxx/Pool.cpp
@@ -21,8 +21,8 @@ void Pool::Entry::incrementRefCount()
 {
     if (!data)
         return;
-    ++data->ref_count;
-    if (data->ref_count == 1)
+    /// First reference, initialize thread
+    if (data->ref_count.fetch_add(1) == 0)
         mysql_thread_init();
 }
 
@@ -30,12 +30,9 @@ void Pool::Entry::decrementRefCount()
 {
     if (!data)
         return;
-    if (data->ref_count > 0)
-    {
-        --data->ref_count;
-        if (data->ref_count == 0)
-            mysql_thread_end();
-    }
+
+    if (data->ref_count.fetch_sub(1) == 1)
+        mysql_thread_end();
 }
 
 

From 16fa71a4c96ef50124bb14dd0197113d6893c351 Mon Sep 17 00:00:00 2001
From: alesapin <alesapin@gmail.com>
Date: Thu, 8 Oct 2020 20:46:59 +0300
Subject: [PATCH 69/72] Missed comment

---
 base/mysqlxx/Pool.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/base/mysqlxx/Pool.cpp b/base/mysqlxx/Pool.cpp
index 78d72e57b81..d845570f1f2 100644
--- a/base/mysqlxx/Pool.cpp
+++ b/base/mysqlxx/Pool.cpp
@@ -31,6 +31,7 @@ void Pool::Entry::decrementRefCount()
     if (!data)
         return;
 
+    /// We were the last user of this thread, deinitialize it
     if (data->ref_count.fetch_sub(1) == 1)
         mysql_thread_end();
 }

From 17b953ae6d6ef343fac6a19267bdaaea3c93bcc9 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Fri, 9 Oct 2020 00:00:56 +0300
Subject: [PATCH 70/72] Update version_date.tsv after release 20.3.20.6

---
 utils/list-versions/version_date.tsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 25f680c7779..23eb5776251 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -21,6 +21,7 @@ v20.4.5.36-stable	2020-06-10
 v20.4.4.18-stable	2020-05-26
 v20.4.3.16-stable	2020-05-23
 v20.4.2.9-stable	2020-05-12
+v20.3.20.6-lts	2020-10-09
 v20.3.19.4-lts	2020-09-18
 v20.3.18.10-lts	2020-09-08
 v20.3.17.173-lts	2020-08-15

From 9aff247afed1479f95a4714d699c1ddaefc75499 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Fri, 9 Oct 2020 00:04:34 +0300
Subject: [PATCH 71/72] Update version_date.tsv after release 20.7.4.11

---
 utils/list-versions/version_date.tsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 23eb5776251..96589e48e74 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -2,6 +2,7 @@ v20.9.3.45-stable	2020-10-08
 v20.9.2.20-stable	2020-09-22
 v20.8.3.18-stable	2020-09-18
 v20.8.2.3-stable	2020-09-08
+v20.7.4.11-stable	2020-10-09
 v20.7.3.7-stable	2020-09-18
 v20.7.2.30-stable	2020-08-31
 v20.6.7.4-stable	2020-09-18

From af0b96466af3edb47fa082f31f01950f0b0c9a18 Mon Sep 17 00:00:00 2001
From: robot-clickhouse <robot-clickhouse@yandex-team.ru>
Date: Fri, 9 Oct 2020 02:39:25 +0300
Subject: [PATCH 72/72] Update version_date.tsv after release 20.8.4.11

---
 utils/list-versions/version_date.tsv | 1 +
 1 file changed, 1 insertion(+)

diff --git a/utils/list-versions/version_date.tsv b/utils/list-versions/version_date.tsv
index 96589e48e74..2873a4fd6e6 100644
--- a/utils/list-versions/version_date.tsv
+++ b/utils/list-versions/version_date.tsv
@@ -1,5 +1,6 @@
 v20.9.3.45-stable	2020-10-08
 v20.9.2.20-stable	2020-09-22
+v20.8.4.11-lts	2020-10-09
 v20.8.3.18-stable	2020-09-18
 v20.8.2.3-stable	2020-09-08
 v20.7.4.11-stable	2020-10-09